PyPI - mkv-episode-matcher - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

mkv-episode-matcher 0.1.5py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mkv-episode-matcher might be problematic. Click here for more details.

Files changed (22) hide show

mkv_episode_matcher/__init__.py +1 -1
mkv_episode_matcher/__main__.py +2 -2
mkv_episode_matcher/libraries/pgs2srt/.gitignore +2 -2
mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +295 -295
mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +249 -249
mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +215 -215
mkv_episode_matcher/libraries/pgs2srt/README.md +26 -26
mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +87 -87
mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +121 -121
mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +221 -221
mkv_episode_matcher/libraries/pgs2srt/requirements.txt +4 -4
mkv_episode_matcher/mkv_to_srt.py +174 -174
mkv_episode_matcher/notebooks/get_subtitles_test.ipynb +252 -0
mkv_episode_matcher/requirements.txt +6 -7
mkv_episode_matcher/utils.py +5 -2
{mkv_episode_matcher-0.1.5.dist-info → mkv_episode_matcher-0.1.9.dist-info}/METADATA +53 -37
mkv_episode_matcher-0.1.9.dist-info/RECORD +25 -0
{mkv_episode_matcher-0.1.5.dist-info → mkv_episode_matcher-0.1.9.dist-info}/WHEEL +2 -1
mkv_episode_matcher-0.1.9.dist-info/top_level.txt +1 -0
mkv_episode_matcher/libraries/pgs2srt/.git +0 -1
mkv_episode_matcher-0.1.5.dist-info/RECORD +0 -24
{mkv_episode_matcher-0.1.5.dist-info → mkv_episode_matcher-0.1.9.dist-info}/entry_points.txt +0 -0

mkv_episode_matcher/libraries/pgs2srt/imagemaker.py CHANGED Viewed

@@ -1,87 +1,87 @@
-import numpy as np
-from PIL import Image
-def read_rle_bytes(ods_bytes):
-    pixels = []
-    line_builder = []
-    i = 0
-    while i < len(ods_bytes):
-        if ods_bytes[i]:
-            incr = 1
-            color = ods_bytes[i]
-            length = 1
-        else:
-            check = ods_bytes[i + 1]
-            if check == 0:
-                incr = 2
-                color = 0
-                length = 0
-                pixels.append(line_builder)
-                line_builder = []
-            elif check < 64:
-                incr = 2
-                color = 0
-                length = check
-            elif check < 128:
-                incr = 3
-                color = 0
-                length = ((check - 64) << 8) + ods_bytes[i + 2]
-            elif check < 192:
-                incr = 3
-                color = ods_bytes[i + 2]
-                length = check - 128
-            else:
-                incr = 4
-                color = ods_bytes[i + 3]
-                length = ((check - 192) << 8) + ods_bytes[i + 2]
-        line_builder.extend([color] * length)
-        i += incr
-    if line_builder:
-        print(f'Probably an error; hanging pixels: {line_builder}')
-    return pixels
-def ycbcr2rgb(ar):
-    xform = np.array([[1, 0, 1.402], [1, -0.34414, -.71414], [1, 1.772, 0]])
-    rgb = ar.astype(float)
-    # Subtracting by 128 the R and G channels
-    rgb[:, [1, 2]] -= 128
-    # .dot is multiplication of the matrices and xform.T is a transpose of the array axes
-    rgb = rgb.dot(xform.T)
-    # Makes any pixel value greater than 255 just be 255 (Max for RGB colorspace)
-    np.putmask(rgb, rgb > 255, 255)
-    # Sets any pixel value less than 0 to 0 (Min for RGB colorspace)
-    np.putmask(rgb, rgb < 0, 0)
-    return np.uint8(rgb)
-def px_rgb_a(ods, pds, swap):
-    px = read_rle_bytes(ods.img_data)
-    px = np.array([[255] * (ods.width - len(l)) + l for l in px], dtype=np.uint8)
-    # Extract the YCbCrA palette data, swapping channels if requested.
-    if swap:
-        ycbcr = np.array([(entry.Y, entry.Cb, entry.Cr) for entry in pds.palette])
-    else:
-        ycbcr = np.array([(entry.Y, entry.Cr, entry.Cb) for entry in pds.palette])
-    try:
-        rgb = ycbcr2rgb(ycbcr)
-    except AttributeError:
-        print("Error: The image is not in YCbCr format.")
-        exit(1)
-    # Separate the Alpha channel from the YCbCr palette data
-    a = [entry.Alpha for entry in pds.palette]
-    a = np.array([[a[x] for x in l] for l in px], dtype=np.uint8)
-    return px, rgb, a
-def make_image(ods, pds, swap=False):
-    px, rgb, a = px_rgb_a(ods, pds, swap)
-    alpha = Image.fromarray(a, mode='L')
-    img = Image.fromarray(px, mode='P')
-    img.putalpha(alpha)
-    img.putpalette(rgb)
-    return img
+import numpy as np
+from PIL import Image
+def read_rle_bytes(ods_bytes):
+    pixels = []
+    line_builder = []
+    i = 0
+    while i < len(ods_bytes):
+        if ods_bytes[i]:
+            incr = 1
+            color = ods_bytes[i]
+            length = 1
+        else:
+            check = ods_bytes[i + 1]
+            if check == 0:
+                incr = 2
+                color = 0
+                length = 0
+                pixels.append(line_builder)
+                line_builder = []
+            elif check < 64:
+                incr = 2
+                color = 0
+                length = check
+            elif check < 128:
+                incr = 3
+                color = 0
+                length = ((check - 64) << 8) + ods_bytes[i + 2]
+            elif check < 192:
+                incr = 3
+                color = ods_bytes[i + 2]
+                length = check - 128
+            else:
+                incr = 4
+                color = ods_bytes[i + 3]
+                length = ((check - 192) << 8) + ods_bytes[i + 2]
+        line_builder.extend([color] * length)
+        i += incr
+    if line_builder:
+        print(f'Probably an error; hanging pixels: {line_builder}')
+    return pixels
+def ycbcr2rgb(ar):
+    xform = np.array([[1, 0, 1.402], [1, -0.34414, -.71414], [1, 1.772, 0]])
+    rgb = ar.astype(float)
+    # Subtracting by 128 the R and G channels
+    rgb[:, [1, 2]] -= 128
+    # .dot is multiplication of the matrices and xform.T is a transpose of the array axes
+    rgb = rgb.dot(xform.T)
+    # Makes any pixel value greater than 255 just be 255 (Max for RGB colorspace)
+    np.putmask(rgb, rgb > 255, 255)
+    # Sets any pixel value less than 0 to 0 (Min for RGB colorspace)
+    np.putmask(rgb, rgb < 0, 0)
+    return np.uint8(rgb)
+def px_rgb_a(ods, pds, swap):
+    px = read_rle_bytes(ods.img_data)
+    px = np.array([[255] * (ods.width - len(l)) + l for l in px], dtype=np.uint8)
+    # Extract the YCbCrA palette data, swapping channels if requested.
+    if swap:
+        ycbcr = np.array([(entry.Y, entry.Cb, entry.Cr) for entry in pds.palette])
+    else:
+        ycbcr = np.array([(entry.Y, entry.Cr, entry.Cb) for entry in pds.palette])
+    try:
+        rgb = ycbcr2rgb(ycbcr)
+    except AttributeError:
+        print("Error: The image is not in YCbCr format.")
+        exit(1)
+    # Separate the Alpha channel from the YCbCr palette data
+    a = [entry.Alpha for entry in pds.palette]
+    a = np.array([[a[x] for x in l] for l in px], dtype=np.uint8)
+    return px, rgb, a
+def make_image(ods, pds, swap=False):
+    px, rgb, a = px_rgb_a(ods, pds, swap)
+    alpha = Image.fromarray(a, mode='L')
+    img = Image.fromarray(px, mode='P')
+    img.putalpha(alpha)
+    img.putpalette(rgb)
+    return img

mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py CHANGED Viewed

@@ -1,121 +1,121 @@
-#!/usr/bin/env python3
-import argparse
-import re
-from datetime import datetime, timedelta
-import pytesseract
-from imagemaker import make_image
-from pgsreader import PGSReader
-from PIL import Image, ImageOps
-from Libraries.SubZero.post_processing import CommonFixes, FixOCR
-parser = argparse.ArgumentParser(description='Convert PGS subtitles to SubRip format.')
-parser.add_argument('input', type=str, help="The input file (a .sup file).")
-parser.add_argument('--output', type=str, help="The output file (a .srt file).")
-parser.add_argument('--oem', type=int, help="The OCR Engine Mode to use (Default: 1).", default=1, choices=range(4))
-parser.add_argument('--language', type=str, help="The language to use (Default: eng).", default='eng')
-parser.add_argument('--fix_common', help='Fixes common whitespace/punctuation issues.',
-                    dest='fix_common', action='store_true')
-parser.add_argument('--fix_common_ocr', help='Fixes common OCR issues for supported languages.',
-                    dest='fix_ocr', action='store_true')
-args = parser.parse_args()
-assert args.input is not None
-# Unescape escaped spaces
-file = args.input.replace("\\ ", " ")
-print(f"Parsing: {file}")
-# Load a PGS/SUP file.
-pgs = PGSReader(file)
-# Set index
-i = 0
-# Complete subtitle track index
-si = 0
-tesseract_lang = args.language
-tesseract_config = f"-c tessedit_char_blacklist=[] --psm 6 --oem {args.oem}"
-# If an output file for the subrip output is provided, use that.
-# Otherwise remove the ".sup" extension from the input and append
-# ".srt".
-output_file = args.output if args.output is not None else (args.input.replace('.sup', '') + '.srt')
-# SubRip output
-output = ""
-fix_common = CommonFixes() if args.fix_common else None
-fix_ocr = FixOCR(args.language) if args.fix_ocr else None
-# Iterate the pgs generator
-for ds in pgs.iter_displaysets():
-    try:
-        # If set has image, parse the image
-        if ds.has_image:
-            # Get Palette Display Segment
-            pds = ds.pds[0]
-            # Get Object Display Segment
-            ods = ds.ods[0]
-            if pds and ods:
-                # Create and show the bitmap image and convert it to RGBA
-                src = make_image(ods, pds).convert('RGBA')
-                # Create grayscale image with black background
-                img = Image.new("L", src.size, "BLACK")
-                # Paste the subtitle bitmap
-                img.paste(src, (0, 0), src)
-                # Invert images so the text is readable by Tesseract
-                img = ImageOps.invert(img)
-                # Parse the image with tesesract
-                text = pytesseract.image_to_string(img, lang=tesseract_lang, config=tesseract_config).strip()
-                # Replace "|" with "I"
-                # Works better than blacklisting "|" in Tesseract,
-                # which results in I becoming "!" "i" and "1"
-                text = re.sub(r'[|/\\]', 'I', text)
-                text = re.sub(r'[_]', 'L', text)
-                if args.fix_common:
-                    text = fix_common.process(text)
-                if args.fix_ocr:
-                    text = fix_ocr.modify(text)
-                start = datetime.fromtimestamp(ods.presentation_timestamp / 1000)
-                start = start + timedelta(hours=-1)
-        else:
-            # Get Presentation Composition Segment
-            pcs = ds.pcs[0]
-            if pcs:
-                end = datetime.fromtimestamp(pcs.presentation_timestamp / 1000)
-                end = end + timedelta(hours=-1)
-                if isinstance(start, datetime) and isinstance(end, datetime) and len(text):
-                    si = si + 1
-                    sub_output = str(si) + "\n"
-                    sub_output += start.strftime("%H:%M:%S,%f")[0:12] + \
-                        " --> " + end.strftime("%H:%M:%S,%f")[0:12] + "\n"
-                    sub_output += text + "\n\n"
-                    output += sub_output
-                    start = end = text = None
-        i = i + 1
-    except Exception as e:
-        print(e)
-        exit(1)
-f = open(output_file, "w")
-f.write(output)
-f.close()
-print(f"Saved to: {output_file}")
+#!/usr/bin/env python3
+import argparse
+import re
+from datetime import datetime, timedelta
+import pytesseract
+from imagemaker import make_image
+from pgsreader import PGSReader
+from PIL import Image, ImageOps
+from Libraries.SubZero.post_processing import CommonFixes, FixOCR
+parser = argparse.ArgumentParser(description='Convert PGS subtitles to SubRip format.')
+parser.add_argument('input', type=str, help="The input file (a .sup file).")
+parser.add_argument('--output', type=str, help="The output file (a .srt file).")
+parser.add_argument('--oem', type=int, help="The OCR Engine Mode to use (Default: 1).", default=1, choices=range(4))
+parser.add_argument('--language', type=str, help="The language to use (Default: eng).", default='eng')
+parser.add_argument('--fix_common', help='Fixes common whitespace/punctuation issues.',
+                    dest='fix_common', action='store_true')
+parser.add_argument('--fix_common_ocr', help='Fixes common OCR issues for supported languages.',
+                    dest='fix_ocr', action='store_true')
+args = parser.parse_args()
+assert args.input is not None
+# Unescape escaped spaces
+file = args.input.replace("\\ ", " ")
+print(f"Parsing: {file}")
+# Load a PGS/SUP file.
+pgs = PGSReader(file)
+# Set index
+i = 0
+# Complete subtitle track index
+si = 0
+tesseract_lang = args.language
+tesseract_config = f"-c tessedit_char_blacklist=[] --psm 6 --oem {args.oem}"
+# If an output file for the subrip output is provided, use that.
+# Otherwise remove the ".sup" extension from the input and append
+# ".srt".
+output_file = args.output if args.output is not None else (args.input.replace('.sup', '') + '.srt')
+# SubRip output
+output = ""
+fix_common = CommonFixes() if args.fix_common else None
+fix_ocr = FixOCR(args.language) if args.fix_ocr else None
+# Iterate the pgs generator
+for ds in pgs.iter_displaysets():
+    try:
+        # If set has image, parse the image
+        if ds.has_image:
+            # Get Palette Display Segment
+            pds = ds.pds[0]
+            # Get Object Display Segment
+            ods = ds.ods[0]
+            if pds and ods:
+                # Create and show the bitmap image and convert it to RGBA
+                src = make_image(ods, pds).convert('RGBA')
+                # Create grayscale image with black background
+                img = Image.new("L", src.size, "BLACK")
+                # Paste the subtitle bitmap
+                img.paste(src, (0, 0), src)
+                # Invert images so the text is readable by Tesseract
+                img = ImageOps.invert(img)
+                # Parse the image with tesesract
+                text = pytesseract.image_to_string(img, lang=tesseract_lang, config=tesseract_config).strip()
+                # Replace "|" with "I"
+                # Works better than blacklisting "|" in Tesseract,
+                # which results in I becoming "!" "i" and "1"
+                text = re.sub(r'[|/\\]', 'I', text)
+                text = re.sub(r'[_]', 'L', text)
+                if args.fix_common:
+                    text = fix_common.process(text)
+                if args.fix_ocr:
+                    text = fix_ocr.modify(text)
+                start = datetime.fromtimestamp(ods.presentation_timestamp / 1000)
+                start = start + timedelta(hours=-1)
+        else:
+            # Get Presentation Composition Segment
+            pcs = ds.pcs[0]
+            if pcs:
+                end = datetime.fromtimestamp(pcs.presentation_timestamp / 1000)
+                end = end + timedelta(hours=-1)
+                if isinstance(start, datetime) and isinstance(end, datetime) and len(text):
+                    si = si + 1
+                    sub_output = str(si) + "\n"
+                    sub_output += start.strftime("%H:%M:%S,%f")[0:12] + \
+                        " --> " + end.strftime("%H:%M:%S,%f")[0:12] + "\n"
+                    sub_output += text + "\n\n"
+                    output += sub_output
+                    start = end = text = None
+        i = i + 1
+    except Exception as e:
+        print(e)
+        exit(1)
+f = open(output_file, "w")
+f.write(output)
+f.close()
+print(f"Saved to: {output_file}")

mkv-episode-matcher 0.1.5__py3-none-any.whl → 0.1.9__py3-none-any.whl

Potentially problematic release.

mkv-episode-matcher 0.1.5py3-none-any.whl → 0.1.9py3-none-any.whl