mkv-episode-matcher 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mkv-episode-matcher might be problematic. Click here for more details.

@@ -0,0 +1,215 @@
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2018 Hannes Tismer
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+ #
23
+ #
24
+ # Copyright for portions of project Sub-Zero are held by Bram Walet, 2014 as part of project Subliminal.bundle.
25
+ # The original license is supplied below.
26
+ #
27
+ # The MIT License (MIT)
28
+ #
29
+ # Copyright (c) 2014 Bram Walet
30
+ #
31
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
32
+ # of this software and associated documentation files (the "Software"), to deal
33
+ # in the Software without restriction, including without limitation the rights
34
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
35
+ # copies of the Software, and to permit persons to whom the Software is
36
+ # furnished to do so, subject to the following conditions:
37
+ #
38
+ # The above copyright notice and this permission notice shall be included in all
39
+ # copies or substantial portions of the Software.
40
+ #
41
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
42
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
43
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
44
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
46
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
47
+ # SOFTWARE.
48
+
49
+
50
+ import re
51
+
52
+ from tld import get_tld
53
+
54
+ from Libraries.SubZero.dictionaries.data import data
55
+ from Libraries.SubZero.SubZero import (
56
+ MultipleLineProcessor,
57
+ MultipleWordReProcessor,
58
+ NReProcessor,
59
+ ReProcessor,
60
+ SubtitleTextModification,
61
+ WholeLineProcessor,
62
+ )
63
+
64
+
65
+ class CommonFixes(SubtitleTextModification):
66
+ identifier = "common"
67
+ description = "Basic common fixes"
68
+ exclusive = True
69
+ order = 40
70
+
71
+ long_description = "Fix common and whitespace/punctuation issues in subtitles"
72
+
73
+ processors = [
74
+ # normalize hyphens
75
+ NReProcessor(re.compile(r'(?u)([‑‐﹘﹣])'), "-", name="CM_hyphens"),
76
+
77
+ # -- = em dash
78
+ NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), r"\1—", name="CM_multidash"),
79
+
80
+ # line = _/-/\s
81
+ NReProcessor(re.compile(r'(?u)(^\W*[-_.:<>~"\']+\W*$)'), "", name="CM_non_word_only"),
82
+
83
+ # remove >>
84
+ NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"),
85
+
86
+ # line = : text
87
+ NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"),
88
+
89
+ # fix music symbols
90
+ NReProcessor(re.compile(r'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'),
91
+ lambda x: "♪ " if x.group(1) else " ♪",
92
+ name="CM_music_symbols"),
93
+
94
+ # '' = "
95
+ NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), '"', name="CM_double_apostrophe"),
96
+
97
+ # double quotes instead of single quotes inside words
98
+ NReProcessor(re.compile(r'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), r"\1'\2", name="CM_double_as_single"),
99
+
100
+ # normalize quotes
101
+ NReProcessor(re.compile(r'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'),
102
+ lambda match: '"' + (" " if match.group(2).endswith(" ") else ""),
103
+ name="CM_normalize_quotes"),
104
+
105
+ # normalize single quotes
106
+ NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛])'), "'", name="CM_normalize_squotes"),
107
+
108
+ # remove leading ...
109
+ NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"),
110
+
111
+ # remove "downloaded from" tags
112
+ NReProcessor(re.compile(r'(?ui).+downloaded\s+from.+'), "", name="CM_crap"),
113
+
114
+ # no space after ellipsis
115
+ NReProcessor(re.compile(r'(?u)\.\.\.(?![\s.,!?\'"])(?!$)'), "... ", name="CM_ellipsis_no_space"),
116
+
117
+ # no space before spaced ellipsis
118
+ NReProcessor(re.compile(r'(?u)(?<=[^\s])(?<!\s)\. \. \.'), " . . .", name="CM_ellipsis_no_space2"),
119
+
120
+ # multiple spaces
121
+ NReProcessor(re.compile(r'(?u)[\s]{2,}'), " ", name="CM_multiple_spaces"),
122
+
123
+ # more than 3 dots
124
+ NReProcessor(re.compile(r'(?u)\.{3,}'), "...", name="CM_dots"),
125
+
126
+ # no space after starting dash
127
+ NReProcessor(re.compile(r'(?u)^-(?![\s-])'), "- ", name="CM_dash_space"),
128
+
129
+ # remove starting spaced dots (not matching ellipses)
130
+ NReProcessor(re.compile(r'(?u)^(?!\s?(\.\s\.\s\.)|(\s?\.{3}))(?=\.+\s+)[\s.]*'), "",
131
+ name="CM_starting_spacedots"),
132
+
133
+ # space missing before doublequote
134
+ ReProcessor(re.compile(r'(?u)(?<!^)(?<![\s(\["])("[^"]+")'), r' \1', name="CM_space_before_dblquote"),
135
+
136
+ # space missing after doublequote
137
+ ReProcessor(re.compile(r'(?u)("[^"\s][^"]+")([^\s.,!?)\]]+)'), r"\1 \2", name="CM_space_after_dblquote"),
138
+
139
+ # space before ending doublequote?
140
+
141
+ # replace uppercase I with lowercase L in words
142
+ NReProcessor(re.compile(r'(?u)([a-zà-ž]+)(I+)'),
143
+ lambda match: r'%s%s' % (match.group(1), "l" * len(match.group(2))),
144
+ name="CM_uppercase_i_in_word"),
145
+
146
+ # fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be
147
+ # countdowns otherwise); don't break up ellipses
148
+ NReProcessor(
149
+ re.compile(r'(?u)(\b[0-9]+[0-9:\']*(?<!\.\.)\s+(?!\.\.)[0-9,.:\'\s]*(?=[0-9]+)[0-9,.:\'])'),
150
+ lambda match: match.group(1).replace(" ", "") if match.group(1).count(" ") == 1 else match.group(1),
151
+ name="CM_spaces_in_numbers"),
152
+
153
+ # uppercase after dot
154
+ # NReProcessor(re.compile(r'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'),
155
+ # lambda match: r'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"),
156
+
157
+ # remove double interpunction
158
+ NReProcessor(re.compile(r'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'),
159
+ lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""),
160
+ name="CM_double_interpunct"),
161
+
162
+ # remove spaces before punctuation; don't break spaced ellipses
163
+ NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=\w)) +([!?.,](?![!?.,]| \.))'), r"\1", name="CM_punctuation_space"),
164
+
165
+ # add space after punctuation
166
+ NReProcessor(re.compile(r'(?u)(([^\s]*)([!?.,:])([A-zÀ-ž]{2,}))'),
167
+ lambda match: "%s%s %s" % (match.group(2), match.group(3), match.group(4)) if not get_tld(match.group(1), fail_silently=True, fix_protocol=True) else match.group(1),
168
+ name="CM_punctuation_space2"),
169
+
170
+ # fix lowercase I in english
171
+ NReProcessor(re.compile(r'(?u)(\b)i(\b)'), r"\1I\2", name="CM_EN_lowercase_i",
172
+ # supported=lambda p: p.language == ENGLISH),
173
+ ),
174
+ ]
175
+
176
+
177
+ class FixOCR(SubtitleTextModification):
178
+ identifier = "OCR_fixes"
179
+ description = "Fix common OCR issues"
180
+ exclusive = True
181
+ order = 10
182
+ data_dict = None
183
+
184
+ long_description = "Fix issues that happen when a subtitle gets converted from bitmap to text through OCR"
185
+
186
+ def __init__(self, language):
187
+ super(FixOCR, self).__init__()
188
+ data_dict = data.get(language)
189
+ if not data_dict:
190
+ # logger.debug("No SnR-data available for language %s", parent.language)
191
+ return
192
+
193
+ self.data_dict = data_dict
194
+ self.processors = self.get_processors()
195
+
196
+ def get_processors(self):
197
+ if not self.data_dict:
198
+ return []
199
+
200
+ return [
201
+ # remove broken HI tag colons (ANNOUNCER'., ". instead of :) after at least 3 uppercase chars
202
+ # don't modify stuff inside quotes
203
+ NReProcessor(re.compile(r'(?u)(^[^"\'’ʼ❜‘‛”“‟„]*(?<=[A-ZÀ-Ž]{3})[A-ZÀ-Ž-_\s0-9]+)'
204
+ r'(["\'’ʼ❜‘‛”“‟„]*[.,‚،⹁、;]+)(\s*)(?!["\'’ʼ❜‘‛”“‟„])'),
205
+ r"\1:\3", name="OCR_fix_HI_colons"),
206
+ # fix F'bla
207
+ NReProcessor(re.compile(r'(?u)(\bF)(\')([A-zÀ-ž]*\b)'), r"\1\3", name="OCR_fix_F"),
208
+ WholeLineProcessor(self.data_dict["WholeLines"], name="OCR_replace_line"),
209
+ MultipleWordReProcessor(self.data_dict["WholeWords"], name="OCR_replace_word"),
210
+ MultipleWordReProcessor(self.data_dict["BeginLines"], name="OCR_replace_beginline"),
211
+ MultipleWordReProcessor(self.data_dict["EndLines"], name="OCR_replace_endline"),
212
+ MultipleWordReProcessor(self.data_dict["PartialLines"], name="OCR_replace_partialline"),
213
+ MultipleLineProcessor(self.data_dict["PartialWordsAlways"], name="OCR_replace_partialwordsalways")
214
+ ]
215
+
@@ -0,0 +1,26 @@
1
+ # pgs2srt
2
+
3
+ Uses [pgsreader](https://github.com/EzraBC/pgsreader) and [pyteseract](https://pypi.org/project/pytesseract/) to convert image based pgs subtitles files (.sup) to text based subrip (.srt) files.
4
+
5
+ ## Requirements
6
+ Python3, pip3, and Tesseract
7
+
8
+ ## Installation
9
+ * Run ```git clone https://github.com/PimvanderLoos/pgs2srt.git```
10
+ * Inside the repo folder, run ```pip3 install -r requirements.txt```
11
+ * In your .bashrc or .zshrc add ```alias pgs2srt='<absolute path to repo>/pgs2srt.py'```
12
+
13
+ ## How to run
14
+
15
+ pgs2srt <pgs filename>.sup
16
+
17
+ ## Improving accuracy
18
+ On Debian and Ubuntu, the default trained models files for Tesseract are from the [fast](https://github.com/tesseract-ocr/tessdata_fast) set. While these are a bit faster than other options, this comes at the cost of accuracy. If you want higher accuracy, I'd recommend using either the [legacy](https://github.com/tesseract-ocr/tessdata) or the [best](https://github.com/tesseract-ocr/tessdata_best) trained models. Note that the fast and best options only support the LSTM OCR Engine Mode (oem 1).
19
+
20
+ ## Caveats
21
+
22
+ This is in no way a perfect converter, and tesseract will make incorrect interpretations of characters. Extremely alpha, issues, pull requests and suggestions welcome!
23
+
24
+
25
+ ## Credits
26
+ This project uses the common + OCR fixes developed by [Sub-Zero.bundle](https://github.com/pannal/Sub-Zero.bundle).
File without changes
@@ -0,0 +1,87 @@
1
+ import numpy as np
2
+ from PIL import Image
3
+
4
+
5
+ def read_rle_bytes(ods_bytes):
6
+
7
+ pixels = []
8
+ line_builder = []
9
+
10
+ i = 0
11
+ while i < len(ods_bytes):
12
+ if ods_bytes[i]:
13
+ incr = 1
14
+ color = ods_bytes[i]
15
+ length = 1
16
+ else:
17
+ check = ods_bytes[i + 1]
18
+ if check == 0:
19
+ incr = 2
20
+ color = 0
21
+ length = 0
22
+ pixels.append(line_builder)
23
+ line_builder = []
24
+ elif check < 64:
25
+ incr = 2
26
+ color = 0
27
+ length = check
28
+ elif check < 128:
29
+ incr = 3
30
+ color = 0
31
+ length = ((check - 64) << 8) + ods_bytes[i + 2]
32
+ elif check < 192:
33
+ incr = 3
34
+ color = ods_bytes[i + 2]
35
+ length = check - 128
36
+ else:
37
+ incr = 4
38
+ color = ods_bytes[i + 3]
39
+ length = ((check - 192) << 8) + ods_bytes[i + 2]
40
+ line_builder.extend([color] * length)
41
+ i += incr
42
+
43
+ if line_builder:
44
+ print(f'Probably an error; hanging pixels: {line_builder}')
45
+
46
+ return pixels
47
+
48
+ def ycbcr2rgb(ar):
49
+ xform = np.array([[1, 0, 1.402], [1, -0.34414, -.71414], [1, 1.772, 0]])
50
+ rgb = ar.astype(float)
51
+ # Subtracting by 128 the R and G channels
52
+ rgb[:, [1, 2]] -= 128
53
+ # .dot is multiplication of the matrices and xform.T is a transpose of the array axes
54
+ rgb = rgb.dot(xform.T)
55
+ # Makes any pixel value greater than 255 just be 255 (Max for RGB colorspace)
56
+ np.putmask(rgb, rgb > 255, 255)
57
+ # Sets any pixel value less than 0 to 0 (Min for RGB colorspace)
58
+ np.putmask(rgb, rgb < 0, 0)
59
+ return np.uint8(rgb)
60
+
61
+ def px_rgb_a(ods, pds, swap):
62
+ px = read_rle_bytes(ods.img_data)
63
+ px = np.array([[255] * (ods.width - len(l)) + l for l in px], dtype=np.uint8)
64
+
65
+ # Extract the YCbCrA palette data, swapping channels if requested.
66
+ if swap:
67
+ ycbcr = np.array([(entry.Y, entry.Cb, entry.Cr) for entry in pds.palette])
68
+ else:
69
+ ycbcr = np.array([(entry.Y, entry.Cr, entry.Cb) for entry in pds.palette])
70
+ try:
71
+ rgb = ycbcr2rgb(ycbcr)
72
+ except AttributeError:
73
+ print("Error: The image is not in YCbCr format.")
74
+ exit(1)
75
+ # Separate the Alpha channel from the YCbCr palette data
76
+ a = [entry.Alpha for entry in pds.palette]
77
+ a = np.array([[a[x] for x in l] for l in px], dtype=np.uint8)
78
+
79
+ return px, rgb, a
80
+
81
+ def make_image(ods, pds, swap=False):
82
+ px, rgb, a = px_rgb_a(ods, pds, swap)
83
+ alpha = Image.fromarray(a, mode='L')
84
+ img = Image.fromarray(px, mode='P')
85
+ img.putalpha(alpha)
86
+ img.putpalette(rgb)
87
+ return img
@@ -0,0 +1,121 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import argparse
4
+ import re
5
+ from datetime import datetime, timedelta
6
+
7
+ import pytesseract
8
+ from imagemaker import make_image
9
+ from pgsreader import PGSReader
10
+ from PIL import Image, ImageOps
11
+
12
+ from Libraries.SubZero.post_processing import CommonFixes, FixOCR
13
+
14
+ parser = argparse.ArgumentParser(description='Convert PGS subtitles to SubRip format.')
15
+
16
+ parser.add_argument('input', type=str, help="The input file (a .sup file).")
17
+ parser.add_argument('--output', type=str, help="The output file (a .srt file).")
18
+ parser.add_argument('--oem', type=int, help="The OCR Engine Mode to use (Default: 1).", default=1, choices=range(4))
19
+ parser.add_argument('--language', type=str, help="The language to use (Default: eng).", default='eng')
20
+ parser.add_argument('--fix_common', help='Fixes common whitespace/punctuation issues.',
21
+ dest='fix_common', action='store_true')
22
+ parser.add_argument('--fix_common_ocr', help='Fixes common OCR issues for supported languages.',
23
+ dest='fix_ocr', action='store_true')
24
+
25
+ args = parser.parse_args()
26
+
27
+ assert args.input is not None
28
+
29
+ # Unescape escaped spaces
30
+ file = args.input.replace("\\ ", " ")
31
+
32
+ print(f"Parsing: {file}")
33
+
34
+ # Load a PGS/SUP file.
35
+ pgs = PGSReader(file)
36
+
37
+ # Set index
38
+ i = 0
39
+
40
+ # Complete subtitle track index
41
+ si = 0
42
+
43
+ tesseract_lang = args.language
44
+ tesseract_config = f"-c tessedit_char_blacklist=[] --psm 6 --oem {args.oem}"
45
+
46
+ # If an output file for the subrip output is provided, use that.
47
+ # Otherwise remove the ".sup" extension from the input and append
48
+ # ".srt".
49
+ output_file = args.output if args.output is not None else (args.input.replace('.sup', '') + '.srt')
50
+
51
+ # SubRip output
52
+ output = ""
53
+
54
+ fix_common = CommonFixes() if args.fix_common else None
55
+ fix_ocr = FixOCR(args.language) if args.fix_ocr else None
56
+
57
+ # Iterate the pgs generator
58
+ for ds in pgs.iter_displaysets():
59
+ try:
60
+ # If set has image, parse the image
61
+ if ds.has_image:
62
+ # Get Palette Display Segment
63
+ pds = ds.pds[0]
64
+ # Get Object Display Segment
65
+ ods = ds.ods[0]
66
+
67
+ if pds and ods:
68
+ # Create and show the bitmap image and convert it to RGBA
69
+ src = make_image(ods, pds).convert('RGBA')
70
+
71
+ # Create grayscale image with black background
72
+ img = Image.new("L", src.size, "BLACK")
73
+ # Paste the subtitle bitmap
74
+ img.paste(src, (0, 0), src)
75
+ # Invert images so the text is readable by Tesseract
76
+ img = ImageOps.invert(img)
77
+
78
+ # Parse the image with tesesract
79
+ text = pytesseract.image_to_string(img, lang=tesseract_lang, config=tesseract_config).strip()
80
+
81
+ # Replace "|" with "I"
82
+ # Works better than blacklisting "|" in Tesseract,
83
+ # which results in I becoming "!" "i" and "1"
84
+ text = re.sub(r'[|/\\]', 'I', text)
85
+ text = re.sub(r'[_]', 'L', text)
86
+
87
+ if args.fix_common:
88
+ text = fix_common.process(text)
89
+ if args.fix_ocr:
90
+ text = fix_ocr.modify(text)
91
+
92
+ start = datetime.fromtimestamp(ods.presentation_timestamp / 1000)
93
+ start = start + timedelta(hours=-1)
94
+
95
+ else:
96
+ # Get Presentation Composition Segment
97
+ pcs = ds.pcs[0]
98
+
99
+ if pcs:
100
+ end = datetime.fromtimestamp(pcs.presentation_timestamp / 1000)
101
+ end = end + timedelta(hours=-1)
102
+
103
+ if isinstance(start, datetime) and isinstance(end, datetime) and len(text):
104
+ si = si + 1
105
+ sub_output = str(si) + "\n"
106
+ sub_output += start.strftime("%H:%M:%S,%f")[0:12] + \
107
+ " --> " + end.strftime("%H:%M:%S,%f")[0:12] + "\n"
108
+ sub_output += text + "\n\n"
109
+
110
+ output += sub_output
111
+ start = end = text = None
112
+ i = i + 1
113
+
114
+ except Exception as e:
115
+ print(e)
116
+ exit(1)
117
+
118
+ f = open(output_file, "w")
119
+ f.write(output)
120
+ f.close()
121
+ print(f"Saved to: {output_file}")
@@ -0,0 +1,221 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from collections import namedtuple
4
+ from os.path import split as pathsplit
5
+
6
+ # Constants for Segments
7
+ PDS = int('0x14', 16)
8
+ ODS = int('0x15', 16)
9
+ PCS = int('0x16', 16)
10
+ WDS = int('0x17', 16)
11
+ END = int('0x80', 16)
12
+
13
+ # Named tuple access for static PDS palettes
14
+ Palette = namedtuple('Palette', "Y Cr Cb Alpha")
15
+
16
+ class InvalidSegmentError(Exception):
17
+ '''Raised when a segment does not match PGS specification'''
18
+
19
+
20
+ class PGSReader:
21
+
22
+ def __init__(self, filepath):
23
+ self.filedir, self.file = pathsplit(filepath)
24
+ with open(filepath, 'rb') as f:
25
+ self.bytes = f.read()
26
+
27
+
28
+ def make_segment(self, bytes_):
29
+ cls = SEGMENT_TYPE[bytes_[10]]
30
+ return cls(bytes_)
31
+
32
+ def iter_segments(self):
33
+ bytes_ = self.bytes[:]
34
+ while bytes_:
35
+ size = 13 + int(bytes_[11:13].hex(), 16)
36
+ yield self.make_segment(bytes_[:size])
37
+ bytes_ = bytes_[size:]
38
+
39
+ def iter_displaysets(self):
40
+ ds = []
41
+ for s in self.iter_segments():
42
+ ds.append(s)
43
+ if s.type == 'END':
44
+ yield DisplaySet(ds)
45
+ ds = []
46
+
47
+ @property
48
+ def segments(self):
49
+ if not hasattr(self, '_segments'):
50
+ self._segments = list(self.iter_segments())
51
+ return self._segments
52
+
53
+ @property
54
+ def displaysets(self):
55
+ if not hasattr(self, '_displaysets'):
56
+ self._displaysets = list(self.iter_displaysets())
57
+ return self._displaysets
58
+
59
+ class BaseSegment:
60
+
61
+ SEGMENT = {
62
+ PDS: 'PDS',
63
+ ODS: 'ODS',
64
+ PCS: 'PCS',
65
+ WDS: 'WDS',
66
+ END: 'END'
67
+ }
68
+
69
+ def __init__(self, bytes_):
70
+ self.bytes = bytes_
71
+ if bytes_[:2] != b'PG':
72
+ raise InvalidSegmentError
73
+ self.pts = int(bytes_[2:6].hex(), base=16) / 90
74
+ self.dts = int(bytes_[6:10].hex(), base=16) / 90
75
+ self.type = self.SEGMENT[bytes_[10]]
76
+ self.size = int(bytes_[11:13].hex(), base=16)
77
+ self.data = bytes_[13:]
78
+
79
+ def __len__(self):
80
+ return self.size
81
+
82
+ @property
83
+ def presentation_timestamp(self): return self.pts
84
+
85
+ @property
86
+ def decoding_timestamp(self): return self.dts
87
+
88
+ @property
89
+ def segment_type(self): return self.type
90
+
91
+ class PresentationCompositionSegment(BaseSegment):
92
+
93
+ class CompositionObject:
94
+
95
+ def __init__(self, bytes_):
96
+ self.bytes = bytes_
97
+ self.object_id = int(bytes_[0:2].hex(), base=16)
98
+ self.window_id = bytes_[2]
99
+ self.cropped = bool(bytes_[3])
100
+ self.x_offset = int(bytes_[4:6].hex(), base=16)
101
+ self.y_offset = int(bytes_[6:8].hex(), base=16)
102
+ if self.cropped:
103
+ self.crop_x_offset = int(bytes_[8:10].hex(), base=16)
104
+ self.crop_y_offset = int(bytes_[10:12].hex(), base=16)
105
+ self.crop_width = int(bytes_[12:14].hex(), base=16)
106
+ self.crop_height = int(bytes_[14:16].hex(), base=16)
107
+
108
+ STATE = {
109
+ int('0x00', base=16): 'Normal',
110
+ int('0x40', base=16): 'Acquisition Point',
111
+ int('0x80', base=16): 'Epoch Start'
112
+ }
113
+
114
+ def __init__(self, bytes_):
115
+ BaseSegment.__init__(self, bytes_)
116
+ self.width = int(self.data[0:2].hex(), base=16)
117
+ self.height = int(self.data[2:4].hex(), base=16)
118
+ self.frame_rate = self.data[4]
119
+ self._num = int(self.data[5:7].hex(), base=16)
120
+ self._state = self.STATE[self.data[7]]
121
+ self.palette_update = bool(self.data[8])
122
+ self.palette_id = self.data[9]
123
+ self._num_comps = self.data[10]
124
+
125
+ @property
126
+ def composition_number(self): return self._num
127
+
128
+ @property
129
+ def composition_state(self): return self._state
130
+
131
+ @property
132
+ def composition_objects(self):
133
+ if not hasattr(self, '_composition_objects'):
134
+ self._composition_objects = self.get_composition_objects()
135
+ if len(self._composition_objects) != self._num_comps:
136
+ print('Warning: Number of composition objects asserted '
137
+ 'does not match the amount found.')
138
+ return self._composition_objects
139
+
140
+ def get_composition_objects(self):
141
+ bytes_ = self.data[11:]
142
+ comps = []
143
+ while bytes_:
144
+ length = 8 * (1 + bool(bytes_[3]))
145
+ comps.append(self.CompositionObject(bytes_[:length]))
146
+ bytes_ = bytes_[length:]
147
+ return comps
148
+
149
+ class WindowDefinitionSegment(BaseSegment):
150
+
151
+ def __init__(self, bytes_):
152
+ BaseSegment.__init__(self, bytes_)
153
+ self.num_windows = self.data[0]
154
+ self.window_id = self.data[1]
155
+ self.x_offset = int(self.data[2:4].hex(), base=16)
156
+ self.y_offset = int(self.data[4:6].hex(), base=16)
157
+ self.width = int(self.data[6:8].hex(), base=16)
158
+ self.height = int(self.data[8:10].hex(), base=16)
159
+
160
+ class PaletteDefinitionSegment(BaseSegment):
161
+
162
+ def __init__(self, bytes_):
163
+ BaseSegment.__init__(self, bytes_)
164
+ self.palette_id = self.data[0]
165
+ self.version = self.data[1]
166
+ self.palette = [Palette(0, 0, 0, 0)] * 256
167
+ # Slice from byte 2 til end of segment. Divide by 5 to determine number of palette entries
168
+ # Iterate entries. Explode the 5 bytes into namedtuple Palette. Must be exploded
169
+ for entry in range(len(self.data[2:]) // 5):
170
+ i = 2 + entry * 5
171
+ self.palette[self.data[i]] = Palette(*self.data[i + 1:i + 5])
172
+
173
+ class ObjectDefinitionSegment(BaseSegment):
174
+
175
+ SEQUENCE = {
176
+ int('0x40', base=16): 'Last',
177
+ int('0x80', base=16): 'First',
178
+ int('0xc0', base=16): 'First and last'
179
+ }
180
+
181
+ def __init__(self, bytes_):
182
+ BaseSegment.__init__(self, bytes_)
183
+ self.id = int(self.data[0:2].hex(), base=16)
184
+ self.version = self.data[2]
185
+ self.in_sequence = self.SEQUENCE[self.data[3]]
186
+ self.data_len = int(self.data[4:7].hex(), base=16)
187
+ self.width = int(self.data[7:9].hex(), base=16)
188
+ self.height = int(self.data[9:11].hex(), base=16)
189
+ self.img_data = self.data[11:]
190
+ if len(self.img_data) != self.data_len - 4:
191
+ print('Warning: Image data length asserted does not match the '
192
+ 'length found.')
193
+
194
+ class EndSegment(BaseSegment):
195
+
196
+ @property
197
+ def is_end(self): return True
198
+
199
+
200
+ SEGMENT_TYPE = {
201
+ PDS: PaletteDefinitionSegment,
202
+ ODS: ObjectDefinitionSegment,
203
+ PCS: PresentationCompositionSegment,
204
+ WDS: WindowDefinitionSegment,
205
+ END: EndSegment
206
+ }
207
+
208
+ class DisplaySet:
209
+
210
+ def __init__(self, segments):
211
+ self.segments = segments
212
+ self.segment_types = [s.type for s in segments]
213
+ self.has_image = 'ODS' in self.segment_types
214
+
215
+ def segment_by_type_getter(type_):
216
+ def f(self):
217
+ return [s for s in self.segments if s.type == type_]
218
+ return f
219
+
220
+ for type_ in BaseSegment.SEGMENT.values():
221
+ setattr(DisplaySet, type_.lower(), property(segment_by_type_getter(type_)))
@@ -0,0 +1,4 @@
1
+ pytesseract==0.3.7
2
+ numpy==1.19.4
3
+ Pillow==8.2.0
4
+ tld~=0.12.3
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: mkv-episode-matcher
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
5
  Project-URL: Documentation, https://github.com/Jsakkos/mkv-episode-matcher#readme
6
6
  Project-URL: Issues, https://github.com/Jsakkos/mkv-episode-matcher/issues