mkv-episode-matcher 0.1.13__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mkv-episode-matcher might be problematic. Click here for more details.
- mkv_episode_matcher/__main__.py +8 -4
- mkv_episode_matcher/episode_identification.py +208 -0
- mkv_episode_matcher/episode_matcher.py +98 -242
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +38 -12
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +16644 -193
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +125 -80
- mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +7 -5
- mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +49 -20
- mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +53 -49
- mkv_episode_matcher/mkv_to_srt.py +150 -22
- mkv_episode_matcher/speech_to_text.py +90 -0
- mkv_episode_matcher/utils.py +222 -74
- mkv_episode_matcher-0.3.0.dist-info/METADATA +119 -0
- mkv_episode_matcher-0.3.0.dist-info/RECORD +25 -0
- mkv_episode_matcher/notebooks/get_subtitles_test.ipynb +0 -252
- mkv_episode_matcher/notebooks/whisper.ipynb +0 -122
- mkv_episode_matcher-0.1.13.dist-info/METADATA +0 -113
- mkv_episode_matcher-0.1.13.dist-info/RECORD +0 -25
- {mkv_episode_matcher-0.1.13.dist-info → mkv_episode_matcher-0.3.0.dist-info}/WHEEL +0 -0
- {mkv_episode_matcher-0.1.13.dist-info → mkv_episode_matcher-0.3.0.dist-info}/entry_points.txt +0 -0
- {mkv_episode_matcher-0.1.13.dist-info → mkv_episode_matcher-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -49,8 +49,6 @@
|
|
|
49
49
|
|
|
50
50
|
import re
|
|
51
51
|
|
|
52
|
-
from tld import get_tld
|
|
53
|
-
|
|
54
52
|
from Libraries.SubZero.dictionaries.data import data
|
|
55
53
|
from Libraries.SubZero.SubZero import (
|
|
56
54
|
MultipleLineProcessor,
|
|
@@ -60,6 +58,7 @@ from Libraries.SubZero.SubZero import (
|
|
|
60
58
|
SubtitleTextModification,
|
|
61
59
|
WholeLineProcessor,
|
|
62
60
|
)
|
|
61
|
+
from tld import get_tld
|
|
63
62
|
|
|
64
63
|
|
|
65
64
|
class CommonFixes(SubtitleTextModification):
|
|
@@ -72,105 +71,134 @@ class CommonFixes(SubtitleTextModification):
|
|
|
72
71
|
|
|
73
72
|
processors = [
|
|
74
73
|
# normalize hyphens
|
|
75
|
-
NReProcessor(re.compile(r
|
|
76
|
-
|
|
74
|
+
NReProcessor(re.compile(r"(?u)([‑‐﹘﹣])"), "-", name="CM_hyphens"),
|
|
77
75
|
# -- = em dash
|
|
78
|
-
NReProcessor(
|
|
79
|
-
|
|
76
|
+
NReProcessor(
|
|
77
|
+
re.compile(r"(?u)(\w|\b|\s|^)(-\s?-{1,2})"), r"\1—", name="CM_multidash"
|
|
78
|
+
),
|
|
80
79
|
# line = _/-/\s
|
|
81
|
-
NReProcessor(
|
|
82
|
-
|
|
80
|
+
NReProcessor(
|
|
81
|
+
re.compile(r'(?u)(^\W*[-_.:<>~"\']+\W*$)'), "", name="CM_non_word_only"
|
|
82
|
+
),
|
|
83
83
|
# remove >>
|
|
84
|
-
NReProcessor(re.compile(r
|
|
85
|
-
|
|
84
|
+
NReProcessor(re.compile(r"(?u)^\s?>>\s*"), "", name="CM_leading_crocodiles"),
|
|
86
85
|
# line = : text
|
|
87
|
-
NReProcessor(
|
|
88
|
-
|
|
86
|
+
NReProcessor(
|
|
87
|
+
re.compile(r"(?u)(^\W*:\s*(?=\w+))"), "", name="CM_empty_colon_start"
|
|
88
|
+
),
|
|
89
89
|
# fix music symbols
|
|
90
|
-
NReProcessor(
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
90
|
+
NReProcessor(
|
|
91
|
+
re.compile(r"(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)"),
|
|
92
|
+
lambda x: "♪ " if x.group(1) else " ♪",
|
|
93
|
+
name="CM_music_symbols",
|
|
94
|
+
),
|
|
94
95
|
# '' = "
|
|
95
|
-
NReProcessor(
|
|
96
|
-
|
|
96
|
+
NReProcessor(
|
|
97
|
+
re.compile(r"(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)"), '"', name="CM_double_apostrophe"
|
|
98
|
+
),
|
|
97
99
|
# double quotes instead of single quotes inside words
|
|
98
|
-
NReProcessor(
|
|
99
|
-
|
|
100
|
+
NReProcessor(
|
|
101
|
+
re.compile(r'(?u)([A-zÀ-ž])"([A-zÀ-ž])'),
|
|
102
|
+
r"\1'\2",
|
|
103
|
+
name="CM_double_as_single",
|
|
104
|
+
),
|
|
100
105
|
# normalize quotes
|
|
101
|
-
NReProcessor(
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
106
|
+
NReProcessor(
|
|
107
|
+
re.compile(r'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'),
|
|
108
|
+
lambda match: '"' + (" " if match.group(2).endswith(" ") else ""),
|
|
109
|
+
name="CM_normalize_quotes",
|
|
110
|
+
),
|
|
105
111
|
# normalize single quotes
|
|
106
|
-
NReProcessor(re.compile(r
|
|
107
|
-
|
|
112
|
+
NReProcessor(re.compile(r"(?u)([\'’ʼ❜‘‛])"), "'", name="CM_normalize_squotes"),
|
|
108
113
|
# remove leading ...
|
|
109
|
-
NReProcessor(re.compile(r
|
|
110
|
-
|
|
114
|
+
NReProcessor(re.compile(r"(?u)^\.\.\.[\s]*"), "", name="CM_leading_ellipsis"),
|
|
111
115
|
# remove "downloaded from" tags
|
|
112
|
-
NReProcessor(re.compile(r
|
|
113
|
-
|
|
116
|
+
NReProcessor(re.compile(r"(?ui).+downloaded\s+from.+"), "", name="CM_crap"),
|
|
114
117
|
# no space after ellipsis
|
|
115
|
-
NReProcessor(
|
|
116
|
-
|
|
118
|
+
NReProcessor(
|
|
119
|
+
re.compile(r'(?u)\.\.\.(?![\s.,!?\'"])(?!$)'),
|
|
120
|
+
"... ",
|
|
121
|
+
name="CM_ellipsis_no_space",
|
|
122
|
+
),
|
|
117
123
|
# no space before spaced ellipsis
|
|
118
|
-
NReProcessor(
|
|
119
|
-
|
|
124
|
+
NReProcessor(
|
|
125
|
+
re.compile(r"(?u)(?<=[^\s])(?<!\s)\. \. \."),
|
|
126
|
+
" . . .",
|
|
127
|
+
name="CM_ellipsis_no_space2",
|
|
128
|
+
),
|
|
120
129
|
# multiple spaces
|
|
121
|
-
NReProcessor(re.compile(r
|
|
122
|
-
|
|
130
|
+
NReProcessor(re.compile(r"(?u)[\s]{2,}"), " ", name="CM_multiple_spaces"),
|
|
123
131
|
# more than 3 dots
|
|
124
|
-
NReProcessor(re.compile(r
|
|
125
|
-
|
|
132
|
+
NReProcessor(re.compile(r"(?u)\.{3,}"), "...", name="CM_dots"),
|
|
126
133
|
# no space after starting dash
|
|
127
|
-
NReProcessor(re.compile(r
|
|
128
|
-
|
|
134
|
+
NReProcessor(re.compile(r"(?u)^-(?![\s-])"), "- ", name="CM_dash_space"),
|
|
129
135
|
# remove starting spaced dots (not matching ellipses)
|
|
130
|
-
NReProcessor(
|
|
131
|
-
|
|
132
|
-
|
|
136
|
+
NReProcessor(
|
|
137
|
+
re.compile(r"(?u)^(?!\s?(\.\s\.\s\.)|(\s?\.{3}))(?=\.+\s+)[\s.]*"),
|
|
138
|
+
"",
|
|
139
|
+
name="CM_starting_spacedots",
|
|
140
|
+
),
|
|
133
141
|
# space missing before doublequote
|
|
134
|
-
ReProcessor(
|
|
135
|
-
|
|
142
|
+
ReProcessor(
|
|
143
|
+
re.compile(r'(?u)(?<!^)(?<![\s(\["])("[^"]+")'),
|
|
144
|
+
r" \1",
|
|
145
|
+
name="CM_space_before_dblquote",
|
|
146
|
+
),
|
|
136
147
|
# space missing after doublequote
|
|
137
|
-
ReProcessor(
|
|
138
|
-
|
|
148
|
+
ReProcessor(
|
|
149
|
+
re.compile(r'(?u)("[^"\s][^"]+")([^\s.,!?)\]]+)'),
|
|
150
|
+
r"\1 \2",
|
|
151
|
+
name="CM_space_after_dblquote",
|
|
152
|
+
),
|
|
139
153
|
# space before ending doublequote?
|
|
140
|
-
|
|
141
154
|
# replace uppercase I with lowercase L in words
|
|
142
|
-
NReProcessor(
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
155
|
+
NReProcessor(
|
|
156
|
+
re.compile(r"(?u)([a-zà-ž]+)(I+)"),
|
|
157
|
+
lambda match: r"{}{}".format(match.group(1), "l" * len(match.group(2))),
|
|
158
|
+
name="CM_uppercase_i_in_word",
|
|
159
|
+
),
|
|
146
160
|
# fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be
|
|
147
161
|
# countdowns otherwise); don't break up ellipses
|
|
148
162
|
NReProcessor(
|
|
149
|
-
re.compile(
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
163
|
+
re.compile(
|
|
164
|
+
r"(?u)(\b[0-9]+[0-9:\']*(?<!\.\.)\s+(?!\.\.)[0-9,.:\'\s]*(?=[0-9]+)[0-9,.:\'])"
|
|
165
|
+
),
|
|
166
|
+
lambda match: match.group(1).replace(" ", "")
|
|
167
|
+
if match.group(1).count(" ") == 1
|
|
168
|
+
else match.group(1),
|
|
169
|
+
name="CM_spaces_in_numbers",
|
|
170
|
+
),
|
|
153
171
|
# uppercase after dot
|
|
154
172
|
# NReProcessor(re.compile(r'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'),
|
|
155
173
|
# lambda match: r'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"),
|
|
156
|
-
|
|
157
174
|
# remove double interpunction
|
|
158
|
-
NReProcessor(
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
175
|
+
NReProcessor(
|
|
176
|
+
re.compile(r"(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)"),
|
|
177
|
+
lambda match: match.group(1).strip()
|
|
178
|
+
+ (" " if match.group(2).endswith(" ") else ""),
|
|
179
|
+
name="CM_double_interpunct",
|
|
180
|
+
),
|
|
162
181
|
# remove spaces before punctuation; don't break spaced ellipses
|
|
163
|
-
NReProcessor(
|
|
164
|
-
|
|
182
|
+
NReProcessor(
|
|
183
|
+
re.compile(r"(?u)(?:(?<=^)|(?<=\w)) +([!?.,](?![!?.,]| \.))"),
|
|
184
|
+
r"\1",
|
|
185
|
+
name="CM_punctuation_space",
|
|
186
|
+
),
|
|
165
187
|
# add space after punctuation
|
|
166
|
-
NReProcessor(
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
188
|
+
NReProcessor(
|
|
189
|
+
re.compile(r"(?u)(([^\s]*)([!?.,:])([A-zÀ-ž]{2,}))"),
|
|
190
|
+
lambda match: f"{match.group(2)}{match.group(3)} {match.group(4)}"
|
|
191
|
+
if not get_tld(match.group(1), fail_silently=True, fix_protocol=True)
|
|
192
|
+
else match.group(1),
|
|
193
|
+
name="CM_punctuation_space2",
|
|
194
|
+
),
|
|
170
195
|
# fix lowercase I in english
|
|
171
|
-
NReProcessor(
|
|
172
|
-
|
|
173
|
-
|
|
196
|
+
NReProcessor(
|
|
197
|
+
re.compile(r"(?u)(\b)i(\b)"),
|
|
198
|
+
r"\1I\2",
|
|
199
|
+
name="CM_EN_lowercase_i",
|
|
200
|
+
# supported=lambda p: p.language == ENGLISH),
|
|
201
|
+
),
|
|
174
202
|
]
|
|
175
203
|
|
|
176
204
|
|
|
@@ -200,16 +228,33 @@ class FixOCR(SubtitleTextModification):
|
|
|
200
228
|
return [
|
|
201
229
|
# remove broken HI tag colons (ANNOUNCER'., ". instead of :) after at least 3 uppercase chars
|
|
202
230
|
# don't modify stuff inside quotes
|
|
203
|
-
NReProcessor(
|
|
204
|
-
|
|
205
|
-
|
|
231
|
+
NReProcessor(
|
|
232
|
+
re.compile(
|
|
233
|
+
r'(?u)(^[^"\'’ʼ❜‘‛”“‟„]*(?<=[A-ZÀ-Ž]{3})[A-ZÀ-Ž-_\s0-9]+)'
|
|
234
|
+
r'(["\'’ʼ❜‘‛”“‟„]*[.,‚،⹁、;]+)(\s*)(?!["\'’ʼ❜‘‛”“‟„])'
|
|
235
|
+
),
|
|
236
|
+
r"\1:\3",
|
|
237
|
+
name="OCR_fix_HI_colons",
|
|
238
|
+
),
|
|
206
239
|
# fix F'bla
|
|
207
|
-
NReProcessor(
|
|
240
|
+
NReProcessor(
|
|
241
|
+
re.compile(r"(?u)(\bF)(\')([A-zÀ-ž]*\b)"), r"\1\3", name="OCR_fix_F"
|
|
242
|
+
),
|
|
208
243
|
WholeLineProcessor(self.data_dict["WholeLines"], name="OCR_replace_line"),
|
|
209
|
-
MultipleWordReProcessor(
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
MultipleWordReProcessor(
|
|
213
|
-
|
|
244
|
+
MultipleWordReProcessor(
|
|
245
|
+
self.data_dict["WholeWords"], name="OCR_replace_word"
|
|
246
|
+
),
|
|
247
|
+
MultipleWordReProcessor(
|
|
248
|
+
self.data_dict["BeginLines"], name="OCR_replace_beginline"
|
|
249
|
+
),
|
|
250
|
+
MultipleWordReProcessor(
|
|
251
|
+
self.data_dict["EndLines"], name="OCR_replace_endline"
|
|
252
|
+
),
|
|
253
|
+
MultipleWordReProcessor(
|
|
254
|
+
self.data_dict["PartialLines"], name="OCR_replace_partialline"
|
|
255
|
+
),
|
|
256
|
+
MultipleLineProcessor(
|
|
257
|
+
self.data_dict["PartialWordsAlways"],
|
|
258
|
+
name="OCR_replace_partialwordsalways",
|
|
259
|
+
),
|
|
214
260
|
]
|
|
215
|
-
|
|
@@ -3,7 +3,6 @@ from PIL import Image
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
def read_rle_bytes(ods_bytes):
|
|
6
|
-
|
|
7
6
|
pixels = []
|
|
8
7
|
line_builder = []
|
|
9
8
|
|
|
@@ -41,12 +40,13 @@ def read_rle_bytes(ods_bytes):
|
|
|
41
40
|
i += incr
|
|
42
41
|
|
|
43
42
|
if line_builder:
|
|
44
|
-
print(f
|
|
43
|
+
print(f"Probably an error; hanging pixels: {line_builder}")
|
|
45
44
|
|
|
46
45
|
return pixels
|
|
47
46
|
|
|
47
|
+
|
|
48
48
|
def ycbcr2rgb(ar):
|
|
49
|
-
xform = np.array([[1, 0, 1.402], [1, -0.34414,
|
|
49
|
+
xform = np.array([[1, 0, 1.402], [1, -0.34414, -0.71414], [1, 1.772, 0]])
|
|
50
50
|
rgb = ar.astype(float)
|
|
51
51
|
# Subtracting by 128 the R and G channels
|
|
52
52
|
rgb[:, [1, 2]] -= 128
|
|
@@ -58,6 +58,7 @@ def ycbcr2rgb(ar):
|
|
|
58
58
|
np.putmask(rgb, rgb < 0, 0)
|
|
59
59
|
return np.uint8(rgb)
|
|
60
60
|
|
|
61
|
+
|
|
61
62
|
def px_rgb_a(ods, pds, swap):
|
|
62
63
|
px = read_rle_bytes(ods.img_data)
|
|
63
64
|
px = np.array([[255] * (ods.width - len(l)) + l for l in px], dtype=np.uint8)
|
|
@@ -78,10 +79,11 @@ def px_rgb_a(ods, pds, swap):
|
|
|
78
79
|
|
|
79
80
|
return px, rgb, a
|
|
80
81
|
|
|
82
|
+
|
|
81
83
|
def make_image(ods, pds, swap=False):
|
|
82
84
|
px, rgb, a = px_rgb_a(ods, pds, swap)
|
|
83
|
-
alpha = Image.fromarray(a, mode=
|
|
84
|
-
img = Image.fromarray(px, mode=
|
|
85
|
+
alpha = Image.fromarray(a, mode="L")
|
|
86
|
+
img = Image.fromarray(px, mode="P")
|
|
85
87
|
img.putalpha(alpha)
|
|
86
88
|
img.putpalette(rgb)
|
|
87
89
|
return img
|
|
@@ -6,21 +6,36 @@ from datetime import datetime, timedelta
|
|
|
6
6
|
|
|
7
7
|
import pytesseract
|
|
8
8
|
from imagemaker import make_image
|
|
9
|
+
from Libraries.SubZero.post_processing import CommonFixes, FixOCR
|
|
9
10
|
from pgsreader import PGSReader
|
|
10
11
|
from PIL import Image, ImageOps
|
|
11
12
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
parser =
|
|
15
|
-
|
|
16
|
-
parser.add_argument(
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
13
|
+
parser = argparse.ArgumentParser(description="Convert PGS subtitles to SubRip format.")
|
|
14
|
+
|
|
15
|
+
parser.add_argument("input", type=str, help="The input file (a .sup file).")
|
|
16
|
+
parser.add_argument("--output", type=str, help="The output file (a .srt file).")
|
|
17
|
+
parser.add_argument(
|
|
18
|
+
"--oem",
|
|
19
|
+
type=int,
|
|
20
|
+
help="The OCR Engine Mode to use (Default: 1).",
|
|
21
|
+
default=1,
|
|
22
|
+
choices=range(4),
|
|
23
|
+
)
|
|
24
|
+
parser.add_argument(
|
|
25
|
+
"--language", type=str, help="The language to use (Default: eng).", default="eng"
|
|
26
|
+
)
|
|
27
|
+
parser.add_argument(
|
|
28
|
+
"--fix_common",
|
|
29
|
+
help="Fixes common whitespace/punctuation issues.",
|
|
30
|
+
dest="fix_common",
|
|
31
|
+
action="store_true",
|
|
32
|
+
)
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"--fix_common_ocr",
|
|
35
|
+
help="Fixes common OCR issues for supported languages.",
|
|
36
|
+
dest="fix_ocr",
|
|
37
|
+
action="store_true",
|
|
38
|
+
)
|
|
24
39
|
|
|
25
40
|
args = parser.parse_args()
|
|
26
41
|
|
|
@@ -46,7 +61,11 @@ tesseract_config = f"-c tessedit_char_blacklist=[] --psm 6 --oem {args.oem}"
|
|
|
46
61
|
# If an output file for the subrip output is provided, use that.
|
|
47
62
|
# Otherwise remove the ".sup" extension from the input and append
|
|
48
63
|
# ".srt".
|
|
49
|
-
output_file =
|
|
64
|
+
output_file = (
|
|
65
|
+
args.output
|
|
66
|
+
if args.output is not None
|
|
67
|
+
else (args.input.replace(".sup", "") + ".srt")
|
|
68
|
+
)
|
|
50
69
|
|
|
51
70
|
# SubRip output
|
|
52
71
|
output = ""
|
|
@@ -66,7 +85,7 @@ for ds in pgs.iter_displaysets():
|
|
|
66
85
|
|
|
67
86
|
if pds and ods:
|
|
68
87
|
# Create and show the bitmap image and convert it to RGBA
|
|
69
|
-
src = make_image(ods, pds).convert(
|
|
88
|
+
src = make_image(ods, pds).convert("RGBA")
|
|
70
89
|
|
|
71
90
|
# Create grayscale image with black background
|
|
72
91
|
img = Image.new("L", src.size, "BLACK")
|
|
@@ -76,13 +95,15 @@ for ds in pgs.iter_displaysets():
|
|
|
76
95
|
img = ImageOps.invert(img)
|
|
77
96
|
|
|
78
97
|
# Parse the image with tesesract
|
|
79
|
-
text = pytesseract.image_to_string(
|
|
98
|
+
text = pytesseract.image_to_string(
|
|
99
|
+
img, lang=tesseract_lang, config=tesseract_config
|
|
100
|
+
).strip()
|
|
80
101
|
|
|
81
102
|
# Replace "|" with "I"
|
|
82
103
|
# Works better than blacklisting "|" in Tesseract,
|
|
83
104
|
# which results in I becoming "!" "i" and "1"
|
|
84
|
-
text = re.sub(r
|
|
85
|
-
text = re.sub(r
|
|
105
|
+
text = re.sub(r"[|/\\]", "I", text)
|
|
106
|
+
text = re.sub(r"[_]", "L", text)
|
|
86
107
|
|
|
87
108
|
if args.fix_common:
|
|
88
109
|
text = fix_common.process(text)
|
|
@@ -100,11 +121,19 @@ for ds in pgs.iter_displaysets():
|
|
|
100
121
|
end = datetime.fromtimestamp(pcs.presentation_timestamp / 1000)
|
|
101
122
|
end = end + timedelta(hours=-1)
|
|
102
123
|
|
|
103
|
-
if
|
|
124
|
+
if (
|
|
125
|
+
isinstance(start, datetime)
|
|
126
|
+
and isinstance(end, datetime)
|
|
127
|
+
and len(text)
|
|
128
|
+
):
|
|
104
129
|
si = si + 1
|
|
105
130
|
sub_output = str(si) + "\n"
|
|
106
|
-
sub_output +=
|
|
107
|
-
|
|
131
|
+
sub_output += (
|
|
132
|
+
start.strftime("%H:%M:%S,%f")[0:12]
|
|
133
|
+
+ " --> "
|
|
134
|
+
+ end.strftime("%H:%M:%S,%f")[0:12]
|
|
135
|
+
+ "\n"
|
|
136
|
+
)
|
|
108
137
|
sub_output += text + "\n\n"
|
|
109
138
|
|
|
110
139
|
output += sub_output
|
|
@@ -4,27 +4,26 @@ from collections import namedtuple
|
|
|
4
4
|
from os.path import split as pathsplit
|
|
5
5
|
|
|
6
6
|
# Constants for Segments
|
|
7
|
-
PDS = int(
|
|
8
|
-
ODS = int(
|
|
9
|
-
PCS = int(
|
|
10
|
-
WDS = int(
|
|
11
|
-
END = int(
|
|
7
|
+
PDS = int("0x14", 16)
|
|
8
|
+
ODS = int("0x15", 16)
|
|
9
|
+
PCS = int("0x16", 16)
|
|
10
|
+
WDS = int("0x17", 16)
|
|
11
|
+
END = int("0x80", 16)
|
|
12
12
|
|
|
13
13
|
# Named tuple access for static PDS palettes
|
|
14
|
-
Palette = namedtuple(
|
|
14
|
+
Palette = namedtuple("Palette", "Y Cr Cb Alpha")
|
|
15
|
+
|
|
15
16
|
|
|
16
17
|
class InvalidSegmentError(Exception):
|
|
17
|
-
|
|
18
|
+
"""Raised when a segment does not match PGS specification"""
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
class PGSReader:
|
|
21
|
-
|
|
22
22
|
def __init__(self, filepath):
|
|
23
23
|
self.filedir, self.file = pathsplit(filepath)
|
|
24
|
-
with open(filepath,
|
|
24
|
+
with open(filepath, "rb") as f:
|
|
25
25
|
self.bytes = f.read()
|
|
26
26
|
|
|
27
|
-
|
|
28
27
|
def make_segment(self, bytes_):
|
|
29
28
|
cls = SEGMENT_TYPE[bytes_[10]]
|
|
30
29
|
return cls(bytes_)
|
|
@@ -40,35 +39,29 @@ class PGSReader:
|
|
|
40
39
|
ds = []
|
|
41
40
|
for s in self.iter_segments():
|
|
42
41
|
ds.append(s)
|
|
43
|
-
if s.type ==
|
|
42
|
+
if s.type == "END":
|
|
44
43
|
yield DisplaySet(ds)
|
|
45
44
|
ds = []
|
|
46
45
|
|
|
47
46
|
@property
|
|
48
47
|
def segments(self):
|
|
49
|
-
if not hasattr(self,
|
|
48
|
+
if not hasattr(self, "_segments"):
|
|
50
49
|
self._segments = list(self.iter_segments())
|
|
51
50
|
return self._segments
|
|
52
51
|
|
|
53
52
|
@property
|
|
54
53
|
def displaysets(self):
|
|
55
|
-
if not hasattr(self,
|
|
54
|
+
if not hasattr(self, "_displaysets"):
|
|
56
55
|
self._displaysets = list(self.iter_displaysets())
|
|
57
56
|
return self._displaysets
|
|
58
57
|
|
|
59
|
-
class BaseSegment:
|
|
60
58
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
ODS: 'ODS',
|
|
64
|
-
PCS: 'PCS',
|
|
65
|
-
WDS: 'WDS',
|
|
66
|
-
END: 'END'
|
|
67
|
-
}
|
|
59
|
+
class BaseSegment:
|
|
60
|
+
SEGMENT = {PDS: "PDS", ODS: "ODS", PCS: "PCS", WDS: "WDS", END: "END"}
|
|
68
61
|
|
|
69
62
|
def __init__(self, bytes_):
|
|
70
63
|
self.bytes = bytes_
|
|
71
|
-
if bytes_[:2] != b
|
|
64
|
+
if bytes_[:2] != b"PG":
|
|
72
65
|
raise InvalidSegmentError
|
|
73
66
|
self.pts = int(bytes_[2:6].hex(), base=16) / 90
|
|
74
67
|
self.dts = int(bytes_[6:10].hex(), base=16) / 90
|
|
@@ -80,18 +73,20 @@ class BaseSegment:
|
|
|
80
73
|
return self.size
|
|
81
74
|
|
|
82
75
|
@property
|
|
83
|
-
def presentation_timestamp(self):
|
|
76
|
+
def presentation_timestamp(self):
|
|
77
|
+
return self.pts
|
|
84
78
|
|
|
85
79
|
@property
|
|
86
|
-
def decoding_timestamp(self):
|
|
80
|
+
def decoding_timestamp(self):
|
|
81
|
+
return self.dts
|
|
87
82
|
|
|
88
83
|
@property
|
|
89
|
-
def segment_type(self):
|
|
84
|
+
def segment_type(self):
|
|
85
|
+
return self.type
|
|
90
86
|
|
|
91
|
-
class PresentationCompositionSegment(BaseSegment):
|
|
92
87
|
|
|
88
|
+
class PresentationCompositionSegment(BaseSegment):
|
|
93
89
|
class CompositionObject:
|
|
94
|
-
|
|
95
90
|
def __init__(self, bytes_):
|
|
96
91
|
self.bytes = bytes_
|
|
97
92
|
self.object_id = int(bytes_[0:2].hex(), base=16)
|
|
@@ -106,9 +101,9 @@ class PresentationCompositionSegment(BaseSegment):
|
|
|
106
101
|
self.crop_height = int(bytes_[14:16].hex(), base=16)
|
|
107
102
|
|
|
108
103
|
STATE = {
|
|
109
|
-
int(
|
|
110
|
-
int(
|
|
111
|
-
int(
|
|
104
|
+
int("0x00", base=16): "Normal",
|
|
105
|
+
int("0x40", base=16): "Acquisition Point",
|
|
106
|
+
int("0x80", base=16): "Epoch Start",
|
|
112
107
|
}
|
|
113
108
|
|
|
114
109
|
def __init__(self, bytes_):
|
|
@@ -123,18 +118,22 @@ class PresentationCompositionSegment(BaseSegment):
|
|
|
123
118
|
self._num_comps = self.data[10]
|
|
124
119
|
|
|
125
120
|
@property
|
|
126
|
-
def composition_number(self):
|
|
121
|
+
def composition_number(self):
|
|
122
|
+
return self._num
|
|
127
123
|
|
|
128
124
|
@property
|
|
129
|
-
def composition_state(self):
|
|
125
|
+
def composition_state(self):
|
|
126
|
+
return self._state
|
|
130
127
|
|
|
131
128
|
@property
|
|
132
129
|
def composition_objects(self):
|
|
133
|
-
if not hasattr(self,
|
|
130
|
+
if not hasattr(self, "_composition_objects"):
|
|
134
131
|
self._composition_objects = self.get_composition_objects()
|
|
135
132
|
if len(self._composition_objects) != self._num_comps:
|
|
136
|
-
print(
|
|
137
|
-
|
|
133
|
+
print(
|
|
134
|
+
"Warning: Number of composition objects asserted "
|
|
135
|
+
"does not match the amount found."
|
|
136
|
+
)
|
|
138
137
|
return self._composition_objects
|
|
139
138
|
|
|
140
139
|
def get_composition_objects(self):
|
|
@@ -146,8 +145,8 @@ class PresentationCompositionSegment(BaseSegment):
|
|
|
146
145
|
bytes_ = bytes_[length:]
|
|
147
146
|
return comps
|
|
148
147
|
|
|
149
|
-
class WindowDefinitionSegment(BaseSegment):
|
|
150
148
|
|
|
149
|
+
class WindowDefinitionSegment(BaseSegment):
|
|
151
150
|
def __init__(self, bytes_):
|
|
152
151
|
BaseSegment.__init__(self, bytes_)
|
|
153
152
|
self.num_windows = self.data[0]
|
|
@@ -157,8 +156,8 @@ class WindowDefinitionSegment(BaseSegment):
|
|
|
157
156
|
self.width = int(self.data[6:8].hex(), base=16)
|
|
158
157
|
self.height = int(self.data[8:10].hex(), base=16)
|
|
159
158
|
|
|
160
|
-
class PaletteDefinitionSegment(BaseSegment):
|
|
161
159
|
|
|
160
|
+
class PaletteDefinitionSegment(BaseSegment):
|
|
162
161
|
def __init__(self, bytes_):
|
|
163
162
|
BaseSegment.__init__(self, bytes_)
|
|
164
163
|
self.palette_id = self.data[0]
|
|
@@ -168,14 +167,14 @@ class PaletteDefinitionSegment(BaseSegment):
|
|
|
168
167
|
# Iterate entries. Explode the 5 bytes into namedtuple Palette. Must be exploded
|
|
169
168
|
for entry in range(len(self.data[2:]) // 5):
|
|
170
169
|
i = 2 + entry * 5
|
|
171
|
-
self.palette[self.data[i]] = Palette(*self.data[i + 1:i + 5])
|
|
170
|
+
self.palette[self.data[i]] = Palette(*self.data[i + 1 : i + 5])
|
|
172
171
|
|
|
173
|
-
class ObjectDefinitionSegment(BaseSegment):
|
|
174
172
|
|
|
173
|
+
class ObjectDefinitionSegment(BaseSegment):
|
|
175
174
|
SEQUENCE = {
|
|
176
|
-
int(
|
|
177
|
-
int(
|
|
178
|
-
int(
|
|
175
|
+
int("0x40", base=16): "Last",
|
|
176
|
+
int("0x80", base=16): "First",
|
|
177
|
+
int("0xc0", base=16): "First and last",
|
|
179
178
|
}
|
|
180
179
|
|
|
181
180
|
def __init__(self, bytes_):
|
|
@@ -188,13 +187,15 @@ class ObjectDefinitionSegment(BaseSegment):
|
|
|
188
187
|
self.height = int(self.data[9:11].hex(), base=16)
|
|
189
188
|
self.img_data = self.data[11:]
|
|
190
189
|
if len(self.img_data) != self.data_len - 4:
|
|
191
|
-
print(
|
|
192
|
-
|
|
190
|
+
print(
|
|
191
|
+
"Warning: Image data length asserted does not match the length found."
|
|
192
|
+
)
|
|
193
193
|
|
|
194
|
-
class EndSegment(BaseSegment):
|
|
195
194
|
|
|
195
|
+
class EndSegment(BaseSegment):
|
|
196
196
|
@property
|
|
197
|
-
def is_end(self):
|
|
197
|
+
def is_end(self):
|
|
198
|
+
return True
|
|
198
199
|
|
|
199
200
|
|
|
200
201
|
SEGMENT_TYPE = {
|
|
@@ -202,20 +203,23 @@ SEGMENT_TYPE = {
|
|
|
202
203
|
ODS: ObjectDefinitionSegment,
|
|
203
204
|
PCS: PresentationCompositionSegment,
|
|
204
205
|
WDS: WindowDefinitionSegment,
|
|
205
|
-
END: EndSegment
|
|
206
|
+
END: EndSegment,
|
|
206
207
|
}
|
|
207
208
|
|
|
208
|
-
class DisplaySet:
|
|
209
209
|
|
|
210
|
+
class DisplaySet:
|
|
210
211
|
def __init__(self, segments):
|
|
211
212
|
self.segments = segments
|
|
212
213
|
self.segment_types = [s.type for s in segments]
|
|
213
|
-
self.has_image =
|
|
214
|
+
self.has_image = "ODS" in self.segment_types
|
|
215
|
+
|
|
214
216
|
|
|
215
217
|
def segment_by_type_getter(type_):
|
|
216
218
|
def f(self):
|
|
217
219
|
return [s for s in self.segments if s.type == type_]
|
|
220
|
+
|
|
218
221
|
return f
|
|
219
222
|
|
|
223
|
+
|
|
220
224
|
for type_ in BaseSegment.SEGMENT.values():
|
|
221
225
|
setattr(DisplaySet, type_.lower(), property(segment_by_type_getter(type_)))
|