lattifai 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +0 -24
- lattifai/alignment/__init__.py +10 -1
- lattifai/alignment/lattice1_aligner.py +66 -58
- lattifai/alignment/lattice1_worker.py +1 -6
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/segmenter.py +1 -1
- lattifai/alignment/sentence_splitter.py +350 -0
- lattifai/alignment/text_align.py +440 -0
- lattifai/alignment/tokenizer.py +91 -220
- lattifai/caption/__init__.py +82 -6
- lattifai/caption/caption.py +335 -1143
- lattifai/caption/formats/__init__.py +199 -0
- lattifai/caption/formats/base.py +211 -0
- lattifai/caption/formats/gemini.py +722 -0
- lattifai/caption/formats/json.py +194 -0
- lattifai/caption/formats/lrc.py +309 -0
- lattifai/caption/formats/nle/__init__.py +9 -0
- lattifai/caption/formats/nle/audition.py +561 -0
- lattifai/caption/formats/nle/avid.py +423 -0
- lattifai/caption/formats/nle/fcpxml.py +549 -0
- lattifai/caption/formats/nle/premiere.py +589 -0
- lattifai/caption/formats/pysubs2.py +642 -0
- lattifai/caption/formats/sbv.py +147 -0
- lattifai/caption/formats/tabular.py +338 -0
- lattifai/caption/formats/textgrid.py +193 -0
- lattifai/caption/formats/ttml.py +652 -0
- lattifai/caption/formats/vtt.py +469 -0
- lattifai/caption/parsers/__init__.py +9 -0
- lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
- lattifai/caption/standardize.py +636 -0
- lattifai/caption/utils.py +474 -0
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/caption.py +108 -1
- lattifai/cli/transcribe.py +4 -9
- lattifai/cli/youtube.py +4 -1
- lattifai/client.py +48 -84
- lattifai/config/__init__.py +11 -1
- lattifai/config/alignment.py +9 -2
- lattifai/config/caption.py +267 -23
- lattifai/config/media.py +20 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/mixin.py +36 -18
- lattifai/transcription/base.py +6 -1
- lattifai/transcription/lattifai.py +19 -54
- lattifai/utils.py +81 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1170 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.2.2.dist-info/METADATA +615 -0
- lattifai-1.2.2.dist-info/RECORD +76 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
- lattifai/caption/gemini_reader.py +0 -371
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.0.dist-info/METADATA +0 -1133
- lattifai-1.2.0.dist-info/RECORD +0 -57
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
lattifai/alignment/tokenizer.py
CHANGED
|
@@ -2,12 +2,13 @@ import gzip
|
|
|
2
2
|
import pickle
|
|
3
3
|
import re
|
|
4
4
|
from collections import defaultdict
|
|
5
|
-
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar
|
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
|
|
9
|
-
from lattifai.
|
|
10
|
-
from
|
|
9
|
+
# from lattifai.caption import Supervision
|
|
10
|
+
from lhotse.supervision import SupervisionSegment as Supervision # NOTE: Transcriber SupervisionSegment
|
|
11
|
+
|
|
11
12
|
from lattifai.caption import normalize_text as normalize_html_text
|
|
12
13
|
from lattifai.errors import (
|
|
13
14
|
LATTICE_DECODING_FAILURE_HELP,
|
|
@@ -16,12 +17,10 @@ from lattifai.errors import (
|
|
|
16
17
|
QuotaExceededError,
|
|
17
18
|
)
|
|
18
19
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
GROUPING_SEPARATOR = "✹"
|
|
20
|
+
from .phonemizer import G2Phonemizer
|
|
21
|
+
from .punctuation import PUNCTUATION, PUNCTUATION_SPACE
|
|
22
|
+
from .sentence_splitter import SentenceSplitter
|
|
23
|
+
from .text_align import TextAlignResult
|
|
25
24
|
|
|
26
25
|
MAXIMUM_WORD_LENGTH = 40
|
|
27
26
|
|
|
@@ -79,8 +78,11 @@ def tokenize_multilingual_text(text: str, keep_spaces: bool = True, attach_punct
|
|
|
79
78
|
['Kühlschrank']
|
|
80
79
|
>>> tokenize_multilingual_text("Hello, World!", attach_punctuation=True)
|
|
81
80
|
['Hello,', ' ', 'World!']
|
|
81
|
+
>>> tokenize_multilingual_text("[AED], World!", keep_spaces=False, attach_punctuation=True)
|
|
82
|
+
['[AED],', 'World!']
|
|
82
83
|
"""
|
|
83
84
|
# Regex pattern:
|
|
85
|
+
# - \[[A-Z_]+\] matches bracketed annotations like [APPLAUSE], [MUSIC], [SPEAKER_01]
|
|
84
86
|
# - [a-zA-Z0-9\u00C0-\u024F]+ matches Latin letters (including accented chars like ü, ö, ä, ß, é, etc.)
|
|
85
87
|
# - (?:'[a-zA-Z]{1,2})? optionally matches contractions like 's, 't, 'm, 'll, 're, 've
|
|
86
88
|
# - [\u4e00-\u9fff] matches CJK characters
|
|
@@ -89,7 +91,7 @@ def tokenize_multilingual_text(text: str, keep_spaces: bool = True, attach_punct
|
|
|
89
91
|
# - \u00C0-\u00FF: Latin-1 Supplement (À-ÿ)
|
|
90
92
|
# - \u0100-\u017F: Latin Extended-A
|
|
91
93
|
# - \u0180-\u024F: Latin Extended-B
|
|
92
|
-
pattern = re.compile(r"([a-zA-Z0-9\u00C0-\u024F]+(?:'[a-zA-Z]{1,2})?|[\u4e00-\u9fff]|.)")
|
|
94
|
+
pattern = re.compile(r"(\[[A-Z_]+\]|[a-zA-Z0-9\u00C0-\u024F]+(?:'[a-zA-Z]{1,2})?|[\u4e00-\u9fff]|.)")
|
|
93
95
|
|
|
94
96
|
# filter(None, ...) removes any empty strings from re.findall results
|
|
95
97
|
tokens = list(filter(None, pattern.findall(text)))
|
|
@@ -126,84 +128,12 @@ class LatticeTokenizer:
|
|
|
126
128
|
self.g2p_model: Any = None # Placeholder for G2P model
|
|
127
129
|
self.dictionaries = defaultdict(lambda: [])
|
|
128
130
|
self.oov_word = "<unk>"
|
|
129
|
-
self.sentence_splitter = None
|
|
131
|
+
self.sentence_splitter: Optional[SentenceSplitter] = None
|
|
130
132
|
self.device = "cpu"
|
|
131
133
|
|
|
132
134
|
def init_sentence_splitter(self):
|
|
133
|
-
if self.sentence_splitter is
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
import onnxruntime as ort
|
|
137
|
-
from wtpsplit import SaT
|
|
138
|
-
|
|
139
|
-
providers = []
|
|
140
|
-
device = self.device
|
|
141
|
-
if device.startswith("cuda") and ort.get_all_providers().count("CUDAExecutionProvider") > 0:
|
|
142
|
-
providers.append("CUDAExecutionProvider")
|
|
143
|
-
elif device.startswith("mps") and ort.get_all_providers().count("MPSExecutionProvider") > 0:
|
|
144
|
-
providers.append("MPSExecutionProvider")
|
|
145
|
-
|
|
146
|
-
if self.model_hub == "modelscope":
|
|
147
|
-
from modelscope.hub.snapshot_download import snapshot_download as ms_snapshot
|
|
148
|
-
|
|
149
|
-
downloaded_path = ms_snapshot("LattifAI/OmniTokenizer")
|
|
150
|
-
sat = SaT(
|
|
151
|
-
f"{downloaded_path}/sat-3l-sm",
|
|
152
|
-
tokenizer_name_or_path=f"{downloaded_path}/xlm-roberta-base",
|
|
153
|
-
ort_providers=providers + ["CPUExecutionProvider"],
|
|
154
|
-
)
|
|
155
|
-
else:
|
|
156
|
-
sat = SaT(
|
|
157
|
-
"sat-3l-sm",
|
|
158
|
-
ort_providers=providers + ["CPUExecutionProvider"],
|
|
159
|
-
)
|
|
160
|
-
self.sentence_splitter = sat
|
|
161
|
-
|
|
162
|
-
@staticmethod
|
|
163
|
-
def _resplit_special_sentence_types(sentence: str) -> List[str]:
|
|
164
|
-
"""
|
|
165
|
-
Re-split special sentence types.
|
|
166
|
-
|
|
167
|
-
Examples:
|
|
168
|
-
'[APPLAUSE] >> MIRA MURATI:' -> ['[APPLAUSE]', '>> MIRA MURATI:']
|
|
169
|
-
'[MUSIC] >> SPEAKER:' -> ['[MUSIC]', '>> SPEAKER:']
|
|
170
|
-
|
|
171
|
-
Special handling patterns:
|
|
172
|
-
1. Separate special marks at the beginning (e.g., [APPLAUSE], [MUSIC], etc.) from subsequent speaker marks
|
|
173
|
-
2. Use speaker marks (>> or other separators) as split points
|
|
174
|
-
|
|
175
|
-
Args:
|
|
176
|
-
sentence: Input sentence string
|
|
177
|
-
|
|
178
|
-
Returns:
|
|
179
|
-
List of re-split sentences. If no special marks are found, returns the original sentence in a list
|
|
180
|
-
"""
|
|
181
|
-
# Detect special mark patterns: [SOMETHING] >> SPEAKER:
|
|
182
|
-
# or other forms like [SOMETHING] SPEAKER:
|
|
183
|
-
|
|
184
|
-
# Pattern 1: [mark] HTML-encoded separator speaker:
|
|
185
|
-
pattern1 = r"^(\[[^\]]+\])\s+(>>|>>)\s+(.+)$"
|
|
186
|
-
match1 = re.match(pattern1, sentence.strip())
|
|
187
|
-
if match1:
|
|
188
|
-
special_mark = match1.group(1)
|
|
189
|
-
separator = match1.group(2)
|
|
190
|
-
speaker_part = match1.group(3)
|
|
191
|
-
return [special_mark, f"{separator} {speaker_part}"]
|
|
192
|
-
|
|
193
|
-
# Pattern 2: [mark] speaker:
|
|
194
|
-
pattern2 = r"^(\[[^\]]+\])\s+([^:]+:)(.*)$"
|
|
195
|
-
match2 = re.match(pattern2, sentence.strip())
|
|
196
|
-
if match2:
|
|
197
|
-
special_mark = match2.group(1)
|
|
198
|
-
speaker_label = match2.group(2)
|
|
199
|
-
remaining = match2.group(3).strip()
|
|
200
|
-
if remaining:
|
|
201
|
-
return [special_mark, f"{speaker_label} {remaining}"]
|
|
202
|
-
else:
|
|
203
|
-
return [special_mark, speaker_label]
|
|
204
|
-
|
|
205
|
-
# If no special pattern matches, return the original sentence
|
|
206
|
-
return [sentence]
|
|
135
|
+
if self.sentence_splitter is None:
|
|
136
|
+
self.sentence_splitter = SentenceSplitter(device=self.device, model_hub=self.model_hub)
|
|
207
137
|
|
|
208
138
|
@classmethod
|
|
209
139
|
def from_pretrained(
|
|
@@ -308,127 +238,45 @@ class LatticeTokenizer:
|
|
|
308
238
|
|
|
309
239
|
return {}
|
|
310
240
|
|
|
311
|
-
def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[
|
|
241
|
+
def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[Supervision]:
|
|
312
242
|
"""Split supervisions into sentences using the sentence splitter.
|
|
313
243
|
|
|
314
|
-
|
|
244
|
+
Careful about speaker changes.
|
|
315
245
|
"""
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
assert len(speakers) == len(texts), f"len(speakers)={len(speakers)} != len(texts)={len(texts)}"
|
|
351
|
-
sentences = self.sentence_splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace, batch_size=8)
|
|
352
|
-
|
|
353
|
-
supervisions, remainder = [], ""
|
|
354
|
-
for k, (_speaker, _sentences) in enumerate(zip(speakers, sentences)):
|
|
355
|
-
# Prepend remainder from previous iteration to the first sentence
|
|
356
|
-
if _sentences and remainder:
|
|
357
|
-
_sentences[0] = remainder + _sentences[0]
|
|
358
|
-
remainder = ""
|
|
359
|
-
|
|
360
|
-
if not _sentences:
|
|
361
|
-
continue
|
|
362
|
-
|
|
363
|
-
# Process and re-split special sentence types
|
|
364
|
-
processed_sentences = []
|
|
365
|
-
for s, _sentence in enumerate(_sentences):
|
|
366
|
-
if remainder:
|
|
367
|
-
_sentence = remainder + _sentence
|
|
368
|
-
remainder = ""
|
|
369
|
-
# Detect and split special sentence types: e.g., '[APPLAUSE] >> MIRA MURATI:' -> ['[APPLAUSE]', '>> MIRA MURATI:'] # noqa: E501
|
|
370
|
-
resplit_parts = self._resplit_special_sentence_types(_sentence)
|
|
371
|
-
if any(resplit_parts[-1].endswith(sp) for sp in [":", ":"]):
|
|
372
|
-
if s < len(_sentences) - 1:
|
|
373
|
-
_sentences[s + 1] = resplit_parts[-1] + " " + _sentences[s + 1]
|
|
374
|
-
else: # last part
|
|
375
|
-
remainder = resplit_parts[-1] + " "
|
|
376
|
-
processed_sentences.extend(resplit_parts[:-1])
|
|
377
|
-
else:
|
|
378
|
-
processed_sentences.extend(resplit_parts)
|
|
379
|
-
_sentences = processed_sentences
|
|
380
|
-
|
|
381
|
-
if not _sentences:
|
|
382
|
-
if remainder:
|
|
383
|
-
_sentences, remainder = [remainder.strip()], ""
|
|
384
|
-
else:
|
|
385
|
-
continue
|
|
246
|
+
self.init_sentence_splitter()
|
|
247
|
+
return self.sentence_splitter.split_sentences(supervisions, strip_whitespace=strip_whitespace)
|
|
248
|
+
|
|
249
|
+
def tokenize(
|
|
250
|
+
self, supervisions: Union[List[Supervision], TextAlignResult], split_sentence: bool = False, boost: float = 0.0
|
|
251
|
+
) -> Tuple[str, Dict[str, Any]]:
|
|
252
|
+
if isinstance(supervisions[0], Supervision):
|
|
253
|
+
if split_sentence:
|
|
254
|
+
supervisions = self.split_sentences(supervisions)
|
|
255
|
+
|
|
256
|
+
pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
|
|
257
|
+
response = self.client_wrapper.post(
|
|
258
|
+
"tokenize",
|
|
259
|
+
json={
|
|
260
|
+
"model_name": self.model_name,
|
|
261
|
+
"supervisions": [s.to_dict() for s in supervisions],
|
|
262
|
+
"pronunciation_dictionaries": pronunciation_dictionaries,
|
|
263
|
+
},
|
|
264
|
+
)
|
|
265
|
+
else:
|
|
266
|
+
pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions[0]])
|
|
267
|
+
pronunciation_dictionaries.update(self.prenormalize([s.text for s in supervisions[1]]))
|
|
268
|
+
|
|
269
|
+
response = self.client_wrapper.post(
|
|
270
|
+
"difftokenize",
|
|
271
|
+
json={
|
|
272
|
+
"model_name": self.model_name,
|
|
273
|
+
"supervisions": [s.to_dict() for s in supervisions[0]],
|
|
274
|
+
"transcription": [s.to_dict() for s in supervisions[1]],
|
|
275
|
+
"pronunciation_dictionaries": pronunciation_dictionaries,
|
|
276
|
+
"boost": boost,
|
|
277
|
+
},
|
|
278
|
+
)
|
|
386
279
|
|
|
387
|
-
if any(_sentences[-1].endswith(ep) for ep in END_PUNCTUATION):
|
|
388
|
-
supervisions.extend(
|
|
389
|
-
Supervision(text=text, speaker=(_speaker if s == 0 else None)) for s, text in enumerate(_sentences)
|
|
390
|
-
)
|
|
391
|
-
_speaker = None # reset speaker after use
|
|
392
|
-
else:
|
|
393
|
-
supervisions.extend(
|
|
394
|
-
Supervision(text=text, speaker=(_speaker if s == 0 else None))
|
|
395
|
-
for s, text in enumerate(_sentences[:-1])
|
|
396
|
-
)
|
|
397
|
-
remainder = _sentences[-1] + " " + remainder
|
|
398
|
-
if k < len(speakers) - 1 and speakers[k + 1] is not None: # next speaker is set
|
|
399
|
-
supervisions.append(
|
|
400
|
-
Supervision(text=remainder.strip(), speaker=_speaker if len(_sentences) == 1 else None)
|
|
401
|
-
)
|
|
402
|
-
remainder = ""
|
|
403
|
-
elif len(_sentences) == 1:
|
|
404
|
-
if k == len(speakers) - 1:
|
|
405
|
-
pass # keep _speaker for the last supervision
|
|
406
|
-
else:
|
|
407
|
-
assert speakers[k + 1] is None
|
|
408
|
-
speakers[k + 1] = _speaker
|
|
409
|
-
else:
|
|
410
|
-
assert len(_sentences) > 1
|
|
411
|
-
_speaker = None # reset speaker if sentence not ended
|
|
412
|
-
|
|
413
|
-
if remainder.strip():
|
|
414
|
-
supervisions.append(Supervision(text=remainder.strip(), speaker=_speaker))
|
|
415
|
-
|
|
416
|
-
return supervisions
|
|
417
|
-
|
|
418
|
-
def tokenize(self, supervisions: List[Supervision], split_sentence: bool = False) -> Tuple[str, Dict[str, Any]]:
|
|
419
|
-
if split_sentence:
|
|
420
|
-
self.init_sentence_splitter()
|
|
421
|
-
supervisions = self.split_sentences(supervisions)
|
|
422
|
-
|
|
423
|
-
pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
|
|
424
|
-
response = self.client_wrapper.post(
|
|
425
|
-
"tokenize",
|
|
426
|
-
json={
|
|
427
|
-
"model_name": self.model_name,
|
|
428
|
-
"supervisions": [s.to_dict() for s in supervisions],
|
|
429
|
-
"pronunciation_dictionaries": pronunciation_dictionaries,
|
|
430
|
-
},
|
|
431
|
-
)
|
|
432
280
|
if response.status_code == 402:
|
|
433
281
|
raise QuotaExceededError(response.json().get("detail", "Quota exceeded"))
|
|
434
282
|
if response.status_code != 200:
|
|
@@ -445,28 +293,47 @@ class LatticeTokenizer:
|
|
|
445
293
|
self,
|
|
446
294
|
lattice_id: str,
|
|
447
295
|
lattice_results: Tuple[np.ndarray, Any, Any, float, float],
|
|
448
|
-
supervisions: List[Supervision],
|
|
296
|
+
supervisions: Union[List[Supervision], TextAlignResult],
|
|
449
297
|
return_details: bool = False,
|
|
450
298
|
start_margin: float = 0.08,
|
|
451
299
|
end_margin: float = 0.20,
|
|
452
300
|
) -> List[Supervision]:
|
|
453
301
|
emission, results, labels, frame_shift, offset, channel = lattice_results # noqa: F841
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
302
|
+
if isinstance(supervisions[0], Supervision):
|
|
303
|
+
response = self.client_wrapper.post(
|
|
304
|
+
"detokenize",
|
|
305
|
+
json={
|
|
306
|
+
"model_name": self.model_name,
|
|
307
|
+
"lattice_id": lattice_id,
|
|
308
|
+
"frame_shift": frame_shift,
|
|
309
|
+
"results": [t.to_dict() for t in results[0]],
|
|
310
|
+
"labels": labels[0],
|
|
311
|
+
"offset": offset,
|
|
312
|
+
"channel": channel,
|
|
313
|
+
"return_details": False if return_details is None else return_details,
|
|
314
|
+
"destroy_lattice": True,
|
|
315
|
+
"start_margin": start_margin,
|
|
316
|
+
"end_margin": end_margin,
|
|
317
|
+
},
|
|
318
|
+
)
|
|
319
|
+
else:
|
|
320
|
+
response = self.client_wrapper.post(
|
|
321
|
+
"diffdetokenize",
|
|
322
|
+
json={
|
|
323
|
+
"model_name": self.model_name,
|
|
324
|
+
"lattice_id": lattice_id,
|
|
325
|
+
"frame_shift": frame_shift,
|
|
326
|
+
"results": [t.to_dict() for t in results[0]],
|
|
327
|
+
"labels": labels[0],
|
|
328
|
+
"offset": offset,
|
|
329
|
+
"channel": channel,
|
|
330
|
+
"return_details": False if return_details is None else return_details,
|
|
331
|
+
"destroy_lattice": True,
|
|
332
|
+
"start_margin": start_margin,
|
|
333
|
+
"end_margin": end_margin,
|
|
334
|
+
},
|
|
335
|
+
)
|
|
336
|
+
|
|
470
337
|
if response.status_code == 400:
|
|
471
338
|
raise LatticeDecodingError(
|
|
472
339
|
lattice_id,
|
|
@@ -487,7 +354,11 @@ class LatticeTokenizer:
|
|
|
487
354
|
# Add emission confidence scores for segments and word-level alignments
|
|
488
355
|
_add_confidence_scores(alignments, emission, labels[0], frame_shift, offset)
|
|
489
356
|
|
|
490
|
-
|
|
357
|
+
if isinstance(supervisions[0], Supervision):
|
|
358
|
+
alignments = _update_alignments_speaker(supervisions, alignments)
|
|
359
|
+
else:
|
|
360
|
+
# NOTE: Text Diff Alignment >> speaker has been handled in the backend service
|
|
361
|
+
pass
|
|
491
362
|
|
|
492
363
|
return alignments
|
|
493
364
|
|
lattifai/caption/__init__.py
CHANGED
|
@@ -1,20 +1,96 @@
|
|
|
1
|
-
|
|
1
|
+
"""Caption processing module for LattifAI.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
This module provides comprehensive caption/subtitle processing capabilities:
|
|
4
|
+
- Multi-format reading and writing (SRT, VTT, ASS, TTML, etc.)
|
|
5
|
+
- Professional NLE integration (Avid, Final Cut Pro, Premiere Pro, DaVinci Resolve)
|
|
6
|
+
- Audio workstation support (Pro Tools, Adobe Audition)
|
|
7
|
+
- Advanced features: timecode offset, overlap resolution, word-level timing
|
|
8
|
+
"""
|
|
4
9
|
|
|
5
|
-
from ..config.caption import InputCaptionFormat
|
|
10
|
+
from ..config.caption import InputCaptionFormat, OutputCaptionFormat
|
|
6
11
|
from .caption import Caption
|
|
7
|
-
from .
|
|
8
|
-
from .
|
|
12
|
+
from .formats.gemini import GeminiReader, GeminiSegment, GeminiWriter
|
|
13
|
+
from .formats.nle.audition import (
|
|
14
|
+
AuditionCSVConfig,
|
|
15
|
+
AuditionCSVWriter,
|
|
16
|
+
EdiMarkerConfig,
|
|
17
|
+
EdiMarkerWriter,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Professional NLE format writers (re-exported from formats/)
|
|
21
|
+
from .formats.nle.avid import AvidDSConfig, AvidDSWriter, FrameRate
|
|
22
|
+
from .formats.nle.fcpxml import FCPXMLConfig, FCPXMLStyle, FCPXMLWriter
|
|
23
|
+
from .formats.nle.premiere import PremiereXMLConfig, PremiereXMLWriter
|
|
24
|
+
from .formats.ttml import TTMLConfig, TTMLFormat, TTMLRegion, TTMLStyle
|
|
25
|
+
from .parsers.text_parser import normalize_text
|
|
26
|
+
from .standardize import (
|
|
27
|
+
CaptionStandardizer,
|
|
28
|
+
CaptionValidator,
|
|
29
|
+
StandardizationConfig,
|
|
30
|
+
ValidationResult,
|
|
31
|
+
apply_margins_to_captions,
|
|
32
|
+
standardize_captions,
|
|
33
|
+
)
|
|
9
34
|
from .supervision import Supervision
|
|
10
|
-
|
|
35
|
+
|
|
36
|
+
# Create TTMLWriter alias for backward compatibility
|
|
37
|
+
TTMLWriter = TTMLFormat
|
|
38
|
+
|
|
39
|
+
# Utility functions
|
|
40
|
+
from .utils import (
|
|
41
|
+
CollisionMode,
|
|
42
|
+
TimecodeOffset,
|
|
43
|
+
apply_timecode_offset,
|
|
44
|
+
detect_overlaps,
|
|
45
|
+
format_srt_timestamp,
|
|
46
|
+
generate_srt_content,
|
|
47
|
+
resolve_overlaps,
|
|
48
|
+
split_long_lines,
|
|
49
|
+
)
|
|
11
50
|
|
|
12
51
|
__all__ = [
|
|
52
|
+
# Core classes
|
|
13
53
|
"Caption",
|
|
14
54
|
"Supervision",
|
|
55
|
+
# Standardization
|
|
56
|
+
"CaptionStandardizer",
|
|
57
|
+
"CaptionValidator",
|
|
58
|
+
"StandardizationConfig",
|
|
59
|
+
"ValidationResult",
|
|
60
|
+
"standardize_captions",
|
|
61
|
+
"apply_margins_to_captions",
|
|
62
|
+
# Gemini format support
|
|
15
63
|
"GeminiReader",
|
|
16
64
|
"GeminiWriter",
|
|
17
65
|
"GeminiSegment",
|
|
66
|
+
# Text utilities
|
|
18
67
|
"normalize_text",
|
|
68
|
+
# Format types
|
|
19
69
|
"InputCaptionFormat",
|
|
70
|
+
"OutputCaptionFormat",
|
|
71
|
+
# Professional format writers
|
|
72
|
+
"AvidDSWriter",
|
|
73
|
+
"AvidDSConfig",
|
|
74
|
+
"FCPXMLWriter",
|
|
75
|
+
"FCPXMLConfig",
|
|
76
|
+
"FCPXMLStyle",
|
|
77
|
+
"PremiereXMLWriter",
|
|
78
|
+
"PremiereXMLConfig",
|
|
79
|
+
"AuditionCSVWriter",
|
|
80
|
+
"AuditionCSVConfig",
|
|
81
|
+
"EdiMarkerWriter",
|
|
82
|
+
"EdiMarkerConfig",
|
|
83
|
+
"TTMLWriter",
|
|
84
|
+
"TTMLConfig",
|
|
85
|
+
"TTMLStyle",
|
|
86
|
+
"TTMLRegion",
|
|
87
|
+
# Utilities
|
|
88
|
+
"CollisionMode",
|
|
89
|
+
"TimecodeOffset",
|
|
90
|
+
"apply_timecode_offset",
|
|
91
|
+
"resolve_overlaps",
|
|
92
|
+
"detect_overlaps",
|
|
93
|
+
"split_long_lines",
|
|
94
|
+
"format_srt_timestamp",
|
|
95
|
+
"generate_srt_content",
|
|
20
96
|
]
|