lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +9 -1
- lattifai/alignment/lattice1_aligner.py +175 -54
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +441 -0
- lattifai/alignment/tokenizer.py +134 -65
- lattifai/audio2.py +162 -183
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +111 -4
- lattifai/cli/transcribe.py +2 -6
- lattifai/cli/youtube.py +7 -1
- lattifai/client.py +72 -123
- lattifai/config/__init__.py +28 -0
- lattifai/config/alignment.py +14 -0
- lattifai/config/caption.py +45 -31
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/media.py +20 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +49 -32
- lattifai/transcription/base.py +8 -2
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +25 -63
- lattifai/types.py +1 -1
- lattifai/utils.py +7 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1265 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.3.0.dist-info/METADATA +678 -0
- lattifai-1.3.0.dist-info/RECORD +57 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -219
- lattifai/caption/__init__.py +0 -20
- lattifai/caption/caption.py +0 -1467
- lattifai/caption/gemini_reader.py +0 -462
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/text_parser.py +0 -145
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.1.dist-info/METADATA +0 -1134
- lattifai-1.2.1.dist-info/RECORD +0 -58
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
lattifai/alignment/tokenizer.py
CHANGED
|
@@ -2,11 +2,11 @@ import gzip
|
|
|
2
2
|
import pickle
|
|
3
3
|
import re
|
|
4
4
|
from collections import defaultdict
|
|
5
|
-
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar
|
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
|
|
9
|
-
from lattifai.caption import Supervision
|
|
9
|
+
from lattifai.caption import SentenceSplitter, Supervision
|
|
10
10
|
from lattifai.caption import normalize_text as normalize_html_text
|
|
11
11
|
from lattifai.errors import (
|
|
12
12
|
LATTICE_DECODING_FAILURE_HELP,
|
|
@@ -15,14 +15,8 @@ from lattifai.errors import (
|
|
|
15
15
|
QuotaExceededError,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
-
from .
|
|
19
|
-
from .
|
|
20
|
-
|
|
21
|
-
PUNCTUATION = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
|
|
22
|
-
PUNCTUATION_SPACE = PUNCTUATION + " "
|
|
23
|
-
STAR_TOKEN = "※"
|
|
24
|
-
|
|
25
|
-
GROUPING_SEPARATOR = "✹"
|
|
18
|
+
from .punctuation import PUNCTUATION, PUNCTUATION_SPACE
|
|
19
|
+
from .text_align import TextAlignResult
|
|
26
20
|
|
|
27
21
|
MAXIMUM_WORD_LENGTH = 40
|
|
28
22
|
|
|
@@ -80,8 +74,11 @@ def tokenize_multilingual_text(text: str, keep_spaces: bool = True, attach_punct
|
|
|
80
74
|
['Kühlschrank']
|
|
81
75
|
>>> tokenize_multilingual_text("Hello, World!", attach_punctuation=True)
|
|
82
76
|
['Hello,', ' ', 'World!']
|
|
77
|
+
>>> tokenize_multilingual_text("[AED], World!", keep_spaces=False, attach_punctuation=True)
|
|
78
|
+
['[AED],', 'World!']
|
|
83
79
|
"""
|
|
84
80
|
# Regex pattern:
|
|
81
|
+
# - \[[A-Z_]+\] matches bracketed annotations like [APPLAUSE], [MUSIC], [SPEAKER_01]
|
|
85
82
|
# - [a-zA-Z0-9\u00C0-\u024F]+ matches Latin letters (including accented chars like ü, ö, ä, ß, é, etc.)
|
|
86
83
|
# - (?:'[a-zA-Z]{1,2})? optionally matches contractions like 's, 't, 'm, 'll, 're, 've
|
|
87
84
|
# - [\u4e00-\u9fff] matches CJK characters
|
|
@@ -90,7 +87,7 @@ def tokenize_multilingual_text(text: str, keep_spaces: bool = True, attach_punct
|
|
|
90
87
|
# - \u00C0-\u00FF: Latin-1 Supplement (À-ÿ)
|
|
91
88
|
# - \u0100-\u017F: Latin Extended-A
|
|
92
89
|
# - \u0180-\u024F: Latin Extended-B
|
|
93
|
-
pattern = re.compile(r"([a-zA-Z0-9\u00C0-\u024F]+(?:'[a-zA-Z]{1,2})?|[\u4e00-\u9fff]|.)")
|
|
90
|
+
pattern = re.compile(r"(\[[A-Z_]+\]|[a-zA-Z0-9\u00C0-\u024F]+(?:'[a-zA-Z]{1,2})?|[\u4e00-\u9fff]|.)")
|
|
94
91
|
|
|
95
92
|
# filter(None, ...) removes any empty strings from re.findall results
|
|
96
93
|
tokens = list(filter(None, pattern.findall(text)))
|
|
@@ -173,13 +170,16 @@ class LatticeTokenizer:
|
|
|
173
170
|
tokenizer.dictionaries = defaultdict(list, data["dictionaries"])
|
|
174
171
|
tokenizer.oov_word = data["oov_word"]
|
|
175
172
|
|
|
173
|
+
# Lazy load G2P model only if it exists (avoids PyTorch dependency)
|
|
176
174
|
g2pp_model_path = f"{model_path}/g2pp.bin" if Path(f"{model_path}/g2pp.bin").exists() else None
|
|
177
|
-
if
|
|
178
|
-
|
|
175
|
+
g2p_model_path = f"{model_path}/g2p.bin" if Path(f"{model_path}/g2p.bin").exists() else None
|
|
176
|
+
g2p_path = g2pp_model_path or g2p_model_path
|
|
177
|
+
if g2p_path:
|
|
178
|
+
from .phonemizer import G2Phonemizer
|
|
179
|
+
|
|
180
|
+
tokenizer.g2p_model = G2Phonemizer(g2p_path, device=device)
|
|
179
181
|
else:
|
|
180
|
-
|
|
181
|
-
if g2p_model_path:
|
|
182
|
-
tokenizer.g2p_model = G2Phonemizer(g2p_model_path, device=device)
|
|
182
|
+
tokenizer.g2p_model = None
|
|
183
183
|
|
|
184
184
|
tokenizer.device = device
|
|
185
185
|
tokenizer.add_special_tokens()
|
|
@@ -245,19 +245,55 @@ class LatticeTokenizer:
|
|
|
245
245
|
self.init_sentence_splitter()
|
|
246
246
|
return self.sentence_splitter.split_sentences(supervisions, strip_whitespace=strip_whitespace)
|
|
247
247
|
|
|
248
|
-
def
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
248
|
+
def _get_client_info(self) -> Dict[str, Optional[str]]:
|
|
249
|
+
"""Get client identification info for usage tracking."""
|
|
250
|
+
try:
|
|
251
|
+
from importlib.metadata import version
|
|
252
|
+
|
|
253
|
+
return {"client_name": "python-sdk", "client_version": version("lattifai")}
|
|
254
|
+
except Exception:
|
|
255
|
+
return {"client_name": "python-sdk", "client_version": "unknown"}
|
|
256
|
+
|
|
257
|
+
def tokenize(
|
|
258
|
+
self,
|
|
259
|
+
supervisions: Union[List[Supervision], TextAlignResult],
|
|
260
|
+
split_sentence: bool = False,
|
|
261
|
+
boost: float = 0.0,
|
|
262
|
+
transition_penalty: Optional[float] = 0.0,
|
|
263
|
+
) -> Tuple[str, Dict[str, Any]]:
|
|
264
|
+
client_info = self._get_client_info()
|
|
265
|
+
|
|
266
|
+
if isinstance(supervisions[0], Supervision):
|
|
267
|
+
if split_sentence:
|
|
268
|
+
supervisions = self.split_sentences(supervisions)
|
|
269
|
+
|
|
270
|
+
pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
|
|
271
|
+
response = self.client_wrapper.post(
|
|
272
|
+
"tokenize",
|
|
273
|
+
json={
|
|
274
|
+
"model_name": self.model_name,
|
|
275
|
+
"supervisions": [s.to_dict() for s in supervisions],
|
|
276
|
+
"pronunciation_dictionaries": pronunciation_dictionaries,
|
|
277
|
+
**client_info,
|
|
278
|
+
"transition_penalty": transition_penalty,
|
|
279
|
+
},
|
|
280
|
+
)
|
|
281
|
+
else:
|
|
282
|
+
pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions[0]])
|
|
283
|
+
pronunciation_dictionaries.update(self.prenormalize([s.text for s in supervisions[1]]))
|
|
284
|
+
|
|
285
|
+
response = self.client_wrapper.post(
|
|
286
|
+
"difftokenize",
|
|
287
|
+
json={
|
|
288
|
+
"model_name": self.model_name,
|
|
289
|
+
"supervisions": [s.to_dict() for s in supervisions[0]],
|
|
290
|
+
"transcription": [s.to_dict() for s in supervisions[1]],
|
|
291
|
+
"pronunciation_dictionaries": pronunciation_dictionaries,
|
|
292
|
+
"boost": boost,
|
|
293
|
+
**client_info,
|
|
294
|
+
},
|
|
295
|
+
)
|
|
296
|
+
|
|
261
297
|
if response.status_code == 402:
|
|
262
298
|
raise QuotaExceededError(response.json().get("detail", "Quota exceeded"))
|
|
263
299
|
if response.status_code != 200:
|
|
@@ -274,28 +310,51 @@ class LatticeTokenizer:
|
|
|
274
310
|
self,
|
|
275
311
|
lattice_id: str,
|
|
276
312
|
lattice_results: Tuple[np.ndarray, Any, Any, float, float],
|
|
277
|
-
supervisions: List[Supervision],
|
|
313
|
+
supervisions: Union[List[Supervision], TextAlignResult],
|
|
278
314
|
return_details: bool = False,
|
|
279
315
|
start_margin: float = 0.08,
|
|
280
316
|
end_margin: float = 0.20,
|
|
317
|
+
check_sanity: bool = True,
|
|
281
318
|
) -> List[Supervision]:
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
"
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
319
|
+
emission_stats, results, labels, frame_shift, offset, channel = lattice_results # noqa: F841
|
|
320
|
+
# emission_stats is a dict with 'max_probs' and 'aligned_probs' (unified for batch and streaming)
|
|
321
|
+
if isinstance(supervisions[0], Supervision):
|
|
322
|
+
response = self.client_wrapper.post(
|
|
323
|
+
"detokenize",
|
|
324
|
+
json={
|
|
325
|
+
"model_name": self.model_name,
|
|
326
|
+
"lattice_id": lattice_id,
|
|
327
|
+
"frame_shift": frame_shift,
|
|
328
|
+
"results": [t.to_dict() for t in results[0]],
|
|
329
|
+
"labels": labels[0],
|
|
330
|
+
"offset": offset,
|
|
331
|
+
"channel": channel,
|
|
332
|
+
"return_details": False if return_details is None else return_details,
|
|
333
|
+
"destroy_lattice": True,
|
|
334
|
+
"start_margin": start_margin,
|
|
335
|
+
"end_margin": end_margin,
|
|
336
|
+
"check_sanity": check_sanity,
|
|
337
|
+
},
|
|
338
|
+
)
|
|
339
|
+
else:
|
|
340
|
+
response = self.client_wrapper.post(
|
|
341
|
+
"diffdetokenize",
|
|
342
|
+
json={
|
|
343
|
+
"model_name": self.model_name,
|
|
344
|
+
"lattice_id": lattice_id,
|
|
345
|
+
"frame_shift": frame_shift,
|
|
346
|
+
"results": [t.to_dict() for t in results[0]],
|
|
347
|
+
"labels": labels[0],
|
|
348
|
+
"offset": offset,
|
|
349
|
+
"channel": channel,
|
|
350
|
+
"return_details": False if return_details is None else return_details,
|
|
351
|
+
"destroy_lattice": True,
|
|
352
|
+
"start_margin": start_margin,
|
|
353
|
+
"end_margin": end_margin,
|
|
354
|
+
"check_sanity": check_sanity,
|
|
355
|
+
},
|
|
356
|
+
)
|
|
357
|
+
|
|
299
358
|
if response.status_code == 400:
|
|
300
359
|
raise LatticeDecodingError(
|
|
301
360
|
lattice_id,
|
|
@@ -312,19 +371,21 @@ class LatticeTokenizer:
|
|
|
312
371
|
|
|
313
372
|
alignments = [Supervision.from_dict(s) for s in result["supervisions"]]
|
|
314
373
|
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
_add_confidence_scores(alignments, emission, labels[0], frame_shift, offset)
|
|
374
|
+
# Add emission confidence scores for segments and word-level alignments
|
|
375
|
+
_add_confidence_scores(alignments, emission_stats, frame_shift, offset)
|
|
318
376
|
|
|
319
|
-
|
|
377
|
+
if isinstance(supervisions[0], Supervision):
|
|
378
|
+
alignments = _update_alignments_speaker(supervisions, alignments)
|
|
379
|
+
else:
|
|
380
|
+
# NOTE: Text Diff Alignment >> speaker has been handled in the backend service
|
|
381
|
+
pass
|
|
320
382
|
|
|
321
383
|
return alignments
|
|
322
384
|
|
|
323
385
|
|
|
324
386
|
def _add_confidence_scores(
|
|
325
387
|
supervisions: List[Supervision],
|
|
326
|
-
|
|
327
|
-
labels: List[int],
|
|
388
|
+
emission_stats: Dict[str, np.ndarray],
|
|
328
389
|
frame_shift: float,
|
|
329
390
|
offset: float = 0.0,
|
|
330
391
|
) -> None:
|
|
@@ -337,29 +398,37 @@ def _add_confidence_scores(
|
|
|
337
398
|
|
|
338
399
|
Args:
|
|
339
400
|
supervisions: List of Supervision objects to add scores to (modified in-place)
|
|
340
|
-
|
|
341
|
-
labels: Token labels corresponding to aligned tokens
|
|
401
|
+
emission_stats: Dict with 'max_probs' and 'aligned_probs' arrays
|
|
342
402
|
frame_shift: Frame shift in seconds for converting frames to time
|
|
403
|
+
offset: Time offset in seconds
|
|
343
404
|
"""
|
|
344
|
-
|
|
405
|
+
max_probs = emission_stats["max_probs"]
|
|
406
|
+
aligned_probs = emission_stats["aligned_probs"]
|
|
407
|
+
diffprobs_full = max_probs - aligned_probs
|
|
345
408
|
|
|
346
409
|
for supervision in supervisions:
|
|
347
410
|
start_frame = int((supervision.start - offset) / frame_shift)
|
|
348
411
|
end_frame = int((supervision.end - offset) / frame_shift)
|
|
349
412
|
|
|
350
|
-
#
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
413
|
+
# Clamp to valid range
|
|
414
|
+
start_frame = max(0, min(start_frame, len(diffprobs_full) - 1))
|
|
415
|
+
end_frame = max(start_frame + 1, min(end_frame, len(diffprobs_full)))
|
|
416
|
+
|
|
417
|
+
diffprobs = diffprobs_full[start_frame:end_frame]
|
|
418
|
+
if len(diffprobs) > 0:
|
|
419
|
+
supervision.score = round(1.0 - diffprobs.mean().item(), ndigits=4)
|
|
355
420
|
|
|
356
|
-
#
|
|
421
|
+
# Word-level confidence
|
|
357
422
|
if hasattr(supervision, "alignment") and supervision.alignment:
|
|
358
423
|
words = supervision.alignment.get("word", [])
|
|
359
424
|
for w, item in enumerate(words):
|
|
360
|
-
start = int((item.start - offset) / frame_shift)
|
|
361
|
-
end = int((item.end - offset) / frame_shift)
|
|
362
|
-
|
|
425
|
+
start = int((item.start - offset) / frame_shift)
|
|
426
|
+
end = int((item.end - offset) / frame_shift)
|
|
427
|
+
start = max(0, min(start, len(diffprobs_full) - 1))
|
|
428
|
+
end = max(start + 1, min(end, len(diffprobs_full)))
|
|
429
|
+
word_diffprobs = diffprobs_full[start:end]
|
|
430
|
+
if len(word_diffprobs) > 0:
|
|
431
|
+
words[w] = item._replace(score=round(1.0 - word_diffprobs.mean().item(), ndigits=4))
|
|
363
432
|
|
|
364
433
|
|
|
365
434
|
def _update_alignments_speaker(supervisions: List[Supervision], alignments: List[Supervision]) -> List[Supervision]:
|