lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +9 -1
  3. lattifai/alignment/lattice1_aligner.py +175 -54
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/punctuation.py +38 -0
  6. lattifai/alignment/segmenter.py +3 -2
  7. lattifai/alignment/text_align.py +441 -0
  8. lattifai/alignment/tokenizer.py +134 -65
  9. lattifai/audio2.py +162 -183
  10. lattifai/cli/__init__.py +2 -1
  11. lattifai/cli/alignment.py +5 -0
  12. lattifai/cli/caption.py +111 -4
  13. lattifai/cli/transcribe.py +2 -6
  14. lattifai/cli/youtube.py +7 -1
  15. lattifai/client.py +72 -123
  16. lattifai/config/__init__.py +28 -0
  17. lattifai/config/alignment.py +14 -0
  18. lattifai/config/caption.py +45 -31
  19. lattifai/config/client.py +16 -0
  20. lattifai/config/event.py +102 -0
  21. lattifai/config/media.py +20 -0
  22. lattifai/config/transcription.py +25 -1
  23. lattifai/data/__init__.py +8 -0
  24. lattifai/data/caption.py +228 -0
  25. lattifai/diarization/__init__.py +41 -1
  26. lattifai/errors.py +78 -53
  27. lattifai/event/__init__.py +65 -0
  28. lattifai/event/lattifai.py +166 -0
  29. lattifai/mixin.py +49 -32
  30. lattifai/transcription/base.py +8 -2
  31. lattifai/transcription/gemini.py +147 -16
  32. lattifai/transcription/lattifai.py +25 -63
  33. lattifai/types.py +1 -1
  34. lattifai/utils.py +7 -13
  35. lattifai/workflow/__init__.py +28 -4
  36. lattifai/workflow/file_manager.py +2 -5
  37. lattifai/youtube/__init__.py +43 -0
  38. lattifai/youtube/client.py +1265 -0
  39. lattifai/youtube/types.py +23 -0
  40. lattifai-1.3.0.dist-info/METADATA +678 -0
  41. lattifai-1.3.0.dist-info/RECORD +57 -0
  42. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
  43. lattifai/__init__.py +0 -88
  44. lattifai/alignment/sentence_splitter.py +0 -219
  45. lattifai/caption/__init__.py +0 -20
  46. lattifai/caption/caption.py +0 -1467
  47. lattifai/caption/gemini_reader.py +0 -462
  48. lattifai/caption/gemini_writer.py +0 -173
  49. lattifai/caption/supervision.py +0 -34
  50. lattifai/caption/text_parser.py +0 -145
  51. lattifai/cli/app_installer.py +0 -142
  52. lattifai/cli/server.py +0 -44
  53. lattifai/server/app.py +0 -427
  54. lattifai/workflow/youtube.py +0 -577
  55. lattifai-1.2.1.dist-info/METADATA +0 -1134
  56. lattifai-1.2.1.dist-info/RECORD +0 -58
  57. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
  58. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
  59. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
@@ -2,11 +2,11 @@ import gzip
2
2
  import pickle
3
3
  import re
4
4
  from collections import defaultdict
5
- from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar
5
+ from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
6
6
 
7
7
  import numpy as np
8
8
 
9
- from lattifai.caption import Supervision
9
+ from lattifai.caption import SentenceSplitter, Supervision
10
10
  from lattifai.caption import normalize_text as normalize_html_text
11
11
  from lattifai.errors import (
12
12
  LATTICE_DECODING_FAILURE_HELP,
@@ -15,14 +15,8 @@ from lattifai.errors import (
15
15
  QuotaExceededError,
16
16
  )
17
17
 
18
- from .phonemizer import G2Phonemizer
19
- from .sentence_splitter import SentenceSplitter
20
-
21
- PUNCTUATION = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
22
- PUNCTUATION_SPACE = PUNCTUATION + " "
23
- STAR_TOKEN = "※"
24
-
25
- GROUPING_SEPARATOR = "✹"
18
+ from .punctuation import PUNCTUATION, PUNCTUATION_SPACE
19
+ from .text_align import TextAlignResult
26
20
 
27
21
  MAXIMUM_WORD_LENGTH = 40
28
22
 
@@ -80,8 +74,11 @@ def tokenize_multilingual_text(text: str, keep_spaces: bool = True, attach_punct
80
74
  ['Kühlschrank']
81
75
  >>> tokenize_multilingual_text("Hello, World!", attach_punctuation=True)
82
76
  ['Hello,', ' ', 'World!']
77
+ >>> tokenize_multilingual_text("[AED], World!", keep_spaces=False, attach_punctuation=True)
78
+ ['[AED],', 'World!']
83
79
  """
84
80
  # Regex pattern:
81
+ # - \[[A-Z_]+\] matches bracketed annotations like [APPLAUSE], [MUSIC], [SPEAKER_01]
85
82
  # - [a-zA-Z0-9\u00C0-\u024F]+ matches Latin letters (including accented chars like ü, ö, ä, ß, é, etc.)
86
83
  # - (?:'[a-zA-Z]{1,2})? optionally matches contractions like 's, 't, 'm, 'll, 're, 've
87
84
  # - [\u4e00-\u9fff] matches CJK characters
@@ -90,7 +87,7 @@ def tokenize_multilingual_text(text: str, keep_spaces: bool = True, attach_punct
90
87
  # - \u00C0-\u00FF: Latin-1 Supplement (À-ÿ)
91
88
  # - \u0100-\u017F: Latin Extended-A
92
89
  # - \u0180-\u024F: Latin Extended-B
93
- pattern = re.compile(r"([a-zA-Z0-9\u00C0-\u024F]+(?:'[a-zA-Z]{1,2})?|[\u4e00-\u9fff]|.)")
90
+ pattern = re.compile(r"(\[[A-Z_]+\]|[a-zA-Z0-9\u00C0-\u024F]+(?:'[a-zA-Z]{1,2})?|[\u4e00-\u9fff]|.)")
94
91
 
95
92
  # filter(None, ...) removes any empty strings from re.findall results
96
93
  tokens = list(filter(None, pattern.findall(text)))
@@ -173,13 +170,16 @@ class LatticeTokenizer:
173
170
  tokenizer.dictionaries = defaultdict(list, data["dictionaries"])
174
171
  tokenizer.oov_word = data["oov_word"]
175
172
 
173
+ # Lazy load G2P model only if it exists (avoids PyTorch dependency)
176
174
  g2pp_model_path = f"{model_path}/g2pp.bin" if Path(f"{model_path}/g2pp.bin").exists() else None
177
- if g2pp_model_path:
178
- tokenizer.g2p_model = G2Phonemizer(g2pp_model_path, device=device)
175
+ g2p_model_path = f"{model_path}/g2p.bin" if Path(f"{model_path}/g2p.bin").exists() else None
176
+ g2p_path = g2pp_model_path or g2p_model_path
177
+ if g2p_path:
178
+ from .phonemizer import G2Phonemizer
179
+
180
+ tokenizer.g2p_model = G2Phonemizer(g2p_path, device=device)
179
181
  else:
180
- g2p_model_path = f"{model_path}/g2p.bin" if Path(f"{model_path}/g2p.bin").exists() else None
181
- if g2p_model_path:
182
- tokenizer.g2p_model = G2Phonemizer(g2p_model_path, device=device)
182
+ tokenizer.g2p_model = None
183
183
 
184
184
  tokenizer.device = device
185
185
  tokenizer.add_special_tokens()
@@ -245,19 +245,55 @@ class LatticeTokenizer:
245
245
  self.init_sentence_splitter()
246
246
  return self.sentence_splitter.split_sentences(supervisions, strip_whitespace=strip_whitespace)
247
247
 
248
- def tokenize(self, supervisions: List[Supervision], split_sentence: bool = False) -> Tuple[str, Dict[str, Any]]:
249
- if split_sentence:
250
- supervisions = self.split_sentences(supervisions)
251
-
252
- pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
253
- response = self.client_wrapper.post(
254
- "tokenize",
255
- json={
256
- "model_name": self.model_name,
257
- "supervisions": [s.to_dict() for s in supervisions],
258
- "pronunciation_dictionaries": pronunciation_dictionaries,
259
- },
260
- )
248
+ def _get_client_info(self) -> Dict[str, Optional[str]]:
249
+ """Get client identification info for usage tracking."""
250
+ try:
251
+ from importlib.metadata import version
252
+
253
+ return {"client_name": "python-sdk", "client_version": version("lattifai")}
254
+ except Exception:
255
+ return {"client_name": "python-sdk", "client_version": "unknown"}
256
+
257
+ def tokenize(
258
+ self,
259
+ supervisions: Union[List[Supervision], TextAlignResult],
260
+ split_sentence: bool = False,
261
+ boost: float = 0.0,
262
+ transition_penalty: Optional[float] = 0.0,
263
+ ) -> Tuple[str, Dict[str, Any]]:
264
+ client_info = self._get_client_info()
265
+
266
+ if isinstance(supervisions[0], Supervision):
267
+ if split_sentence:
268
+ supervisions = self.split_sentences(supervisions)
269
+
270
+ pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
271
+ response = self.client_wrapper.post(
272
+ "tokenize",
273
+ json={
274
+ "model_name": self.model_name,
275
+ "supervisions": [s.to_dict() for s in supervisions],
276
+ "pronunciation_dictionaries": pronunciation_dictionaries,
277
+ **client_info,
278
+ "transition_penalty": transition_penalty,
279
+ },
280
+ )
281
+ else:
282
+ pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions[0]])
283
+ pronunciation_dictionaries.update(self.prenormalize([s.text for s in supervisions[1]]))
284
+
285
+ response = self.client_wrapper.post(
286
+ "difftokenize",
287
+ json={
288
+ "model_name": self.model_name,
289
+ "supervisions": [s.to_dict() for s in supervisions[0]],
290
+ "transcription": [s.to_dict() for s in supervisions[1]],
291
+ "pronunciation_dictionaries": pronunciation_dictionaries,
292
+ "boost": boost,
293
+ **client_info,
294
+ },
295
+ )
296
+
261
297
  if response.status_code == 402:
262
298
  raise QuotaExceededError(response.json().get("detail", "Quota exceeded"))
263
299
  if response.status_code != 200:
@@ -274,28 +310,51 @@ class LatticeTokenizer:
274
310
  self,
275
311
  lattice_id: str,
276
312
  lattice_results: Tuple[np.ndarray, Any, Any, float, float],
277
- supervisions: List[Supervision],
313
+ supervisions: Union[List[Supervision], TextAlignResult],
278
314
  return_details: bool = False,
279
315
  start_margin: float = 0.08,
280
316
  end_margin: float = 0.20,
317
+ check_sanity: bool = True,
281
318
  ) -> List[Supervision]:
282
- emission, results, labels, frame_shift, offset, channel = lattice_results # noqa: F841
283
- response = self.client_wrapper.post(
284
- "detokenize",
285
- json={
286
- "model_name": self.model_name,
287
- "lattice_id": lattice_id,
288
- "frame_shift": frame_shift,
289
- "results": [t.to_dict() for t in results[0]],
290
- "labels": labels[0],
291
- "offset": offset,
292
- "channel": channel,
293
- "return_details": False if return_details is None else return_details,
294
- "destroy_lattice": True,
295
- "start_margin": start_margin,
296
- "end_margin": end_margin,
297
- },
298
- )
319
+ emission_stats, results, labels, frame_shift, offset, channel = lattice_results # noqa: F841
320
+ # emission_stats is a dict with 'max_probs' and 'aligned_probs' (unified for batch and streaming)
321
+ if isinstance(supervisions[0], Supervision):
322
+ response = self.client_wrapper.post(
323
+ "detokenize",
324
+ json={
325
+ "model_name": self.model_name,
326
+ "lattice_id": lattice_id,
327
+ "frame_shift": frame_shift,
328
+ "results": [t.to_dict() for t in results[0]],
329
+ "labels": labels[0],
330
+ "offset": offset,
331
+ "channel": channel,
332
+ "return_details": False if return_details is None else return_details,
333
+ "destroy_lattice": True,
334
+ "start_margin": start_margin,
335
+ "end_margin": end_margin,
336
+ "check_sanity": check_sanity,
337
+ },
338
+ )
339
+ else:
340
+ response = self.client_wrapper.post(
341
+ "diffdetokenize",
342
+ json={
343
+ "model_name": self.model_name,
344
+ "lattice_id": lattice_id,
345
+ "frame_shift": frame_shift,
346
+ "results": [t.to_dict() for t in results[0]],
347
+ "labels": labels[0],
348
+ "offset": offset,
349
+ "channel": channel,
350
+ "return_details": False if return_details is None else return_details,
351
+ "destroy_lattice": True,
352
+ "start_margin": start_margin,
353
+ "end_margin": end_margin,
354
+ "check_sanity": check_sanity,
355
+ },
356
+ )
357
+
299
358
  if response.status_code == 400:
300
359
  raise LatticeDecodingError(
301
360
  lattice_id,
@@ -312,19 +371,21 @@ class LatticeTokenizer:
312
371
 
313
372
  alignments = [Supervision.from_dict(s) for s in result["supervisions"]]
314
373
 
315
- if emission is not None and return_details:
316
- # Add emission confidence scores for segments and word-level alignments
317
- _add_confidence_scores(alignments, emission, labels[0], frame_shift, offset)
374
+ # Add emission confidence scores for segments and word-level alignments
375
+ _add_confidence_scores(alignments, emission_stats, frame_shift, offset)
318
376
 
319
- alignments = _update_alignments_speaker(supervisions, alignments)
377
+ if isinstance(supervisions[0], Supervision):
378
+ alignments = _update_alignments_speaker(supervisions, alignments)
379
+ else:
380
+ # NOTE: Text Diff Alignment >> speaker has been handled in the backend service
381
+ pass
320
382
 
321
383
  return alignments
322
384
 
323
385
 
324
386
  def _add_confidence_scores(
325
387
  supervisions: List[Supervision],
326
- emission: np.ndarray,
327
- labels: List[int],
388
+ emission_stats: Dict[str, np.ndarray],
328
389
  frame_shift: float,
329
390
  offset: float = 0.0,
330
391
  ) -> None:
@@ -337,29 +398,37 @@ def _add_confidence_scores(
337
398
 
338
399
  Args:
339
400
  supervisions: List of Supervision objects to add scores to (modified in-place)
340
- emission: Emission tensor with shape [batch, time, vocab_size]
341
- labels: Token labels corresponding to aligned tokens
401
+ emission_stats: Dict with 'max_probs' and 'aligned_probs' arrays
342
402
  frame_shift: Frame shift in seconds for converting frames to time
403
+ offset: Time offset in seconds
343
404
  """
344
- tokens = np.array(labels, dtype=np.int64)
405
+ max_probs = emission_stats["max_probs"]
406
+ aligned_probs = emission_stats["aligned_probs"]
407
+ diffprobs_full = max_probs - aligned_probs
345
408
 
346
409
  for supervision in supervisions:
347
410
  start_frame = int((supervision.start - offset) / frame_shift)
348
411
  end_frame = int((supervision.end - offset) / frame_shift)
349
412
 
350
- # Compute segment-level confidence
351
- probabilities = np.exp(emission[0, start_frame:end_frame])
352
- aligned = probabilities[range(0, end_frame - start_frame), tokens[start_frame:end_frame]]
353
- diffprobs = np.max(probabilities, axis=-1) - aligned
354
- supervision.score = round(1.0 - diffprobs.mean(), ndigits=4)
413
+ # Clamp to valid range
414
+ start_frame = max(0, min(start_frame, len(diffprobs_full) - 1))
415
+ end_frame = max(start_frame + 1, min(end_frame, len(diffprobs_full)))
416
+
417
+ diffprobs = diffprobs_full[start_frame:end_frame]
418
+ if len(diffprobs) > 0:
419
+ supervision.score = round(1.0 - diffprobs.mean().item(), ndigits=4)
355
420
 
356
- # Compute word-level confidence if alignment exists
421
+ # Word-level confidence
357
422
  if hasattr(supervision, "alignment") and supervision.alignment:
358
423
  words = supervision.alignment.get("word", [])
359
424
  for w, item in enumerate(words):
360
- start = int((item.start - offset) / frame_shift) - start_frame
361
- end = int((item.end - offset) / frame_shift) - start_frame
362
- words[w] = item._replace(score=round(1.0 - diffprobs[start:end].mean(), ndigits=4))
425
+ start = int((item.start - offset) / frame_shift)
426
+ end = int((item.end - offset) / frame_shift)
427
+ start = max(0, min(start, len(diffprobs_full) - 1))
428
+ end = max(start + 1, min(end, len(diffprobs_full)))
429
+ word_diffprobs = diffprobs_full[start:end]
430
+ if len(word_diffprobs) > 0:
431
+ words[w] = item._replace(score=round(1.0 - word_diffprobs.mean().item(), ndigits=4))
363
432
 
364
433
 
365
434
  def _update_alignments_speaker(supervisions: List[Supervision], alignments: List[Supervision]) -> List[Supervision]: