lattifai 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. lattifai/__init__.py +0 -24
  2. lattifai/alignment/__init__.py +10 -1
  3. lattifai/alignment/lattice1_aligner.py +66 -58
  4. lattifai/alignment/lattice1_worker.py +1 -6
  5. lattifai/alignment/punctuation.py +38 -0
  6. lattifai/alignment/segmenter.py +1 -1
  7. lattifai/alignment/sentence_splitter.py +350 -0
  8. lattifai/alignment/text_align.py +440 -0
  9. lattifai/alignment/tokenizer.py +91 -220
  10. lattifai/caption/__init__.py +82 -6
  11. lattifai/caption/caption.py +335 -1143
  12. lattifai/caption/formats/__init__.py +199 -0
  13. lattifai/caption/formats/base.py +211 -0
  14. lattifai/caption/formats/gemini.py +722 -0
  15. lattifai/caption/formats/json.py +194 -0
  16. lattifai/caption/formats/lrc.py +309 -0
  17. lattifai/caption/formats/nle/__init__.py +9 -0
  18. lattifai/caption/formats/nle/audition.py +561 -0
  19. lattifai/caption/formats/nle/avid.py +423 -0
  20. lattifai/caption/formats/nle/fcpxml.py +549 -0
  21. lattifai/caption/formats/nle/premiere.py +589 -0
  22. lattifai/caption/formats/pysubs2.py +642 -0
  23. lattifai/caption/formats/sbv.py +147 -0
  24. lattifai/caption/formats/tabular.py +338 -0
  25. lattifai/caption/formats/textgrid.py +193 -0
  26. lattifai/caption/formats/ttml.py +652 -0
  27. lattifai/caption/formats/vtt.py +469 -0
  28. lattifai/caption/parsers/__init__.py +9 -0
  29. lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
  30. lattifai/caption/standardize.py +636 -0
  31. lattifai/caption/utils.py +474 -0
  32. lattifai/cli/__init__.py +2 -1
  33. lattifai/cli/caption.py +108 -1
  34. lattifai/cli/transcribe.py +4 -9
  35. lattifai/cli/youtube.py +4 -1
  36. lattifai/client.py +48 -84
  37. lattifai/config/__init__.py +11 -1
  38. lattifai/config/alignment.py +9 -2
  39. lattifai/config/caption.py +267 -23
  40. lattifai/config/media.py +20 -0
  41. lattifai/diarization/__init__.py +41 -1
  42. lattifai/mixin.py +36 -18
  43. lattifai/transcription/base.py +6 -1
  44. lattifai/transcription/lattifai.py +19 -54
  45. lattifai/utils.py +81 -13
  46. lattifai/workflow/__init__.py +28 -4
  47. lattifai/workflow/file_manager.py +2 -5
  48. lattifai/youtube/__init__.py +43 -0
  49. lattifai/youtube/client.py +1170 -0
  50. lattifai/youtube/types.py +23 -0
  51. lattifai-1.2.2.dist-info/METADATA +615 -0
  52. lattifai-1.2.2.dist-info/RECORD +76 -0
  53. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
  54. lattifai/caption/gemini_reader.py +0 -371
  55. lattifai/caption/gemini_writer.py +0 -173
  56. lattifai/cli/app_installer.py +0 -142
  57. lattifai/cli/server.py +0 -44
  58. lattifai/server/app.py +0 -427
  59. lattifai/workflow/youtube.py +0 -577
  60. lattifai-1.2.0.dist-info/METADATA +0 -1133
  61. lattifai-1.2.0.dist-info/RECORD +0 -57
  62. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
  63. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
  64. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,469 @@
1
+ """WebVTT format with YouTube VTT word-level timestamp support.
2
+
3
+ This module provides a unified VTT format handler that:
4
+ - Reads both standard VTT and YouTube VTT (with word-level timestamps)
5
+ - Writes standard VTT or YouTube VTT (when karaoke_config.enabled=True)
6
+
7
+ YouTube VTT format uses word-level tags like:
8
+ Word1<00:00:10.559><c> Word2</c><00:00:11.000><c> Word3</c>
9
+ """
10
+
11
+ import re
12
+ from pathlib import Path
13
+ from typing import Dict, List, Optional
14
+
15
+ import pysubs2
16
+ from lhotse.supervision import AlignmentItem
17
+
18
+ from ...config.caption import KaraokeConfig
19
+ from ..parsers.text_parser import normalize_text as normalize_text_fn
20
+ from ..parsers.text_parser import parse_speaker_text
21
+ from ..supervision import Supervision
22
+ from . import register_format
23
+ from .base import FormatHandler
24
+
25
+
26
+ @register_format("vtt")
27
+ class VTTFormat(FormatHandler):
28
+ """WebVTT format with YouTube VTT word-level timestamp support.
29
+
30
+ Reading:
31
+ - Auto-detects YouTube VTT format (with word-level timestamps)
32
+ - Falls back to standard VTT parsing via pysubs2
33
+
34
+ Writing:
35
+ - Standard VTT by default
36
+ - YouTube VTT style when word_level=True and karaoke_config.enabled=True
37
+ """
38
+
39
+ extensions = [".vtt"]
40
+ description = "Web Video Text Tracks - HTML5 standard with YouTube VTT support"
41
+
42
+ # Pattern to detect YouTube VTT word-level timestamps
43
+ YOUTUBE_VTT_PATTERN = re.compile(r"<\d{2}:\d{2}:\d{2}[.,]\d{3}><c>")
44
+
45
+ @classmethod
46
+ def can_read(cls, source) -> bool:
47
+ """Check if source is a VTT file."""
48
+ if cls.is_content(source):
49
+ return source.strip().startswith("WEBVTT")
50
+ try:
51
+ path_str = str(source).lower()
52
+ return path_str.endswith(".vtt")
53
+ except Exception:
54
+ return False
55
+
56
+ @classmethod
57
+ def _is_youtube_vtt(cls, content: str) -> bool:
58
+ """Check if content is YouTube VTT format with word-level timestamps."""
59
+ return bool(cls.YOUTUBE_VTT_PATTERN.search(content))
60
+
61
+ @classmethod
62
+ def read(
63
+ cls,
64
+ source,
65
+ normalize_text: bool = True,
66
+ **kwargs,
67
+ ) -> List[Supervision]:
68
+ """Read VTT format, auto-detecting YouTube VTT word-level timestamps.
69
+
70
+ Args:
71
+ source: File path or content string
72
+ normalize_text: Whether to normalize text
73
+
74
+ Returns:
75
+ List of Supervision objects
76
+ """
77
+ if cls.is_content(source):
78
+ content = source
79
+ else:
80
+ with open(source, "r", encoding="utf-8") as f:
81
+ content = f.read()
82
+
83
+ # Auto-detect YouTube VTT format
84
+ if cls._is_youtube_vtt(content):
85
+ return cls._read_youtube_vtt(content, normalize_text)
86
+ else:
87
+ return cls._read_standard_vtt(source if not cls.is_content(source) else content, normalize_text)
88
+
89
+ @classmethod
90
+ def _read_standard_vtt(cls, source, normalize_text: bool = True) -> List[Supervision]:
91
+ """Read standard VTT using pysubs2."""
92
+ try:
93
+ if cls.is_content(source):
94
+ subs = pysubs2.SSAFile.from_string(source, format_="vtt")
95
+ else:
96
+ subs = pysubs2.load(str(source), encoding="utf-8", format_="vtt")
97
+ except Exception:
98
+ if cls.is_content(source):
99
+ subs = pysubs2.SSAFile.from_string(source)
100
+ else:
101
+ subs = pysubs2.load(str(source), encoding="utf-8")
102
+
103
+ supervisions = []
104
+ for event in subs.events:
105
+ text = event.text
106
+ if normalize_text:
107
+ text = normalize_text_fn(text)
108
+
109
+ speaker, text = parse_speaker_text(text)
110
+
111
+ supervisions.append(
112
+ Supervision(
113
+ text=text,
114
+ speaker=speaker or event.name or None,
115
+ start=event.start / 1000.0 if event.start is not None else 0,
116
+ duration=(event.end - event.start) / 1000.0 if event.end is not None else 0,
117
+ )
118
+ )
119
+
120
+ return supervisions
121
+
122
+ @classmethod
123
+ def _read_youtube_vtt(cls, content: str, normalize_text: bool = True) -> List[Supervision]:
124
+ """Parse YouTube VTT format with word-level timestamps."""
125
+ supervisions = []
126
+
127
+ # Pattern to match timestamp lines: 00:00:14.280 --> 00:00:17.269
128
+ timestamp_pattern = re.compile(r"(\d{2}:\d{2}:\d{2}[.,]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[.,]\d{3})")
129
+
130
+ # Pattern to match word-level timestamps: <00:00:10.559><c> word</c>
131
+ word_timestamp_pattern = re.compile(r"<(\d{2}:\d{2}:\d{2}[.,]\d{3})><c>\s*([^<]+)</c>")
132
+
133
+ # Pattern to match the first word (before first timestamp)
134
+ first_word_pattern = re.compile(r"^([^<\n]+?)<(\d{2}:\d{2}:\d{2}[.,]\d{3})>")
135
+
136
+ def parse_timestamp(ts: str) -> float:
137
+ """Convert timestamp string to seconds."""
138
+ ts = ts.replace(",", ".")
139
+ parts = ts.split(":")
140
+ hours = int(parts[0])
141
+ minutes = int(parts[1])
142
+ seconds = float(parts[2])
143
+ return hours * 3600 + minutes * 60 + seconds
144
+
145
+ def has_word_timestamps(text: str) -> bool:
146
+ """Check if text contains word-level timestamps."""
147
+ return bool(word_timestamp_pattern.search(text) or first_word_pattern.match(text))
148
+
149
+ lines = content.split("\n")
150
+ i = 0
151
+
152
+ # First pass: collect all cues with their content
153
+ all_cues = []
154
+ while i < len(lines):
155
+ line = lines[i]
156
+ ts_match = timestamp_pattern.search(line)
157
+ if ts_match:
158
+ cue_start = parse_timestamp(ts_match.group(1))
159
+ cue_end = parse_timestamp(ts_match.group(2))
160
+
161
+ cue_lines = []
162
+ i += 1
163
+ while i < len(lines):
164
+ if timestamp_pattern.search(lines[i]):
165
+ break
166
+ stripped = lines[i].strip()
167
+ if not stripped and cue_lines and not lines[i - 1].strip():
168
+ break
169
+ if stripped:
170
+ cue_lines.append(lines[i])
171
+ i += 1
172
+
173
+ all_cues.append({"start": cue_start, "end": cue_end, "lines": cue_lines})
174
+ continue
175
+ i += 1
176
+
177
+ # Second pass: identify cues to skip and merge
178
+ cues_to_skip = set()
179
+ cues_to_merge_text = {}
180
+
181
+ for idx in range(len(all_cues) - 1):
182
+ cue = all_cues[idx]
183
+ duration = cue["end"] - cue["start"]
184
+
185
+ if abs(duration - 0.010) < 0.001:
186
+ cue_text = "\n".join(cue["lines"])
187
+ if not has_word_timestamps(cue_text):
188
+ next_cue = all_cues[idx + 1]
189
+ if abs(next_cue["start"] - cue["end"]) < 0.001:
190
+ cues_to_skip.add(idx)
191
+
192
+ next_cue_text = "\n".join(next_cue["lines"])
193
+ if not has_word_timestamps(next_cue_text):
194
+ for prev_idx in range(idx - 1, -1, -1):
195
+ if prev_idx not in cues_to_skip:
196
+ if len(next_cue["lines"]) > 1:
197
+ append_text = next_cue["lines"][-1].strip()
198
+ if append_text:
199
+ cues_to_merge_text[prev_idx] = append_text
200
+ cues_to_skip.add(idx + 1)
201
+ break
202
+
203
+ # Third pass: process remaining cues
204
+ for idx, cue in enumerate(all_cues):
205
+ if idx in cues_to_skip:
206
+ continue
207
+
208
+ cue_start = cue["start"]
209
+ cue_end = cue["end"]
210
+ cue_lines = cue["lines"]
211
+
212
+ word_alignments = []
213
+ text_parts = []
214
+
215
+ for cue_line in cue_lines:
216
+ cue_line = cue_line.strip()
217
+ if not cue_line:
218
+ continue
219
+
220
+ word_matches = word_timestamp_pattern.findall(cue_line)
221
+ first_match = first_word_pattern.match(cue_line)
222
+
223
+ if word_matches or first_match:
224
+ if first_match:
225
+ first_word = first_match.group(1).strip()
226
+ first_word_next_ts = parse_timestamp(first_match.group(2))
227
+ if first_word:
228
+ text_parts.append(first_word)
229
+ word_alignments.append(
230
+ AlignmentItem(
231
+ symbol=first_word,
232
+ start=cue_start,
233
+ duration=max(0.01, first_word_next_ts - cue_start),
234
+ )
235
+ )
236
+
237
+ for word_idx, (ts, word) in enumerate(word_matches):
238
+ word_start = parse_timestamp(ts)
239
+ word = word.strip()
240
+ if not word:
241
+ continue
242
+
243
+ text_parts.append(word)
244
+
245
+ if word_idx + 1 < len(word_matches):
246
+ next_ts = parse_timestamp(word_matches[word_idx + 1][0])
247
+ duration = next_ts - word_start
248
+ else:
249
+ duration = cue_end - word_start
250
+
251
+ word_alignments.append(
252
+ AlignmentItem(
253
+ symbol=word,
254
+ start=word_start,
255
+ duration=max(0.01, duration),
256
+ )
257
+ )
258
+
259
+ if not text_parts:
260
+ continue
261
+
262
+ full_text = " ".join(text_parts)
263
+ if idx in cues_to_merge_text:
264
+ full_text += " " + cues_to_merge_text[idx]
265
+
266
+ if normalize_text:
267
+ full_text = normalize_text_fn(full_text)
268
+
269
+ if word_alignments:
270
+ sup_start = word_alignments[0].start
271
+ sup_end = word_alignments[-1].start + word_alignments[-1].duration
272
+ else:
273
+ sup_start = cue_start
274
+ sup_end = cue_end
275
+
276
+ supervisions.append(
277
+ Supervision(
278
+ text=full_text,
279
+ start=sup_start,
280
+ duration=max(0.0, sup_end - sup_start),
281
+ alignment={"word": word_alignments} if word_alignments else None,
282
+ )
283
+ )
284
+
285
+ return supervisions
286
+
287
+ @classmethod
288
+ def extract_metadata(cls, source, **kwargs) -> Dict[str, str]:
289
+ """Extract metadata from VTT header."""
290
+ if cls.is_content(source):
291
+ content = source[:4096]
292
+ else:
293
+ try:
294
+ with open(source, "r", encoding="utf-8") as f:
295
+ content = f.read(4096)
296
+ except Exception:
297
+ return {}
298
+
299
+ metadata = {}
300
+ lines = content.split("\n")
301
+ for line in lines[:10]:
302
+ line = line.strip()
303
+ if line.startswith("Kind:"):
304
+ metadata["kind"] = line.split(":", 1)[1].strip()
305
+ elif line.startswith("Language:"):
306
+ metadata["language"] = line.split(":", 1)[1].strip()
307
+ elif line.startswith("NOTE"):
308
+ match = re.search(r"NOTE\s+(\w+):\s*(.+)", line)
309
+ if match:
310
+ key, value = match.groups()
311
+ metadata[key.lower()] = value.strip()
312
+
313
+ return metadata
314
+
315
+ @classmethod
316
+ def write(
317
+ cls,
318
+ supervisions: List[Supervision],
319
+ output_path,
320
+ include_speaker: bool = True,
321
+ **kwargs,
322
+ ) -> Path:
323
+ """Write VTT to file."""
324
+ output_path = Path(output_path)
325
+ content = cls.to_bytes(supervisions, include_speaker=include_speaker, **kwargs)
326
+ output_path.write_bytes(content)
327
+ return output_path
328
+
329
+ @classmethod
330
+ def to_bytes(
331
+ cls,
332
+ supervisions: List[Supervision],
333
+ include_speaker: bool = True,
334
+ fps: float = 25.0,
335
+ word_level: bool = False,
336
+ karaoke_config: Optional[KaraokeConfig] = None,
337
+ metadata: Optional[Dict] = None,
338
+ **kwargs,
339
+ ) -> bytes:
340
+ """Convert to VTT bytes with optional karaoke and metadata preservation.
341
+
342
+ Args:
343
+ supervisions: List of supervision segments
344
+ include_speaker: Whether to include speaker in output
345
+ fps: Frames per second (not used for VTT)
346
+ word_level: If True and alignment exists, output word-per-segment or karaoke
347
+ karaoke_config: Karaoke configuration. When enabled, output YouTube VTT
348
+ style with word-level timestamps: <00:00:10.559><c> word</c>
349
+ metadata: Optional metadata dict containing kind and language
350
+
351
+ Returns:
352
+ VTT content as bytes
353
+ """
354
+ from .base import expand_to_word_supervisions
355
+
356
+ karaoke_enabled = karaoke_config is not None and karaoke_config.enabled
357
+
358
+ # If karaoke enabled, output YouTube VTT style
359
+ if word_level and karaoke_enabled:
360
+ return cls._to_youtube_vtt_bytes(supervisions, include_speaker, metadata)
361
+
362
+ # If word_level only (no karaoke), expand to word-per-segment
363
+ if word_level:
364
+ supervisions = expand_to_word_supervisions(supervisions)
365
+
366
+ # Build VTT with metadata header
367
+ return cls._to_vtt_bytes_with_metadata(supervisions, include_speaker, metadata)
368
+
369
+ @classmethod
370
+ def _to_vtt_bytes_with_metadata(
371
+ cls,
372
+ supervisions: List[Supervision],
373
+ include_speaker: bool = True,
374
+ metadata: Optional[Dict] = None,
375
+ ) -> bytes:
376
+ """Generate VTT with metadata header."""
377
+ lines = ["WEBVTT"]
378
+
379
+ if metadata:
380
+ if metadata.get("kind"):
381
+ lines.append(f"Kind: {metadata['kind']}")
382
+ if metadata.get("language"):
383
+ lines.append(f"Language: {metadata['language']}")
384
+
385
+ lines.append("")
386
+
387
+ subs = pysubs2.SSAFile()
388
+ for sup in supervisions:
389
+ text = sup.text or ""
390
+ if cls._should_include_speaker(sup, include_speaker):
391
+ text = f"{sup.speaker} {text}"
392
+ subs.append(
393
+ pysubs2.SSAEvent(
394
+ start=int(sup.start * 1000),
395
+ end=int(sup.end * 1000),
396
+ text=text,
397
+ name=sup.speaker or "",
398
+ )
399
+ )
400
+
401
+ vtt_content = subs.to_string(format_="vtt")
402
+ vtt_lines = vtt_content.split("\n")
403
+ started = False
404
+ for line in vtt_lines[1:]:
405
+ if not started and not line.strip():
406
+ continue
407
+ started = True
408
+ lines.append(line)
409
+
410
+ return "\n".join(lines).encode("utf-8")
411
+
412
+ @classmethod
413
+ def _to_youtube_vtt_bytes(
414
+ cls,
415
+ supervisions: List[Supervision],
416
+ include_speaker: bool = True,
417
+ metadata: Optional[Dict] = None,
418
+ ) -> bytes:
419
+ """Generate YouTube VTT format with word-level timestamps.
420
+
421
+ Format: <00:00:10.559><c> word</c>
422
+ """
423
+
424
+ def format_timestamp(seconds: float) -> str:
425
+ """Format seconds into HH:MM:SS.mmm."""
426
+ h = int(seconds // 3600)
427
+ m = int((seconds % 3600) // 60)
428
+ s = int(seconds % 60)
429
+ ms = int(round((seconds % 1) * 1000))
430
+ if ms == 1000:
431
+ s += 1
432
+ ms = 0
433
+ return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
434
+
435
+ lines = ["WEBVTT"]
436
+
437
+ if metadata:
438
+ if metadata.get("kind"):
439
+ lines.append(f"Kind: {metadata['kind']}")
440
+ if metadata.get("language"):
441
+ lines.append(f"Language: {metadata['language']}")
442
+
443
+ lines.append("")
444
+
445
+ for sup in sorted(supervisions, key=lambda x: x.start):
446
+ text = sup.text or ""
447
+ alignment = getattr(sup, "alignment", None)
448
+ words = alignment.get("word") if alignment else None
449
+
450
+ if words:
451
+ cue_start = words[0].start
452
+ cue_end = words[-1].end
453
+ lines.append(f"{format_timestamp(cue_start)} --> {format_timestamp(cue_end)}")
454
+
455
+ text_parts = []
456
+ for i, word in enumerate(words):
457
+ symbol = word.symbol
458
+ if i == 0 and include_speaker and sup.speaker:
459
+ symbol = f"{sup.speaker}: {symbol}"
460
+ text_parts.append(f"<{format_timestamp(word.start)}><c> {symbol}</c>")
461
+ lines.append("".join(text_parts))
462
+ else:
463
+ lines.append(f"{format_timestamp(sup.start)} --> {format_timestamp(sup.end)}")
464
+ if include_speaker and sup.speaker:
465
+ text = f"{sup.speaker}: {text}"
466
+ lines.append(text)
467
+ lines.append("")
468
+
469
+ return "\n".join(lines).encode("utf-8")
@@ -0,0 +1,9 @@
1
+ """Text parsing utilities for caption processing."""
2
+
3
+ from .text_parser import normalize_text, parse_speaker_text, parse_timestamp_text
4
+
5
+ __all__ = [
6
+ "normalize_text",
7
+ "parse_speaker_text",
8
+ "parse_timestamp_text",
9
+ ]
@@ -69,7 +69,8 @@ def parse_speaker_text(line) -> Tuple[Optional[str], str]:
69
69
 
70
70
  match = SPEAKER_LATTIFAI.match(line)
71
71
  if match:
72
- assert len(match.groups()) == 2, match.groups()
72
+ if len(match.groups()) != 2:
73
+ raise ValueError(f"Expected 2 groups in SPEAKER_LATTIFAI match, got {match.groups()}")
73
74
  if not match.group(1):
74
75
  logging.error(f"ParseSub LINE [{line}]")
75
76
  else:
@@ -77,7 +78,8 @@ def parse_speaker_text(line) -> Tuple[Optional[str], str]:
77
78
 
78
79
  match = SPEAKER_PATTERN2.match(line)
79
80
  if match:
80
- assert len(match.groups()) == 2, match.groups()
81
+ if len(match.groups()) != 2:
82
+ raise ValueError(f"Expected 2 groups in SPEAKER_PATTERN2 match, got {match.groups()}")
81
83
  return match.group(1).strip(), match.group(2).strip()
82
84
 
83
85
  return None, line