lattifai 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. lattifai/__init__.py +0 -24
  2. lattifai/alignment/__init__.py +10 -1
  3. lattifai/alignment/lattice1_aligner.py +66 -58
  4. lattifai/alignment/lattice1_worker.py +1 -6
  5. lattifai/alignment/punctuation.py +38 -0
  6. lattifai/alignment/segmenter.py +1 -1
  7. lattifai/alignment/sentence_splitter.py +350 -0
  8. lattifai/alignment/text_align.py +440 -0
  9. lattifai/alignment/tokenizer.py +91 -220
  10. lattifai/caption/__init__.py +82 -6
  11. lattifai/caption/caption.py +335 -1143
  12. lattifai/caption/formats/__init__.py +199 -0
  13. lattifai/caption/formats/base.py +211 -0
  14. lattifai/caption/formats/gemini.py +722 -0
  15. lattifai/caption/formats/json.py +194 -0
  16. lattifai/caption/formats/lrc.py +309 -0
  17. lattifai/caption/formats/nle/__init__.py +9 -0
  18. lattifai/caption/formats/nle/audition.py +561 -0
  19. lattifai/caption/formats/nle/avid.py +423 -0
  20. lattifai/caption/formats/nle/fcpxml.py +549 -0
  21. lattifai/caption/formats/nle/premiere.py +589 -0
  22. lattifai/caption/formats/pysubs2.py +642 -0
  23. lattifai/caption/formats/sbv.py +147 -0
  24. lattifai/caption/formats/tabular.py +338 -0
  25. lattifai/caption/formats/textgrid.py +193 -0
  26. lattifai/caption/formats/ttml.py +652 -0
  27. lattifai/caption/formats/vtt.py +469 -0
  28. lattifai/caption/parsers/__init__.py +9 -0
  29. lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
  30. lattifai/caption/standardize.py +636 -0
  31. lattifai/caption/utils.py +474 -0
  32. lattifai/cli/__init__.py +2 -1
  33. lattifai/cli/caption.py +108 -1
  34. lattifai/cli/transcribe.py +4 -9
  35. lattifai/cli/youtube.py +4 -1
  36. lattifai/client.py +48 -84
  37. lattifai/config/__init__.py +11 -1
  38. lattifai/config/alignment.py +9 -2
  39. lattifai/config/caption.py +267 -23
  40. lattifai/config/media.py +20 -0
  41. lattifai/diarization/__init__.py +41 -1
  42. lattifai/mixin.py +36 -18
  43. lattifai/transcription/base.py +6 -1
  44. lattifai/transcription/lattifai.py +19 -54
  45. lattifai/utils.py +81 -13
  46. lattifai/workflow/__init__.py +28 -4
  47. lattifai/workflow/file_manager.py +2 -5
  48. lattifai/youtube/__init__.py +43 -0
  49. lattifai/youtube/client.py +1170 -0
  50. lattifai/youtube/types.py +23 -0
  51. lattifai-1.2.2.dist-info/METADATA +615 -0
  52. lattifai-1.2.2.dist-info/RECORD +76 -0
  53. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
  54. lattifai/caption/gemini_reader.py +0 -371
  55. lattifai/caption/gemini_writer.py +0 -173
  56. lattifai/cli/app_installer.py +0 -142
  57. lattifai/cli/server.py +0 -44
  58. lattifai/server/app.py +0 -427
  59. lattifai/workflow/youtube.py +0 -577
  60. lattifai-1.2.0.dist-info/METADATA +0 -1133
  61. lattifai-1.2.0.dist-info/RECORD +0 -57
  62. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
  63. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
  64. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,194 @@
1
+ """JSON format handler for structured caption data.
2
+
3
+ JSON is the most flexible format for storing caption data, supporting:
4
+ - Segment-level timing (start, end)
5
+ - Word-level alignment (words array with per-word timestamps)
6
+ - Speaker labels
7
+ - Custom metadata
8
+
9
+ Example JSON structure:
10
+ ```json
11
+ [
12
+ {
13
+ "text": "Hello world",
14
+ "start": 0.0,
15
+ "end": 2.5,
16
+ "speaker": "Speaker 1",
17
+ "words": [
18
+ {"word": "Hello", "start": 0.0, "end": 0.5},
19
+ {"word": "world", "start": 0.6, "end": 2.5}
20
+ ]
21
+ }
22
+ ]
23
+ ```
24
+ """
25
+
26
+ import json
27
+ from pathlib import Path
28
+ from typing import List
29
+
30
+ from ..parsers.text_parser import normalize_text as normalize_text_fn
31
+ from ..supervision import Supervision
32
+ from . import register_format
33
+ from .base import FormatHandler
34
+
35
+
36
+ @register_format("json")
37
+ class JSONFormat(FormatHandler):
38
+ """JSON format for structured caption data.
39
+
40
+ Features:
41
+ - Preserves full segment structure with timing
42
+ - Supports word-level alignment in 'words' field
43
+ - Round-trip compatible (read/write preserves all data)
44
+ - Human-readable with indentation
45
+
46
+ Input format (read):
47
+ - Array of objects with: text, start, duration/end
48
+ - Optional: speaker, words (array of word timing objects)
49
+ - Words can have: word, start, duration or end
50
+
51
+ Output format (write):
52
+ - word_level=False: Standard segment output
53
+ - word_level=True: Includes 'words' array with per-word timestamps
54
+ """
55
+
56
+ extensions = [".json"]
57
+ description = "JSON - structured caption data with word-level support"
58
+
59
+ @classmethod
60
+ def read(cls, source, normalize_text: bool = True, **kwargs) -> List[Supervision]:
61
+ """Read JSON format.
62
+
63
+ Args:
64
+ source: File path or JSON string content
65
+ normalize_text: Whether to normalize text content
66
+
67
+ Returns:
68
+ List of Supervision objects with alignment data if present
69
+
70
+ Supports word-level alignment data in the 'words' field.
71
+ Each word item should have: word, start, duration (or end).
72
+ """
73
+ from lhotse.supervision import AlignmentItem
74
+
75
+ if cls.is_content(source):
76
+ data = json.loads(source)
77
+ else:
78
+ with open(source, "r", encoding="utf-8") as f:
79
+ data = json.load(f)
80
+
81
+ supervisions = []
82
+ for item in data:
83
+ text = item.get("text", "")
84
+ if normalize_text:
85
+ text = normalize_text_fn(text)
86
+
87
+ # Parse word-level alignment if present
88
+ alignment = None
89
+ if "words" in item and item["words"]:
90
+ word_alignments = []
91
+ for word_item in item["words"]:
92
+ word_text = word_item.get("word", "")
93
+ word_start = word_item.get("start", 0)
94
+ # Support both 'duration' and 'end' fields
95
+ if "duration" in word_item:
96
+ word_duration = word_item["duration"]
97
+ elif "end" in word_item:
98
+ word_duration = word_item["end"] - word_start
99
+ else:
100
+ word_duration = 0
101
+ word_alignments.append(AlignmentItem(symbol=word_text, start=word_start, duration=word_duration))
102
+ if word_alignments:
103
+ alignment = {"word": word_alignments}
104
+
105
+ # Support both 'duration' and 'end' fields for segment timing
106
+ start = item.get("start", 0)
107
+ if "duration" in item:
108
+ duration = item["duration"]
109
+ elif "end" in item:
110
+ duration = item["end"] - start
111
+ else:
112
+ duration = 0
113
+
114
+ supervisions.append(
115
+ Supervision(
116
+ text=text,
117
+ start=start,
118
+ duration=duration,
119
+ speaker=item.get("speaker"),
120
+ alignment=alignment,
121
+ )
122
+ )
123
+
124
+ return supervisions
125
+
126
+ @classmethod
127
+ def write(
128
+ cls,
129
+ supervisions: List[Supervision],
130
+ output_path,
131
+ include_speaker: bool = True,
132
+ word_level: bool = False,
133
+ **kwargs,
134
+ ) -> Path:
135
+ """Write JSON format.
136
+
137
+ Args:
138
+ supervisions: List of Supervision objects
139
+ output_path: Output file path
140
+ include_speaker: Whether to include speaker field
141
+ word_level: If True, include 'words' field with word-level timestamps
142
+
143
+ Returns:
144
+ Path to written file
145
+ """
146
+ output_path = Path(output_path)
147
+ content = cls.to_bytes(supervisions, include_speaker=include_speaker, word_level=word_level)
148
+ output_path.write_bytes(content)
149
+ return output_path
150
+
151
+ @classmethod
152
+ def to_bytes(
153
+ cls, supervisions: List[Supervision], include_speaker: bool = True, word_level: bool = False, **kwargs
154
+ ) -> bytes:
155
+ """Convert to JSON format bytes.
156
+
157
+ Args:
158
+ supervisions: List of Supervision objects
159
+ include_speaker: Whether to include speaker field
160
+ word_level: If True, include 'words' field with word-level timestamps
161
+
162
+ Returns:
163
+ JSON content as UTF-8 encoded bytes
164
+
165
+ Note:
166
+ Unlike other formats (SRT, VTT, LRC) that expand word_level=True to
167
+ one segment per word, JSON preserves the original structure and adds
168
+ a 'words' array inside each segment. This allows round-trip compatibility
169
+ and preserves all timing information.
170
+ """
171
+ data = []
172
+ for sup in supervisions:
173
+ item = {
174
+ "text": sup.text,
175
+ "start": sup.start,
176
+ "end": sup.end,
177
+ }
178
+ if include_speaker and sup.speaker:
179
+ item["speaker"] = sup.speaker
180
+
181
+ # Add words field when word_level=True and alignment exists
182
+ if word_level and sup.alignment and "word" in sup.alignment:
183
+ item["words"] = [
184
+ {
185
+ "word": w.symbol,
186
+ "start": w.start,
187
+ "end": w.start + w.duration,
188
+ }
189
+ for w in sup.alignment["word"]
190
+ ]
191
+
192
+ data.append(item)
193
+
194
+ return json.dumps(data, ensure_ascii=False, indent=4).encode("utf-8")
@@ -0,0 +1,309 @@
1
+ """Enhanced LRC format handler.
2
+
3
+ LRC (LyRiCs) is a file format for synchronized song lyrics. Enhanced LRC
4
+ adds word-level timestamps for karaoke applications.
5
+
6
+ Standard LRC:
7
+ [00:15.20]Hello beautiful world
8
+
9
+ Enhanced LRC (word-level):
10
+ [00:15.20]<00:15.20>Hello <00:15.65>beautiful <00:16.40>world
11
+
12
+ Metadata tags:
13
+ [ar:Artist Name]
14
+ [ti:Song Title]
15
+ [al:Album Name]
16
+ [offset:±milliseconds]
17
+ """
18
+
19
+ import re
20
+ from pathlib import Path
21
+ from typing import Dict, List, Optional, Union
22
+
23
+ from lhotse.supervision import AlignmentItem
24
+ from lhotse.utils import Pathlike
25
+
26
+ from ...config.caption import KaraokeConfig
27
+ from ..supervision import Supervision
28
+ from . import register_format
29
+ from .base import FormatHandler
30
+
31
+
32
+ @register_format("lrc")
33
+ class LRCFormat(FormatHandler):
34
+ """Enhanced LRC format with word-level timing support."""
35
+
36
+ extensions = [".lrc"]
37
+ description = "Enhanced LRC - karaoke lyrics format"
38
+
39
+ @classmethod
40
+ def is_content(cls, source) -> bool:
41
+ """Check if source is LRC content rather than a file path.
42
+
43
+ Overrides base class to also detect LRC content by timestamp pattern.
44
+ """
45
+ if not isinstance(source, str):
46
+ return False
47
+ # If it has newlines or is very long, it's likely content
48
+ if "\n" in source or len(source) > 500:
49
+ return True
50
+ # LRC-specific: check for timestamp pattern at start
51
+ if source.strip().startswith("[") and re.match(r"\[\d+:\d+", source):
52
+ return True
53
+ return False
54
+
55
+ @classmethod
56
+ def extract_metadata(cls, source: Union[Pathlike, str], **kwargs) -> Dict[str, str]:
57
+ """Extract LRC metadata tags.
58
+
59
+ Extracts standard LRC metadata:
60
+ - ar: Artist name
61
+ - ti: Title
62
+ - al: Album
63
+ - by: Creator
64
+ - offset: Time offset in milliseconds
65
+ - length: Song length
66
+
67
+ Returns:
68
+ Dict with lrc_* prefixed keys for metadata preservation
69
+ """
70
+ if cls.is_content(source):
71
+ content = source
72
+ else:
73
+ try:
74
+ content = Path(str(source)).read_text(encoding="utf-8")
75
+ except Exception:
76
+ return {}
77
+
78
+ metadata = {}
79
+ # Pattern to match [key:value] metadata tags
80
+ meta_pattern = re.compile(r"^\[([a-z]+):(.+)\]$", re.IGNORECASE)
81
+
82
+ for line in content.split("\n")[:50]: # Only check first 50 lines
83
+ line = line.strip()
84
+ match = meta_pattern.match(line)
85
+ if match:
86
+ key, value = match.groups()
87
+ key = key.lower()
88
+ # Store with lrc_ prefix to avoid conflicts
89
+ if key in ("ar", "ti", "al", "by", "offset", "length", "re", "ve"):
90
+ metadata[f"lrc_{key}"] = value.strip()
91
+
92
+ return metadata
93
+
94
+ @classmethod
95
+ def read(
96
+ cls,
97
+ source,
98
+ normalize_text: bool = True,
99
+ **kwargs,
100
+ ) -> List[Supervision]:
101
+ """Read LRC file and return list of Supervision objects.
102
+
103
+ Parses both standard LRC and enhanced LRC with word-level timestamps.
104
+
105
+ Args:
106
+ source: File path or string content
107
+ normalize_text: Whether to normalize text (currently unused)
108
+ **kwargs: Additional options
109
+
110
+ Returns:
111
+ List of Supervision objects with timing and optional word alignment
112
+ """
113
+ if cls.is_content(source):
114
+ content = source
115
+ else:
116
+ content = Path(source).read_text(encoding="utf-8")
117
+
118
+ supervisions = []
119
+ # Match line timestamp: [mm:ss.xx] or [mm:ss.xxx]
120
+ line_pattern = re.compile(r"\[(\d+):(\d+)\.(\d+)\](.+)")
121
+ # Match word timestamp: <mm:ss.xx> or <mm:ss.xxx>
122
+ word_pattern = re.compile(r"<(\d+):(\d+)\.(\d+)>([^<]+)")
123
+
124
+ for line in content.split("\n"):
125
+ line = line.strip()
126
+ # Skip empty lines and metadata
127
+ if not line or line.startswith("[ar:") or line.startswith("[ti:"):
128
+ continue
129
+ if line.startswith("[al:") or line.startswith("[offset:"):
130
+ continue
131
+ if line.startswith("[by:") or line.startswith("[length:"):
132
+ continue
133
+
134
+ match = line_pattern.match(line)
135
+ if match:
136
+ mins, secs, frac, text = match.groups()
137
+ # Handle centisecond vs millisecond
138
+ if len(frac) == 2:
139
+ start = int(mins) * 60 + int(secs) + int(frac) / 100
140
+ else:
141
+ start = int(mins) * 60 + int(secs) + int(frac) / 1000
142
+
143
+ # Extract word-level alignment
144
+ words = word_pattern.findall(text)
145
+ alignment = None
146
+ if words:
147
+ alignment = {"word": []}
148
+ for w_mins, w_secs, w_frac, w_text in words:
149
+ if len(w_frac) == 2:
150
+ w_start = int(w_mins) * 60 + int(w_secs) + int(w_frac) / 100
151
+ else:
152
+ w_start = int(w_mins) * 60 + int(w_secs) + int(w_frac) / 1000
153
+ alignment["word"].append(
154
+ AlignmentItem(
155
+ symbol=w_text.strip(),
156
+ start=w_start,
157
+ duration=0, # LRC doesn't store duration
158
+ )
159
+ )
160
+ # Clean text (remove timestamp tags)
161
+ text = re.sub(r"<\d+:\d+\.\d+>", "", text)
162
+
163
+ supervisions.append(
164
+ Supervision(
165
+ text=text.strip(),
166
+ start=start,
167
+ duration=0, # Will calculate below
168
+ alignment=alignment,
169
+ )
170
+ )
171
+
172
+ # Calculate duration from next segment
173
+ for i, sup in enumerate(supervisions):
174
+ if i + 1 < len(supervisions):
175
+ sup.duration = supervisions[i + 1].start - sup.start
176
+ else:
177
+ sup.duration = 5.0 # Default 5 seconds for last line
178
+
179
+ return supervisions
180
+
181
+ @classmethod
182
+ def write(
183
+ cls,
184
+ supervisions: List[Supervision],
185
+ output_path,
186
+ include_speaker: bool = True,
187
+ word_level: bool = False,
188
+ karaoke_config: Optional[KaraokeConfig] = None,
189
+ **kwargs,
190
+ ) -> Path:
191
+ """Write supervisions to LRC file.
192
+
193
+ Args:
194
+ supervisions: List of Supervision objects to write
195
+ output_path: Path to output file
196
+ include_speaker: Whether to include speaker labels in text
197
+ word_level: Enable word-level output
198
+ karaoke_config: Karaoke configuration. When provided with enabled=True,
199
+ use enhanced LRC with inline timestamps
200
+ **kwargs: Additional options
201
+
202
+ Returns:
203
+ Path to the written file
204
+ """
205
+ output_path = Path(output_path)
206
+ content = cls.to_bytes(
207
+ supervisions,
208
+ include_speaker=include_speaker,
209
+ word_level=word_level,
210
+ karaoke_config=karaoke_config,
211
+ **kwargs,
212
+ )
213
+ output_path.write_bytes(content)
214
+ return output_path
215
+
216
+ @classmethod
217
+ def to_bytes(
218
+ cls,
219
+ supervisions: List[Supervision],
220
+ include_speaker: bool = True,
221
+ word_level: bool = False,
222
+ karaoke_config: Optional[KaraokeConfig] = None,
223
+ metadata: Optional[Dict] = None,
224
+ **kwargs,
225
+ ) -> bytes:
226
+ """Convert supervisions to LRC format bytes.
227
+
228
+ Args:
229
+ supervisions: List of Supervision objects
230
+ include_speaker: Whether to include speaker labels
231
+ word_level: Enable word-level output
232
+ karaoke_config: Karaoke configuration. When provided with enabled=True,
233
+ use enhanced LRC with inline timestamps
234
+ metadata: Optional metadata dict containing lrc_* keys to restore
235
+
236
+ Returns:
237
+ Caption content as bytes
238
+ """
239
+ config = karaoke_config or KaraokeConfig(enabled=False)
240
+ karaoke_enabled = config.enabled
241
+ lines = []
242
+
243
+ # Restore metadata from Caption.metadata (lrc_* keys)
244
+ if metadata:
245
+ lrc_meta_keys = ["ar", "ti", "al", "by", "offset", "length", "re", "ve"]
246
+ for key in lrc_meta_keys:
247
+ value = metadata.get(f"lrc_{key}")
248
+ if value:
249
+ lines.append(f"[{key}:{value}]")
250
+
251
+ # Also add karaoke config metadata if enabled
252
+ if karaoke_enabled:
253
+ for key, value in config.lrc_metadata.items():
254
+ # Avoid duplicates
255
+ existing_line = f"[{key}:"
256
+ if not any(line.startswith(existing_line) for line in lines):
257
+ lines.append(f"[{key}:{value}]")
258
+
259
+ if lines:
260
+ lines.append("")
261
+
262
+ for sup in supervisions:
263
+ if word_level and sup.alignment and "word" in sup.alignment:
264
+ word_items = sup.alignment["word"]
265
+ if karaoke_enabled:
266
+ # Enhanced LRC mode: each word has inline timestamp
267
+ # Use first word's timestamp for line timing (more accurate)
268
+ line_time = cls._format_time(word_items[0].start, config.lrc_precision)
269
+ word_parts = []
270
+ for word in word_items:
271
+ word_time = cls._format_time(word.start, config.lrc_precision)
272
+ word_parts.append(f"<{word_time}>{word.symbol}")
273
+ lines.append(f"[{line_time}]{' '.join(word_parts)}")
274
+ else:
275
+ # Word-per-line mode: each word as separate line
276
+ for word in sup.alignment["word"]:
277
+ word_time = cls._format_time(word.start, config.lrc_precision)
278
+ lines.append(f"[{word_time}]{word.symbol}")
279
+ else:
280
+ # Standard LRC mode: only line timestamp
281
+ line_time = cls._format_time(sup.start, config.lrc_precision)
282
+ text = sup.text or ""
283
+ if cls._should_include_speaker(sup, include_speaker):
284
+ text = f"{sup.speaker}: {text}"
285
+ lines.append(f"[{line_time}]{text}")
286
+
287
+ return "\n".join(lines).encode("utf-8")
288
+
289
+ @staticmethod
290
+ def _format_time(seconds: float, precision: str) -> str:
291
+ """Format time for LRC timestamp.
292
+
293
+ Args:
294
+ seconds: Time in seconds
295
+ precision: "centisecond" for [mm:ss.xx] or "millisecond" for [mm:ss.xxx]
296
+
297
+ Returns:
298
+ Formatted time string
299
+ """
300
+ if seconds < 0:
301
+ seconds = 0
302
+ minutes = int(seconds // 60)
303
+ secs = seconds % 60
304
+ if precision == "millisecond":
305
+ return f"{minutes:02d}:{secs:06.3f}" # 00:15.200
306
+ return f"{minutes:02d}:{secs:05.2f}" # 00:15.23
307
+
308
+
309
+ __all__ = ["LRCFormat"]
@@ -0,0 +1,9 @@
1
+ """Professional NLE format handlers.
2
+
3
+ This module provides format handlers for professional non-linear editing systems
4
+ and digital audio workstations.
5
+ """
6
+
7
+ from . import audition, avid, fcpxml, premiere
8
+
9
+ __all__ = ["audition", "avid", "fcpxml", "premiere"]