lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +9 -1
  3. lattifai/alignment/lattice1_aligner.py +175 -54
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/punctuation.py +38 -0
  6. lattifai/alignment/segmenter.py +3 -2
  7. lattifai/alignment/text_align.py +441 -0
  8. lattifai/alignment/tokenizer.py +134 -65
  9. lattifai/audio2.py +162 -183
  10. lattifai/cli/__init__.py +2 -1
  11. lattifai/cli/alignment.py +5 -0
  12. lattifai/cli/caption.py +111 -4
  13. lattifai/cli/transcribe.py +2 -6
  14. lattifai/cli/youtube.py +7 -1
  15. lattifai/client.py +72 -123
  16. lattifai/config/__init__.py +28 -0
  17. lattifai/config/alignment.py +14 -0
  18. lattifai/config/caption.py +45 -31
  19. lattifai/config/client.py +16 -0
  20. lattifai/config/event.py +102 -0
  21. lattifai/config/media.py +20 -0
  22. lattifai/config/transcription.py +25 -1
  23. lattifai/data/__init__.py +8 -0
  24. lattifai/data/caption.py +228 -0
  25. lattifai/diarization/__init__.py +41 -1
  26. lattifai/errors.py +78 -53
  27. lattifai/event/__init__.py +65 -0
  28. lattifai/event/lattifai.py +166 -0
  29. lattifai/mixin.py +49 -32
  30. lattifai/transcription/base.py +8 -2
  31. lattifai/transcription/gemini.py +147 -16
  32. lattifai/transcription/lattifai.py +25 -63
  33. lattifai/types.py +1 -1
  34. lattifai/utils.py +7 -13
  35. lattifai/workflow/__init__.py +28 -4
  36. lattifai/workflow/file_manager.py +2 -5
  37. lattifai/youtube/__init__.py +43 -0
  38. lattifai/youtube/client.py +1265 -0
  39. lattifai/youtube/types.py +23 -0
  40. lattifai-1.3.0.dist-info/METADATA +678 -0
  41. lattifai-1.3.0.dist-info/RECORD +57 -0
  42. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
  43. lattifai/__init__.py +0 -88
  44. lattifai/alignment/sentence_splitter.py +0 -219
  45. lattifai/caption/__init__.py +0 -20
  46. lattifai/caption/caption.py +0 -1467
  47. lattifai/caption/gemini_reader.py +0 -462
  48. lattifai/caption/gemini_writer.py +0 -173
  49. lattifai/caption/supervision.py +0 -34
  50. lattifai/caption/text_parser.py +0 -145
  51. lattifai/cli/app_installer.py +0 -142
  52. lattifai/cli/server.py +0 -44
  53. lattifai/server/app.py +0 -427
  54. lattifai/workflow/youtube.py +0 -577
  55. lattifai-1.2.1.dist-info/METADATA +0 -1134
  56. lattifai-1.2.1.dist-info/RECORD +0 -58
  57. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
  58. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
  59. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
lattifai/cli/youtube.py CHANGED
@@ -11,6 +11,7 @@ from lattifai.config import (
11
11
  CaptionConfig,
12
12
  ClientConfig,
13
13
  DiarizationConfig,
14
+ EventConfig,
14
15
  MediaConfig,
15
16
  TranscriptionConfig,
16
17
  )
@@ -25,6 +26,7 @@ def youtube(
25
26
  caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
26
27
  transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
27
28
  diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
29
+ event: Annotated[Optional[EventConfig], run.Config[EventConfig]] = None,
28
30
  use_transcription: bool = False,
29
31
  ):
30
32
  """
@@ -44,7 +46,8 @@ def youtube(
44
46
  Args:
45
47
  yt_url: YouTube video URL (can be provided as positional argument)
46
48
  media: Media configuration for controlling formats and output directories.
47
- Fields: input_path (YouTube URL), output_dir, output_format, force_overwrite
49
+ Fields: input_path (YouTube URL), output_dir, output_format, force_overwrite,
50
+ audio_track_id (default: "original"), quality (default: "best")
48
51
  client: API client configuration.
49
52
  Fields: api_key, timeout, max_retries
50
53
  alignment: Alignment configuration (model selection and inference settings).
@@ -113,6 +116,7 @@ def youtube(
113
116
  caption_config=caption_config,
114
117
  transcription_config=transcription,
115
118
  diarization_config=diarization,
119
+ event_config=event,
116
120
  )
117
121
 
118
122
  # Call the client's youtube method
@@ -129,6 +133,8 @@ def youtube(
129
133
  channel_selector=media_config.channel_selector,
130
134
  streaming_chunk_secs=media_config.streaming_chunk_secs,
131
135
  use_transcription=use_transcription,
136
+ audio_track_id=media_config.audio_track_id,
137
+ quality=media_config.quality,
132
138
  )
133
139
 
134
140
 
lattifai/client.py CHANGED
@@ -9,8 +9,16 @@ from lhotse.utils import Pathlike
9
9
 
10
10
  from lattifai.alignment import Lattice1Aligner, Segmenter
11
11
  from lattifai.audio2 import AudioData, AudioLoader
12
- from lattifai.caption import Caption, InputCaptionFormat
13
- from lattifai.config import AlignmentConfig, CaptionConfig, ClientConfig, DiarizationConfig, TranscriptionConfig
12
+ from lattifai.caption import InputCaptionFormat
13
+ from lattifai.config import (
14
+ AlignmentConfig,
15
+ CaptionConfig,
16
+ ClientConfig,
17
+ DiarizationConfig,
18
+ EventConfig,
19
+ TranscriptionConfig,
20
+ )
21
+ from lattifai.data import Caption
14
22
  from lattifai.errors import (
15
23
  AlignmentError,
16
24
  CaptionProcessingError,
@@ -22,6 +30,7 @@ from lattifai.utils import safe_print
22
30
 
23
31
  if TYPE_CHECKING:
24
32
  from lattifai.diarization import LattifAIDiarizer # noqa: F401
33
+ from lattifai.event import LattifAIEventDetector # noqa: F401
25
34
 
26
35
 
27
36
  class LattifAI(LattifAIClientMixin, SyncAPIClient):
@@ -41,6 +50,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
41
50
  caption_config: Optional[CaptionConfig] = None,
42
51
  transcription_config: Optional[TranscriptionConfig] = None,
43
52
  diarization_config: Optional[DiarizationConfig] = None,
53
+ event_config: Optional[EventConfig] = None,
44
54
  ) -> None:
45
55
  __doc__ = LattifAIClientMixin._INIT_DOC.format(
46
56
  client_class="LattifAI",
@@ -59,8 +69,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
59
69
  self.config = client_config
60
70
 
61
71
  # Initialize all configs with defaults
62
- alignment_config, transcription_config, diarization_config = self._init_configs(
63
- alignment_config, transcription_config, diarization_config
72
+ alignment_config, transcription_config, diarization_config, event_config = self._init_configs(
73
+ alignment_config, transcription_config, diarization_config, event_config
64
74
  )
65
75
 
66
76
  # Store configs
@@ -82,6 +92,14 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
82
92
 
83
93
  self.diarizer = LattifAIDiarizer(config=self.diarization_config)
84
94
 
95
+ # Initialize event detector if enabled
96
+ self.event_config = event_config
97
+ self.event_detector: Optional["LattifAIEventDetector"] = None
98
+ if self.event_config.enabled:
99
+ from lattifai.event import LattifAIEventDetector # noqa: F811
100
+
101
+ self.event_detector = LattifAIEventDetector(config=self.event_config)
102
+
85
103
  # Initialize shared components (transcriber, downloader)
86
104
  self._init_shared_components(transcription_config)
87
105
 
@@ -123,9 +141,16 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
123
141
  alignment_strategy = self.aligner.config.strategy
124
142
 
125
143
  if alignment_strategy != "entire" or caption.transcription:
126
- safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
144
+ safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
127
145
 
128
146
  if caption.supervisions and alignment_strategy == "transcription":
147
+ from lattifai.alignment.text_align import align_supervisions_and_transcription
148
+
149
+ if "gemini" in self.transcriber.name.lower():
150
+ raise ValueError(
151
+ f"Transcription-based alignment is not supported for {self.transcriber.name} "
152
+ "(Gemini's timestamp is not reliable)."
153
+ )
129
154
  if not caption.transcription:
130
155
  transcript = self._transcribe(
131
156
  media_audio,
@@ -134,118 +159,30 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
134
159
  output_dir=Path(str(output_caption_path)).parent if output_caption_path else None,
135
160
  )
136
161
  caption.transcription = transcript.supervisions or transcript.transcription
137
- caption.audio_events = transcript.audio_events
138
- assert caption.transcription, "Transcription is empty after transcription step."
139
-
140
- # Align caption.supervisions with transcription to get segments
141
- import regex
142
- from error_align import error_align # noqa: F401
143
- from error_align.utils import DELIMITERS, NUMERIC_TOKEN, STANDARD_TOKEN, OpType
144
-
145
- JOIN_TOKEN = "❄"
146
- if JOIN_TOKEN not in DELIMITERS:
147
- DELIMITERS.add(JOIN_TOKEN)
148
-
149
- def custom_tokenizer(text: str) -> list:
150
- """Default tokenizer that splits text into words based on whitespace.
151
-
152
- Args:
153
- text (str): The input text to tokenize.
154
-
155
- Returns:
156
- list: A list of tokens (words).
157
-
158
- """
159
- # Escape JOIN_TOKEN for use in regex pattern
160
- escaped_join_token = regex.escape(JOIN_TOKEN)
161
- return list(
162
- regex.finditer(
163
- rf"({NUMERIC_TOKEN})|({STANDARD_TOKEN}|{escaped_join_token})",
164
- text,
165
- regex.UNICODE | regex.VERBOSE,
166
- )
167
- )
162
+ caption.event = transcript.event
163
+ if not caption.transcription:
164
+ raise ValueError("Transcription is empty after transcription step.")
168
165
 
169
166
  if split_sentence or self.caption_config.split_sentence:
170
167
  caption.supervisions = self.aligner.tokenizer.split_sentences(caption.supervisions)
171
168
 
172
- ref = f"{JOIN_TOKEN}".join(sup.text for sup in caption.supervisions)
173
- hyp = f"{JOIN_TOKEN}".join(sup.text for sup in caption.transcription)
174
- alignments = error_align(ref, hyp, tokenizer=custom_tokenizer)
175
-
176
- idx = 0
177
- for k, align in enumerate(alignments):
178
- if align.hyp == JOIN_TOKEN and align.op_type == OpType.MATCH:
179
- # safe_print(f"Segment {k}: JOIN_TOKEN detected, creating segment.")
180
-
181
- # Find first non-None ref_slice starting from idx
182
- ref_start = 0
183
- for i in range(idx, k + 1):
184
- if i < len(alignments) and alignments[i].ref_slice is not None:
185
- ref_start = alignments[i].ref_slice.start
186
- break
187
-
188
- # Find last non-None ref_slice up to current position
189
- ref_stop = len(ref)
190
- for i in range(k, idx - 1, -1):
191
- if i < len(alignments) and alignments[i].ref_slice is not None:
192
- ref_stop = alignments[i].ref_slice.stop
193
- break
194
-
195
- # Find first non-None hyp_slice starting from idx
196
- hyp_start = 0
197
- for i in range(idx, k + 1):
198
- if i < len(alignments) and alignments[i].hyp_slice is not None:
199
- hyp_start = alignments[i].hyp_slice.start
200
- break
201
-
202
- # Find last non-None hyp_slice up to current position
203
- hyp_stop = len(hyp)
204
- for i in range(k, idx - 1, -1):
205
- if i < len(alignments) and alignments[i].hyp_slice is not None:
206
- hyp_stop = alignments[i].hyp_slice.stop
207
- break
208
-
209
- safe_print(f"[REF]: {ref[ref_start:ref_stop]}")
210
- safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop]}\n")
211
- idx = k + 1
212
-
213
- # last part - handle remaining alignments after last JOIN_TOKEN
214
- if idx < len(alignments):
215
- # Find first non-None ref_slice starting from idx
216
- ref_start = 0
217
- for i in range(idx, len(alignments)):
218
- if alignments[i].ref_slice is not None:
219
- ref_start = alignments[i].ref_slice.start
220
- break
221
-
222
- # Find last non-None ref_slice from end
223
- ref_stop = len(ref)
224
- for i in range(len(alignments) - 1, idx - 1, -1):
225
- if alignments[i].ref_slice is not None:
226
- ref_stop = alignments[i].ref_slice.stop
227
- break
228
-
229
- # Find first non-None hyp_slice starting from idx
230
- hyp_start = 0
231
- for i in range(idx, len(alignments)):
232
- if alignments[i].hyp_slice is not None:
233
- hyp_start = alignments[i].hyp_slice.start
234
- break
235
-
236
- # Find last non-None hyp_slice from end
237
- hyp_stop = len(hyp)
238
- for i in range(len(alignments) - 1, idx - 1, -1):
239
- if alignments[i].hyp_slice is not None:
240
- hyp_stop = alignments[i].hyp_slice.stop
241
- break
242
-
243
- safe_print(f"[REF]: {ref[ref_start:ref_stop + 1]}")
244
- safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop + 1]}\n")
245
-
246
- raise NotImplementedError("Transcription-based segmentation is not yet implemented.")
169
+ matches = align_supervisions_and_transcription(
170
+ caption, max_duration=media_audio.duration, verbose=True
171
+ )
172
+
173
+ skipalign = False
174
+ matches = sorted(matches, key=lambda x: x[2].WER.WER) # sort by WER
175
+ segments = [(m[3].start[1], m[3].end[1], m, skipalign) for m in matches]
176
+ for segment in segments:
177
+ # transcription segments -> sentence splitting
178
+ segment[2][1] = self.aligner.tokenizer.split_sentences(segment[2][1])
247
179
  else:
248
180
  if caption.transcription:
181
+ if "gemini" in self.transcriber.name.lower():
182
+ raise ValueError(
183
+ f"Transcription-based alignment is not supported for {self.transcriber.name} "
184
+ "(Gemini's timestamp is not reliable)."
185
+ )
249
186
  if not caption.supervisions: # youtube + transcription case
250
187
  segments = [(sup.start, sup.end, [sup], not sup.text) for sup in caption.transcription]
251
188
  else:
@@ -266,7 +203,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
266
203
  sr = media_audio.sampling_rate
267
204
  supervisions, alignments = [], []
268
205
  for i, (start, end, _supervisions, skipalign) in enumerate(segments, 1):
269
- print(
206
+ safe_print(
270
207
  colorful.green(
271
208
  f" ⏩ aligning segment {i:04d}/{len(segments):04d}: {start:8.2f}s - {end:8.2f}s"
272
209
  )
@@ -286,8 +223,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
286
223
  media_audio,
287
224
  _supervisions,
288
225
  split_sentence=split_sentence or self.caption_config.split_sentence,
289
- return_details=self.caption_config.word_level
290
- or (output_caption_path and str(output_caption_path).endswith(".TextGrid")),
226
+ return_details=True,
291
227
  emission=emission,
292
228
  offset=offset,
293
229
  verbose=False,
@@ -295,14 +231,16 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
295
231
 
296
232
  supervisions.extend(_supervisions)
297
233
  alignments.extend(_alignments)
234
+
235
+ # sort by start
236
+ alignments = sorted(alignments, key=lambda x: x.start)
298
237
  else:
299
238
  # Step 2-4: Standard single-pass alignment
300
239
  supervisions, alignments = self.aligner.alignment(
301
240
  media_audio,
302
241
  caption.supervisions,
303
242
  split_sentence=split_sentence or self.caption_config.split_sentence,
304
- return_details=self.caption_config.word_level
305
- or (output_caption_path and str(output_caption_path).endswith(".TextGrid")),
243
+ return_details=True,
306
244
  )
307
245
 
308
246
  # Update caption with aligned results
@@ -316,13 +254,15 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
316
254
  if self.config.profile:
317
255
  self.aligner.profile()
318
256
 
319
- except (CaptionProcessingError, LatticeEncodingError, AlignmentError, LatticeDecodingError):
257
+ except (CaptionProcessingError, LatticeEncodingError) as e:
320
258
  # Re-raise our specific errors as-is
321
- raise
259
+ raise e
260
+ except LatticeDecodingError as e:
261
+ raise e
322
262
  except Exception as e:
323
263
  # Catch any unexpected errors and wrap them
324
264
  raise AlignmentError(
325
- "Unexpected error during alignment process",
265
+ message="Unexpected error during alignment process",
326
266
  media_path=str(input_media),
327
267
  caption_path=str(input_caption),
328
268
  context={"original_error": str(e), "error_type": e.__class__.__name__},
@@ -337,6 +277,13 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
337
277
  output_caption_path=output_caption_path,
338
278
  )
339
279
 
280
+ # Step 6: Event detection
281
+ if self.event_config.enabled and self.event_detector:
282
+ safe_print(colorful.cyan("🔊 Performing audio event detection..."))
283
+ caption = self.event_detector.detect_and_update_caption(caption, media_audio)
284
+ if output_caption_path:
285
+ self._write_caption(caption, output_caption_path)
286
+
340
287
  return caption
341
288
 
342
289
  def speaker_diarization(
@@ -367,12 +314,12 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
367
314
  diarization_file = Path(str(output_caption_path)).with_suffix(".SpkDiar")
368
315
  if diarization_file.exists():
369
316
  safe_print(colorful.cyan(f"Reading existing speaker diarization from {diarization_file}"))
370
- caption.read_speaker_diarization(diarization_file)
317
+ caption.read_diarization(diarization_file)
371
318
 
372
319
  diarization, alignments = self.diarizer.diarize_with_alignments(
373
320
  input_media,
374
321
  caption.alignments,
375
- diarization=caption.speaker_diarization,
322
+ diarization=caption.diarization,
376
323
  alignment_fn=self.aligner.alignment,
377
324
  transcribe_fn=self.transcriber.transcribe_numpy if self.transcriber else None,
378
325
  separate_fn=self.aligner.separate if self.aligner.worker.separator_ort else None,
@@ -380,7 +327,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
380
327
  output_path=output_caption_path,
381
328
  )
382
329
  caption.alignments = alignments
383
- caption.speaker_diarization = diarization
330
+ caption.diarization = diarization
384
331
 
385
332
  # Write output if requested
386
333
  if output_caption_path:
@@ -400,6 +347,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
400
347
  use_transcription: bool = False,
401
348
  channel_selector: Optional[str | int] = "average",
402
349
  streaming_chunk_secs: Optional[float] = None,
350
+ audio_track_id: Optional[str] = "original",
351
+ quality: str = "best",
403
352
  ) -> Caption:
404
353
  # Prepare output directory and media format
405
354
  output_dir = self._prepare_youtube_output_dir(output_dir)
@@ -408,7 +357,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
408
357
  safe_print(colorful.cyan(f"🎬 Starting YouTube workflow for: {url}"))
409
358
 
410
359
  # Step 1: Download media
411
- media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite)
360
+ media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite, audio_track_id, quality)
412
361
 
413
362
  media_audio = self.audio_loader(
414
363
  media_file, channel_selector=channel_selector, streaming_chunk_secs=streaming_chunk_secs
@@ -1,16 +1,44 @@
1
1
  """Configuration system for LattifAI using nemo_run."""
2
2
 
3
+ # Re-export caption config classes from lattifai-captions package
4
+ from lattifai.caption.config import (
5
+ ALL_CAPTION_FORMATS,
6
+ CAPTION_FORMATS,
7
+ INPUT_CAPTION_FORMATS,
8
+ OUTPUT_CAPTION_FORMATS,
9
+ CaptionFonts,
10
+ CaptionStyle,
11
+ InputCaptionFormat,
12
+ KaraokeConfig,
13
+ OutputCaptionFormat,
14
+ StandardizationConfig,
15
+ )
16
+
3
17
  from .alignment import AlignmentConfig
18
+
19
+ # CaptionConfig is defined in lattifai-python (workflow config)
4
20
  from .caption import CaptionConfig
5
21
  from .client import ClientConfig
6
22
  from .diarization import DiarizationConfig
23
+ from .event import EventConfig
7
24
  from .media import AUDIO_FORMATS, MEDIA_FORMATS, VIDEO_FORMATS, MediaConfig
8
25
  from .transcription import TranscriptionConfig
9
26
 
10
27
  __all__ = [
28
+ "EventConfig",
11
29
  "ClientConfig",
12
30
  "AlignmentConfig",
13
31
  "CaptionConfig",
32
+ "CaptionFonts",
33
+ "CaptionStyle",
34
+ "KaraokeConfig",
35
+ "StandardizationConfig",
36
+ "InputCaptionFormat",
37
+ "OutputCaptionFormat",
38
+ "INPUT_CAPTION_FORMATS",
39
+ "OUTPUT_CAPTION_FORMATS",
40
+ "ALL_CAPTION_FORMATS",
41
+ "CAPTION_FORMATS",
14
42
  "TranscriptionConfig",
15
43
  "DiarizationConfig",
16
44
  "MediaConfig",
@@ -93,6 +93,20 @@ class AlignmentConfig:
93
93
  Default: 0.20. Typical range: 0.0-0.5.
94
94
  """
95
95
 
96
+ boost: float = 5.0
97
+ """Boost for preferring supervisions over transcription in diff alignment decoding graph.
98
+ A positive value encourages the decoder to prefer supervision text over ASR transcription.
99
+ Only effective when strategy='transcription'. Has no effect with 'entire' or 'caption' strategies.
100
+ Default: 5.0. Typical range: 0.0-10.0.
101
+ """
102
+
103
+ transition_penalty: float = 0.0
104
+ """Penalty for token transitions in the decoding graph to discourage duration=1 tokens.
105
+ A negative value penalizes transitions (moving to next token), making the model prefer
106
+ self-loops (staying on current token longer). This helps prevent spurious short-duration alignments.
107
+ Default: 0.0 (no penalty). Typical range: -1.0 to 0.0 (e.g., -0.5).
108
+ """
109
+
96
110
  client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
97
111
  """Reference to the SyncAPIClient instance. Auto-set during client initialization."""
98
112
 
@@ -1,28 +1,18 @@
1
- """Caption I/O configuration for LattifAI."""
1
+ """Caption I/O configuration for LattifAI SDK."""
2
2
 
3
3
  from dataclasses import dataclass
4
4
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Literal, Optional
5
+ from typing import Optional
6
6
 
7
- from lhotse.utils import Pathlike
8
-
9
- # Supported caption formats for reading/writing
10
- CAPTION_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "md", "ttml", "sami", "smi"]
11
-
12
- # Input caption formats (includes special formats like 'auto' and 'gemini')
13
- INPUT_CAPTION_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "auto", "gemini"]
14
-
15
- # Output caption formats (includes special formats like 'TextGrid' and 'json')
16
- OUTPUT_CAPTION_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "TextGrid", "json"]
17
-
18
- # All caption formats combined (for file detection)
19
- ALL_CAPTION_FORMATS = list(set(CAPTION_FORMATS + ["TextGrid", "json", "gemini"]))
20
-
21
- # Type aliases for better type hints
22
- InputCaptionFormat = Literal["auto", "srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "gemini"]
23
- OutputCaptionFormat = Literal[
24
- "srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "TextGrid", "json"
25
- ]
7
+ from lattifai.caption.config import (
8
+ INPUT_CAPTION_FORMATS,
9
+ OUTPUT_CAPTION_FORMATS,
10
+ InputCaptionFormat,
11
+ KaraokeConfig,
12
+ OutputCaptionFormat,
13
+ StandardizationConfig,
14
+ )
15
+ from lattifai.caption.supervision import Pathlike
26
16
 
27
17
 
28
18
  @dataclass
@@ -34,13 +24,20 @@ class CaptionConfig:
34
24
  """
35
25
 
36
26
  input_format: InputCaptionFormat = "auto"
37
- """Input caption format: 'auto', 'srt', 'vtt', 'ass', 'txt', or 'json'."""
27
+ """Input caption format. Supports: 'auto' (detect),
28
+ standard formats (srt, vtt, ass, ssa, sub, sbv, txt, sami, smi),
29
+ tabular (csv, tsv, aud, json),
30
+ specialized (textgrid, gemini),
31
+ NLE (avid_ds, fcpxml, premiere_xml, audition_csv).
32
+ Note: VTT format auto-detects YouTube VTT with word-level timestamps.
33
+ """
38
34
 
39
35
  input_path: Optional[str] = None
40
36
  """Path to input caption file."""
41
37
 
42
38
  output_format: OutputCaptionFormat = "srt"
43
- """Output caption format: 'srt', 'vtt', 'ass', 'txt', or 'json'."""
39
+ """Output caption format. Supports: standard formats, tabular, specialized, TTML profiles (ttml, imsc1, ebu_tt_d),
40
+ NLE (avid_ds, fcpxml, premiere_xml, audition_csv, edimarker_csv)."""
44
41
 
45
42
  output_path: Optional[str] = None
46
43
  """Path to output caption file."""
@@ -57,12 +54,21 @@ class CaptionConfig:
57
54
  word_level: bool = False
58
55
  """Include word-level timestamps in alignment results (useful for karaoke, dubbing)."""
59
56
 
57
+ karaoke: Optional[KaraokeConfig] = None
58
+ """Karaoke configuration when word_level=True (e.g., ASS \\kf tags, enhanced LRC).
59
+ When None with word_level=True, outputs word-per-segment instead of karaoke styling.
60
+ When provided, karaoke.enabled controls whether karaoke styling is applied."""
61
+
60
62
  encoding: str = "utf-8"
61
63
  """Character encoding for reading/writing caption files (default: utf-8)."""
62
64
 
63
65
  source_lang: Optional[str] = None
64
66
  """Source language code for the caption content (e.g., 'en', 'zh', 'de')."""
65
67
 
68
+ standardization: Optional[StandardizationConfig] = None
69
+ """Standardization configuration for broadcast-grade captions.
70
+ When provided, captions will be standardized according to Netflix/BBC guidelines."""
71
+
66
72
  def __post_init__(self):
67
73
  """Validate configuration after initialization."""
68
74
  self._normalize_paths()
@@ -86,14 +92,17 @@ class CaptionConfig:
86
92
  return True
87
93
 
88
94
  def _normalize_paths(self) -> None:
89
- """Normalize and expand input/output paths."""
95
+ """Normalize and expand input/output paths.
96
+
97
+ Uses Path.resolve() to get absolute paths and prevent path traversal issues.
98
+ """
90
99
  # Expand and normalize input path if provided, but don't require it to exist yet
91
100
  # (it might be set later after downloading captions)
92
101
  if self.input_path is not None:
93
- self.input_path = str(Path(self.input_path).expanduser())
102
+ self.input_path = str(Path(self.input_path).expanduser().resolve())
94
103
 
95
104
  if self.output_path is not None:
96
- self.output_path = str(Path(self.output_path).expanduser())
105
+ self.output_path = str(Path(self.output_path).expanduser().resolve())
97
106
  output_dir = Path(self.output_path).parent
98
107
  output_dir.mkdir(parents=True, exist_ok=True)
99
108
 
@@ -154,7 +163,7 @@ class CaptionConfig:
154
163
  if not self.input_path:
155
164
  raise ValueError("input_path is required but not set in CaptionConfig")
156
165
 
157
- input_file = Path(self.input_path).expanduser()
166
+ input_file = Path(self.input_path).expanduser().resolve()
158
167
  if not input_file.exists():
159
168
  raise FileNotFoundError(
160
169
  f"Input caption file does not exist: '{input_file}'. " "Please check the path and try again."
@@ -164,15 +173,20 @@ class CaptionConfig:
164
173
  f"Input caption path is not a file: '{input_file}'. " "Expected a valid caption file path."
165
174
  )
166
175
 
167
- def check_sanity(self) -> bool:
168
- """Perform sanity checks on the configuration."""
169
- assert self.is_input_path_existed(), "Input caption path must be provided and exist."
176
+ def check_sanity(self) -> None:
177
+ """Perform sanity checks on the configuration.
178
+
179
+ Raises:
180
+ ValueError: If input path is not provided or does not exist.
181
+ """
182
+ if not self.is_input_path_existed():
183
+ raise ValueError("Input caption path must be provided and exist.")
170
184
 
171
185
  def is_input_path_existed(self) -> bool:
172
186
  """Check if input caption path is provided and exists."""
173
187
  if self.input_path is None:
174
188
  return False
175
189
 
176
- input_file = Path(self.input_path).expanduser()
190
+ input_file = Path(self.input_path).expanduser().resolve()
177
191
  self.input_path = str(input_file)
178
192
  return input_file.exists() and input_file.is_file()
lattifai/config/client.py CHANGED
@@ -31,6 +31,13 @@ class ClientConfig:
31
31
  When True, prints detailed timing information for various stages of the process.
32
32
  """
33
33
 
34
+ # Client identification for usage tracking
35
+ client_name: Optional[str] = field(default="python-sdk")
36
+ """Client identifier for usage tracking (e.g., 'python-sdk', 'claude-plugin')."""
37
+
38
+ client_version: Optional[str] = field(default=None)
39
+ """Client version for usage tracking. If None, uses lattifai package version."""
40
+
34
41
  def __post_init__(self):
35
42
  """Validate and auto-populate configuration after initialization."""
36
43
 
@@ -44,6 +51,15 @@ class ClientConfig:
44
51
  if self.api_key is None:
45
52
  object.__setattr__(self, "api_key", os.environ.get("LATTIFAI_API_KEY"))
46
53
 
54
+ # Auto-load client version from package if not provided
55
+ if self.client_version is None:
56
+ try:
57
+ from importlib.metadata import version
58
+
59
+ object.__setattr__(self, "client_version", version("lattifai"))
60
+ except Exception:
61
+ object.__setattr__(self, "client_version", "unknown")
62
+
47
63
  # Validate API parameters
48
64
  if self.timeout <= 0:
49
65
  raise ValueError("timeout must be greater than 0")