lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +9 -1
  3. lattifai/alignment/lattice1_aligner.py +175 -54
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/punctuation.py +38 -0
  6. lattifai/alignment/segmenter.py +3 -2
  7. lattifai/alignment/text_align.py +441 -0
  8. lattifai/alignment/tokenizer.py +134 -65
  9. lattifai/audio2.py +162 -183
  10. lattifai/cli/__init__.py +2 -1
  11. lattifai/cli/alignment.py +5 -0
  12. lattifai/cli/caption.py +111 -4
  13. lattifai/cli/transcribe.py +2 -6
  14. lattifai/cli/youtube.py +7 -1
  15. lattifai/client.py +72 -123
  16. lattifai/config/__init__.py +28 -0
  17. lattifai/config/alignment.py +14 -0
  18. lattifai/config/caption.py +45 -31
  19. lattifai/config/client.py +16 -0
  20. lattifai/config/event.py +102 -0
  21. lattifai/config/media.py +20 -0
  22. lattifai/config/transcription.py +25 -1
  23. lattifai/data/__init__.py +8 -0
  24. lattifai/data/caption.py +228 -0
  25. lattifai/diarization/__init__.py +41 -1
  26. lattifai/errors.py +78 -53
  27. lattifai/event/__init__.py +65 -0
  28. lattifai/event/lattifai.py +166 -0
  29. lattifai/mixin.py +49 -32
  30. lattifai/transcription/base.py +8 -2
  31. lattifai/transcription/gemini.py +147 -16
  32. lattifai/transcription/lattifai.py +25 -63
  33. lattifai/types.py +1 -1
  34. lattifai/utils.py +7 -13
  35. lattifai/workflow/__init__.py +28 -4
  36. lattifai/workflow/file_manager.py +2 -5
  37. lattifai/youtube/__init__.py +43 -0
  38. lattifai/youtube/client.py +1265 -0
  39. lattifai/youtube/types.py +23 -0
  40. lattifai-1.3.0.dist-info/METADATA +678 -0
  41. lattifai-1.3.0.dist-info/RECORD +57 -0
  42. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
  43. lattifai/__init__.py +0 -88
  44. lattifai/alignment/sentence_splitter.py +0 -219
  45. lattifai/caption/__init__.py +0 -20
  46. lattifai/caption/caption.py +0 -1467
  47. lattifai/caption/gemini_reader.py +0 -462
  48. lattifai/caption/gemini_writer.py +0 -173
  49. lattifai/caption/supervision.py +0 -34
  50. lattifai/caption/text_parser.py +0 -145
  51. lattifai/cli/app_installer.py +0 -142
  52. lattifai/cli/server.py +0 -44
  53. lattifai/server/app.py +0 -427
  54. lattifai/workflow/youtube.py +0 -577
  55. lattifai-1.2.1.dist-info/METADATA +0 -1134
  56. lattifai-1.2.1.dist-info/RECORD +0 -58
  57. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
  58. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
  59. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
lattifai/errors.py CHANGED
@@ -1,10 +1,42 @@
1
1
  """Error handling and exception classes for LattifAI SDK."""
2
2
 
3
+ import functools
3
4
  import traceback
4
5
  from typing import Any, Dict, Optional
5
6
 
6
7
  import colorful
7
8
 
9
+
10
+ def format_exception(e: "LattifAIError") -> str:
11
+ """Format LattifAIError with filtered traceback (only lattifai frames)."""
12
+ tb_lines = traceback.format_exception(type(e), e, e.__traceback__)
13
+ filtered = []
14
+ skip_next_code_line = False
15
+
16
+ for i, line in enumerate(tb_lines):
17
+ if skip_next_code_line:
18
+ skip_next_code_line = False
19
+ continue
20
+
21
+ if line.startswith("Traceback") or not line.startswith(" File"):
22
+ filtered.append(line)
23
+ elif "lattifai" in line:
24
+ filtered.append(line)
25
+ if i + 1 < len(tb_lines) and tb_lines[i + 1].startswith(" "):
26
+ filtered.append(tb_lines[i + 1])
27
+ skip_next_code_line = True
28
+ elif i + 1 < len(tb_lines) and tb_lines[i + 1].startswith(" "):
29
+ skip_next_code_line = True
30
+
31
+ return "".join(filtered)
32
+
33
+
34
+ def _merge_context(kwargs: Dict[str, Any], updates: Dict[str, Any]) -> None:
35
+ """Merge updates into kwargs['context'], creating it if needed."""
36
+ context = kwargs.setdefault("context", {})
37
+ context.update(updates)
38
+
39
+
8
40
  # Error help messages
9
41
  LATTICE_DECODING_FAILURE_HELP = (
10
42
  "Failed to decode lattice alignment. Possible reasons:\n\n"
@@ -76,10 +108,8 @@ class AudioProcessingError(LattifAIError):
76
108
  """Error during audio processing operations."""
77
109
 
78
110
  def __init__(self, message: str, media_path: Optional[str] = None, **kwargs):
79
- context = kwargs.get("context", {})
80
111
  if media_path:
81
- context["media_path"] = media_path
82
- kwargs["context"] = context
112
+ _merge_context(kwargs, {"media_path": media_path})
83
113
  super().__init__(message, **kwargs)
84
114
 
85
115
 
@@ -90,11 +120,9 @@ class AudioLoadError(AudioProcessingError):
90
120
  message = f"Failed to load audio file: {colorful.red(media_path)}"
91
121
  if original_error:
92
122
  message += f" - {colorful.red(str(original_error))}"
93
-
94
- context = kwargs.get("context", {})
95
- context.update({"media_path": media_path, "original_error": str(original_error) if original_error else None})
96
- kwargs["context"] = context
97
-
123
+ _merge_context(
124
+ kwargs, {"media_path": media_path, "original_error": str(original_error) if original_error else None}
125
+ )
98
126
  super().__init__(message, media_path=media_path, **kwargs)
99
127
 
100
128
 
@@ -103,9 +131,7 @@ class AudioFormatError(AudioProcessingError):
103
131
 
104
132
  def __init__(self, media_path: str, format_issue: str, **kwargs):
105
133
  message = f"Audio format error for {colorful.red(media_path)}: {colorful.red(format_issue)}"
106
- context = kwargs.get("context", {})
107
- context.update({"media_path": media_path, "format_issue": format_issue})
108
- kwargs["context"] = context
134
+ _merge_context(kwargs, {"media_path": media_path, "format_issue": format_issue})
109
135
  super().__init__(message, media_path=media_path, **kwargs)
110
136
 
111
137
 
@@ -113,10 +139,8 @@ class CaptionProcessingError(LattifAIError):
113
139
  """Error during caption/text processing operations."""
114
140
 
115
141
  def __init__(self, message: str, caption_path: Optional[str] = None, **kwargs):
116
- context = kwargs.get("context", {})
117
142
  if caption_path:
118
- context["caption_path"] = caption_path
119
- kwargs["context"] = context
143
+ _merge_context(kwargs, {"caption_path": caption_path})
120
144
  super().__init__(message, **kwargs)
121
145
 
122
146
 
@@ -125,9 +149,7 @@ class CaptionParseError(CaptionProcessingError):
125
149
 
126
150
  def __init__(self, caption_path: str, parse_issue: str, **kwargs):
127
151
  message = f"Failed to parse caption file {caption_path}: {parse_issue}"
128
- context = kwargs.get("context", {})
129
- context.update({"caption_path": caption_path, "parse_issue": parse_issue})
130
- kwargs["context"] = context
152
+ _merge_context(kwargs, {"caption_path": caption_path, "parse_issue": parse_issue})
131
153
  super().__init__(message, caption_path=caption_path, **kwargs)
132
154
 
133
155
 
@@ -135,12 +157,13 @@ class AlignmentError(LattifAIError):
135
157
  """Error during audio-text alignment process."""
136
158
 
137
159
  def __init__(self, message: str, media_path: Optional[str] = None, caption_path: Optional[str] = None, **kwargs):
138
- context = kwargs.get("context", {})
160
+ updates = {}
139
161
  if media_path:
140
- context["media_path"] = media_path
162
+ updates["media_path"] = media_path
141
163
  if caption_path:
142
- context["caption_path"] = caption_path
143
- kwargs["context"] = context
164
+ updates["caption_path"] = caption_path
165
+ if updates:
166
+ _merge_context(kwargs, updates)
144
167
  super().__init__(message, **kwargs)
145
168
 
146
169
 
@@ -151,36 +174,44 @@ class LatticeEncodingError(AlignmentError):
151
174
  message = "Failed to generate lattice graph from text"
152
175
  if original_error:
153
176
  message += f": {colorful.red(str(original_error))}"
154
-
155
- context = kwargs.get("context", {})
156
- context.update(
177
+ text_preview = text_content[:100] + "..." if len(text_content) > 100 else text_content
178
+ _merge_context(
179
+ kwargs,
157
180
  {
158
181
  "text_content_length": len(text_content),
159
- "text_preview": text_content[:100] + "..." if len(text_content) > 100 else text_content,
182
+ "text_preview": text_preview,
160
183
  "original_error": str(original_error) if original_error else None,
161
- }
184
+ },
162
185
  )
163
- kwargs["context"] = context
164
186
  super().__init__(message, **kwargs)
165
187
 
166
188
 
167
189
  class LatticeDecodingError(AlignmentError):
168
190
  """Error decoding lattice alignment results."""
169
191
 
170
- def __init__(self, lattice_id: str, original_error: Optional[Exception] = None, **kwargs):
171
- message = f"Failed to decode lattice alignment results for lattice ID: {colorful.red(lattice_id)}"
192
+ def __init__(
193
+ self,
194
+ lattice_id: str,
195
+ message: Optional[str] = None,
196
+ original_error: Optional[Exception] = None,
197
+ skip_help: bool = False,
198
+ **kwargs,
199
+ ):
200
+ message = message or f"Failed to decode lattice alignment results for lattice ID: {colorful.red(lattice_id)}"
172
201
 
173
- # Don't duplicate the help message if it's already in original_error
174
- if original_error and str(original_error) != LATTICE_DECODING_FAILURE_HELP:
175
- message += f" - {colorful.red(str(original_error))}"
202
+ error_str = str(original_error) if original_error else None
203
+ is_help_message = error_str == LATTICE_DECODING_FAILURE_HELP
204
+
205
+ if original_error and not is_help_message:
206
+ message += f" - {colorful.red(error_str)}"
207
+
208
+ context_updates = {"lattice_id": lattice_id}
209
+ if original_error and not is_help_message:
210
+ context_updates["original_error"] = error_str
211
+ _merge_context(kwargs, context_updates)
176
212
 
177
- context = kwargs.get("context", {})
178
- # Don't store the entire help message in context to avoid duplication
179
- if original_error and str(original_error) != LATTICE_DECODING_FAILURE_HELP:
180
- context["original_error"] = str(original_error)
181
- context["lattice_id"] = lattice_id
182
- kwargs["context"] = context
183
213
  super().__init__(message, **kwargs)
214
+ self.skip_help = skip_help
184
215
 
185
216
  def get_message(self) -> str:
186
217
  """Return formatted error message with help text."""
@@ -188,8 +219,9 @@ class LatticeDecodingError(AlignmentError):
188
219
  if self.context and self.context.get("lattice_id"):
189
220
  # Only show essential context (lattice_id), not the duplicated help message
190
221
  base_message += f'\n{colorful.yellow("Lattice ID:")} {self.context["lattice_id"]}'
191
- # Append help message once at the end
192
- base_message += f"\n\n{colorful.yellow(LATTICE_DECODING_FAILURE_HELP)}"
222
+ # Append help message only if not skipped (e.g., when anomaly info is provided)
223
+ if not self.skip_help:
224
+ base_message += f"\n\n{colorful.yellow(LATTICE_DECODING_FAILURE_HELP)}"
193
225
  return base_message
194
226
 
195
227
 
@@ -200,10 +232,9 @@ class ModelLoadError(LattifAIError):
200
232
  message = f"Failed to load model: {colorful.red(model_name)}"
201
233
  if original_error:
202
234
  message += f" - {colorful.red(str(original_error))}"
203
-
204
- context = kwargs.get("context", {})
205
- context.update({"model_name": model_name, "original_error": str(original_error) if original_error else None})
206
- kwargs["context"] = context
235
+ _merge_context(
236
+ kwargs, {"model_name": model_name, "original_error": str(original_error) if original_error else None}
237
+ )
207
238
  super().__init__(message, **kwargs)
208
239
 
209
240
 
@@ -214,10 +245,7 @@ class DependencyError(LattifAIError):
214
245
  message = f"Missing required dependency: {colorful.red(dependency_name)}"
215
246
  if install_command:
216
247
  message += f"\nPlease install it using: {colorful.yellow(install_command)}"
217
-
218
- context = kwargs.get("context", {})
219
- context.update({"dependency_name": dependency_name, "install_command": install_command})
220
- kwargs["context"] = context
248
+ _merge_context(kwargs, {"dependency_name": dependency_name, "install_command": install_command})
221
249
  super().__init__(message, **kwargs)
222
250
 
223
251
 
@@ -225,9 +253,7 @@ class APIError(LattifAIError):
225
253
  """Error communicating with LattifAI API."""
226
254
 
227
255
  def __init__(self, message: str, status_code: Optional[int] = None, response_text: Optional[str] = None, **kwargs):
228
- context = kwargs.get("context", {})
229
- context.update({"status_code": status_code, "response_text": response_text})
230
- kwargs["context"] = context
256
+ _merge_context(kwargs, {"status_code": status_code, "response_text": response_text})
231
257
  super().__init__(message, **kwargs)
232
258
 
233
259
 
@@ -249,14 +275,13 @@ class QuotaExceededError(APIError):
249
275
  def handle_exception(func):
250
276
  """Decorator to handle exceptions and convert them to LattifAI errors."""
251
277
 
278
+ @functools.wraps(func)
252
279
  def wrapper(*args, **kwargs):
253
280
  try:
254
281
  return func(*args, **kwargs)
255
282
  except LattifAIError:
256
- # Re-raise LattifAI errors as-is
257
283
  raise
258
284
  except Exception as e:
259
- # Convert other exceptions to LattifAI errors
260
285
  error_msg = f"Unexpected error in {func.__name__}: {str(e)}"
261
286
  context = {
262
287
  "function": func.__name__,
@@ -0,0 +1,65 @@
1
+ """Audio Event Detection module for LattifAI.
2
+
3
+ This module provides audio event detection capabilities, it can identify various
4
+ audio events including speech, music, singing, and demographic characteristics
5
+ (male, female, child voices).
6
+
7
+ Key Components:
8
+ LattifAIEventDetector: Main class that wraps lattifai_core's
9
+ EventDetector for seamless integration with LattifAI workflows.
10
+
11
+ Features:
12
+ - Multi-class audio event detection (30+ reduced classes or 400+ full classes)
13
+ - Voice Activity Detection (VAD) for speech segmentation
14
+ - Gender/age classification for speech segments
15
+ - Configurable detection thresholds and top-k filtering
16
+ - Support for both bundled and custom pretrained models
17
+
18
+ Detected Event Types:
19
+ - Speech: General speech activity
20
+ - Male/Female/Child: Speaker demographic classification
21
+ - Music: Musical content detection
22
+ - Singing: Vocal music detection
23
+ - Synthetic: Synthetic/electronic sounds
24
+
25
+ Configuration:
26
+ Use EventConfig to control:
27
+ - enabled: Whether to run audio event detection
28
+ - device: GPU/CPU device selection
29
+ - dtype: Model precision (float32, float16, bfloat16)
30
+ - reduced: Use reduced label set (33 vs 400+ classes)
31
+ - top_k: Number of top event classes to detect
32
+ - vad_chunk_size/vad_max_gap: VAD segmentation parameters
33
+
34
+ Example:
35
+ >>> from lattifai.event import LattifAIEventDetector
36
+ >>> from lattifai.config import EventConfig
37
+ >>> from lattifai.audio2 import AudioLoader
38
+ >>>
39
+ >>> config = EventConfig(enabled=True, device="cuda")
40
+ >>> detector = LattifAIEventDetector(config)
41
+ >>>
42
+ >>> audio = AudioLoader.load("speech.wav")
43
+ >>> result = detector.detect(audio)
44
+ >>>
45
+ >>> # Access VAD segments directly
46
+ >>> for start, end in result.vad_segments:
47
+ ... print(f"Speech: {start:.2f} - {end:.2f}")
48
+ >>>
49
+ >>> # Or access the full TextGrid
50
+ >>> print(result.audio_events)
51
+
52
+ Performance Notes:
53
+ - GPU acceleration provides significant speedup (10x+ over CPU)
54
+ - Use dtype="float16" for faster inference with minimal accuracy loss
55
+ - fast_mode=True reduces computation by only detecting top_k classes
56
+ - Long audio files are automatically chunked to manage memory
57
+
58
+ See Also:
59
+ - lattifai.config.EventConfig: Configuration options
60
+ - lattifai_core.event: Core event detection implementation
61
+ """
62
+
63
+ from .lattifai import LattifAIEventDetector
64
+
65
+ __all__ = ["LattifAIEventDetector"]
@@ -0,0 +1,166 @@
1
+ """LattifAI Audio Event Detection implementation."""
2
+
3
+ import logging
4
+ from typing import TYPE_CHECKING, Optional
5
+
6
+ from lattifai.audio2 import AudioData
7
+ from lattifai.config.event import EventConfig
8
+ from lattifai.logging import get_logger
9
+
10
+ if TYPE_CHECKING:
11
+ from lattifai_core.event import LEDOutput
12
+
13
+ from lattifai.data import Caption
14
+
15
+
16
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
17
+ logging.basicConfig(format=formatter, level=logging.INFO)
18
+
19
+
20
+ class LattifAIEventDetector:
21
+ """
22
+ LattifAI Audio Event Detector.
23
+
24
+ This class provides a high-level interface for audio event detection,
25
+ wrapping the core LattifAIEventDetector from lattifai_core.
26
+
27
+ Attributes:
28
+ config: EventConfig configuration object.
29
+
30
+ Example:
31
+ >>> from lattifai.event import LattifAIEventDetector
32
+ >>> from lattifai.config import EventConfig
33
+ >>>
34
+ >>> config = EventConfig(enabled=True, device="cuda")
35
+ >>> detector = LattifAIEventDetector(config)
36
+ >>>
37
+ >>> # Detect events from audio data
38
+ >>> result = detector.detect(audio_data)
39
+ >>>
40
+ >>> # Access VAD segments directly
41
+ >>> for start, end in result.vad_segments:
42
+ ... print(f"Speech: {start:.2f} - {end:.2f}")
43
+ >>>
44
+ >>> # Or access the full TextGrid
45
+ >>> for tier in result.audio_events.tiers:
46
+ ... print(f"Event type: {tier.name}")
47
+ """
48
+
49
+ def __init__(self, config: EventConfig):
50
+ """
51
+ Initialize LattifAI Audio Event Detector.
52
+
53
+ Args:
54
+ config: EventConfig configuration.
55
+ """
56
+ self.config = config
57
+ self.logger = get_logger("event")
58
+ self._detector = None
59
+
60
+ @property
61
+ def name(self) -> str:
62
+ """Human-readable name of the detector."""
63
+ return "LattifAI_EventDetector"
64
+
65
+ @property
66
+ def detector(self):
67
+ """Lazy-load and return the audio event detector."""
68
+ if self._detector is None:
69
+ from lattifai_core.event import LattifAIEventDetector as CoreEventDetector
70
+
71
+ self._detector = CoreEventDetector.from_pretrained(
72
+ model_path=self.config.model_path,
73
+ device=self.config.device,
74
+ client_wrapper=self.config.client_wrapper,
75
+ )
76
+ return self._detector
77
+
78
+ def detect(
79
+ self,
80
+ input_media: AudioData,
81
+ vad_chunk_size: Optional[float] = None,
82
+ vad_max_gap: Optional[float] = None,
83
+ fast_mode: Optional[bool] = None,
84
+ ) -> "LEDOutput":
85
+ """
86
+ Detect audio events in the input audio.
87
+
88
+ Args:
89
+ input_media: Audio data to analyze.
90
+ vad_chunk_size: Override config vad_chunk_size.
91
+ vad_max_gap: Override config vad_max_gap.
92
+ fast_mode: Override config fast_mode.
93
+
94
+ Returns:
95
+ LEDOutput containing audio_events, event_names, vad_segments.
96
+ """
97
+ return self.detector(
98
+ audio=input_media,
99
+ vad_chunk_size=vad_chunk_size or self.config.vad_chunk_size,
100
+ vad_max_gap=vad_max_gap or self.config.vad_max_gap,
101
+ fast_mode=fast_mode if fast_mode is not None else self.config.fast_mode,
102
+ custom_aliases=self.config.event_aliases or {},
103
+ )
104
+
105
+ def profiling(self, reset: bool = False) -> str:
106
+ """Get profiling information for the detector."""
107
+ if self._detector is None:
108
+ return ""
109
+ return self.detector.profiling(reset=reset, logger=self.logger)
110
+
111
+ def detect_and_update_caption(
112
+ self,
113
+ caption: "Caption",
114
+ input_media: AudioData,
115
+ vad_chunk_size: Optional[float] = None,
116
+ vad_max_gap: Optional[float] = None,
117
+ fast_mode: Optional[bool] = None,
118
+ ) -> "Caption":
119
+ """
120
+ Run event detection and update caption with audio events.
121
+
122
+ This is the main entry point for integrating event detection with alignment.
123
+ When event_matching is enabled, it also updates caption timestamps for [Event] markers.
124
+
125
+ Args:
126
+ audio: AudioData to analyze
127
+ caption: Caption to update with event detection results
128
+
129
+ Returns:
130
+ Updated Caption with event field populated
131
+ """
132
+ # Event matching: update caption timestamps based on detected events
133
+ if self.config.event_matching:
134
+ # Get supervisions to process
135
+ supervisions = caption.alignments or caption.supervisions
136
+
137
+ led_output, supervisions = self.detector.detect_and_update_supervisions(
138
+ supervisions=supervisions,
139
+ audio=input_media,
140
+ vad_chunk_size=vad_chunk_size or self.config.vad_chunk_size,
141
+ vad_max_gap=vad_max_gap or self.config.vad_max_gap,
142
+ fast_mode=fast_mode if fast_mode is not None else self.config.fast_mode,
143
+ custom_aliases=self.config.event_aliases or {},
144
+ extra_events=self.config.extra_events or None,
145
+ time_tolerance=self.config.time_tolerance,
146
+ update_timestamps=self.config.update_timestamps,
147
+ duplicate_strategy=self.config.duplicate_strategy,
148
+ )
149
+ # Store LEDOutput in caption
150
+ caption.event = led_output
151
+
152
+ if caption.alignments:
153
+ caption.alignments = supervisions
154
+ else:
155
+ caption.supervisions = supervisions
156
+ else:
157
+ # Simple detection without event matching
158
+ led_output = self.detect(
159
+ input_media=input_media,
160
+ vad_chunk_size=vad_chunk_size,
161
+ vad_max_gap=vad_max_gap,
162
+ fast_mode=fast_mode,
163
+ )
164
+ caption.event = led_output
165
+
166
+ return caption
lattifai/mixin.py CHANGED
@@ -8,12 +8,12 @@ import colorful
8
8
  from lhotse.utils import Pathlike
9
9
 
10
10
  from lattifai.audio2 import AudioData
11
- from lattifai.caption import Caption
11
+ from lattifai.data import Caption
12
12
  from lattifai.errors import CaptionProcessingError
13
13
  from lattifai.utils import safe_print
14
14
 
15
15
  if TYPE_CHECKING:
16
- from .config import AlignmentConfig, CaptionConfig, ClientConfig, DiarizationConfig, TranscriptionConfig
16
+ from .config import AlignmentConfig, DiarizationConfig, EventConfig, TranscriptionConfig
17
17
 
18
18
 
19
19
  class LattifAIClientMixin:
@@ -170,9 +170,10 @@ class LattifAIClientMixin:
170
170
  alignment_config: Optional["AlignmentConfig"],
171
171
  transcription_config: Optional["TranscriptionConfig"],
172
172
  diarization_config: Optional["DiarizationConfig"] = None,
173
+ event_config: Optional["EventConfig"] = None,
173
174
  ) -> tuple:
174
175
  """Initialize all configs with defaults if not provided."""
175
- from .config import AlignmentConfig, DiarizationConfig, TranscriptionConfig
176
+ from .config import AlignmentConfig, DiarizationConfig, EventConfig, TranscriptionConfig
176
177
 
177
178
  if alignment_config is None:
178
179
  alignment_config = AlignmentConfig()
@@ -180,20 +181,24 @@ class LattifAIClientMixin:
180
181
  transcription_config = TranscriptionConfig()
181
182
  if diarization_config is None:
182
183
  diarization_config = DiarizationConfig()
184
+ if event_config is None:
185
+ event_config = EventConfig()
183
186
 
184
187
  from lattifai.utils import _resolve_model_path
185
188
 
186
- if transcription_config is not None:
187
- transcription_config.lattice_model_path = _resolve_model_path(
188
- alignment_config.model_name, getattr(alignment_config, "model_hub", "huggingface")
189
- )
189
+ model_path = _resolve_model_path(
190
+ alignment_config.model_name, getattr(alignment_config, "model_hub", "modelscope")
191
+ )
192
+ transcription_config.lattice_model_path = model_path
193
+ event_config.model_path = model_path
190
194
 
191
195
  # Set client_wrapper for all configs
192
196
  alignment_config.client_wrapper = self
193
197
  transcription_config.client_wrapper = self
194
198
  diarization_config.client_wrapper = self
199
+ event_config.client_wrapper = self
195
200
 
196
- return alignment_config, transcription_config, diarization_config
201
+ return alignment_config, transcription_config, diarization_config, event_config
197
202
 
198
203
  def _init_shared_components(
199
204
  self,
@@ -220,19 +225,16 @@ class LattifAIClientMixin:
220
225
  def downloader(self):
221
226
  """Lazy load YouTube downloader."""
222
227
  if self._downloader is None:
223
- from .workflow.youtube import YouTubeDownloader
228
+ from .youtube import YouTubeDownloader
224
229
 
225
230
  self._downloader = YouTubeDownloader()
226
231
  return self._downloader
227
232
 
228
233
  def _prepare_youtube_output_dir(self, output_dir: Optional["Pathlike"]) -> Path:
229
234
  """Prepare and return output directory for YouTube downloads."""
230
- if output_dir is None:
231
- output_dir = Path(tempfile.gettempdir()) / "lattifai_youtube"
232
- else:
233
- output_dir = Path(output_dir).expanduser()
234
- output_dir.mkdir(parents=True, exist_ok=True)
235
- return output_dir
235
+ output_path = Path(output_dir).expanduser() if output_dir else Path(tempfile.gettempdir()) / "lattifai_youtube"
236
+ output_path.mkdir(parents=True, exist_ok=True)
237
+ return output_path
236
238
 
237
239
  def _determine_media_format(self, media_format: Optional[str]) -> str:
238
240
  """Determine media format from parameter or config."""
@@ -242,11 +244,11 @@ class LattifAIClientMixin:
242
244
  self, output_caption_path: Optional["Pathlike"], media_file: str, output_dir: Path
243
245
  ) -> Path:
244
246
  """Generate output caption path if not provided."""
245
- if not output_caption_path:
246
- media_name = Path(media_file).stem
247
- output_format = self.caption_config.output_format or "srt"
248
- output_caption_path = output_dir / f"{media_name}_LattifAI.{output_format}"
249
- return Path(output_caption_path)
247
+ if output_caption_path:
248
+ return Path(output_caption_path)
249
+ media_name = Path(media_file).stem
250
+ output_format = self.caption_config.output_format or "srt"
251
+ return output_dir / f"{media_name}_LattifAI.{output_format}"
250
252
 
251
253
  def _validate_transcription_setup(self) -> None:
252
254
  """Validate that transcription is properly configured if requested."""
@@ -287,18 +289,18 @@ class LattifAIClientMixin:
287
289
  format=input_caption_format,
288
290
  normalize_text=normalize_text if normalize_text is not None else self.caption_config.normalize_text,
289
291
  )
290
- diarization_file = Path(str(input_caption)).with_suffix(".SpkDiar")
292
+ diarization_file = Path(str(input_caption)).with_suffix(".Diarization")
291
293
  if diarization_file.exists():
292
294
  if verbose:
293
295
  safe_print(colorful.cyan(f"📖 Step1b: Reading speaker diarization from {diarization_file}"))
294
- caption.read_speaker_diarization(diarization_file)
295
- events_file = Path(str(input_caption)).with_suffix(".AED")
296
- if events_file.exists():
296
+ caption.read_diarization(diarization_file)
297
+ event_file = Path(str(input_caption)).with_suffix(".LED")
298
+ if event_file.exists():
297
299
  if verbose:
298
- safe_print(colorful.cyan(f"📖 Step1c: Reading audio events from {events_file}"))
299
- from tgt import read_textgrid
300
+ safe_print(colorful.cyan(f"📖 Step1c: Reading audio events from {event_file}"))
301
+ from lattifai_core.event import LEDOutput
300
302
 
301
- caption.audio_events = read_textgrid(events_file)
303
+ caption.event = LEDOutput.read(event_file)
302
304
 
303
305
  if verbose:
304
306
  safe_print(colorful.green(f" ✓ Parsed {len(caption)} caption segments"))
@@ -332,11 +334,13 @@ class LattifAIClientMixin:
332
334
  result = caption.write(
333
335
  output_caption_path,
334
336
  include_speaker_in_text=self.caption_config.include_speaker_in_text,
337
+ word_level=self.caption_config.word_level,
338
+ karaoke_config=self.caption_config.karaoke,
335
339
  )
336
340
  diarization_file = Path(str(output_caption_path)).with_suffix(".SpkDiar")
337
- if not diarization_file.exists() and caption.speaker_diarization:
341
+ if not diarization_file.exists() and caption.diarization:
338
342
  safe_print(colorful.green(f" Writing speaker diarization to: {diarization_file}"))
339
- caption.write_speaker_diarization(diarization_file)
343
+ caption.write_diarization(diarization_file)
340
344
 
341
345
  safe_print(colorful.green(f"🎉🎉🎉🎉🎉 Caption file written to: {output_caption_path}"))
342
346
  return result
@@ -353,14 +357,22 @@ class LattifAIClientMixin:
353
357
  output_dir: Path,
354
358
  media_format: str,
355
359
  force_overwrite: bool,
360
+ audio_track_id: Optional[str] = "original",
361
+ quality: str = "best",
356
362
  ) -> str:
357
363
  """Download media from YouTube (async implementation)."""
358
364
  safe_print(colorful.cyan("📥 Downloading media from YouTube..."))
365
+ if audio_track_id:
366
+ safe_print(colorful.cyan(f" Audio track: {audio_track_id}"))
367
+ if quality != "best":
368
+ safe_print(colorful.cyan(f" Quality: {quality}"))
359
369
  media_file = await self.downloader.download_media(
360
370
  url=url,
361
371
  output_dir=str(output_dir),
362
372
  media_format=media_format,
363
373
  force_overwrite=force_overwrite,
374
+ audio_track_id=audio_track_id,
375
+ quality=quality,
364
376
  )
365
377
  safe_print(colorful.green(f" ✓ Media downloaded: {media_file}"))
366
378
  return media_file
@@ -371,11 +383,15 @@ class LattifAIClientMixin:
371
383
  output_dir: Path,
372
384
  media_format: str,
373
385
  force_overwrite: bool,
386
+ audio_track_id: Optional[str] = "original",
387
+ quality: str = "best",
374
388
  ) -> str:
375
389
  """Download media from YouTube (sync wrapper)."""
376
390
  import asyncio
377
391
 
378
- return asyncio.run(self._download_media(url, output_dir, media_format, force_overwrite))
392
+ return asyncio.run(
393
+ self._download_media(url, output_dir, media_format, force_overwrite, audio_track_id, quality)
394
+ )
379
395
 
380
396
  def _transcribe(
381
397
  self,
@@ -408,7 +424,7 @@ class LattifAIClientMixin:
408
424
  # Generate transcript file path
409
425
  transcript_file = output_dir / f"{Path(str(media_file)).stem}_{self.transcriber.file_name}"
410
426
  if transcript_file.exists():
411
- safe_print(colorful.cyan(f" Using existing transcript file: {transcript_file}"))
427
+ safe_print(colorful.cyan(f" Using existing transcript file: {transcript_file}"))
412
428
  transcription = self._read_caption(transcript_file, normalize_text=False)
413
429
  return transcription
414
430
 
@@ -485,11 +501,12 @@ class LattifAIClientMixin:
485
501
  """
486
502
  import asyncio
487
503
 
488
- from lattifai.workflow.youtube import TRANSCRIBE_CHOICE
504
+ from lattifai.workflow.file_manager import TRANSCRIBE_CHOICE
489
505
 
490
506
  transcriber_name = self.transcriber.name
491
507
 
492
508
  async def _async_impl():
509
+ nonlocal use_transcription # Allow modification of outer variable
493
510
  # First check if caption input_path is already provided
494
511
  if self.caption_config.input_path:
495
512
  caption_path = Path(self.caption_config.input_path)