lattifai 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +2 -3
  3. lattifai/alignment/lattice1_aligner.py +117 -4
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/segmenter.py +3 -2
  6. lattifai/alignment/text_align.py +2 -1
  7. lattifai/alignment/tokenizer.py +56 -29
  8. lattifai/audio2.py +162 -183
  9. lattifai/cli/alignment.py +5 -0
  10. lattifai/cli/caption.py +6 -6
  11. lattifai/cli/transcribe.py +1 -5
  12. lattifai/cli/youtube.py +3 -0
  13. lattifai/client.py +41 -12
  14. lattifai/config/__init__.py +21 -3
  15. lattifai/config/alignment.py +7 -0
  16. lattifai/config/caption.py +13 -243
  17. lattifai/config/client.py +16 -0
  18. lattifai/config/event.py +102 -0
  19. lattifai/config/transcription.py +25 -1
  20. lattifai/data/__init__.py +8 -0
  21. lattifai/data/caption.py +228 -0
  22. lattifai/errors.py +78 -53
  23. lattifai/event/__init__.py +65 -0
  24. lattifai/event/lattifai.py +166 -0
  25. lattifai/mixin.py +22 -17
  26. lattifai/transcription/base.py +2 -1
  27. lattifai/transcription/gemini.py +147 -16
  28. lattifai/transcription/lattifai.py +8 -11
  29. lattifai/types.py +1 -1
  30. lattifai/youtube/client.py +143 -48
  31. {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/METADATA +129 -58
  32. lattifai-1.3.1.dist-info/RECORD +57 -0
  33. lattifai/__init__.py +0 -88
  34. lattifai/alignment/sentence_splitter.py +0 -350
  35. lattifai/caption/__init__.py +0 -96
  36. lattifai/caption/caption.py +0 -661
  37. lattifai/caption/formats/__init__.py +0 -199
  38. lattifai/caption/formats/base.py +0 -211
  39. lattifai/caption/formats/gemini.py +0 -722
  40. lattifai/caption/formats/json.py +0 -194
  41. lattifai/caption/formats/lrc.py +0 -309
  42. lattifai/caption/formats/nle/__init__.py +0 -9
  43. lattifai/caption/formats/nle/audition.py +0 -561
  44. lattifai/caption/formats/nle/avid.py +0 -423
  45. lattifai/caption/formats/nle/fcpxml.py +0 -549
  46. lattifai/caption/formats/nle/premiere.py +0 -589
  47. lattifai/caption/formats/pysubs2.py +0 -642
  48. lattifai/caption/formats/sbv.py +0 -147
  49. lattifai/caption/formats/tabular.py +0 -338
  50. lattifai/caption/formats/textgrid.py +0 -193
  51. lattifai/caption/formats/ttml.py +0 -652
  52. lattifai/caption/formats/vtt.py +0 -469
  53. lattifai/caption/parsers/__init__.py +0 -9
  54. lattifai/caption/parsers/text_parser.py +0 -147
  55. lattifai/caption/standardize.py +0 -636
  56. lattifai/caption/supervision.py +0 -34
  57. lattifai/caption/utils.py +0 -474
  58. lattifai-1.2.2.dist-info/RECORD +0 -76
  59. {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/WHEEL +0 -0
  60. {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/entry_points.txt +0 -0
  61. {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/licenses/LICENSE +0 -0
  62. {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,166 @@
1
+ """LattifAI Audio Event Detection implementation."""
2
+
3
+ import logging
4
+ from typing import TYPE_CHECKING, Optional
5
+
6
+ from lattifai.audio2 import AudioData
7
+ from lattifai.config.event import EventConfig
8
+ from lattifai.logging import get_logger
9
+
10
+ if TYPE_CHECKING:
11
+ from lattifai_core.event import LEDOutput
12
+
13
+ from lattifai.data import Caption
14
+
15
+
16
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
17
+ logging.basicConfig(format=formatter, level=logging.INFO)
18
+
19
+
20
+ class LattifAIEventDetector:
21
+ """
22
+ LattifAI Audio Event Detector.
23
+
24
+ This class provides a high-level interface for audio event detection,
25
+ wrapping the core LattifAIEventDetector from lattifai_core.
26
+
27
+ Attributes:
28
+ config: EventConfig configuration object.
29
+
30
+ Example:
31
+ >>> from lattifai.event import LattifAIEventDetector
32
+ >>> from lattifai.config import EventConfig
33
+ >>>
34
+ >>> config = EventConfig(enabled=True, device="cuda")
35
+ >>> detector = LattifAIEventDetector(config)
36
+ >>>
37
+ >>> # Detect events from audio data
38
+ >>> result = detector.detect(audio_data)
39
+ >>>
40
+ >>> # Access VAD segments directly
41
+ >>> for start, end in result.vad_segments:
42
+ ... print(f"Speech: {start:.2f} - {end:.2f}")
43
+ >>>
44
+ >>> # Or access the full TextGrid
45
+ >>> for tier in result.audio_events.tiers:
46
+ ... print(f"Event type: {tier.name}")
47
+ """
48
+
49
+ def __init__(self, config: EventConfig):
50
+ """
51
+ Initialize LattifAI Audio Event Detector.
52
+
53
+ Args:
54
+ config: EventConfig configuration.
55
+ """
56
+ self.config = config
57
+ self.logger = get_logger("event")
58
+ self._detector = None
59
+
60
+ @property
61
+ def name(self) -> str:
62
+ """Human-readable name of the detector."""
63
+ return "LattifAI_EventDetector"
64
+
65
+ @property
66
+ def detector(self):
67
+ """Lazy-load and return the audio event detector."""
68
+ if self._detector is None:
69
+ from lattifai_core.event import LattifAIEventDetector as CoreEventDetector
70
+
71
+ self._detector = CoreEventDetector.from_pretrained(
72
+ model_path=self.config.model_path,
73
+ device=self.config.device,
74
+ client_wrapper=self.config.client_wrapper,
75
+ )
76
+ return self._detector
77
+
78
+ def detect(
79
+ self,
80
+ input_media: AudioData,
81
+ vad_chunk_size: Optional[float] = None,
82
+ vad_max_gap: Optional[float] = None,
83
+ fast_mode: Optional[bool] = None,
84
+ ) -> "LEDOutput":
85
+ """
86
+ Detect audio events in the input audio.
87
+
88
+ Args:
89
+ input_media: Audio data to analyze.
90
+ vad_chunk_size: Override config vad_chunk_size.
91
+ vad_max_gap: Override config vad_max_gap.
92
+ fast_mode: Override config fast_mode.
93
+
94
+ Returns:
95
+ LEDOutput containing audio_events, event_names, vad_segments.
96
+ """
97
+ return self.detector(
98
+ audio=input_media,
99
+ vad_chunk_size=vad_chunk_size or self.config.vad_chunk_size,
100
+ vad_max_gap=vad_max_gap or self.config.vad_max_gap,
101
+ fast_mode=fast_mode if fast_mode is not None else self.config.fast_mode,
102
+ custom_aliases=self.config.event_aliases or {},
103
+ )
104
+
105
+ def profiling(self, reset: bool = False) -> str:
106
+ """Get profiling information for the detector."""
107
+ if self._detector is None:
108
+ return ""
109
+ return self.detector.profiling(reset=reset, logger=self.logger)
110
+
111
+ def detect_and_update_caption(
112
+ self,
113
+ caption: "Caption",
114
+ input_media: AudioData,
115
+ vad_chunk_size: Optional[float] = None,
116
+ vad_max_gap: Optional[float] = None,
117
+ fast_mode: Optional[bool] = None,
118
+ ) -> "Caption":
119
+ """
120
+ Run event detection and update caption with audio events.
121
+
122
+ This is the main entry point for integrating event detection with alignment.
123
+ When event_matching is enabled, it also updates caption timestamps for [Event] markers.
124
+
125
+ Args:
126
+ audio: AudioData to analyze
127
+ caption: Caption to update with event detection results
128
+
129
+ Returns:
130
+ Updated Caption with event field populated
131
+ """
132
+ # Event matching: update caption timestamps based on detected events
133
+ if self.config.event_matching:
134
+ # Get supervisions to process
135
+ supervisions = caption.alignments or caption.supervisions
136
+
137
+ led_output, supervisions = self.detector.detect_and_update_supervisions(
138
+ supervisions=supervisions,
139
+ audio=input_media,
140
+ vad_chunk_size=vad_chunk_size or self.config.vad_chunk_size,
141
+ vad_max_gap=vad_max_gap or self.config.vad_max_gap,
142
+ fast_mode=fast_mode if fast_mode is not None else self.config.fast_mode,
143
+ custom_aliases=self.config.event_aliases or {},
144
+ extra_events=self.config.extra_events or None,
145
+ time_tolerance=self.config.time_tolerance,
146
+ update_timestamps=self.config.update_timestamps,
147
+ duplicate_strategy=self.config.duplicate_strategy,
148
+ )
149
+ # Store LEDOutput in caption
150
+ caption.event = led_output
151
+
152
+ if caption.alignments:
153
+ caption.alignments = supervisions
154
+ else:
155
+ caption.supervisions = supervisions
156
+ else:
157
+ # Simple detection without event matching
158
+ led_output = self.detect(
159
+ input_media=input_media,
160
+ vad_chunk_size=vad_chunk_size,
161
+ vad_max_gap=vad_max_gap,
162
+ fast_mode=fast_mode,
163
+ )
164
+ caption.event = led_output
165
+
166
+ return caption
lattifai/mixin.py CHANGED
@@ -8,12 +8,12 @@ import colorful
8
8
  from lhotse.utils import Pathlike
9
9
 
10
10
  from lattifai.audio2 import AudioData
11
- from lattifai.caption import Caption
11
+ from lattifai.data import Caption
12
12
  from lattifai.errors import CaptionProcessingError
13
13
  from lattifai.utils import safe_print
14
14
 
15
15
  if TYPE_CHECKING:
16
- from .config import AlignmentConfig, CaptionConfig, ClientConfig, DiarizationConfig, TranscriptionConfig
16
+ from .config import AlignmentConfig, DiarizationConfig, EventConfig, TranscriptionConfig
17
17
 
18
18
 
19
19
  class LattifAIClientMixin:
@@ -170,9 +170,10 @@ class LattifAIClientMixin:
170
170
  alignment_config: Optional["AlignmentConfig"],
171
171
  transcription_config: Optional["TranscriptionConfig"],
172
172
  diarization_config: Optional["DiarizationConfig"] = None,
173
+ event_config: Optional["EventConfig"] = None,
173
174
  ) -> tuple:
174
175
  """Initialize all configs with defaults if not provided."""
175
- from .config import AlignmentConfig, DiarizationConfig, TranscriptionConfig
176
+ from .config import AlignmentConfig, DiarizationConfig, EventConfig, TranscriptionConfig
176
177
 
177
178
  if alignment_config is None:
178
179
  alignment_config = AlignmentConfig()
@@ -180,20 +181,24 @@ class LattifAIClientMixin:
180
181
  transcription_config = TranscriptionConfig()
181
182
  if diarization_config is None:
182
183
  diarization_config = DiarizationConfig()
184
+ if event_config is None:
185
+ event_config = EventConfig()
183
186
 
184
187
  from lattifai.utils import _resolve_model_path
185
188
 
186
- if transcription_config is not None:
187
- transcription_config.lattice_model_path = _resolve_model_path(
188
- alignment_config.model_name, getattr(alignment_config, "model_hub", "huggingface")
189
- )
189
+ model_path = _resolve_model_path(
190
+ alignment_config.model_name, getattr(alignment_config, "model_hub", "modelscope")
191
+ )
192
+ transcription_config.lattice_model_path = model_path
193
+ event_config.model_path = model_path
190
194
 
191
195
  # Set client_wrapper for all configs
192
196
  alignment_config.client_wrapper = self
193
197
  transcription_config.client_wrapper = self
194
198
  diarization_config.client_wrapper = self
199
+ event_config.client_wrapper = self
195
200
 
196
- return alignment_config, transcription_config, diarization_config
201
+ return alignment_config, transcription_config, diarization_config, event_config
197
202
 
198
203
  def _init_shared_components(
199
204
  self,
@@ -284,18 +289,18 @@ class LattifAIClientMixin:
284
289
  format=input_caption_format,
285
290
  normalize_text=normalize_text if normalize_text is not None else self.caption_config.normalize_text,
286
291
  )
287
- diarization_file = Path(str(input_caption)).with_suffix(".SpkDiar")
292
+ diarization_file = Path(str(input_caption)).with_suffix(".Diarization")
288
293
  if diarization_file.exists():
289
294
  if verbose:
290
295
  safe_print(colorful.cyan(f"📖 Step1b: Reading speaker diarization from {diarization_file}"))
291
- caption.read_speaker_diarization(diarization_file)
292
- events_file = Path(str(input_caption)).with_suffix(".AED")
293
- if events_file.exists():
296
+ caption.read_diarization(diarization_file)
297
+ event_file = Path(str(input_caption)).with_suffix(".LED")
298
+ if event_file.exists():
294
299
  if verbose:
295
- safe_print(colorful.cyan(f"📖 Step1c: Reading audio events from {events_file}"))
296
- from tgt import read_textgrid
300
+ safe_print(colorful.cyan(f"📖 Step1c: Reading audio events from {event_file}"))
301
+ from lattifai_core.event import LEDOutput
297
302
 
298
- caption.audio_events = read_textgrid(events_file)
303
+ caption.event = LEDOutput.read(event_file)
299
304
 
300
305
  if verbose:
301
306
  safe_print(colorful.green(f" ✓ Parsed {len(caption)} caption segments"))
@@ -333,9 +338,9 @@ class LattifAIClientMixin:
333
338
  karaoke_config=self.caption_config.karaoke,
334
339
  )
335
340
  diarization_file = Path(str(output_caption_path)).with_suffix(".SpkDiar")
336
- if not diarization_file.exists() and caption.speaker_diarization:
341
+ if not diarization_file.exists() and caption.diarization:
337
342
  safe_print(colorful.green(f" Writing speaker diarization to: {diarization_file}"))
338
- caption.write_speaker_diarization(diarization_file)
343
+ caption.write_diarization(diarization_file)
339
344
 
340
345
  safe_print(colorful.green(f"🎉🎉🎉🎉🎉 Caption file written to: {output_caption_path}"))
341
346
  return result
@@ -7,8 +7,9 @@ from typing import List, Optional, Union
7
7
  import numpy as np
8
8
 
9
9
  from lattifai.audio2 import AudioData
10
- from lattifai.caption import Caption, Supervision
10
+ from lattifai.caption import Supervision
11
11
  from lattifai.config import TranscriptionConfig
12
+ from lattifai.data import Caption
12
13
  from lattifai.logging import get_logger
13
14
 
14
15
 
@@ -11,6 +11,7 @@ from google.genai.types import GenerateContentConfig, Part, ThinkingConfig
11
11
  from lattifai.audio2 import AudioData
12
12
  from lattifai.caption import Supervision
13
13
  from lattifai.config import TranscriptionConfig
14
+ from lattifai.data import Caption
14
15
  from lattifai.transcription.base import BaseTranscriber
15
16
  from lattifai.transcription.prompts import get_prompt_loader
16
17
 
@@ -245,18 +246,41 @@ class GeminiTranscriber(BaseTranscriber):
245
246
  return transcript
246
247
 
247
248
  def _get_transcription_prompt(self) -> str:
248
- """Get (and cache) transcription system prompt from prompts module."""
249
+ """Get (and cache) transcription system prompt.
250
+
251
+ Priority:
252
+ 1. Custom prompt from config.prompt (file path or text)
253
+ 2. Default prompt from prompts/gemini/transcription_gem.txt
254
+ """
249
255
  if self._system_prompt is not None:
250
256
  return self._system_prompt
251
257
 
252
- # Load prompt from prompts/gemini/transcription_gem.txt
253
- prompt_loader = get_prompt_loader()
254
- base_prompt = prompt_loader.get_gemini_transcription_prompt()
258
+ # Check for custom prompt
259
+ if self.config.prompt:
260
+ prompt_path = Path(self.config.prompt)
261
+ if prompt_path.exists() and prompt_path.is_file():
262
+ # Load from file
263
+ base_prompt = prompt_path.read_text(encoding="utf-8").strip()
264
+ if self.config.verbose:
265
+ self.logger.info(f"📝 Using custom prompt from file: {prompt_path}")
266
+ else:
267
+ # Use as direct text
268
+ base_prompt = self.config.prompt
269
+ if self.config.verbose:
270
+ self.logger.info("📝 Using custom prompt text")
271
+ else:
272
+ # Load default prompt from prompts/gemini/transcription_gem.txt
273
+ prompt_loader = get_prompt_loader()
274
+ base_prompt = prompt_loader.get_gemini_transcription_prompt()
255
275
 
256
276
  # Add language-specific instruction if configured
257
277
  if self.config.language:
258
278
  base_prompt += f"\n\n* Use {self.config.language} language for transcription."
259
279
 
280
+ # Add media description context if available
281
+ if self.config.description:
282
+ base_prompt += f"\n\n## Media Context\n\n{self.config.description}"
283
+
260
284
  self._system_prompt = base_prompt
261
285
  return self._system_prompt
262
286
 
@@ -287,14 +311,21 @@ class GeminiTranscriber(BaseTranscriber):
287
311
  def _get_generation_config(self) -> GenerateContentConfig:
288
312
  """Lazily build the generation config since it rarely changes."""
289
313
  if self._generation_config is None:
314
+ # Only include thinking_config if thinking mode is enabled
315
+ thinking_config = None
316
+ if self.config.thinking:
317
+ thinking_config = ThinkingConfig(
318
+ include_thoughts=self.config.include_thoughts,
319
+ thinking_budget=-1,
320
+ )
321
+
290
322
  self._generation_config = GenerateContentConfig(
291
323
  system_instruction=self._get_transcription_prompt(),
292
324
  response_modalities=["TEXT"],
293
- thinking_config=ThinkingConfig(
294
- include_thoughts=False,
295
- thinking_budget=-1,
296
- # thinking_level="high", # "low", "medium"
297
- ),
325
+ thinking_config=thinking_config,
326
+ temperature=self.config.temperature,
327
+ top_k=self.config.top_k,
328
+ top_p=self.config.top_p,
298
329
  )
299
330
  return self._generation_config
300
331
 
@@ -323,23 +354,123 @@ class GeminiTranscriber(BaseTranscriber):
323
354
  ),
324
355
  )
325
356
 
326
- if not response.text:
327
- raise RuntimeError("Empty response from Gemini API")
328
-
329
- transcript = response.text.strip()
357
+ # Extract content based on include_thoughts setting
358
+ if self.config.include_thoughts:
359
+ transcript = self._extract_with_thoughts(response)
360
+ else:
361
+ if not response.text:
362
+ raise RuntimeError("Empty response from Gemini API")
363
+ transcript = response.text.strip()
330
364
 
331
365
  if self.config.verbose:
332
366
  self.logger.info(f"✅ Transcription completed ({source}): {len(transcript)} characters")
333
367
 
334
368
  return transcript
335
369
 
370
+ def _extract_with_thoughts(self, response) -> str:
371
+ """Extract response content including thinking process and metadata."""
372
+ output_parts = []
373
+ thoughts = []
374
+ text_parts = []
375
+
376
+ # Iterate through all parts in the response
377
+ for candidate in response.candidates:
378
+ for part in candidate.content.parts:
379
+ if hasattr(part, "thought") and part.thought:
380
+ # This is a thinking part
381
+ if hasattr(part, "text") and part.text:
382
+ thoughts.append(part.text)
383
+ elif hasattr(part, "text") and part.text:
384
+ # This is a regular text part
385
+ text_parts.append(part.text)
386
+
387
+ # Extract metadata
388
+ metadata_lines = self._extract_response_metadata(response)
389
+ if metadata_lines:
390
+ output_parts.append("---")
391
+ output_parts.extend(metadata_lines)
392
+ output_parts.append("---\n")
393
+
394
+ # Format output with thoughts section if present
395
+ if thoughts:
396
+ output_parts.append("<thinking>")
397
+ output_parts.extend(thoughts)
398
+ output_parts.append("</thinking>\n")
399
+
400
+ output_parts.extend(text_parts)
401
+
402
+ result = "\n".join(output_parts).strip()
403
+ if not result:
404
+ raise RuntimeError("Empty response from Gemini API")
405
+
406
+ return result
407
+
408
+ def _extract_response_metadata(self, response) -> list:
409
+ """Extract useful metadata from Gemini response as YAML frontmatter."""
410
+ lines = []
411
+
412
+ # Model version
413
+ if hasattr(response, "model_version") and response.model_version:
414
+ lines.append(f"model_version: {response.model_version}")
415
+
416
+ # Usage metadata (token counts)
417
+ if hasattr(response, "usage_metadata") and response.usage_metadata:
418
+ usage = response.usage_metadata
419
+ if hasattr(usage, "prompt_token_count"):
420
+ lines.append(f"prompt_tokens: {usage.prompt_token_count}")
421
+ if hasattr(usage, "candidates_token_count"):
422
+ lines.append(f"output_tokens: {usage.candidates_token_count}")
423
+ if hasattr(usage, "total_token_count"):
424
+ lines.append(f"total_tokens: {usage.total_token_count}")
425
+ # Thinking tokens if available
426
+ if hasattr(usage, "thoughts_token_count") and usage.thoughts_token_count:
427
+ lines.append(f"thinking_tokens: {usage.thoughts_token_count}")
428
+
429
+ # Candidate-level metadata
430
+ if response.candidates:
431
+ candidate = response.candidates[0]
432
+
433
+ # Finish reason
434
+ if hasattr(candidate, "finish_reason") and candidate.finish_reason:
435
+ lines.append(f"finish_reason: {candidate.finish_reason}")
436
+
437
+ # Average log probability (confidence indicator)
438
+ if hasattr(candidate, "avg_logprobs") and candidate.avg_logprobs is not None:
439
+ lines.append(f"avg_logprobs: {candidate.avg_logprobs:.4f}")
440
+
441
+ # Citation metadata
442
+ if hasattr(candidate, "citation_metadata") and candidate.citation_metadata:
443
+ citations = getattr(candidate.citation_metadata, "citations", [])
444
+ if citations:
445
+ lines.append("citations:")
446
+ for cite in citations:
447
+ uri = getattr(cite, "uri", "")
448
+ start = getattr(cite, "start_index", "")
449
+ end = getattr(cite, "end_index", "")
450
+ if uri:
451
+ lines.append(f" - uri: {uri}")
452
+ if start or end:
453
+ lines.append(f" range: [{start}, {end}]")
454
+
455
+ return lines
456
+
336
457
  def write(
337
- self, transcript: str, output_file: Path, encoding: str = "utf-8", cache_audio_events: bool = True
458
+ self, transcript: Union[str, Caption], output_file: Path, encoding: str = "utf-8", cache_event: bool = True
338
459
  ) -> Path:
339
460
  """
340
- Persist transcript text to disk and return the file path.
461
+ Persist transcript to disk and return the file path.
462
+
463
+ Supports both raw string (from transcribe_file) and Caption object
464
+ (after conversion in mixin._transcribe).
341
465
  """
342
466
  if isinstance(output_file, str):
343
467
  output_file = Path(output_file)
344
- output_file.write_text(transcript, encoding=encoding)
468
+
469
+ if isinstance(transcript, Caption):
470
+ # Caption object - use its write method with gemini format
471
+ transcript.write(output_file, output_format="gemini")
472
+ else:
473
+ # Raw string from transcription
474
+ output_file.write_text(transcript, encoding=encoding)
475
+
345
476
  return output_file
@@ -6,8 +6,9 @@ from typing import List, Optional, Union
6
6
  import numpy as np
7
7
 
8
8
  from lattifai.audio2 import AudioData
9
- from lattifai.caption import Caption, Supervision
9
+ from lattifai.caption import Supervision
10
10
  from lattifai.config import TranscriptionConfig
11
+ from lattifai.data import Caption
11
12
  from lattifai.transcription.base import BaseTranscriber
12
13
 
13
14
 
@@ -53,8 +54,8 @@ class LattifAITranscriber(BaseTranscriber):
53
54
 
54
55
  async def transcribe_file(self, media_file: Union[str, Path, AudioData], language: Optional[str] = None) -> Caption:
55
56
  transcriber = self._ensure_transcriber()
56
- transcription, audio_events = transcriber.transcribe(media_file, language=language, num_workers=2)
57
- return Caption.from_transcription_results(transcription=transcription, audio_events=audio_events)
57
+ transcription, event = transcriber.transcribe(media_file, language=language, num_workers=2)
58
+ return Caption.from_transcription_results(transcription=transcription, event=event)
58
59
 
59
60
  def transcribe_numpy(
60
61
  self,
@@ -77,9 +78,7 @@ class LattifAITranscriber(BaseTranscriber):
77
78
  audio, language=language, return_hypotheses=True, progress_bar=False, timestamps=True
78
79
  )[0]
79
80
 
80
- def write(
81
- self, transcript: Caption, output_file: Path, encoding: str = "utf-8", cache_audio_events: bool = True
82
- ) -> Path:
81
+ def write(self, transcript: Caption, output_file: Path, encoding: str = "utf-8", cache_event: bool = True) -> Path:
83
82
  """
84
83
  Persist transcript text to disk and return the file path.
85
84
  """
@@ -87,10 +86,8 @@ class LattifAITranscriber(BaseTranscriber):
87
86
  output_file,
88
87
  include_speaker_in_text=False,
89
88
  )
90
- if cache_audio_events and transcript.audio_events:
91
- from tgt import write_to_file
92
-
93
- events_file = output_file.with_suffix(".AED")
94
- write_to_file(transcript.audio_events, events_file, format="long")
89
+ if cache_event and transcript.event:
90
+ events_file = output_file.with_suffix(".LED")
91
+ transcript.event.write(events_file)
95
92
 
96
93
  return output_file
lattifai/types.py CHANGED
@@ -5,7 +5,7 @@ from typing import List, TypeAlias, Union
5
5
 
6
6
  from lhotse.utils import Pathlike
7
7
 
8
- from .caption import Supervision
8
+ from lattifai.caption import Supervision
9
9
 
10
10
  # Path-like types
11
11
  PathLike: TypeAlias = Pathlike # Re-export for convenience (str | Path)