lattifai 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +2 -3
- lattifai/alignment/lattice1_aligner.py +117 -4
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +2 -1
- lattifai/alignment/tokenizer.py +56 -29
- lattifai/audio2.py +162 -183
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +6 -6
- lattifai/cli/transcribe.py +1 -5
- lattifai/cli/youtube.py +3 -0
- lattifai/client.py +41 -12
- lattifai/config/__init__.py +21 -3
- lattifai/config/alignment.py +7 -0
- lattifai/config/caption.py +13 -243
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +22 -17
- lattifai/transcription/base.py +2 -1
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +8 -11
- lattifai/types.py +1 -1
- lattifai/youtube/client.py +143 -48
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/METADATA +129 -58
- lattifai-1.3.1.dist-info/RECORD +57 -0
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -350
- lattifai/caption/__init__.py +0 -96
- lattifai/caption/caption.py +0 -661
- lattifai/caption/formats/__init__.py +0 -199
- lattifai/caption/formats/base.py +0 -211
- lattifai/caption/formats/gemini.py +0 -722
- lattifai/caption/formats/json.py +0 -194
- lattifai/caption/formats/lrc.py +0 -309
- lattifai/caption/formats/nle/__init__.py +0 -9
- lattifai/caption/formats/nle/audition.py +0 -561
- lattifai/caption/formats/nle/avid.py +0 -423
- lattifai/caption/formats/nle/fcpxml.py +0 -549
- lattifai/caption/formats/nle/premiere.py +0 -589
- lattifai/caption/formats/pysubs2.py +0 -642
- lattifai/caption/formats/sbv.py +0 -147
- lattifai/caption/formats/tabular.py +0 -338
- lattifai/caption/formats/textgrid.py +0 -193
- lattifai/caption/formats/ttml.py +0 -652
- lattifai/caption/formats/vtt.py +0 -469
- lattifai/caption/parsers/__init__.py +0 -9
- lattifai/caption/parsers/text_parser.py +0 -147
- lattifai/caption/standardize.py +0 -636
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/utils.py +0 -474
- lattifai-1.2.2.dist-info/RECORD +0 -76
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/WHEEL +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/entry_points.txt +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""LattifAI Audio Event Detection implementation."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import TYPE_CHECKING, Optional
|
|
5
|
+
|
|
6
|
+
from lattifai.audio2 import AudioData
|
|
7
|
+
from lattifai.config.event import EventConfig
|
|
8
|
+
from lattifai.logging import get_logger
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from lattifai_core.event import LEDOutput
|
|
12
|
+
|
|
13
|
+
from lattifai.data import Caption
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
17
|
+
logging.basicConfig(format=formatter, level=logging.INFO)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class LattifAIEventDetector:
|
|
21
|
+
"""
|
|
22
|
+
LattifAI Audio Event Detector.
|
|
23
|
+
|
|
24
|
+
This class provides a high-level interface for audio event detection,
|
|
25
|
+
wrapping the core LattifAIEventDetector from lattifai_core.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
config: EventConfig configuration object.
|
|
29
|
+
|
|
30
|
+
Example:
|
|
31
|
+
>>> from lattifai.event import LattifAIEventDetector
|
|
32
|
+
>>> from lattifai.config import EventConfig
|
|
33
|
+
>>>
|
|
34
|
+
>>> config = EventConfig(enabled=True, device="cuda")
|
|
35
|
+
>>> detector = LattifAIEventDetector(config)
|
|
36
|
+
>>>
|
|
37
|
+
>>> # Detect events from audio data
|
|
38
|
+
>>> result = detector.detect(audio_data)
|
|
39
|
+
>>>
|
|
40
|
+
>>> # Access VAD segments directly
|
|
41
|
+
>>> for start, end in result.vad_segments:
|
|
42
|
+
... print(f"Speech: {start:.2f} - {end:.2f}")
|
|
43
|
+
>>>
|
|
44
|
+
>>> # Or access the full TextGrid
|
|
45
|
+
>>> for tier in result.audio_events.tiers:
|
|
46
|
+
... print(f"Event type: {tier.name}")
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(self, config: EventConfig):
|
|
50
|
+
"""
|
|
51
|
+
Initialize LattifAI Audio Event Detector.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
config: EventConfig configuration.
|
|
55
|
+
"""
|
|
56
|
+
self.config = config
|
|
57
|
+
self.logger = get_logger("event")
|
|
58
|
+
self._detector = None
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def name(self) -> str:
|
|
62
|
+
"""Human-readable name of the detector."""
|
|
63
|
+
return "LattifAI_EventDetector"
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def detector(self):
|
|
67
|
+
"""Lazy-load and return the audio event detector."""
|
|
68
|
+
if self._detector is None:
|
|
69
|
+
from lattifai_core.event import LattifAIEventDetector as CoreEventDetector
|
|
70
|
+
|
|
71
|
+
self._detector = CoreEventDetector.from_pretrained(
|
|
72
|
+
model_path=self.config.model_path,
|
|
73
|
+
device=self.config.device,
|
|
74
|
+
client_wrapper=self.config.client_wrapper,
|
|
75
|
+
)
|
|
76
|
+
return self._detector
|
|
77
|
+
|
|
78
|
+
def detect(
|
|
79
|
+
self,
|
|
80
|
+
input_media: AudioData,
|
|
81
|
+
vad_chunk_size: Optional[float] = None,
|
|
82
|
+
vad_max_gap: Optional[float] = None,
|
|
83
|
+
fast_mode: Optional[bool] = None,
|
|
84
|
+
) -> "LEDOutput":
|
|
85
|
+
"""
|
|
86
|
+
Detect audio events in the input audio.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
input_media: Audio data to analyze.
|
|
90
|
+
vad_chunk_size: Override config vad_chunk_size.
|
|
91
|
+
vad_max_gap: Override config vad_max_gap.
|
|
92
|
+
fast_mode: Override config fast_mode.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
LEDOutput containing audio_events, event_names, vad_segments.
|
|
96
|
+
"""
|
|
97
|
+
return self.detector(
|
|
98
|
+
audio=input_media,
|
|
99
|
+
vad_chunk_size=vad_chunk_size or self.config.vad_chunk_size,
|
|
100
|
+
vad_max_gap=vad_max_gap or self.config.vad_max_gap,
|
|
101
|
+
fast_mode=fast_mode if fast_mode is not None else self.config.fast_mode,
|
|
102
|
+
custom_aliases=self.config.event_aliases or {},
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def profiling(self, reset: bool = False) -> str:
|
|
106
|
+
"""Get profiling information for the detector."""
|
|
107
|
+
if self._detector is None:
|
|
108
|
+
return ""
|
|
109
|
+
return self.detector.profiling(reset=reset, logger=self.logger)
|
|
110
|
+
|
|
111
|
+
def detect_and_update_caption(
|
|
112
|
+
self,
|
|
113
|
+
caption: "Caption",
|
|
114
|
+
input_media: AudioData,
|
|
115
|
+
vad_chunk_size: Optional[float] = None,
|
|
116
|
+
vad_max_gap: Optional[float] = None,
|
|
117
|
+
fast_mode: Optional[bool] = None,
|
|
118
|
+
) -> "Caption":
|
|
119
|
+
"""
|
|
120
|
+
Run event detection and update caption with audio events.
|
|
121
|
+
|
|
122
|
+
This is the main entry point for integrating event detection with alignment.
|
|
123
|
+
When event_matching is enabled, it also updates caption timestamps for [Event] markers.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
audio: AudioData to analyze
|
|
127
|
+
caption: Caption to update with event detection results
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Updated Caption with event field populated
|
|
131
|
+
"""
|
|
132
|
+
# Event matching: update caption timestamps based on detected events
|
|
133
|
+
if self.config.event_matching:
|
|
134
|
+
# Get supervisions to process
|
|
135
|
+
supervisions = caption.alignments or caption.supervisions
|
|
136
|
+
|
|
137
|
+
led_output, supervisions = self.detector.detect_and_update_supervisions(
|
|
138
|
+
supervisions=supervisions,
|
|
139
|
+
audio=input_media,
|
|
140
|
+
vad_chunk_size=vad_chunk_size or self.config.vad_chunk_size,
|
|
141
|
+
vad_max_gap=vad_max_gap or self.config.vad_max_gap,
|
|
142
|
+
fast_mode=fast_mode if fast_mode is not None else self.config.fast_mode,
|
|
143
|
+
custom_aliases=self.config.event_aliases or {},
|
|
144
|
+
extra_events=self.config.extra_events or None,
|
|
145
|
+
time_tolerance=self.config.time_tolerance,
|
|
146
|
+
update_timestamps=self.config.update_timestamps,
|
|
147
|
+
duplicate_strategy=self.config.duplicate_strategy,
|
|
148
|
+
)
|
|
149
|
+
# Store LEDOutput in caption
|
|
150
|
+
caption.event = led_output
|
|
151
|
+
|
|
152
|
+
if caption.alignments:
|
|
153
|
+
caption.alignments = supervisions
|
|
154
|
+
else:
|
|
155
|
+
caption.supervisions = supervisions
|
|
156
|
+
else:
|
|
157
|
+
# Simple detection without event matching
|
|
158
|
+
led_output = self.detect(
|
|
159
|
+
input_media=input_media,
|
|
160
|
+
vad_chunk_size=vad_chunk_size,
|
|
161
|
+
vad_max_gap=vad_max_gap,
|
|
162
|
+
fast_mode=fast_mode,
|
|
163
|
+
)
|
|
164
|
+
caption.event = led_output
|
|
165
|
+
|
|
166
|
+
return caption
|
lattifai/mixin.py
CHANGED
|
@@ -8,12 +8,12 @@ import colorful
|
|
|
8
8
|
from lhotse.utils import Pathlike
|
|
9
9
|
|
|
10
10
|
from lattifai.audio2 import AudioData
|
|
11
|
-
from lattifai.
|
|
11
|
+
from lattifai.data import Caption
|
|
12
12
|
from lattifai.errors import CaptionProcessingError
|
|
13
13
|
from lattifai.utils import safe_print
|
|
14
14
|
|
|
15
15
|
if TYPE_CHECKING:
|
|
16
|
-
from .config import AlignmentConfig,
|
|
16
|
+
from .config import AlignmentConfig, DiarizationConfig, EventConfig, TranscriptionConfig
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class LattifAIClientMixin:
|
|
@@ -170,9 +170,10 @@ class LattifAIClientMixin:
|
|
|
170
170
|
alignment_config: Optional["AlignmentConfig"],
|
|
171
171
|
transcription_config: Optional["TranscriptionConfig"],
|
|
172
172
|
diarization_config: Optional["DiarizationConfig"] = None,
|
|
173
|
+
event_config: Optional["EventConfig"] = None,
|
|
173
174
|
) -> tuple:
|
|
174
175
|
"""Initialize all configs with defaults if not provided."""
|
|
175
|
-
from .config import AlignmentConfig, DiarizationConfig, TranscriptionConfig
|
|
176
|
+
from .config import AlignmentConfig, DiarizationConfig, EventConfig, TranscriptionConfig
|
|
176
177
|
|
|
177
178
|
if alignment_config is None:
|
|
178
179
|
alignment_config = AlignmentConfig()
|
|
@@ -180,20 +181,24 @@ class LattifAIClientMixin:
|
|
|
180
181
|
transcription_config = TranscriptionConfig()
|
|
181
182
|
if diarization_config is None:
|
|
182
183
|
diarization_config = DiarizationConfig()
|
|
184
|
+
if event_config is None:
|
|
185
|
+
event_config = EventConfig()
|
|
183
186
|
|
|
184
187
|
from lattifai.utils import _resolve_model_path
|
|
185
188
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
189
|
+
model_path = _resolve_model_path(
|
|
190
|
+
alignment_config.model_name, getattr(alignment_config, "model_hub", "modelscope")
|
|
191
|
+
)
|
|
192
|
+
transcription_config.lattice_model_path = model_path
|
|
193
|
+
event_config.model_path = model_path
|
|
190
194
|
|
|
191
195
|
# Set client_wrapper for all configs
|
|
192
196
|
alignment_config.client_wrapper = self
|
|
193
197
|
transcription_config.client_wrapper = self
|
|
194
198
|
diarization_config.client_wrapper = self
|
|
199
|
+
event_config.client_wrapper = self
|
|
195
200
|
|
|
196
|
-
return alignment_config, transcription_config, diarization_config
|
|
201
|
+
return alignment_config, transcription_config, diarization_config, event_config
|
|
197
202
|
|
|
198
203
|
def _init_shared_components(
|
|
199
204
|
self,
|
|
@@ -284,18 +289,18 @@ class LattifAIClientMixin:
|
|
|
284
289
|
format=input_caption_format,
|
|
285
290
|
normalize_text=normalize_text if normalize_text is not None else self.caption_config.normalize_text,
|
|
286
291
|
)
|
|
287
|
-
diarization_file = Path(str(input_caption)).with_suffix(".
|
|
292
|
+
diarization_file = Path(str(input_caption)).with_suffix(".Diarization")
|
|
288
293
|
if diarization_file.exists():
|
|
289
294
|
if verbose:
|
|
290
295
|
safe_print(colorful.cyan(f"📖 Step1b: Reading speaker diarization from {diarization_file}"))
|
|
291
|
-
caption.
|
|
292
|
-
|
|
293
|
-
if
|
|
296
|
+
caption.read_diarization(diarization_file)
|
|
297
|
+
event_file = Path(str(input_caption)).with_suffix(".LED")
|
|
298
|
+
if event_file.exists():
|
|
294
299
|
if verbose:
|
|
295
|
-
safe_print(colorful.cyan(f"📖 Step1c: Reading audio events from {
|
|
296
|
-
from
|
|
300
|
+
safe_print(colorful.cyan(f"📖 Step1c: Reading audio events from {event_file}"))
|
|
301
|
+
from lattifai_core.event import LEDOutput
|
|
297
302
|
|
|
298
|
-
caption.
|
|
303
|
+
caption.event = LEDOutput.read(event_file)
|
|
299
304
|
|
|
300
305
|
if verbose:
|
|
301
306
|
safe_print(colorful.green(f" ✓ Parsed {len(caption)} caption segments"))
|
|
@@ -333,9 +338,9 @@ class LattifAIClientMixin:
|
|
|
333
338
|
karaoke_config=self.caption_config.karaoke,
|
|
334
339
|
)
|
|
335
340
|
diarization_file = Path(str(output_caption_path)).with_suffix(".SpkDiar")
|
|
336
|
-
if not diarization_file.exists() and caption.
|
|
341
|
+
if not diarization_file.exists() and caption.diarization:
|
|
337
342
|
safe_print(colorful.green(f" Writing speaker diarization to: {diarization_file}"))
|
|
338
|
-
caption.
|
|
343
|
+
caption.write_diarization(diarization_file)
|
|
339
344
|
|
|
340
345
|
safe_print(colorful.green(f"🎉🎉🎉🎉🎉 Caption file written to: {output_caption_path}"))
|
|
341
346
|
return result
|
lattifai/transcription/base.py
CHANGED
|
@@ -7,8 +7,9 @@ from typing import List, Optional, Union
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
|
|
9
9
|
from lattifai.audio2 import AudioData
|
|
10
|
-
from lattifai.caption import
|
|
10
|
+
from lattifai.caption import Supervision
|
|
11
11
|
from lattifai.config import TranscriptionConfig
|
|
12
|
+
from lattifai.data import Caption
|
|
12
13
|
from lattifai.logging import get_logger
|
|
13
14
|
|
|
14
15
|
|
lattifai/transcription/gemini.py
CHANGED
|
@@ -11,6 +11,7 @@ from google.genai.types import GenerateContentConfig, Part, ThinkingConfig
|
|
|
11
11
|
from lattifai.audio2 import AudioData
|
|
12
12
|
from lattifai.caption import Supervision
|
|
13
13
|
from lattifai.config import TranscriptionConfig
|
|
14
|
+
from lattifai.data import Caption
|
|
14
15
|
from lattifai.transcription.base import BaseTranscriber
|
|
15
16
|
from lattifai.transcription.prompts import get_prompt_loader
|
|
16
17
|
|
|
@@ -245,18 +246,41 @@ class GeminiTranscriber(BaseTranscriber):
|
|
|
245
246
|
return transcript
|
|
246
247
|
|
|
247
248
|
def _get_transcription_prompt(self) -> str:
|
|
248
|
-
"""Get (and cache) transcription system prompt
|
|
249
|
+
"""Get (and cache) transcription system prompt.
|
|
250
|
+
|
|
251
|
+
Priority:
|
|
252
|
+
1. Custom prompt from config.prompt (file path or text)
|
|
253
|
+
2. Default prompt from prompts/gemini/transcription_gem.txt
|
|
254
|
+
"""
|
|
249
255
|
if self._system_prompt is not None:
|
|
250
256
|
return self._system_prompt
|
|
251
257
|
|
|
252
|
-
#
|
|
253
|
-
|
|
254
|
-
|
|
258
|
+
# Check for custom prompt
|
|
259
|
+
if self.config.prompt:
|
|
260
|
+
prompt_path = Path(self.config.prompt)
|
|
261
|
+
if prompt_path.exists() and prompt_path.is_file():
|
|
262
|
+
# Load from file
|
|
263
|
+
base_prompt = prompt_path.read_text(encoding="utf-8").strip()
|
|
264
|
+
if self.config.verbose:
|
|
265
|
+
self.logger.info(f"📝 Using custom prompt from file: {prompt_path}")
|
|
266
|
+
else:
|
|
267
|
+
# Use as direct text
|
|
268
|
+
base_prompt = self.config.prompt
|
|
269
|
+
if self.config.verbose:
|
|
270
|
+
self.logger.info("📝 Using custom prompt text")
|
|
271
|
+
else:
|
|
272
|
+
# Load default prompt from prompts/gemini/transcription_gem.txt
|
|
273
|
+
prompt_loader = get_prompt_loader()
|
|
274
|
+
base_prompt = prompt_loader.get_gemini_transcription_prompt()
|
|
255
275
|
|
|
256
276
|
# Add language-specific instruction if configured
|
|
257
277
|
if self.config.language:
|
|
258
278
|
base_prompt += f"\n\n* Use {self.config.language} language for transcription."
|
|
259
279
|
|
|
280
|
+
# Add media description context if available
|
|
281
|
+
if self.config.description:
|
|
282
|
+
base_prompt += f"\n\n## Media Context\n\n{self.config.description}"
|
|
283
|
+
|
|
260
284
|
self._system_prompt = base_prompt
|
|
261
285
|
return self._system_prompt
|
|
262
286
|
|
|
@@ -287,14 +311,21 @@ class GeminiTranscriber(BaseTranscriber):
|
|
|
287
311
|
def _get_generation_config(self) -> GenerateContentConfig:
|
|
288
312
|
"""Lazily build the generation config since it rarely changes."""
|
|
289
313
|
if self._generation_config is None:
|
|
314
|
+
# Only include thinking_config if thinking mode is enabled
|
|
315
|
+
thinking_config = None
|
|
316
|
+
if self.config.thinking:
|
|
317
|
+
thinking_config = ThinkingConfig(
|
|
318
|
+
include_thoughts=self.config.include_thoughts,
|
|
319
|
+
thinking_budget=-1,
|
|
320
|
+
)
|
|
321
|
+
|
|
290
322
|
self._generation_config = GenerateContentConfig(
|
|
291
323
|
system_instruction=self._get_transcription_prompt(),
|
|
292
324
|
response_modalities=["TEXT"],
|
|
293
|
-
thinking_config=
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
),
|
|
325
|
+
thinking_config=thinking_config,
|
|
326
|
+
temperature=self.config.temperature,
|
|
327
|
+
top_k=self.config.top_k,
|
|
328
|
+
top_p=self.config.top_p,
|
|
298
329
|
)
|
|
299
330
|
return self._generation_config
|
|
300
331
|
|
|
@@ -323,23 +354,123 @@ class GeminiTranscriber(BaseTranscriber):
|
|
|
323
354
|
),
|
|
324
355
|
)
|
|
325
356
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
357
|
+
# Extract content based on include_thoughts setting
|
|
358
|
+
if self.config.include_thoughts:
|
|
359
|
+
transcript = self._extract_with_thoughts(response)
|
|
360
|
+
else:
|
|
361
|
+
if not response.text:
|
|
362
|
+
raise RuntimeError("Empty response from Gemini API")
|
|
363
|
+
transcript = response.text.strip()
|
|
330
364
|
|
|
331
365
|
if self.config.verbose:
|
|
332
366
|
self.logger.info(f"✅ Transcription completed ({source}): {len(transcript)} characters")
|
|
333
367
|
|
|
334
368
|
return transcript
|
|
335
369
|
|
|
370
|
+
def _extract_with_thoughts(self, response) -> str:
|
|
371
|
+
"""Extract response content including thinking process and metadata."""
|
|
372
|
+
output_parts = []
|
|
373
|
+
thoughts = []
|
|
374
|
+
text_parts = []
|
|
375
|
+
|
|
376
|
+
# Iterate through all parts in the response
|
|
377
|
+
for candidate in response.candidates:
|
|
378
|
+
for part in candidate.content.parts:
|
|
379
|
+
if hasattr(part, "thought") and part.thought:
|
|
380
|
+
# This is a thinking part
|
|
381
|
+
if hasattr(part, "text") and part.text:
|
|
382
|
+
thoughts.append(part.text)
|
|
383
|
+
elif hasattr(part, "text") and part.text:
|
|
384
|
+
# This is a regular text part
|
|
385
|
+
text_parts.append(part.text)
|
|
386
|
+
|
|
387
|
+
# Extract metadata
|
|
388
|
+
metadata_lines = self._extract_response_metadata(response)
|
|
389
|
+
if metadata_lines:
|
|
390
|
+
output_parts.append("---")
|
|
391
|
+
output_parts.extend(metadata_lines)
|
|
392
|
+
output_parts.append("---\n")
|
|
393
|
+
|
|
394
|
+
# Format output with thoughts section if present
|
|
395
|
+
if thoughts:
|
|
396
|
+
output_parts.append("<thinking>")
|
|
397
|
+
output_parts.extend(thoughts)
|
|
398
|
+
output_parts.append("</thinking>\n")
|
|
399
|
+
|
|
400
|
+
output_parts.extend(text_parts)
|
|
401
|
+
|
|
402
|
+
result = "\n".join(output_parts).strip()
|
|
403
|
+
if not result:
|
|
404
|
+
raise RuntimeError("Empty response from Gemini API")
|
|
405
|
+
|
|
406
|
+
return result
|
|
407
|
+
|
|
408
|
+
def _extract_response_metadata(self, response) -> list:
|
|
409
|
+
"""Extract useful metadata from Gemini response as YAML frontmatter."""
|
|
410
|
+
lines = []
|
|
411
|
+
|
|
412
|
+
# Model version
|
|
413
|
+
if hasattr(response, "model_version") and response.model_version:
|
|
414
|
+
lines.append(f"model_version: {response.model_version}")
|
|
415
|
+
|
|
416
|
+
# Usage metadata (token counts)
|
|
417
|
+
if hasattr(response, "usage_metadata") and response.usage_metadata:
|
|
418
|
+
usage = response.usage_metadata
|
|
419
|
+
if hasattr(usage, "prompt_token_count"):
|
|
420
|
+
lines.append(f"prompt_tokens: {usage.prompt_token_count}")
|
|
421
|
+
if hasattr(usage, "candidates_token_count"):
|
|
422
|
+
lines.append(f"output_tokens: {usage.candidates_token_count}")
|
|
423
|
+
if hasattr(usage, "total_token_count"):
|
|
424
|
+
lines.append(f"total_tokens: {usage.total_token_count}")
|
|
425
|
+
# Thinking tokens if available
|
|
426
|
+
if hasattr(usage, "thoughts_token_count") and usage.thoughts_token_count:
|
|
427
|
+
lines.append(f"thinking_tokens: {usage.thoughts_token_count}")
|
|
428
|
+
|
|
429
|
+
# Candidate-level metadata
|
|
430
|
+
if response.candidates:
|
|
431
|
+
candidate = response.candidates[0]
|
|
432
|
+
|
|
433
|
+
# Finish reason
|
|
434
|
+
if hasattr(candidate, "finish_reason") and candidate.finish_reason:
|
|
435
|
+
lines.append(f"finish_reason: {candidate.finish_reason}")
|
|
436
|
+
|
|
437
|
+
# Average log probability (confidence indicator)
|
|
438
|
+
if hasattr(candidate, "avg_logprobs") and candidate.avg_logprobs is not None:
|
|
439
|
+
lines.append(f"avg_logprobs: {candidate.avg_logprobs:.4f}")
|
|
440
|
+
|
|
441
|
+
# Citation metadata
|
|
442
|
+
if hasattr(candidate, "citation_metadata") and candidate.citation_metadata:
|
|
443
|
+
citations = getattr(candidate.citation_metadata, "citations", [])
|
|
444
|
+
if citations:
|
|
445
|
+
lines.append("citations:")
|
|
446
|
+
for cite in citations:
|
|
447
|
+
uri = getattr(cite, "uri", "")
|
|
448
|
+
start = getattr(cite, "start_index", "")
|
|
449
|
+
end = getattr(cite, "end_index", "")
|
|
450
|
+
if uri:
|
|
451
|
+
lines.append(f" - uri: {uri}")
|
|
452
|
+
if start or end:
|
|
453
|
+
lines.append(f" range: [{start}, {end}]")
|
|
454
|
+
|
|
455
|
+
return lines
|
|
456
|
+
|
|
336
457
|
def write(
|
|
337
|
-
self, transcript: str, output_file: Path, encoding: str = "utf-8",
|
|
458
|
+
self, transcript: Union[str, Caption], output_file: Path, encoding: str = "utf-8", cache_event: bool = True
|
|
338
459
|
) -> Path:
|
|
339
460
|
"""
|
|
340
|
-
Persist transcript
|
|
461
|
+
Persist transcript to disk and return the file path.
|
|
462
|
+
|
|
463
|
+
Supports both raw string (from transcribe_file) and Caption object
|
|
464
|
+
(after conversion in mixin._transcribe).
|
|
341
465
|
"""
|
|
342
466
|
if isinstance(output_file, str):
|
|
343
467
|
output_file = Path(output_file)
|
|
344
|
-
|
|
468
|
+
|
|
469
|
+
if isinstance(transcript, Caption):
|
|
470
|
+
# Caption object - use its write method with gemini format
|
|
471
|
+
transcript.write(output_file, output_format="gemini")
|
|
472
|
+
else:
|
|
473
|
+
# Raw string from transcription
|
|
474
|
+
output_file.write_text(transcript, encoding=encoding)
|
|
475
|
+
|
|
345
476
|
return output_file
|
|
@@ -6,8 +6,9 @@ from typing import List, Optional, Union
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
|
|
8
8
|
from lattifai.audio2 import AudioData
|
|
9
|
-
from lattifai.caption import
|
|
9
|
+
from lattifai.caption import Supervision
|
|
10
10
|
from lattifai.config import TranscriptionConfig
|
|
11
|
+
from lattifai.data import Caption
|
|
11
12
|
from lattifai.transcription.base import BaseTranscriber
|
|
12
13
|
|
|
13
14
|
|
|
@@ -53,8 +54,8 @@ class LattifAITranscriber(BaseTranscriber):
|
|
|
53
54
|
|
|
54
55
|
async def transcribe_file(self, media_file: Union[str, Path, AudioData], language: Optional[str] = None) -> Caption:
|
|
55
56
|
transcriber = self._ensure_transcriber()
|
|
56
|
-
transcription,
|
|
57
|
-
return Caption.from_transcription_results(transcription=transcription,
|
|
57
|
+
transcription, event = transcriber.transcribe(media_file, language=language, num_workers=2)
|
|
58
|
+
return Caption.from_transcription_results(transcription=transcription, event=event)
|
|
58
59
|
|
|
59
60
|
def transcribe_numpy(
|
|
60
61
|
self,
|
|
@@ -77,9 +78,7 @@ class LattifAITranscriber(BaseTranscriber):
|
|
|
77
78
|
audio, language=language, return_hypotheses=True, progress_bar=False, timestamps=True
|
|
78
79
|
)[0]
|
|
79
80
|
|
|
80
|
-
def write(
|
|
81
|
-
self, transcript: Caption, output_file: Path, encoding: str = "utf-8", cache_audio_events: bool = True
|
|
82
|
-
) -> Path:
|
|
81
|
+
def write(self, transcript: Caption, output_file: Path, encoding: str = "utf-8", cache_event: bool = True) -> Path:
|
|
83
82
|
"""
|
|
84
83
|
Persist transcript text to disk and return the file path.
|
|
85
84
|
"""
|
|
@@ -87,10 +86,8 @@ class LattifAITranscriber(BaseTranscriber):
|
|
|
87
86
|
output_file,
|
|
88
87
|
include_speaker_in_text=False,
|
|
89
88
|
)
|
|
90
|
-
if
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
events_file = output_file.with_suffix(".AED")
|
|
94
|
-
write_to_file(transcript.audio_events, events_file, format="long")
|
|
89
|
+
if cache_event and transcript.event:
|
|
90
|
+
events_file = output_file.with_suffix(".LED")
|
|
91
|
+
transcript.event.write(events_file)
|
|
95
92
|
|
|
96
93
|
return output_file
|
lattifai/types.py
CHANGED