lattifai 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +0 -25
- lattifai/alignment/lattice1_aligner.py +12 -9
- lattifai/alignment/lattice1_worker.py +124 -155
- lattifai/alignment/segmenter.py +1 -1
- lattifai/alignment/sentence_splitter.py +219 -0
- lattifai/alignment/tokenizer.py +23 -179
- lattifai/audio2.py +1 -1
- lattifai/caption/caption.py +0 -2
- lattifai/caption/gemini_reader.py +151 -60
- lattifai/cli/diarization.py +3 -1
- lattifai/cli/transcribe.py +3 -8
- lattifai/cli/youtube.py +11 -0
- lattifai/client.py +96 -47
- lattifai/config/alignment.py +2 -2
- lattifai/config/client.py +5 -0
- lattifai/mixin.py +17 -8
- lattifai/utils.py +40 -4
- lattifai/workflow/youtube.py +55 -57
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/METADATA +331 -48
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/RECORD +24 -23
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/WHEEL +0 -0
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/entry_points.txt +0 -0
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/top_level.txt +0 -0
|
@@ -15,7 +15,8 @@ class GeminiSegment:
|
|
|
15
15
|
"""Represents a segment in the Gemini transcript with metadata."""
|
|
16
16
|
|
|
17
17
|
text: str
|
|
18
|
-
timestamp: Optional[float] = None
|
|
18
|
+
timestamp: Optional[float] = None # For backward compatibility (start time)
|
|
19
|
+
end_timestamp: Optional[float] = None # End time when timestamp is at the end
|
|
19
20
|
speaker: Optional[str] = None
|
|
20
21
|
section: Optional[str] = None
|
|
21
22
|
segment_type: str = "dialogue" # 'dialogue', 'event', or 'section_header'
|
|
@@ -26,6 +27,11 @@ class GeminiSegment:
|
|
|
26
27
|
"""Return start time in seconds."""
|
|
27
28
|
return self.timestamp if self.timestamp is not None else 0.0
|
|
28
29
|
|
|
30
|
+
@property
|
|
31
|
+
def end(self) -> Optional[float]:
|
|
32
|
+
"""Return end time in seconds if available."""
|
|
33
|
+
return self.end_timestamp
|
|
34
|
+
|
|
29
35
|
|
|
30
36
|
class GeminiReader:
|
|
31
37
|
"""Parser for YouTube transcript format with speaker labels and timestamps."""
|
|
@@ -34,8 +40,12 @@ class GeminiReader:
|
|
|
34
40
|
TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
|
|
35
41
|
SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
|
|
36
42
|
SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[::])\*\*\s*(.+)$")
|
|
37
|
-
|
|
38
|
-
|
|
43
|
+
# Event pattern: [Event] [HH:MM:SS] or [Event] [MM:SS] - prioritize HH:MM:SS format
|
|
44
|
+
EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(\d{1,2}):(\d{2})(?::(\d{2}))?\]$")
|
|
45
|
+
# Timestamp at the end indicates end time
|
|
46
|
+
INLINE_TIMESTAMP_END_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
|
|
47
|
+
# Timestamp at the beginning indicates start time
|
|
48
|
+
INLINE_TIMESTAMP_START_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]\s*(.+)$")
|
|
39
49
|
|
|
40
50
|
# New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
|
|
41
51
|
YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
|
|
@@ -144,18 +154,22 @@ class GeminiReader:
|
|
|
144
154
|
if event_match:
|
|
145
155
|
groups = event_match.groups()
|
|
146
156
|
event_text = groups[0]
|
|
147
|
-
# Parse timestamp -
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
157
|
+
# Parse timestamp - groups: (event_text, hours/minutes, minutes/seconds, seconds_optional)
|
|
158
|
+
hours_or_minutes = groups[1]
|
|
159
|
+
minutes_or_seconds = groups[2]
|
|
160
|
+
seconds_optional = groups[3]
|
|
161
|
+
|
|
162
|
+
if seconds_optional is not None:
|
|
163
|
+
# HH:MM:SS format
|
|
164
|
+
timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds, seconds_optional)
|
|
152
165
|
else:
|
|
153
|
-
|
|
166
|
+
# MM:SS format
|
|
167
|
+
timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds)
|
|
154
168
|
|
|
155
169
|
if include_events and timestamp is not None:
|
|
156
170
|
segments.append(
|
|
157
171
|
GeminiSegment(
|
|
158
|
-
text=event_text.strip(),
|
|
172
|
+
text=f"[{event_text.strip()}]",
|
|
159
173
|
timestamp=timestamp,
|
|
160
174
|
section=current_section,
|
|
161
175
|
segment_type="event",
|
|
@@ -170,34 +184,44 @@ class GeminiReader:
|
|
|
170
184
|
speaker, text_with_timestamp = speaker_match.groups()
|
|
171
185
|
current_speaker = speaker.strip()
|
|
172
186
|
|
|
173
|
-
#
|
|
174
|
-
|
|
187
|
+
# Check for timestamp at the beginning (start time)
|
|
188
|
+
start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(text_with_timestamp.strip())
|
|
189
|
+
# Check for timestamp at the end (end time)
|
|
190
|
+
end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(text_with_timestamp.strip())
|
|
175
191
|
youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
|
|
176
192
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
193
|
+
start_timestamp = None
|
|
194
|
+
end_timestamp = None
|
|
195
|
+
text = text_with_timestamp.strip()
|
|
196
|
+
|
|
197
|
+
if start_match:
|
|
198
|
+
groups = start_match.groups()
|
|
199
|
+
# Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
|
|
200
|
+
if groups[0] is not None: # HH:MM:SS format
|
|
201
|
+
start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
|
|
202
|
+
elif groups[3] is not None: # MM:SS format
|
|
203
|
+
start_timestamp = cls.parse_timestamp(groups[3], groups[4])
|
|
204
|
+
text = groups[5] # Text is after timestamp
|
|
205
|
+
elif end_match:
|
|
206
|
+
groups = end_match.groups()
|
|
207
|
+
text = groups[0] # Text is before timestamp
|
|
180
208
|
# Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
|
|
181
209
|
if groups[1] is not None: # HH:MM:SS format
|
|
182
|
-
|
|
210
|
+
end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
|
|
183
211
|
elif groups[4] is not None: # MM:SS format
|
|
184
|
-
|
|
185
|
-
else:
|
|
186
|
-
timestamp = None
|
|
212
|
+
end_timestamp = cls.parse_timestamp(groups[4], groups[5])
|
|
187
213
|
elif youtube_match:
|
|
188
214
|
groups = youtube_match.groups()
|
|
189
215
|
text = groups[0]
|
|
190
|
-
# Extract seconds from URL parameter
|
|
216
|
+
# Extract seconds from URL parameter (treat as end time)
|
|
191
217
|
url_seconds = groups[3]
|
|
192
|
-
|
|
193
|
-
else:
|
|
194
|
-
text = text_with_timestamp.strip()
|
|
195
|
-
timestamp = None
|
|
218
|
+
end_timestamp = cls.parse_timestamp(url_seconds)
|
|
196
219
|
|
|
197
220
|
segments.append(
|
|
198
221
|
GeminiSegment(
|
|
199
222
|
text=text.strip(),
|
|
200
|
-
timestamp=
|
|
223
|
+
timestamp=start_timestamp,
|
|
224
|
+
end_timestamp=end_timestamp,
|
|
201
225
|
speaker=current_speaker,
|
|
202
226
|
section=current_section,
|
|
203
227
|
segment_type="dialogue",
|
|
@@ -207,25 +231,50 @@ class GeminiReader:
|
|
|
207
231
|
current_speaker = None # Reset speaker after use
|
|
208
232
|
continue
|
|
209
233
|
|
|
210
|
-
# Parse plain text with timestamp
|
|
211
|
-
|
|
234
|
+
# Parse plain text with timestamp (check both positions)
|
|
235
|
+
start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(line)
|
|
236
|
+
end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(line)
|
|
212
237
|
youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
|
|
213
238
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
239
|
+
start_timestamp = None
|
|
240
|
+
end_timestamp = None
|
|
241
|
+
text = None
|
|
242
|
+
|
|
243
|
+
if start_match:
|
|
244
|
+
groups = start_match.groups()
|
|
245
|
+
# Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
|
|
246
|
+
if groups[0] is not None: # HH:MM:SS format
|
|
247
|
+
start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
|
|
248
|
+
elif groups[3] is not None: # MM:SS format
|
|
249
|
+
start_timestamp = cls.parse_timestamp(groups[3], groups[4])
|
|
250
|
+
text = groups[5] # Text is after timestamp
|
|
251
|
+
|
|
252
|
+
segments.append(
|
|
253
|
+
GeminiSegment(
|
|
254
|
+
text=text.strip(),
|
|
255
|
+
timestamp=start_timestamp,
|
|
256
|
+
end_timestamp=None,
|
|
257
|
+
speaker=current_speaker,
|
|
258
|
+
section=current_section,
|
|
259
|
+
segment_type="dialogue",
|
|
260
|
+
line_number=line_num,
|
|
261
|
+
)
|
|
262
|
+
)
|
|
263
|
+
continue
|
|
264
|
+
elif end_match:
|
|
265
|
+
groups = end_match.groups()
|
|
266
|
+
text = groups[0] # Text is before timestamp
|
|
217
267
|
# Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
|
|
218
268
|
if groups[1] is not None: # HH:MM:SS format
|
|
219
|
-
|
|
269
|
+
end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
|
|
220
270
|
elif groups[4] is not None: # MM:SS format
|
|
221
|
-
|
|
222
|
-
else:
|
|
223
|
-
timestamp = None
|
|
271
|
+
end_timestamp = cls.parse_timestamp(groups[4], groups[5])
|
|
224
272
|
|
|
225
273
|
segments.append(
|
|
226
274
|
GeminiSegment(
|
|
227
275
|
text=text.strip(),
|
|
228
|
-
timestamp=
|
|
276
|
+
timestamp=None,
|
|
277
|
+
end_timestamp=end_timestamp,
|
|
229
278
|
speaker=current_speaker,
|
|
230
279
|
section=current_section,
|
|
231
280
|
segment_type="dialogue",
|
|
@@ -236,14 +285,15 @@ class GeminiReader:
|
|
|
236
285
|
elif youtube_inline_match:
|
|
237
286
|
groups = youtube_inline_match.groups()
|
|
238
287
|
text = groups[0]
|
|
239
|
-
# Extract seconds from URL parameter
|
|
288
|
+
# Extract seconds from URL parameter (treat as end time)
|
|
240
289
|
url_seconds = groups[3]
|
|
241
|
-
|
|
290
|
+
end_timestamp = cls.parse_timestamp(url_seconds)
|
|
242
291
|
|
|
243
292
|
segments.append(
|
|
244
293
|
GeminiSegment(
|
|
245
294
|
text=text.strip(),
|
|
246
|
-
timestamp=
|
|
295
|
+
timestamp=None,
|
|
296
|
+
end_timestamp=end_timestamp,
|
|
247
297
|
speaker=current_speaker,
|
|
248
298
|
section=current_section,
|
|
249
299
|
segment_type="dialogue",
|
|
@@ -280,38 +330,79 @@ class GeminiReader:
|
|
|
280
330
|
Returns:
|
|
281
331
|
List of Supervision objects ready for alignment
|
|
282
332
|
"""
|
|
283
|
-
segments = cls.read(transcript_path, include_events=
|
|
333
|
+
segments = cls.read(transcript_path, include_events=True, include_sections=False)
|
|
284
334
|
|
|
285
|
-
# Filter to
|
|
286
|
-
dialogue_segments = [
|
|
335
|
+
# Filter to dialogue and event segments with timestamps (either start or end)
|
|
336
|
+
dialogue_segments = [
|
|
337
|
+
s
|
|
338
|
+
for s in segments
|
|
339
|
+
if s.segment_type in ("dialogue", "event") and (s.timestamp is not None or s.end_timestamp is not None)
|
|
340
|
+
]
|
|
287
341
|
|
|
288
342
|
if not dialogue_segments:
|
|
289
343
|
raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
|
|
290
344
|
|
|
291
|
-
# Sort by timestamp
|
|
292
|
-
dialogue_segments.sort(key=lambda x: x.timestamp)
|
|
345
|
+
# Sort by timestamp (use start time if available, otherwise end time)
|
|
346
|
+
dialogue_segments.sort(key=lambda x: x.timestamp if x.timestamp is not None else x.end_timestamp)
|
|
293
347
|
|
|
294
348
|
# Convert to Supervision objects
|
|
295
349
|
supervisions: List[Supervision] = []
|
|
350
|
+
prev_end_time = 0.0
|
|
296
351
|
|
|
297
352
|
for i, segment in enumerate(dialogue_segments):
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
start
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
353
|
+
seg_start = None
|
|
354
|
+
seg_end = None
|
|
355
|
+
|
|
356
|
+
# Determine start and end times based on available timestamps
|
|
357
|
+
if segment.timestamp is not None:
|
|
358
|
+
# Has start time
|
|
359
|
+
seg_start = segment.timestamp
|
|
360
|
+
if segment.end_timestamp is not None:
|
|
361
|
+
# Has both start and end
|
|
362
|
+
seg_end = segment.end_timestamp
|
|
363
|
+
else:
|
|
364
|
+
# Only has start, estimate end
|
|
365
|
+
if i < len(dialogue_segments) - 1:
|
|
366
|
+
# Use next segment's time
|
|
367
|
+
next_seg = dialogue_segments[i + 1]
|
|
368
|
+
if next_seg.timestamp is not None:
|
|
369
|
+
seg_end = next_seg.timestamp
|
|
370
|
+
elif next_seg.end_timestamp is not None:
|
|
371
|
+
# Next has only end, estimate its start and use that
|
|
372
|
+
words_next = len(next_seg.text.split())
|
|
373
|
+
estimated_duration_next = words_next * 0.3
|
|
374
|
+
seg_end = next_seg.end_timestamp - estimated_duration_next
|
|
375
|
+
|
|
376
|
+
if seg_end is None:
|
|
377
|
+
# Estimate based on text length
|
|
378
|
+
words = len(segment.text.split())
|
|
379
|
+
seg_end = seg_start + words * 0.3
|
|
380
|
+
|
|
381
|
+
elif segment.end_timestamp is not None:
|
|
382
|
+
# Only has end time, need to infer start
|
|
383
|
+
seg_end = segment.end_timestamp
|
|
384
|
+
# Use previous segment's end time as start, or estimate based on text
|
|
385
|
+
if prev_end_time > 0:
|
|
386
|
+
seg_start = prev_end_time
|
|
387
|
+
else:
|
|
388
|
+
# Estimate start based on text length
|
|
389
|
+
words = len(segment.text.split())
|
|
390
|
+
estimated_duration = words * 0.3
|
|
391
|
+
seg_start = seg_end - estimated_duration
|
|
392
|
+
|
|
393
|
+
if seg_start is not None and seg_end is not None:
|
|
394
|
+
duration = max(seg_end - seg_start, min_duration)
|
|
395
|
+
if segment.segment_type == "dialogue":
|
|
396
|
+
supervisions.append(
|
|
397
|
+
Supervision(
|
|
398
|
+
text=segment.text,
|
|
399
|
+
start=seg_start,
|
|
400
|
+
duration=duration,
|
|
401
|
+
id=f"segment_{i:05d}",
|
|
402
|
+
speaker=segment.speaker,
|
|
403
|
+
)
|
|
404
|
+
)
|
|
405
|
+
prev_end_time = seg_start + duration
|
|
315
406
|
|
|
316
407
|
# Optionally merge consecutive segments from same speaker
|
|
317
408
|
if merge_consecutive:
|
lattifai/cli/diarization.py
CHANGED
|
@@ -8,7 +8,7 @@ import nemo_run as run
|
|
|
8
8
|
from typing_extensions import Annotated
|
|
9
9
|
|
|
10
10
|
from lattifai.client import LattifAI
|
|
11
|
-
from lattifai.config import CaptionConfig, ClientConfig, DiarizationConfig, MediaConfig
|
|
11
|
+
from lattifai.config import AlignmentConfig, CaptionConfig, ClientConfig, DiarizationConfig, MediaConfig
|
|
12
12
|
from lattifai.utils import safe_print
|
|
13
13
|
|
|
14
14
|
__all__ = ["diarize"]
|
|
@@ -22,6 +22,7 @@ def diarize(
|
|
|
22
22
|
media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
|
|
23
23
|
caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
|
|
24
24
|
client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
|
|
25
|
+
alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
|
|
25
26
|
diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
|
|
26
27
|
):
|
|
27
28
|
"""Run speaker diarization on aligned captions and audio."""
|
|
@@ -53,6 +54,7 @@ def diarize(
|
|
|
53
54
|
|
|
54
55
|
client_instance = LattifAI(
|
|
55
56
|
client_config=client,
|
|
57
|
+
alignment_config=alignment,
|
|
56
58
|
caption_config=caption_config,
|
|
57
59
|
diarization_config=diarization_config,
|
|
58
60
|
)
|
lattifai/cli/transcribe.py
CHANGED
|
@@ -108,12 +108,7 @@ def transcribe(
|
|
|
108
108
|
is_url = media_config.is_input_remote()
|
|
109
109
|
|
|
110
110
|
# Prepare output paths
|
|
111
|
-
|
|
112
|
-
# For URLs, use output_dir from media_config or current directory
|
|
113
|
-
output_path = media_config.output_dir
|
|
114
|
-
else:
|
|
115
|
-
# For files, use input path directory
|
|
116
|
-
output_path = Path(media_config.input_path).parent
|
|
111
|
+
output_dir = media_config.output_dir or Path(media_config.input_path).parent
|
|
117
112
|
|
|
118
113
|
# Create transcriber
|
|
119
114
|
if not transcription_config.lattice_model_path:
|
|
@@ -140,7 +135,7 @@ def transcribe(
|
|
|
140
135
|
input_path = asyncio.run(
|
|
141
136
|
downloader.download_media(
|
|
142
137
|
url=media_config.input_path,
|
|
143
|
-
output_dir=str(
|
|
138
|
+
output_dir=str(output_dir),
|
|
144
139
|
media_format=media_config.normalize_format(),
|
|
145
140
|
force_overwrite=media_config.force_overwrite,
|
|
146
141
|
)
|
|
@@ -167,7 +162,7 @@ def transcribe(
|
|
|
167
162
|
if is_url:
|
|
168
163
|
# For URLs, generate output filename based on transcriber
|
|
169
164
|
output_format = transcriber.file_suffix.lstrip(".")
|
|
170
|
-
final_output =
|
|
165
|
+
final_output = output_dir / f"youtube_LattifAI_{transcriber.name}.{output_format}"
|
|
171
166
|
else:
|
|
172
167
|
# For files, use input filename with suffix
|
|
173
168
|
final_output = Path(media_config.input_path).with_suffix(".LattifAI.srt")
|
lattifai/cli/youtube.py
CHANGED
|
@@ -25,6 +25,7 @@ def youtube(
|
|
|
25
25
|
caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
|
|
26
26
|
transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
|
|
27
27
|
diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
|
|
28
|
+
use_transcription: bool = False,
|
|
28
29
|
):
|
|
29
30
|
"""
|
|
30
31
|
Download media from YouTube (when needed) and align captions.
|
|
@@ -55,6 +56,11 @@ def youtube(
|
|
|
55
56
|
Fields: gemini_api_key, model_name, language, device
|
|
56
57
|
diarization: Speaker diarization configuration.
|
|
57
58
|
Fields: enabled, num_speakers, min_speakers, max_speakers, device
|
|
59
|
+
use_transcription: If True, skip YouTube caption download and directly use
|
|
60
|
+
transcription.model_name to transcribe. If False (default), first try to
|
|
61
|
+
download YouTube captions; if download fails (no captions available or
|
|
62
|
+
errors like HTTP 429), automatically fallback to transcription if
|
|
63
|
+
transcription.model_name is configured.
|
|
58
64
|
|
|
59
65
|
Examples:
|
|
60
66
|
# Download from YouTube and align (positional argument)
|
|
@@ -108,7 +114,11 @@ def youtube(
|
|
|
108
114
|
transcription_config=transcription,
|
|
109
115
|
diarization_config=diarization,
|
|
110
116
|
)
|
|
117
|
+
|
|
111
118
|
# Call the client's youtube method
|
|
119
|
+
# If use_transcription=True, skip YouTube caption download and use transcription directly.
|
|
120
|
+
# If use_transcription=False (default), try YouTube captions first; on failure,
|
|
121
|
+
# automatically fallback to transcription if transcription.model_name is configured.
|
|
112
122
|
return lattifai_client.youtube(
|
|
113
123
|
url=media_config.input_path,
|
|
114
124
|
output_dir=media_config.output_dir,
|
|
@@ -118,6 +128,7 @@ def youtube(
|
|
|
118
128
|
split_sentence=caption_config.split_sentence,
|
|
119
129
|
channel_selector=media_config.channel_selector,
|
|
120
130
|
streaming_chunk_secs=media_config.streaming_chunk_secs,
|
|
131
|
+
use_transcription=use_transcription,
|
|
121
132
|
)
|
|
122
133
|
|
|
123
134
|
|
lattifai/client.py
CHANGED
|
@@ -56,6 +56,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
56
56
|
|
|
57
57
|
# Initialize base API client
|
|
58
58
|
super().__init__(config=client_config)
|
|
59
|
+
self.config = client_config
|
|
59
60
|
|
|
60
61
|
# Initialize all configs with defaults
|
|
61
62
|
alignment_config, transcription_config, diarization_config = self._init_configs(
|
|
@@ -125,38 +126,20 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
125
126
|
safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
|
|
126
127
|
|
|
127
128
|
if caption.supervisions and alignment_strategy == "transcription":
|
|
128
|
-
# raise NotImplementedError("Transcription-based alignment is not yet implemented.")
|
|
129
|
-
assert (
|
|
130
|
-
"gemini" not in self.transcriber.name.lower()
|
|
131
|
-
), "Transcription-based alignment is not supported with Gemini transcriber."
|
|
132
|
-
assert (
|
|
133
|
-
caption.supervisions
|
|
134
|
-
), "Input caption should contain supervisions when using transcription-based alignment."
|
|
135
129
|
if not caption.transcription:
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
# print(colorful.cyan(f"Reading existing transcription from {transcript_file}"))
|
|
146
|
-
transcript = self._read_caption(transcript_file, verbose=False)
|
|
147
|
-
caption.transcription = transcript.supervisions
|
|
148
|
-
caption.audio_events = transcript.audio_events
|
|
149
|
-
|
|
150
|
-
if not caption.transcription:
|
|
151
|
-
transcript = asyncio.run(
|
|
152
|
-
self.transcriber.transcribe(media_audio, language=self.caption_config.source_lang)
|
|
153
|
-
)
|
|
154
|
-
caption.transcription = transcript.transcription
|
|
155
|
-
caption.audio_events = transcript.audio_events
|
|
130
|
+
transcript = self._transcribe(
|
|
131
|
+
media_audio,
|
|
132
|
+
source_lang=self.caption_config.source_lang,
|
|
133
|
+
is_async=False,
|
|
134
|
+
output_dir=Path(str(output_caption_path)).parent if output_caption_path else None,
|
|
135
|
+
)
|
|
136
|
+
caption.transcription = transcript.supervisions or transcript.transcription
|
|
137
|
+
caption.audio_events = transcript.audio_events
|
|
138
|
+
assert caption.transcription, "Transcription is empty after transcription step."
|
|
156
139
|
|
|
157
140
|
# Align caption.supervisions with transcription to get segments
|
|
158
141
|
import regex
|
|
159
|
-
from error_align import
|
|
142
|
+
from error_align import error_align # noqa: F401
|
|
160
143
|
from error_align.utils import DELIMITERS, NUMERIC_TOKEN, STANDARD_TOKEN, OpType
|
|
161
144
|
|
|
162
145
|
JOIN_TOKEN = "❄"
|
|
@@ -183,21 +166,82 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
183
166
|
)
|
|
184
167
|
)
|
|
185
168
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
f"{JOIN_TOKEN}".join(sup.text for sup in caption.transcription),
|
|
189
|
-
tokenizer=custom_tokenizer,
|
|
190
|
-
)
|
|
169
|
+
if split_sentence or self.caption_config.split_sentence:
|
|
170
|
+
caption.supervisions = self.aligner.tokenizer.split_sentences(caption.supervisions)
|
|
191
171
|
|
|
192
|
-
for
|
|
193
|
-
|
|
194
|
-
|
|
172
|
+
ref = f"{JOIN_TOKEN}".join(sup.text for sup in caption.supervisions)
|
|
173
|
+
hyp = f"{JOIN_TOKEN}".join(sup.text for sup in caption.transcription)
|
|
174
|
+
alignments = error_align(ref, hyp, tokenizer=custom_tokenizer)
|
|
195
175
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
176
|
+
idx = 0
|
|
177
|
+
for k, align in enumerate(alignments):
|
|
178
|
+
if align.hyp == JOIN_TOKEN and align.op_type == OpType.MATCH:
|
|
179
|
+
# safe_print(f"Segment {k}: JOIN_TOKEN detected, creating segment.")
|
|
180
|
+
|
|
181
|
+
# Find first non-None ref_slice starting from idx
|
|
182
|
+
ref_start = 0
|
|
183
|
+
for i in range(idx, k + 1):
|
|
184
|
+
if i < len(alignments) and alignments[i].ref_slice is not None:
|
|
185
|
+
ref_start = alignments[i].ref_slice.start
|
|
186
|
+
break
|
|
187
|
+
|
|
188
|
+
# Find last non-None ref_slice up to current position
|
|
189
|
+
ref_stop = len(ref)
|
|
190
|
+
for i in range(k, idx - 1, -1):
|
|
191
|
+
if i < len(alignments) and alignments[i].ref_slice is not None:
|
|
192
|
+
ref_stop = alignments[i].ref_slice.stop
|
|
193
|
+
break
|
|
194
|
+
|
|
195
|
+
# Find first non-None hyp_slice starting from idx
|
|
196
|
+
hyp_start = 0
|
|
197
|
+
for i in range(idx, k + 1):
|
|
198
|
+
if i < len(alignments) and alignments[i].hyp_slice is not None:
|
|
199
|
+
hyp_start = alignments[i].hyp_slice.start
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
# Find last non-None hyp_slice up to current position
|
|
203
|
+
hyp_stop = len(hyp)
|
|
204
|
+
for i in range(k, idx - 1, -1):
|
|
205
|
+
if i < len(alignments) and alignments[i].hyp_slice is not None:
|
|
206
|
+
hyp_stop = alignments[i].hyp_slice.stop
|
|
207
|
+
break
|
|
208
|
+
|
|
209
|
+
safe_print(f"[REF]: {ref[ref_start:ref_stop]}")
|
|
210
|
+
safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop]}\n")
|
|
211
|
+
idx = k + 1
|
|
212
|
+
|
|
213
|
+
# last part - handle remaining alignments after last JOIN_TOKEN
|
|
214
|
+
if idx < len(alignments):
|
|
215
|
+
# Find first non-None ref_slice starting from idx
|
|
216
|
+
ref_start = 0
|
|
217
|
+
for i in range(idx, len(alignments)):
|
|
218
|
+
if alignments[i].ref_slice is not None:
|
|
219
|
+
ref_start = alignments[i].ref_slice.start
|
|
220
|
+
break
|
|
221
|
+
|
|
222
|
+
# Find last non-None ref_slice from end
|
|
223
|
+
ref_stop = len(ref)
|
|
224
|
+
for i in range(len(alignments) - 1, idx - 1, -1):
|
|
225
|
+
if alignments[i].ref_slice is not None:
|
|
226
|
+
ref_stop = alignments[i].ref_slice.stop
|
|
227
|
+
break
|
|
228
|
+
|
|
229
|
+
# Find first non-None hyp_slice starting from idx
|
|
230
|
+
hyp_start = 0
|
|
231
|
+
for i in range(idx, len(alignments)):
|
|
232
|
+
if alignments[i].hyp_slice is not None:
|
|
233
|
+
hyp_start = alignments[i].hyp_slice.start
|
|
234
|
+
break
|
|
235
|
+
|
|
236
|
+
# Find last non-None hyp_slice from end
|
|
237
|
+
hyp_stop = len(hyp)
|
|
238
|
+
for i in range(len(alignments) - 1, idx - 1, -1):
|
|
239
|
+
if alignments[i].hyp_slice is not None:
|
|
240
|
+
hyp_stop = alignments[i].hyp_slice.stop
|
|
241
|
+
break
|
|
242
|
+
|
|
243
|
+
safe_print(f"[REF]: {ref[ref_start:ref_stop + 1]}")
|
|
244
|
+
safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop + 1]}\n")
|
|
201
245
|
|
|
202
246
|
raise NotImplementedError("Transcription-based segmentation is not yet implemented.")
|
|
203
247
|
else:
|
|
@@ -219,6 +263,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
219
263
|
)
|
|
220
264
|
|
|
221
265
|
# align each segment
|
|
266
|
+
sr = media_audio.sampling_rate
|
|
222
267
|
supervisions, alignments = [], []
|
|
223
268
|
for i, (start, end, _supervisions, skipalign) in enumerate(segments, 1):
|
|
224
269
|
print(
|
|
@@ -233,10 +278,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
233
278
|
|
|
234
279
|
offset = round(start, 4)
|
|
235
280
|
# Extract audio slice
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
]
|
|
239
|
-
emission = self.aligner.emission(audio_slice_ndarray)
|
|
281
|
+
audio_slice = media_audio.ndarray[:, int(start * sr) : int(end * sr)]
|
|
282
|
+
emission = self.aligner.emission(audio_slice)
|
|
240
283
|
|
|
241
284
|
# Align segment
|
|
242
285
|
_supervisions, _alignments = self.aligner.alignment(
|
|
@@ -269,6 +312,10 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
269
312
|
if output_caption_path:
|
|
270
313
|
self._write_caption(caption, output_caption_path)
|
|
271
314
|
|
|
315
|
+
# Profile if enabled
|
|
316
|
+
if self.config.profile:
|
|
317
|
+
self.aligner.profile()
|
|
318
|
+
|
|
272
319
|
except (CaptionProcessingError, LatticeEncodingError, AlignmentError, LatticeDecodingError):
|
|
273
320
|
# Re-raise our specific errors as-is
|
|
274
321
|
raise
|
|
@@ -363,7 +410,9 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
363
410
|
# Step 1: Download media
|
|
364
411
|
media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite)
|
|
365
412
|
|
|
366
|
-
media_audio = self.audio_loader(
|
|
413
|
+
media_audio = self.audio_loader(
|
|
414
|
+
media_file, channel_selector=channel_selector, streaming_chunk_secs=streaming_chunk_secs
|
|
415
|
+
)
|
|
367
416
|
|
|
368
417
|
# Step 2: Get or create captions (download or transcribe)
|
|
369
418
|
caption = self._download_or_transcribe_caption(
|
|
@@ -388,7 +437,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
388
437
|
output_caption_path=output_caption_path,
|
|
389
438
|
split_sentence=split_sentence,
|
|
390
439
|
channel_selector=channel_selector,
|
|
391
|
-
streaming_chunk_secs=
|
|
440
|
+
streaming_chunk_secs=None,
|
|
392
441
|
)
|
|
393
442
|
|
|
394
443
|
return caption
|
lattifai/config/alignment.py
CHANGED
|
@@ -28,11 +28,11 @@ class AlignmentConfig:
|
|
|
28
28
|
"""Computation device: 'cpu' for CPU, 'cuda' for NVIDIA GPU, 'mps' for Apple Silicon."""
|
|
29
29
|
|
|
30
30
|
batch_size: int = 1
|
|
31
|
-
"""Batch size for inference (number of samples processed simultaneously)."""
|
|
31
|
+
"""Batch size for inference (number of samples processed simultaneously, NotImplemented yet)."""
|
|
32
32
|
|
|
33
33
|
# Segmented Alignment for Long Audio
|
|
34
34
|
trust_caption_timestamps: bool = False
|
|
35
|
-
"""When True, use original caption timestamps as strong reference constraints during alignment.
|
|
35
|
+
"""When True, use original caption.supervisions' timestamps as strong reference constraints during alignment.
|
|
36
36
|
The alignment process will still adjust timestamps but stay close to the input timing.
|
|
37
37
|
Use this when you want to re-segment caption sentence boundaries (caption.split_sentence=True)
|
|
38
38
|
while preserving the approximate timing from the original captions.
|
lattifai/config/client.py
CHANGED
|
@@ -26,6 +26,11 @@ class ClientConfig:
|
|
|
26
26
|
default_headers: Optional[Dict[str, str]] = field(default=None)
|
|
27
27
|
"""Optional static headers to include in all requests."""
|
|
28
28
|
|
|
29
|
+
profile: bool = False
|
|
30
|
+
"""Enable profiling of client operations tasks.
|
|
31
|
+
When True, prints detailed timing information for various stages of the process.
|
|
32
|
+
"""
|
|
33
|
+
|
|
29
34
|
def __post_init__(self):
|
|
30
35
|
"""Validate and auto-populate configuration after initialization."""
|
|
31
36
|
|