lattifai 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +0 -24
- lattifai/alignment/lattice1_aligner.py +1 -1
- lattifai/alignment/lattice1_worker.py +1 -6
- lattifai/alignment/segmenter.py +1 -1
- lattifai/alignment/sentence_splitter.py +219 -0
- lattifai/alignment/tokenizer.py +10 -181
- lattifai/caption/caption.py +0 -2
- lattifai/caption/gemini_reader.py +151 -60
- lattifai/cli/transcribe.py +3 -8
- lattifai/client.py +91 -47
- lattifai/config/alignment.py +2 -2
- lattifai/mixin.py +10 -4
- lattifai/utils.py +74 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/METADATA +2 -1
- {lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/RECORD +19 -18
- {lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/WHEEL +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/entry_points.txt +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/top_level.txt +0 -0
|
@@ -15,7 +15,8 @@ class GeminiSegment:
|
|
|
15
15
|
"""Represents a segment in the Gemini transcript with metadata."""
|
|
16
16
|
|
|
17
17
|
text: str
|
|
18
|
-
timestamp: Optional[float] = None
|
|
18
|
+
timestamp: Optional[float] = None # For backward compatibility (start time)
|
|
19
|
+
end_timestamp: Optional[float] = None # End time when timestamp is at the end
|
|
19
20
|
speaker: Optional[str] = None
|
|
20
21
|
section: Optional[str] = None
|
|
21
22
|
segment_type: str = "dialogue" # 'dialogue', 'event', or 'section_header'
|
|
@@ -26,6 +27,11 @@ class GeminiSegment:
|
|
|
26
27
|
"""Return start time in seconds."""
|
|
27
28
|
return self.timestamp if self.timestamp is not None else 0.0
|
|
28
29
|
|
|
30
|
+
@property
|
|
31
|
+
def end(self) -> Optional[float]:
|
|
32
|
+
"""Return end time in seconds if available."""
|
|
33
|
+
return self.end_timestamp
|
|
34
|
+
|
|
29
35
|
|
|
30
36
|
class GeminiReader:
|
|
31
37
|
"""Parser for YouTube transcript format with speaker labels and timestamps."""
|
|
@@ -34,8 +40,12 @@ class GeminiReader:
|
|
|
34
40
|
TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
|
|
35
41
|
SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
|
|
36
42
|
SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[::])\*\*\s*(.+)$")
|
|
37
|
-
|
|
38
|
-
|
|
43
|
+
# Event pattern: [Event] [HH:MM:SS] or [Event] [MM:SS] - prioritize HH:MM:SS format
|
|
44
|
+
EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(\d{1,2}):(\d{2})(?::(\d{2}))?\]$")
|
|
45
|
+
# Timestamp at the end indicates end time
|
|
46
|
+
INLINE_TIMESTAMP_END_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
|
|
47
|
+
# Timestamp at the beginning indicates start time
|
|
48
|
+
INLINE_TIMESTAMP_START_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]\s*(.+)$")
|
|
39
49
|
|
|
40
50
|
# New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
|
|
41
51
|
YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
|
|
@@ -144,18 +154,22 @@ class GeminiReader:
|
|
|
144
154
|
if event_match:
|
|
145
155
|
groups = event_match.groups()
|
|
146
156
|
event_text = groups[0]
|
|
147
|
-
# Parse timestamp -
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
157
|
+
# Parse timestamp - groups: (event_text, hours/minutes, minutes/seconds, seconds_optional)
|
|
158
|
+
hours_or_minutes = groups[1]
|
|
159
|
+
minutes_or_seconds = groups[2]
|
|
160
|
+
seconds_optional = groups[3]
|
|
161
|
+
|
|
162
|
+
if seconds_optional is not None:
|
|
163
|
+
# HH:MM:SS format
|
|
164
|
+
timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds, seconds_optional)
|
|
152
165
|
else:
|
|
153
|
-
|
|
166
|
+
# MM:SS format
|
|
167
|
+
timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds)
|
|
154
168
|
|
|
155
169
|
if include_events and timestamp is not None:
|
|
156
170
|
segments.append(
|
|
157
171
|
GeminiSegment(
|
|
158
|
-
text=event_text.strip(),
|
|
172
|
+
text=f"[{event_text.strip()}]",
|
|
159
173
|
timestamp=timestamp,
|
|
160
174
|
section=current_section,
|
|
161
175
|
segment_type="event",
|
|
@@ -170,34 +184,44 @@ class GeminiReader:
|
|
|
170
184
|
speaker, text_with_timestamp = speaker_match.groups()
|
|
171
185
|
current_speaker = speaker.strip()
|
|
172
186
|
|
|
173
|
-
#
|
|
174
|
-
|
|
187
|
+
# Check for timestamp at the beginning (start time)
|
|
188
|
+
start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(text_with_timestamp.strip())
|
|
189
|
+
# Check for timestamp at the end (end time)
|
|
190
|
+
end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(text_with_timestamp.strip())
|
|
175
191
|
youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
|
|
176
192
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
193
|
+
start_timestamp = None
|
|
194
|
+
end_timestamp = None
|
|
195
|
+
text = text_with_timestamp.strip()
|
|
196
|
+
|
|
197
|
+
if start_match:
|
|
198
|
+
groups = start_match.groups()
|
|
199
|
+
# Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
|
|
200
|
+
if groups[0] is not None: # HH:MM:SS format
|
|
201
|
+
start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
|
|
202
|
+
elif groups[3] is not None: # MM:SS format
|
|
203
|
+
start_timestamp = cls.parse_timestamp(groups[3], groups[4])
|
|
204
|
+
text = groups[5] # Text is after timestamp
|
|
205
|
+
elif end_match:
|
|
206
|
+
groups = end_match.groups()
|
|
207
|
+
text = groups[0] # Text is before timestamp
|
|
180
208
|
# Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
|
|
181
209
|
if groups[1] is not None: # HH:MM:SS format
|
|
182
|
-
|
|
210
|
+
end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
|
|
183
211
|
elif groups[4] is not None: # MM:SS format
|
|
184
|
-
|
|
185
|
-
else:
|
|
186
|
-
timestamp = None
|
|
212
|
+
end_timestamp = cls.parse_timestamp(groups[4], groups[5])
|
|
187
213
|
elif youtube_match:
|
|
188
214
|
groups = youtube_match.groups()
|
|
189
215
|
text = groups[0]
|
|
190
|
-
# Extract seconds from URL parameter
|
|
216
|
+
# Extract seconds from URL parameter (treat as end time)
|
|
191
217
|
url_seconds = groups[3]
|
|
192
|
-
|
|
193
|
-
else:
|
|
194
|
-
text = text_with_timestamp.strip()
|
|
195
|
-
timestamp = None
|
|
218
|
+
end_timestamp = cls.parse_timestamp(url_seconds)
|
|
196
219
|
|
|
197
220
|
segments.append(
|
|
198
221
|
GeminiSegment(
|
|
199
222
|
text=text.strip(),
|
|
200
|
-
timestamp=
|
|
223
|
+
timestamp=start_timestamp,
|
|
224
|
+
end_timestamp=end_timestamp,
|
|
201
225
|
speaker=current_speaker,
|
|
202
226
|
section=current_section,
|
|
203
227
|
segment_type="dialogue",
|
|
@@ -207,25 +231,50 @@ class GeminiReader:
|
|
|
207
231
|
current_speaker = None # Reset speaker after use
|
|
208
232
|
continue
|
|
209
233
|
|
|
210
|
-
# Parse plain text with timestamp
|
|
211
|
-
|
|
234
|
+
# Parse plain text with timestamp (check both positions)
|
|
235
|
+
start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(line)
|
|
236
|
+
end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(line)
|
|
212
237
|
youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
|
|
213
238
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
239
|
+
start_timestamp = None
|
|
240
|
+
end_timestamp = None
|
|
241
|
+
text = None
|
|
242
|
+
|
|
243
|
+
if start_match:
|
|
244
|
+
groups = start_match.groups()
|
|
245
|
+
# Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
|
|
246
|
+
if groups[0] is not None: # HH:MM:SS format
|
|
247
|
+
start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
|
|
248
|
+
elif groups[3] is not None: # MM:SS format
|
|
249
|
+
start_timestamp = cls.parse_timestamp(groups[3], groups[4])
|
|
250
|
+
text = groups[5] # Text is after timestamp
|
|
251
|
+
|
|
252
|
+
segments.append(
|
|
253
|
+
GeminiSegment(
|
|
254
|
+
text=text.strip(),
|
|
255
|
+
timestamp=start_timestamp,
|
|
256
|
+
end_timestamp=None,
|
|
257
|
+
speaker=current_speaker,
|
|
258
|
+
section=current_section,
|
|
259
|
+
segment_type="dialogue",
|
|
260
|
+
line_number=line_num,
|
|
261
|
+
)
|
|
262
|
+
)
|
|
263
|
+
continue
|
|
264
|
+
elif end_match:
|
|
265
|
+
groups = end_match.groups()
|
|
266
|
+
text = groups[0] # Text is before timestamp
|
|
217
267
|
# Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
|
|
218
268
|
if groups[1] is not None: # HH:MM:SS format
|
|
219
|
-
|
|
269
|
+
end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
|
|
220
270
|
elif groups[4] is not None: # MM:SS format
|
|
221
|
-
|
|
222
|
-
else:
|
|
223
|
-
timestamp = None
|
|
271
|
+
end_timestamp = cls.parse_timestamp(groups[4], groups[5])
|
|
224
272
|
|
|
225
273
|
segments.append(
|
|
226
274
|
GeminiSegment(
|
|
227
275
|
text=text.strip(),
|
|
228
|
-
timestamp=
|
|
276
|
+
timestamp=None,
|
|
277
|
+
end_timestamp=end_timestamp,
|
|
229
278
|
speaker=current_speaker,
|
|
230
279
|
section=current_section,
|
|
231
280
|
segment_type="dialogue",
|
|
@@ -236,14 +285,15 @@ class GeminiReader:
|
|
|
236
285
|
elif youtube_inline_match:
|
|
237
286
|
groups = youtube_inline_match.groups()
|
|
238
287
|
text = groups[0]
|
|
239
|
-
# Extract seconds from URL parameter
|
|
288
|
+
# Extract seconds from URL parameter (treat as end time)
|
|
240
289
|
url_seconds = groups[3]
|
|
241
|
-
|
|
290
|
+
end_timestamp = cls.parse_timestamp(url_seconds)
|
|
242
291
|
|
|
243
292
|
segments.append(
|
|
244
293
|
GeminiSegment(
|
|
245
294
|
text=text.strip(),
|
|
246
|
-
timestamp=
|
|
295
|
+
timestamp=None,
|
|
296
|
+
end_timestamp=end_timestamp,
|
|
247
297
|
speaker=current_speaker,
|
|
248
298
|
section=current_section,
|
|
249
299
|
segment_type="dialogue",
|
|
@@ -280,38 +330,79 @@ class GeminiReader:
|
|
|
280
330
|
Returns:
|
|
281
331
|
List of Supervision objects ready for alignment
|
|
282
332
|
"""
|
|
283
|
-
segments = cls.read(transcript_path, include_events=
|
|
333
|
+
segments = cls.read(transcript_path, include_events=True, include_sections=False)
|
|
284
334
|
|
|
285
|
-
# Filter to
|
|
286
|
-
dialogue_segments = [
|
|
335
|
+
# Filter to dialogue and event segments with timestamps (either start or end)
|
|
336
|
+
dialogue_segments = [
|
|
337
|
+
s
|
|
338
|
+
for s in segments
|
|
339
|
+
if s.segment_type in ("dialogue", "event") and (s.timestamp is not None or s.end_timestamp is not None)
|
|
340
|
+
]
|
|
287
341
|
|
|
288
342
|
if not dialogue_segments:
|
|
289
343
|
raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
|
|
290
344
|
|
|
291
|
-
# Sort by timestamp
|
|
292
|
-
dialogue_segments.sort(key=lambda x: x.timestamp)
|
|
345
|
+
# Sort by timestamp (use start time if available, otherwise end time)
|
|
346
|
+
dialogue_segments.sort(key=lambda x: x.timestamp if x.timestamp is not None else x.end_timestamp)
|
|
293
347
|
|
|
294
348
|
# Convert to Supervision objects
|
|
295
349
|
supervisions: List[Supervision] = []
|
|
350
|
+
prev_end_time = 0.0
|
|
296
351
|
|
|
297
352
|
for i, segment in enumerate(dialogue_segments):
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
start
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
353
|
+
seg_start = None
|
|
354
|
+
seg_end = None
|
|
355
|
+
|
|
356
|
+
# Determine start and end times based on available timestamps
|
|
357
|
+
if segment.timestamp is not None:
|
|
358
|
+
# Has start time
|
|
359
|
+
seg_start = segment.timestamp
|
|
360
|
+
if segment.end_timestamp is not None:
|
|
361
|
+
# Has both start and end
|
|
362
|
+
seg_end = segment.end_timestamp
|
|
363
|
+
else:
|
|
364
|
+
# Only has start, estimate end
|
|
365
|
+
if i < len(dialogue_segments) - 1:
|
|
366
|
+
# Use next segment's time
|
|
367
|
+
next_seg = dialogue_segments[i + 1]
|
|
368
|
+
if next_seg.timestamp is not None:
|
|
369
|
+
seg_end = next_seg.timestamp
|
|
370
|
+
elif next_seg.end_timestamp is not None:
|
|
371
|
+
# Next has only end, estimate its start and use that
|
|
372
|
+
words_next = len(next_seg.text.split())
|
|
373
|
+
estimated_duration_next = words_next * 0.3
|
|
374
|
+
seg_end = next_seg.end_timestamp - estimated_duration_next
|
|
375
|
+
|
|
376
|
+
if seg_end is None:
|
|
377
|
+
# Estimate based on text length
|
|
378
|
+
words = len(segment.text.split())
|
|
379
|
+
seg_end = seg_start + words * 0.3
|
|
380
|
+
|
|
381
|
+
elif segment.end_timestamp is not None:
|
|
382
|
+
# Only has end time, need to infer start
|
|
383
|
+
seg_end = segment.end_timestamp
|
|
384
|
+
# Use previous segment's end time as start, or estimate based on text
|
|
385
|
+
if prev_end_time > 0:
|
|
386
|
+
seg_start = prev_end_time
|
|
387
|
+
else:
|
|
388
|
+
# Estimate start based on text length
|
|
389
|
+
words = len(segment.text.split())
|
|
390
|
+
estimated_duration = words * 0.3
|
|
391
|
+
seg_start = seg_end - estimated_duration
|
|
392
|
+
|
|
393
|
+
if seg_start is not None and seg_end is not None:
|
|
394
|
+
duration = max(seg_end - seg_start, min_duration)
|
|
395
|
+
if segment.segment_type == "dialogue":
|
|
396
|
+
supervisions.append(
|
|
397
|
+
Supervision(
|
|
398
|
+
text=segment.text,
|
|
399
|
+
start=seg_start,
|
|
400
|
+
duration=duration,
|
|
401
|
+
id=f"segment_{i:05d}",
|
|
402
|
+
speaker=segment.speaker,
|
|
403
|
+
)
|
|
404
|
+
)
|
|
405
|
+
prev_end_time = seg_start + duration
|
|
315
406
|
|
|
316
407
|
# Optionally merge consecutive segments from same speaker
|
|
317
408
|
if merge_consecutive:
|
lattifai/cli/transcribe.py
CHANGED
|
@@ -108,12 +108,7 @@ def transcribe(
|
|
|
108
108
|
is_url = media_config.is_input_remote()
|
|
109
109
|
|
|
110
110
|
# Prepare output paths
|
|
111
|
-
|
|
112
|
-
# For URLs, use output_dir from media_config or current directory
|
|
113
|
-
output_path = media_config.output_dir
|
|
114
|
-
else:
|
|
115
|
-
# For files, use input path directory
|
|
116
|
-
output_path = Path(media_config.input_path).parent
|
|
111
|
+
output_dir = media_config.output_dir or Path(media_config.input_path).parent
|
|
117
112
|
|
|
118
113
|
# Create transcriber
|
|
119
114
|
if not transcription_config.lattice_model_path:
|
|
@@ -140,7 +135,7 @@ def transcribe(
|
|
|
140
135
|
input_path = asyncio.run(
|
|
141
136
|
downloader.download_media(
|
|
142
137
|
url=media_config.input_path,
|
|
143
|
-
output_dir=str(
|
|
138
|
+
output_dir=str(output_dir),
|
|
144
139
|
media_format=media_config.normalize_format(),
|
|
145
140
|
force_overwrite=media_config.force_overwrite,
|
|
146
141
|
)
|
|
@@ -167,7 +162,7 @@ def transcribe(
|
|
|
167
162
|
if is_url:
|
|
168
163
|
# For URLs, generate output filename based on transcriber
|
|
169
164
|
output_format = transcriber.file_suffix.lstrip(".")
|
|
170
|
-
final_output =
|
|
165
|
+
final_output = output_dir / f"youtube_LattifAI_{transcriber.name}.{output_format}"
|
|
171
166
|
else:
|
|
172
167
|
# For files, use input filename with suffix
|
|
173
168
|
final_output = Path(media_config.input_path).with_suffix(".LattifAI.srt")
|
lattifai/client.py
CHANGED
|
@@ -126,38 +126,20 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
126
126
|
safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
|
|
127
127
|
|
|
128
128
|
if caption.supervisions and alignment_strategy == "transcription":
|
|
129
|
-
# raise NotImplementedError("Transcription-based alignment is not yet implemented.")
|
|
130
|
-
assert (
|
|
131
|
-
"gemini" not in self.transcriber.name.lower()
|
|
132
|
-
), "Transcription-based alignment is not supported with Gemini transcriber."
|
|
133
|
-
assert (
|
|
134
|
-
caption.supervisions
|
|
135
|
-
), "Input caption should contain supervisions when using transcription-based alignment."
|
|
136
129
|
if not caption.transcription:
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
# print(colorful.cyan(f"Reading existing transcription from {transcript_file}"))
|
|
147
|
-
transcript = self._read_caption(transcript_file, verbose=False)
|
|
148
|
-
caption.transcription = transcript.supervisions
|
|
149
|
-
caption.audio_events = transcript.audio_events
|
|
150
|
-
|
|
151
|
-
if not caption.transcription:
|
|
152
|
-
transcript = asyncio.run(
|
|
153
|
-
self.transcriber.transcribe(media_audio, language=self.caption_config.source_lang)
|
|
154
|
-
)
|
|
155
|
-
caption.transcription = transcript.transcription
|
|
156
|
-
caption.audio_events = transcript.audio_events
|
|
130
|
+
transcript = self._transcribe(
|
|
131
|
+
media_audio,
|
|
132
|
+
source_lang=self.caption_config.source_lang,
|
|
133
|
+
is_async=False,
|
|
134
|
+
output_dir=Path(str(output_caption_path)).parent if output_caption_path else None,
|
|
135
|
+
)
|
|
136
|
+
caption.transcription = transcript.supervisions or transcript.transcription
|
|
137
|
+
caption.audio_events = transcript.audio_events
|
|
138
|
+
assert caption.transcription, "Transcription is empty after transcription step."
|
|
157
139
|
|
|
158
140
|
# Align caption.supervisions with transcription to get segments
|
|
159
141
|
import regex
|
|
160
|
-
from error_align import
|
|
142
|
+
from error_align import error_align # noqa: F401
|
|
161
143
|
from error_align.utils import DELIMITERS, NUMERIC_TOKEN, STANDARD_TOKEN, OpType
|
|
162
144
|
|
|
163
145
|
JOIN_TOKEN = "❄"
|
|
@@ -184,21 +166,82 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
184
166
|
)
|
|
185
167
|
)
|
|
186
168
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
f"{JOIN_TOKEN}".join(sup.text for sup in caption.transcription),
|
|
190
|
-
tokenizer=custom_tokenizer,
|
|
191
|
-
)
|
|
169
|
+
if split_sentence or self.caption_config.split_sentence:
|
|
170
|
+
caption.supervisions = self.aligner.tokenizer.split_sentences(caption.supervisions)
|
|
192
171
|
|
|
193
|
-
for
|
|
194
|
-
|
|
195
|
-
|
|
172
|
+
ref = f"{JOIN_TOKEN}".join(sup.text for sup in caption.supervisions)
|
|
173
|
+
hyp = f"{JOIN_TOKEN}".join(sup.text for sup in caption.transcription)
|
|
174
|
+
alignments = error_align(ref, hyp, tokenizer=custom_tokenizer)
|
|
196
175
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
176
|
+
idx = 0
|
|
177
|
+
for k, align in enumerate(alignments):
|
|
178
|
+
if align.hyp == JOIN_TOKEN and align.op_type == OpType.MATCH:
|
|
179
|
+
# safe_print(f"Segment {k}: JOIN_TOKEN detected, creating segment.")
|
|
180
|
+
|
|
181
|
+
# Find first non-None ref_slice starting from idx
|
|
182
|
+
ref_start = 0
|
|
183
|
+
for i in range(idx, k + 1):
|
|
184
|
+
if i < len(alignments) and alignments[i].ref_slice is not None:
|
|
185
|
+
ref_start = alignments[i].ref_slice.start
|
|
186
|
+
break
|
|
187
|
+
|
|
188
|
+
# Find last non-None ref_slice up to current position
|
|
189
|
+
ref_stop = len(ref)
|
|
190
|
+
for i in range(k, idx - 1, -1):
|
|
191
|
+
if i < len(alignments) and alignments[i].ref_slice is not None:
|
|
192
|
+
ref_stop = alignments[i].ref_slice.stop
|
|
193
|
+
break
|
|
194
|
+
|
|
195
|
+
# Find first non-None hyp_slice starting from idx
|
|
196
|
+
hyp_start = 0
|
|
197
|
+
for i in range(idx, k + 1):
|
|
198
|
+
if i < len(alignments) and alignments[i].hyp_slice is not None:
|
|
199
|
+
hyp_start = alignments[i].hyp_slice.start
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
# Find last non-None hyp_slice up to current position
|
|
203
|
+
hyp_stop = len(hyp)
|
|
204
|
+
for i in range(k, idx - 1, -1):
|
|
205
|
+
if i < len(alignments) and alignments[i].hyp_slice is not None:
|
|
206
|
+
hyp_stop = alignments[i].hyp_slice.stop
|
|
207
|
+
break
|
|
208
|
+
|
|
209
|
+
safe_print(f"[REF]: {ref[ref_start:ref_stop]}")
|
|
210
|
+
safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop]}\n")
|
|
211
|
+
idx = k + 1
|
|
212
|
+
|
|
213
|
+
# last part - handle remaining alignments after last JOIN_TOKEN
|
|
214
|
+
if idx < len(alignments):
|
|
215
|
+
# Find first non-None ref_slice starting from idx
|
|
216
|
+
ref_start = 0
|
|
217
|
+
for i in range(idx, len(alignments)):
|
|
218
|
+
if alignments[i].ref_slice is not None:
|
|
219
|
+
ref_start = alignments[i].ref_slice.start
|
|
220
|
+
break
|
|
221
|
+
|
|
222
|
+
# Find last non-None ref_slice from end
|
|
223
|
+
ref_stop = len(ref)
|
|
224
|
+
for i in range(len(alignments) - 1, idx - 1, -1):
|
|
225
|
+
if alignments[i].ref_slice is not None:
|
|
226
|
+
ref_stop = alignments[i].ref_slice.stop
|
|
227
|
+
break
|
|
228
|
+
|
|
229
|
+
# Find first non-None hyp_slice starting from idx
|
|
230
|
+
hyp_start = 0
|
|
231
|
+
for i in range(idx, len(alignments)):
|
|
232
|
+
if alignments[i].hyp_slice is not None:
|
|
233
|
+
hyp_start = alignments[i].hyp_slice.start
|
|
234
|
+
break
|
|
235
|
+
|
|
236
|
+
# Find last non-None hyp_slice from end
|
|
237
|
+
hyp_stop = len(hyp)
|
|
238
|
+
for i in range(len(alignments) - 1, idx - 1, -1):
|
|
239
|
+
if alignments[i].hyp_slice is not None:
|
|
240
|
+
hyp_stop = alignments[i].hyp_slice.stop
|
|
241
|
+
break
|
|
242
|
+
|
|
243
|
+
safe_print(f"[REF]: {ref[ref_start:ref_stop + 1]}")
|
|
244
|
+
safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop + 1]}\n")
|
|
202
245
|
|
|
203
246
|
raise NotImplementedError("Transcription-based segmentation is not yet implemented.")
|
|
204
247
|
else:
|
|
@@ -220,6 +263,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
220
263
|
)
|
|
221
264
|
|
|
222
265
|
# align each segment
|
|
266
|
+
sr = media_audio.sampling_rate
|
|
223
267
|
supervisions, alignments = [], []
|
|
224
268
|
for i, (start, end, _supervisions, skipalign) in enumerate(segments, 1):
|
|
225
269
|
print(
|
|
@@ -234,10 +278,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
234
278
|
|
|
235
279
|
offset = round(start, 4)
|
|
236
280
|
# Extract audio slice
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
]
|
|
240
|
-
emission = self.aligner.emission(audio_slice_ndarray)
|
|
281
|
+
audio_slice = media_audio.ndarray[:, int(start * sr) : int(end * sr)]
|
|
282
|
+
emission = self.aligner.emission(audio_slice)
|
|
241
283
|
|
|
242
284
|
# Align segment
|
|
243
285
|
_supervisions, _alignments = self.aligner.alignment(
|
|
@@ -368,7 +410,9 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
368
410
|
# Step 1: Download media
|
|
369
411
|
media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite)
|
|
370
412
|
|
|
371
|
-
media_audio = self.audio_loader(
|
|
413
|
+
media_audio = self.audio_loader(
|
|
414
|
+
media_file, channel_selector=channel_selector, streaming_chunk_secs=streaming_chunk_secs
|
|
415
|
+
)
|
|
372
416
|
|
|
373
417
|
# Step 2: Get or create captions (download or transcribe)
|
|
374
418
|
caption = self._download_or_transcribe_caption(
|
|
@@ -393,7 +437,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
|
|
|
393
437
|
output_caption_path=output_caption_path,
|
|
394
438
|
split_sentence=split_sentence,
|
|
395
439
|
channel_selector=channel_selector,
|
|
396
|
-
streaming_chunk_secs=
|
|
440
|
+
streaming_chunk_secs=None,
|
|
397
441
|
)
|
|
398
442
|
|
|
399
443
|
return caption
|
lattifai/config/alignment.py
CHANGED
|
@@ -28,11 +28,11 @@ class AlignmentConfig:
|
|
|
28
28
|
"""Computation device: 'cpu' for CPU, 'cuda' for NVIDIA GPU, 'mps' for Apple Silicon."""
|
|
29
29
|
|
|
30
30
|
batch_size: int = 1
|
|
31
|
-
"""Batch size for inference (number of samples processed simultaneously)."""
|
|
31
|
+
"""Batch size for inference (number of samples processed simultaneously, NotImplemented yet)."""
|
|
32
32
|
|
|
33
33
|
# Segmented Alignment for Long Audio
|
|
34
34
|
trust_caption_timestamps: bool = False
|
|
35
|
-
"""When True, use original caption timestamps as strong reference constraints during alignment.
|
|
35
|
+
"""When True, use original caption.supervisions' timestamps as strong reference constraints during alignment.
|
|
36
36
|
The alignment process will still adjust timestamps but stay close to the input timing.
|
|
37
37
|
Use this when you want to re-segment caption sentence boundaries (caption.split_sentence=True)
|
|
38
38
|
while preserving the approximate timing from the original captions.
|
lattifai/mixin.py
CHANGED
|
@@ -290,12 +290,12 @@ class LattifAIClientMixin:
|
|
|
290
290
|
diarization_file = Path(str(input_caption)).with_suffix(".SpkDiar")
|
|
291
291
|
if diarization_file.exists():
|
|
292
292
|
if verbose:
|
|
293
|
-
safe_print(colorful.cyan(f"📖
|
|
293
|
+
safe_print(colorful.cyan(f"📖 Step1b: Reading speaker diarization from {diarization_file}"))
|
|
294
294
|
caption.read_speaker_diarization(diarization_file)
|
|
295
295
|
events_file = Path(str(input_caption)).with_suffix(".AED")
|
|
296
296
|
if events_file.exists():
|
|
297
297
|
if verbose:
|
|
298
|
-
safe_print(colorful.cyan(f"📖
|
|
298
|
+
safe_print(colorful.cyan(f"📖 Step1c: Reading audio events from {events_file}"))
|
|
299
299
|
from tgt import read_textgrid
|
|
300
300
|
|
|
301
301
|
caption.audio_events = read_textgrid(events_file)
|
|
@@ -404,6 +404,14 @@ class LattifAIClientMixin:
|
|
|
404
404
|
# Transcription mode: use Transcriber to transcribe
|
|
405
405
|
self._validate_transcription_setup()
|
|
406
406
|
|
|
407
|
+
if output_dir:
|
|
408
|
+
# Generate transcript file path
|
|
409
|
+
transcript_file = output_dir / f"{Path(str(media_file)).stem}_{self.transcriber.file_name}"
|
|
410
|
+
if transcript_file.exists():
|
|
411
|
+
safe_print(colorful.cyan(f" Using existing transcript file: {transcript_file}"))
|
|
412
|
+
transcription = self._read_caption(transcript_file, normalize_text=False)
|
|
413
|
+
return transcription
|
|
414
|
+
|
|
407
415
|
safe_print(colorful.cyan(f"🎤 Transcribing({self.transcriber.name}) media: {str(media_file)} ..."))
|
|
408
416
|
transcription = await self.transcriber.transcribe_file(media_file, language=source_lang)
|
|
409
417
|
safe_print(colorful.green(" ✓ Transcription completed."))
|
|
@@ -442,8 +450,6 @@ class LattifAIClientMixin:
|
|
|
442
450
|
safe_print(colorful.yellow(f"First segment: {transcription.transcription[0].text}"))
|
|
443
451
|
|
|
444
452
|
if output_dir:
|
|
445
|
-
# Generate transcript file path
|
|
446
|
-
transcript_file = output_dir / f"{Path(str(media_file)).stem}_{self.transcriber.file_name}"
|
|
447
453
|
await asyncio.to_thread(self.transcriber.write, transcription, transcript_file, encoding="utf-8")
|
|
448
454
|
safe_print(colorful.green(f" ✓ Transcription saved to: {transcript_file}"))
|
|
449
455
|
|