lattifai 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,7 +15,8 @@ class GeminiSegment:
15
15
  """Represents a segment in the Gemini transcript with metadata."""
16
16
 
17
17
  text: str
18
- timestamp: Optional[float] = None
18
+ timestamp: Optional[float] = None # For backward compatibility (start time)
19
+ end_timestamp: Optional[float] = None # End time when timestamp is at the end
19
20
  speaker: Optional[str] = None
20
21
  section: Optional[str] = None
21
22
  segment_type: str = "dialogue" # 'dialogue', 'event', or 'section_header'
@@ -26,6 +27,11 @@ class GeminiSegment:
26
27
  """Return start time in seconds."""
27
28
  return self.timestamp if self.timestamp is not None else 0.0
28
29
 
30
+ @property
31
+ def end(self) -> Optional[float]:
32
+ """Return end time in seconds if available."""
33
+ return self.end_timestamp
34
+
29
35
 
30
36
  class GeminiReader:
31
37
  """Parser for YouTube transcript format with speaker labels and timestamps."""
@@ -34,8 +40,12 @@ class GeminiReader:
34
40
  TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
35
41
  SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
36
42
  SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[::])\*\*\s*(.+)$")
37
- EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
38
- INLINE_TIMESTAMP_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
43
+ # Event pattern: [Event] [HH:MM:SS] or [Event] [MM:SS] - prioritize HH:MM:SS format
44
+ EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(\d{1,2}):(\d{2})(?::(\d{2}))?\]$")
45
+ # Timestamp at the end indicates end time
46
+ INLINE_TIMESTAMP_END_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
47
+ # Timestamp at the beginning indicates start time
48
+ INLINE_TIMESTAMP_START_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]\s*(.+)$")
39
49
 
40
50
  # New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
41
51
  YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
@@ -144,18 +154,22 @@ class GeminiReader:
144
154
  if event_match:
145
155
  groups = event_match.groups()
146
156
  event_text = groups[0]
147
- # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
148
- if groups[1] is not None: # HH:MM:SS format
149
- timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
150
- elif groups[4] is not None: # MM:SS format
151
- timestamp = cls.parse_timestamp(groups[4], groups[5])
157
+ # Parse timestamp - groups: (event_text, hours/minutes, minutes/seconds, seconds_optional)
158
+ hours_or_minutes = groups[1]
159
+ minutes_or_seconds = groups[2]
160
+ seconds_optional = groups[3]
161
+
162
+ if seconds_optional is not None:
163
+ # HH:MM:SS format
164
+ timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds, seconds_optional)
152
165
  else:
153
- timestamp = None
166
+ # MM:SS format
167
+ timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds)
154
168
 
155
169
  if include_events and timestamp is not None:
156
170
  segments.append(
157
171
  GeminiSegment(
158
- text=event_text.strip(),
172
+ text=f"[{event_text.strip()}]",
159
173
  timestamp=timestamp,
160
174
  section=current_section,
161
175
  segment_type="event",
@@ -170,34 +184,44 @@ class GeminiReader:
170
184
  speaker, text_with_timestamp = speaker_match.groups()
171
185
  current_speaker = speaker.strip()
172
186
 
173
- # Extract timestamp from the end of the text
174
- timestamp_match = cls.INLINE_TIMESTAMP_PATTERN.match(text_with_timestamp.strip())
187
+ # Check for timestamp at the beginning (start time)
188
+ start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(text_with_timestamp.strip())
189
+ # Check for timestamp at the end (end time)
190
+ end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(text_with_timestamp.strip())
175
191
  youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
176
192
 
177
- if timestamp_match:
178
- groups = timestamp_match.groups()
179
- text = groups[0]
193
+ start_timestamp = None
194
+ end_timestamp = None
195
+ text = text_with_timestamp.strip()
196
+
197
+ if start_match:
198
+ groups = start_match.groups()
199
+ # Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
200
+ if groups[0] is not None: # HH:MM:SS format
201
+ start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
202
+ elif groups[3] is not None: # MM:SS format
203
+ start_timestamp = cls.parse_timestamp(groups[3], groups[4])
204
+ text = groups[5] # Text is after timestamp
205
+ elif end_match:
206
+ groups = end_match.groups()
207
+ text = groups[0] # Text is before timestamp
180
208
  # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
181
209
  if groups[1] is not None: # HH:MM:SS format
182
- timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
210
+ end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
183
211
  elif groups[4] is not None: # MM:SS format
184
- timestamp = cls.parse_timestamp(groups[4], groups[5])
185
- else:
186
- timestamp = None
212
+ end_timestamp = cls.parse_timestamp(groups[4], groups[5])
187
213
  elif youtube_match:
188
214
  groups = youtube_match.groups()
189
215
  text = groups[0]
190
- # Extract seconds from URL parameter
216
+ # Extract seconds from URL parameter (treat as end time)
191
217
  url_seconds = groups[3]
192
- timestamp = cls.parse_timestamp(url_seconds)
193
- else:
194
- text = text_with_timestamp.strip()
195
- timestamp = None
218
+ end_timestamp = cls.parse_timestamp(url_seconds)
196
219
 
197
220
  segments.append(
198
221
  GeminiSegment(
199
222
  text=text.strip(),
200
- timestamp=timestamp,
223
+ timestamp=start_timestamp,
224
+ end_timestamp=end_timestamp,
201
225
  speaker=current_speaker,
202
226
  section=current_section,
203
227
  segment_type="dialogue",
@@ -207,25 +231,50 @@ class GeminiReader:
207
231
  current_speaker = None # Reset speaker after use
208
232
  continue
209
233
 
210
- # Parse plain text with timestamp at the end
211
- inline_match = cls.INLINE_TIMESTAMP_PATTERN.match(line)
234
+ # Parse plain text with timestamp (check both positions)
235
+ start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(line)
236
+ end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(line)
212
237
  youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
213
238
 
214
- if inline_match:
215
- groups = inline_match.groups()
216
- text = groups[0]
239
+ start_timestamp = None
240
+ end_timestamp = None
241
+ text = None
242
+
243
+ if start_match:
244
+ groups = start_match.groups()
245
+ # Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
246
+ if groups[0] is not None: # HH:MM:SS format
247
+ start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
248
+ elif groups[3] is not None: # MM:SS format
249
+ start_timestamp = cls.parse_timestamp(groups[3], groups[4])
250
+ text = groups[5] # Text is after timestamp
251
+
252
+ segments.append(
253
+ GeminiSegment(
254
+ text=text.strip(),
255
+ timestamp=start_timestamp,
256
+ end_timestamp=None,
257
+ speaker=current_speaker,
258
+ section=current_section,
259
+ segment_type="dialogue",
260
+ line_number=line_num,
261
+ )
262
+ )
263
+ continue
264
+ elif end_match:
265
+ groups = end_match.groups()
266
+ text = groups[0] # Text is before timestamp
217
267
  # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
218
268
  if groups[1] is not None: # HH:MM:SS format
219
- timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
269
+ end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
220
270
  elif groups[4] is not None: # MM:SS format
221
- timestamp = cls.parse_timestamp(groups[4], groups[5])
222
- else:
223
- timestamp = None
271
+ end_timestamp = cls.parse_timestamp(groups[4], groups[5])
224
272
 
225
273
  segments.append(
226
274
  GeminiSegment(
227
275
  text=text.strip(),
228
- timestamp=timestamp,
276
+ timestamp=None,
277
+ end_timestamp=end_timestamp,
229
278
  speaker=current_speaker,
230
279
  section=current_section,
231
280
  segment_type="dialogue",
@@ -236,14 +285,15 @@ class GeminiReader:
236
285
  elif youtube_inline_match:
237
286
  groups = youtube_inline_match.groups()
238
287
  text = groups[0]
239
- # Extract seconds from URL parameter
288
+ # Extract seconds from URL parameter (treat as end time)
240
289
  url_seconds = groups[3]
241
- timestamp = cls.parse_timestamp(url_seconds)
290
+ end_timestamp = cls.parse_timestamp(url_seconds)
242
291
 
243
292
  segments.append(
244
293
  GeminiSegment(
245
294
  text=text.strip(),
246
- timestamp=timestamp,
295
+ timestamp=None,
296
+ end_timestamp=end_timestamp,
247
297
  speaker=current_speaker,
248
298
  section=current_section,
249
299
  segment_type="dialogue",
@@ -280,38 +330,79 @@ class GeminiReader:
280
330
  Returns:
281
331
  List of Supervision objects ready for alignment
282
332
  """
283
- segments = cls.read(transcript_path, include_events=False, include_sections=False)
333
+ segments = cls.read(transcript_path, include_events=True, include_sections=False)
284
334
 
285
- # Filter to only dialogue segments with timestamps
286
- dialogue_segments = [s for s in segments if s.segment_type == "dialogue" and s.timestamp is not None]
335
+ # Filter to dialogue and event segments with timestamps (either start or end)
336
+ dialogue_segments = [
337
+ s
338
+ for s in segments
339
+ if s.segment_type in ("dialogue", "event") and (s.timestamp is not None or s.end_timestamp is not None)
340
+ ]
287
341
 
288
342
  if not dialogue_segments:
289
343
  raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
290
344
 
291
- # Sort by timestamp
292
- dialogue_segments.sort(key=lambda x: x.timestamp)
345
+ # Sort by timestamp (use start time if available, otherwise end time)
346
+ dialogue_segments.sort(key=lambda x: x.timestamp if x.timestamp is not None else x.end_timestamp)
293
347
 
294
348
  # Convert to Supervision objects
295
349
  supervisions: List[Supervision] = []
350
+ prev_end_time = 0.0
296
351
 
297
352
  for i, segment in enumerate(dialogue_segments):
298
- # Estimate duration based on next segment
299
- if i < len(dialogue_segments) - 1:
300
- duration = dialogue_segments[i + 1].timestamp - segment.timestamp
301
- else:
302
- # Last segment: estimate based on text length (rough heuristic)
303
- words = len(segment.text.split())
304
- duration = words * 0.3 # ~0.3 seconds per word
305
-
306
- supervisions.append(
307
- Supervision(
308
- text=segment.text,
309
- start=segment.timestamp,
310
- duration=max(duration, min_duration),
311
- id=f"segment_{i:05d}",
312
- speaker=segment.speaker,
313
- )
314
- )
353
+ seg_start = None
354
+ seg_end = None
355
+
356
+ # Determine start and end times based on available timestamps
357
+ if segment.timestamp is not None:
358
+ # Has start time
359
+ seg_start = segment.timestamp
360
+ if segment.end_timestamp is not None:
361
+ # Has both start and end
362
+ seg_end = segment.end_timestamp
363
+ else:
364
+ # Only has start, estimate end
365
+ if i < len(dialogue_segments) - 1:
366
+ # Use next segment's time
367
+ next_seg = dialogue_segments[i + 1]
368
+ if next_seg.timestamp is not None:
369
+ seg_end = next_seg.timestamp
370
+ elif next_seg.end_timestamp is not None:
371
+ # Next has only end, estimate its start and use that
372
+ words_next = len(next_seg.text.split())
373
+ estimated_duration_next = words_next * 0.3
374
+ seg_end = next_seg.end_timestamp - estimated_duration_next
375
+
376
+ if seg_end is None:
377
+ # Estimate based on text length
378
+ words = len(segment.text.split())
379
+ seg_end = seg_start + words * 0.3
380
+
381
+ elif segment.end_timestamp is not None:
382
+ # Only has end time, need to infer start
383
+ seg_end = segment.end_timestamp
384
+ # Use previous segment's end time as start, or estimate based on text
385
+ if prev_end_time > 0:
386
+ seg_start = prev_end_time
387
+ else:
388
+ # Estimate start based on text length
389
+ words = len(segment.text.split())
390
+ estimated_duration = words * 0.3
391
+ seg_start = seg_end - estimated_duration
392
+
393
+ if seg_start is not None and seg_end is not None:
394
+ duration = max(seg_end - seg_start, min_duration)
395
+ if segment.segment_type == "dialogue":
396
+ supervisions.append(
397
+ Supervision(
398
+ text=segment.text,
399
+ start=seg_start,
400
+ duration=duration,
401
+ id=f"segment_{i:05d}",
402
+ speaker=segment.speaker,
403
+ )
404
+ )
405
+ prev_end_time = seg_start + duration
315
406
 
316
407
  # Optionally merge consecutive segments from same speaker
317
408
  if merge_consecutive:
@@ -108,12 +108,7 @@ def transcribe(
108
108
  is_url = media_config.is_input_remote()
109
109
 
110
110
  # Prepare output paths
111
- if is_url:
112
- # For URLs, use output_dir from media_config or current directory
113
- output_path = media_config.output_dir
114
- else:
115
- # For files, use input path directory
116
- output_path = Path(media_config.input_path).parent
111
+ output_dir = media_config.output_dir or Path(media_config.input_path).parent
117
112
 
118
113
  # Create transcriber
119
114
  if not transcription_config.lattice_model_path:
@@ -140,7 +135,7 @@ def transcribe(
140
135
  input_path = asyncio.run(
141
136
  downloader.download_media(
142
137
  url=media_config.input_path,
143
- output_dir=str(output_path),
138
+ output_dir=str(output_dir),
144
139
  media_format=media_config.normalize_format(),
145
140
  force_overwrite=media_config.force_overwrite,
146
141
  )
@@ -167,7 +162,7 @@ def transcribe(
167
162
  if is_url:
168
163
  # For URLs, generate output filename based on transcriber
169
164
  output_format = transcriber.file_suffix.lstrip(".")
170
- final_output = output_path / f"youtube_LattifAI_{transcriber.name}.{output_format}"
165
+ final_output = output_dir / f"youtube_LattifAI_{transcriber.name}.{output_format}"
171
166
  else:
172
167
  # For files, use input filename with suffix
173
168
  final_output = Path(media_config.input_path).with_suffix(".LattifAI.srt")
lattifai/client.py CHANGED
@@ -126,38 +126,20 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
126
126
  safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
127
127
 
128
128
  if caption.supervisions and alignment_strategy == "transcription":
129
- # raise NotImplementedError("Transcription-based alignment is not yet implemented.")
130
- assert (
131
- "gemini" not in self.transcriber.name.lower()
132
- ), "Transcription-based alignment is not supported with Gemini transcriber."
133
- assert (
134
- caption.supervisions
135
- ), "Input caption should contain supervisions when using transcription-based alignment."
136
129
  if not caption.transcription:
137
- import asyncio
138
-
139
- safe_print(colorful.cyan("📝 Transcribing media for alignment..."))
140
- if output_caption_path:
141
- transcript_file = (
142
- Path(str(output_caption_path)).parent
143
- / f"{Path(str(media_audio)).stem}_{self.transcriber.file_name}"
144
- )
145
- if transcript_file.exists():
146
- # print(colorful.cyan(f"Reading existing transcription from {transcript_file}"))
147
- transcript = self._read_caption(transcript_file, verbose=False)
148
- caption.transcription = transcript.supervisions
149
- caption.audio_events = transcript.audio_events
150
-
151
- if not caption.transcription:
152
- transcript = asyncio.run(
153
- self.transcriber.transcribe(media_audio, language=self.caption_config.source_lang)
154
- )
155
- caption.transcription = transcript.transcription
156
- caption.audio_events = transcript.audio_events
130
+ transcript = self._transcribe(
131
+ media_audio,
132
+ source_lang=self.caption_config.source_lang,
133
+ is_async=False,
134
+ output_dir=Path(str(output_caption_path)).parent if output_caption_path else None,
135
+ )
136
+ caption.transcription = transcript.supervisions or transcript.transcription
137
+ caption.audio_events = transcript.audio_events
138
+ assert caption.transcription, "Transcription is empty after transcription step."
157
139
 
158
140
  # Align caption.supervisions with transcription to get segments
159
141
  import regex
160
- from error_align import ErrorAlign, error_align # noqa: F401
142
+ from error_align import error_align # noqa: F401
161
143
  from error_align.utils import DELIMITERS, NUMERIC_TOKEN, STANDARD_TOKEN, OpType
162
144
 
163
145
  JOIN_TOKEN = "❄"
@@ -184,21 +166,82 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
184
166
  )
185
167
  )
186
168
 
187
- alignments = error_align(
188
- f"{JOIN_TOKEN}".join(sup.text for sup in caption.supervisions),
189
- f"{JOIN_TOKEN}".join(sup.text for sup in caption.transcription),
190
- tokenizer=custom_tokenizer,
191
- )
169
+ if split_sentence or self.caption_config.split_sentence:
170
+ caption.supervisions = self.aligner.tokenizer.split_sentences(caption.supervisions)
192
171
 
193
- for align in alignments:
194
- if align.hyp == JOIN_TOKEN and align.op_type == OpType.MATCH:
195
- pass
172
+ ref = f"{JOIN_TOKEN}".join(sup.text for sup in caption.supervisions)
173
+ hyp = f"{JOIN_TOKEN}".join(sup.text for sup in caption.transcription)
174
+ alignments = error_align(ref, hyp, tokenizer=custom_tokenizer)
196
175
 
197
- # if align.op_type == OpType.MATCH:
198
- # continue
199
- # elif align.op_type in (OpType.INSERT, OpType.DELETE, OpType.SUBSTITUTE):
200
- # # print(colorful.yellow(f"⚠️ Alignment warning: {op}"))
201
- # pass
176
+ idx = 0
177
+ for k, align in enumerate(alignments):
178
+ if align.hyp == JOIN_TOKEN and align.op_type == OpType.MATCH:
179
+ # safe_print(f"Segment {k}: JOIN_TOKEN detected, creating segment.")
180
+
181
+ # Find first non-None ref_slice starting from idx
182
+ ref_start = 0
183
+ for i in range(idx, k + 1):
184
+ if i < len(alignments) and alignments[i].ref_slice is not None:
185
+ ref_start = alignments[i].ref_slice.start
186
+ break
187
+
188
+ # Find last non-None ref_slice up to current position
189
+ ref_stop = len(ref)
190
+ for i in range(k, idx - 1, -1):
191
+ if i < len(alignments) and alignments[i].ref_slice is not None:
192
+ ref_stop = alignments[i].ref_slice.stop
193
+ break
194
+
195
+ # Find first non-None hyp_slice starting from idx
196
+ hyp_start = 0
197
+ for i in range(idx, k + 1):
198
+ if i < len(alignments) and alignments[i].hyp_slice is not None:
199
+ hyp_start = alignments[i].hyp_slice.start
200
+ break
201
+
202
+ # Find last non-None hyp_slice up to current position
203
+ hyp_stop = len(hyp)
204
+ for i in range(k, idx - 1, -1):
205
+ if i < len(alignments) and alignments[i].hyp_slice is not None:
206
+ hyp_stop = alignments[i].hyp_slice.stop
207
+ break
208
+
209
+ safe_print(f"[REF]: {ref[ref_start:ref_stop]}")
210
+ safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop]}\n")
211
+ idx = k + 1
212
+
213
+ # last part - handle remaining alignments after last JOIN_TOKEN
214
+ if idx < len(alignments):
215
+ # Find first non-None ref_slice starting from idx
216
+ ref_start = 0
217
+ for i in range(idx, len(alignments)):
218
+ if alignments[i].ref_slice is not None:
219
+ ref_start = alignments[i].ref_slice.start
220
+ break
221
+
222
+ # Find last non-None ref_slice from end
223
+ ref_stop = len(ref)
224
+ for i in range(len(alignments) - 1, idx - 1, -1):
225
+ if alignments[i].ref_slice is not None:
226
+ ref_stop = alignments[i].ref_slice.stop
227
+ break
228
+
229
+ # Find first non-None hyp_slice starting from idx
230
+ hyp_start = 0
231
+ for i in range(idx, len(alignments)):
232
+ if alignments[i].hyp_slice is not None:
233
+ hyp_start = alignments[i].hyp_slice.start
234
+ break
235
+
236
+ # Find last non-None hyp_slice from end
237
+ hyp_stop = len(hyp)
238
+ for i in range(len(alignments) - 1, idx - 1, -1):
239
+ if alignments[i].hyp_slice is not None:
240
+ hyp_stop = alignments[i].hyp_slice.stop
241
+ break
242
+
243
+ safe_print(f"[REF]: {ref[ref_start:ref_stop + 1]}")
244
+ safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop + 1]}\n")
202
245
 
203
246
  raise NotImplementedError("Transcription-based segmentation is not yet implemented.")
204
247
  else:
@@ -220,6 +263,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
220
263
  )
221
264
 
222
265
  # align each segment
266
+ sr = media_audio.sampling_rate
223
267
  supervisions, alignments = [], []
224
268
  for i, (start, end, _supervisions, skipalign) in enumerate(segments, 1):
225
269
  print(
@@ -234,10 +278,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
234
278
 
235
279
  offset = round(start, 4)
236
280
  # Extract audio slice
237
- audio_slice_ndarray = media_audio.ndarray[
238
- :, int(start * media_audio.sampling_rate) : int(end * media_audio.sampling_rate)
239
- ]
240
- emission = self.aligner.emission(audio_slice_ndarray)
281
+ audio_slice = media_audio.ndarray[:, int(start * sr) : int(end * sr)]
282
+ emission = self.aligner.emission(audio_slice)
241
283
 
242
284
  # Align segment
243
285
  _supervisions, _alignments = self.aligner.alignment(
@@ -368,7 +410,9 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
368
410
  # Step 1: Download media
369
411
  media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite)
370
412
 
371
- media_audio = self.audio_loader(media_file, channel_selector=channel_selector)
413
+ media_audio = self.audio_loader(
414
+ media_file, channel_selector=channel_selector, streaming_chunk_secs=streaming_chunk_secs
415
+ )
372
416
 
373
417
  # Step 2: Get or create captions (download or transcribe)
374
418
  caption = self._download_or_transcribe_caption(
@@ -393,7 +437,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
393
437
  output_caption_path=output_caption_path,
394
438
  split_sentence=split_sentence,
395
439
  channel_selector=channel_selector,
396
- streaming_chunk_secs=streaming_chunk_secs,
440
+ streaming_chunk_secs=None,
397
441
  )
398
442
 
399
443
  return caption
@@ -28,11 +28,11 @@ class AlignmentConfig:
28
28
  """Computation device: 'cpu' for CPU, 'cuda' for NVIDIA GPU, 'mps' for Apple Silicon."""
29
29
 
30
30
  batch_size: int = 1
31
- """Batch size for inference (number of samples processed simultaneously)."""
31
+ """Batch size for inference (number of samples processed simultaneously, NotImplemented yet)."""
32
32
 
33
33
  # Segmented Alignment for Long Audio
34
34
  trust_caption_timestamps: bool = False
35
- """When True, use original caption timestamps as strong reference constraints during alignment.
35
+ """When True, use original caption.supervisions' timestamps as strong reference constraints during alignment.
36
36
  The alignment process will still adjust timestamps but stay close to the input timing.
37
37
  Use this when you want to re-segment caption sentence boundaries (caption.split_sentence=True)
38
38
  while preserving the approximate timing from the original captions.
lattifai/mixin.py CHANGED
@@ -290,12 +290,12 @@ class LattifAIClientMixin:
290
290
  diarization_file = Path(str(input_caption)).with_suffix(".SpkDiar")
291
291
  if diarization_file.exists():
292
292
  if verbose:
293
- safe_print(colorful.cyan(f"📖 Step 1b: Reading speaker diarization from {diarization_file}"))
293
+ safe_print(colorful.cyan(f"📖 Step1b: Reading speaker diarization from {diarization_file}"))
294
294
  caption.read_speaker_diarization(diarization_file)
295
295
  events_file = Path(str(input_caption)).with_suffix(".AED")
296
296
  if events_file.exists():
297
297
  if verbose:
298
- safe_print(colorful.cyan(f"📖 Step 1c: Reading audio events from {events_file}"))
298
+ safe_print(colorful.cyan(f"📖 Step1c: Reading audio events from {events_file}"))
299
299
  from tgt import read_textgrid
300
300
 
301
301
  caption.audio_events = read_textgrid(events_file)
@@ -404,6 +404,14 @@ class LattifAIClientMixin:
404
404
  # Transcription mode: use Transcriber to transcribe
405
405
  self._validate_transcription_setup()
406
406
 
407
+ if output_dir:
408
+ # Generate transcript file path
409
+ transcript_file = output_dir / f"{Path(str(media_file)).stem}_{self.transcriber.file_name}"
410
+ if transcript_file.exists():
411
+ safe_print(colorful.cyan(f" Using existing transcript file: {transcript_file}"))
412
+ transcription = self._read_caption(transcript_file, normalize_text=False)
413
+ return transcription
414
+
407
415
  safe_print(colorful.cyan(f"🎤 Transcribing({self.transcriber.name}) media: {str(media_file)} ..."))
408
416
  transcription = await self.transcriber.transcribe_file(media_file, language=source_lang)
409
417
  safe_print(colorful.green(" ✓ Transcription completed."))
@@ -442,8 +450,6 @@ class LattifAIClientMixin:
442
450
  safe_print(colorful.yellow(f"First segment: {transcription.transcription[0].text}"))
443
451
 
444
452
  if output_dir:
445
- # Generate transcript file path
446
- transcript_file = output_dir / f"{Path(str(media_file)).stem}_{self.transcriber.file_name}"
447
453
  await asyncio.to_thread(self.transcriber.write, transcription, transcript_file, encoding="utf-8")
448
454
  safe_print(colorful.green(f" ✓ Transcription saved to: {transcript_file}"))
449
455