lattifai 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,7 +15,8 @@ class GeminiSegment:
15
15
  """Represents a segment in the Gemini transcript with metadata."""
16
16
 
17
17
  text: str
18
- timestamp: Optional[float] = None
18
+ timestamp: Optional[float] = None # For backward compatibility (start time)
19
+ end_timestamp: Optional[float] = None # End time when timestamp is at the end
19
20
  speaker: Optional[str] = None
20
21
  section: Optional[str] = None
21
22
  segment_type: str = "dialogue" # 'dialogue', 'event', or 'section_header'
@@ -26,6 +27,11 @@ class GeminiSegment:
26
27
  """Return start time in seconds."""
27
28
  return self.timestamp if self.timestamp is not None else 0.0
28
29
 
30
+ @property
31
+ def end(self) -> Optional[float]:
32
+ """Return end time in seconds if available."""
33
+ return self.end_timestamp
34
+
29
35
 
30
36
  class GeminiReader:
31
37
  """Parser for YouTube transcript format with speaker labels and timestamps."""
@@ -34,8 +40,12 @@ class GeminiReader:
34
40
  TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
35
41
  SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
36
42
  SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[::])\*\*\s*(.+)$")
37
- EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
38
- INLINE_TIMESTAMP_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
43
+ # Event pattern: [Event] [HH:MM:SS] or [Event] [MM:SS] - prioritize HH:MM:SS format
44
+ EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(\d{1,2}):(\d{2})(?::(\d{2}))?\]$")
45
+ # Timestamp at the end indicates end time
46
+ INLINE_TIMESTAMP_END_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
47
+ # Timestamp at the beginning indicates start time
48
+ INLINE_TIMESTAMP_START_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]\s*(.+)$")
39
49
 
40
50
  # New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
41
51
  YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
@@ -144,18 +154,22 @@ class GeminiReader:
144
154
  if event_match:
145
155
  groups = event_match.groups()
146
156
  event_text = groups[0]
147
- # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
148
- if groups[1] is not None: # HH:MM:SS format
149
- timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
150
- elif groups[4] is not None: # MM:SS format
151
- timestamp = cls.parse_timestamp(groups[4], groups[5])
157
+ # Parse timestamp - groups: (event_text, hours/minutes, minutes/seconds, seconds_optional)
158
+ hours_or_minutes = groups[1]
159
+ minutes_or_seconds = groups[2]
160
+ seconds_optional = groups[3]
161
+
162
+ if seconds_optional is not None:
163
+ # HH:MM:SS format
164
+ timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds, seconds_optional)
152
165
  else:
153
- timestamp = None
166
+ # MM:SS format
167
+ timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds)
154
168
 
155
169
  if include_events and timestamp is not None:
156
170
  segments.append(
157
171
  GeminiSegment(
158
- text=event_text.strip(),
172
+ text=f"[{event_text.strip()}]",
159
173
  timestamp=timestamp,
160
174
  section=current_section,
161
175
  segment_type="event",
@@ -170,34 +184,44 @@ class GeminiReader:
170
184
  speaker, text_with_timestamp = speaker_match.groups()
171
185
  current_speaker = speaker.strip()
172
186
 
173
- # Extract timestamp from the end of the text
174
- timestamp_match = cls.INLINE_TIMESTAMP_PATTERN.match(text_with_timestamp.strip())
187
+ # Check for timestamp at the beginning (start time)
188
+ start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(text_with_timestamp.strip())
189
+ # Check for timestamp at the end (end time)
190
+ end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(text_with_timestamp.strip())
175
191
  youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
176
192
 
177
- if timestamp_match:
178
- groups = timestamp_match.groups()
179
- text = groups[0]
193
+ start_timestamp = None
194
+ end_timestamp = None
195
+ text = text_with_timestamp.strip()
196
+
197
+ if start_match:
198
+ groups = start_match.groups()
199
+ # Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
200
+ if groups[0] is not None: # HH:MM:SS format
201
+ start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
202
+ elif groups[3] is not None: # MM:SS format
203
+ start_timestamp = cls.parse_timestamp(groups[3], groups[4])
204
+ text = groups[5] # Text is after timestamp
205
+ elif end_match:
206
+ groups = end_match.groups()
207
+ text = groups[0] # Text is before timestamp
180
208
  # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
181
209
  if groups[1] is not None: # HH:MM:SS format
182
- timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
210
+ end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
183
211
  elif groups[4] is not None: # MM:SS format
184
- timestamp = cls.parse_timestamp(groups[4], groups[5])
185
- else:
186
- timestamp = None
212
+ end_timestamp = cls.parse_timestamp(groups[4], groups[5])
187
213
  elif youtube_match:
188
214
  groups = youtube_match.groups()
189
215
  text = groups[0]
190
- # Extract seconds from URL parameter
216
+ # Extract seconds from URL parameter (treat as end time)
191
217
  url_seconds = groups[3]
192
- timestamp = cls.parse_timestamp(url_seconds)
193
- else:
194
- text = text_with_timestamp.strip()
195
- timestamp = None
218
+ end_timestamp = cls.parse_timestamp(url_seconds)
196
219
 
197
220
  segments.append(
198
221
  GeminiSegment(
199
222
  text=text.strip(),
200
- timestamp=timestamp,
223
+ timestamp=start_timestamp,
224
+ end_timestamp=end_timestamp,
201
225
  speaker=current_speaker,
202
226
  section=current_section,
203
227
  segment_type="dialogue",
@@ -207,25 +231,50 @@ class GeminiReader:
207
231
  current_speaker = None # Reset speaker after use
208
232
  continue
209
233
 
210
- # Parse plain text with timestamp at the end
211
- inline_match = cls.INLINE_TIMESTAMP_PATTERN.match(line)
234
+ # Parse plain text with timestamp (check both positions)
235
+ start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(line)
236
+ end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(line)
212
237
  youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
213
238
 
214
- if inline_match:
215
- groups = inline_match.groups()
216
- text = groups[0]
239
+ start_timestamp = None
240
+ end_timestamp = None
241
+ text = None
242
+
243
+ if start_match:
244
+ groups = start_match.groups()
245
+ # Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
246
+ if groups[0] is not None: # HH:MM:SS format
247
+ start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
248
+ elif groups[3] is not None: # MM:SS format
249
+ start_timestamp = cls.parse_timestamp(groups[3], groups[4])
250
+ text = groups[5] # Text is after timestamp
251
+
252
+ segments.append(
253
+ GeminiSegment(
254
+ text=text.strip(),
255
+ timestamp=start_timestamp,
256
+ end_timestamp=None,
257
+ speaker=current_speaker,
258
+ section=current_section,
259
+ segment_type="dialogue",
260
+ line_number=line_num,
261
+ )
262
+ )
263
+ continue
264
+ elif end_match:
265
+ groups = end_match.groups()
266
+ text = groups[0] # Text is before timestamp
217
267
  # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
218
268
  if groups[1] is not None: # HH:MM:SS format
219
- timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
269
+ end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
220
270
  elif groups[4] is not None: # MM:SS format
221
- timestamp = cls.parse_timestamp(groups[4], groups[5])
222
- else:
223
- timestamp = None
271
+ end_timestamp = cls.parse_timestamp(groups[4], groups[5])
224
272
 
225
273
  segments.append(
226
274
  GeminiSegment(
227
275
  text=text.strip(),
228
- timestamp=timestamp,
276
+ timestamp=None,
277
+ end_timestamp=end_timestamp,
229
278
  speaker=current_speaker,
230
279
  section=current_section,
231
280
  segment_type="dialogue",
@@ -236,14 +285,15 @@ class GeminiReader:
236
285
  elif youtube_inline_match:
237
286
  groups = youtube_inline_match.groups()
238
287
  text = groups[0]
239
- # Extract seconds from URL parameter
288
+ # Extract seconds from URL parameter (treat as end time)
240
289
  url_seconds = groups[3]
241
- timestamp = cls.parse_timestamp(url_seconds)
290
+ end_timestamp = cls.parse_timestamp(url_seconds)
242
291
 
243
292
  segments.append(
244
293
  GeminiSegment(
245
294
  text=text.strip(),
246
- timestamp=timestamp,
295
+ timestamp=None,
296
+ end_timestamp=end_timestamp,
247
297
  speaker=current_speaker,
248
298
  section=current_section,
249
299
  segment_type="dialogue",
@@ -280,38 +330,79 @@ class GeminiReader:
280
330
  Returns:
281
331
  List of Supervision objects ready for alignment
282
332
  """
283
- segments = cls.read(transcript_path, include_events=False, include_sections=False)
333
+ segments = cls.read(transcript_path, include_events=True, include_sections=False)
284
334
 
285
- # Filter to only dialogue segments with timestamps
286
- dialogue_segments = [s for s in segments if s.segment_type == "dialogue" and s.timestamp is not None]
335
+ # Filter to dialogue and event segments with timestamps (either start or end)
336
+ dialogue_segments = [
337
+ s
338
+ for s in segments
339
+ if s.segment_type in ("dialogue", "event") and (s.timestamp is not None or s.end_timestamp is not None)
340
+ ]
287
341
 
288
342
  if not dialogue_segments:
289
343
  raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
290
344
 
291
- # Sort by timestamp
292
- dialogue_segments.sort(key=lambda x: x.timestamp)
345
+ # Sort by timestamp (use start time if available, otherwise end time)
346
+ dialogue_segments.sort(key=lambda x: x.timestamp if x.timestamp is not None else x.end_timestamp)
293
347
 
294
348
  # Convert to Supervision objects
295
349
  supervisions: List[Supervision] = []
350
+ prev_end_time = 0.0
296
351
 
297
352
  for i, segment in enumerate(dialogue_segments):
298
- # Estimate duration based on next segment
299
- if i < len(dialogue_segments) - 1:
300
- duration = dialogue_segments[i + 1].timestamp - segment.timestamp
301
- else:
302
- # Last segment: estimate based on text length (rough heuristic)
303
- words = len(segment.text.split())
304
- duration = words * 0.3 # ~0.3 seconds per word
305
-
306
- supervisions.append(
307
- Supervision(
308
- text=segment.text,
309
- start=segment.timestamp,
310
- duration=max(duration, min_duration),
311
- id=f"segment_{i:05d}",
312
- speaker=segment.speaker,
313
- )
314
- )
353
+ seg_start = None
354
+ seg_end = None
355
+
356
+ # Determine start and end times based on available timestamps
357
+ if segment.timestamp is not None:
358
+ # Has start time
359
+ seg_start = segment.timestamp
360
+ if segment.end_timestamp is not None:
361
+ # Has both start and end
362
+ seg_end = segment.end_timestamp
363
+ else:
364
+ # Only has start, estimate end
365
+ if i < len(dialogue_segments) - 1:
366
+ # Use next segment's time
367
+ next_seg = dialogue_segments[i + 1]
368
+ if next_seg.timestamp is not None:
369
+ seg_end = next_seg.timestamp
370
+ elif next_seg.end_timestamp is not None:
371
+ # Next has only end, estimate its start and use that
372
+ words_next = len(next_seg.text.split())
373
+ estimated_duration_next = words_next * 0.3
374
+ seg_end = next_seg.end_timestamp - estimated_duration_next
375
+
376
+ if seg_end is None:
377
+ # Estimate based on text length
378
+ words = len(segment.text.split())
379
+ seg_end = seg_start + words * 0.3
380
+
381
+ elif segment.end_timestamp is not None:
382
+ # Only has end time, need to infer start
383
+ seg_end = segment.end_timestamp
384
+ # Use previous segment's end time as start, or estimate based on text
385
+ if prev_end_time > 0:
386
+ seg_start = prev_end_time
387
+ else:
388
+ # Estimate start based on text length
389
+ words = len(segment.text.split())
390
+ estimated_duration = words * 0.3
391
+ seg_start = seg_end - estimated_duration
392
+
393
+ if seg_start is not None and seg_end is not None:
394
+ duration = max(seg_end - seg_start, min_duration)
395
+ if segment.segment_type == "dialogue":
396
+ supervisions.append(
397
+ Supervision(
398
+ text=segment.text,
399
+ start=seg_start,
400
+ duration=duration,
401
+ id=f"segment_{i:05d}",
402
+ speaker=segment.speaker,
403
+ )
404
+ )
405
+ prev_end_time = seg_start + duration
315
406
 
316
407
  # Optionally merge consecutive segments from same speaker
317
408
  if merge_consecutive:
@@ -8,7 +8,7 @@ import nemo_run as run
8
8
  from typing_extensions import Annotated
9
9
 
10
10
  from lattifai.client import LattifAI
11
- from lattifai.config import CaptionConfig, ClientConfig, DiarizationConfig, MediaConfig
11
+ from lattifai.config import AlignmentConfig, CaptionConfig, ClientConfig, DiarizationConfig, MediaConfig
12
12
  from lattifai.utils import safe_print
13
13
 
14
14
  __all__ = ["diarize"]
@@ -22,6 +22,7 @@ def diarize(
22
22
  media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
23
23
  caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
24
24
  client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
25
+ alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
25
26
  diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
26
27
  ):
27
28
  """Run speaker diarization on aligned captions and audio."""
@@ -53,6 +54,7 @@ def diarize(
53
54
 
54
55
  client_instance = LattifAI(
55
56
  client_config=client,
57
+ alignment_config=alignment,
56
58
  caption_config=caption_config,
57
59
  diarization_config=diarization_config,
58
60
  )
@@ -108,12 +108,7 @@ def transcribe(
108
108
  is_url = media_config.is_input_remote()
109
109
 
110
110
  # Prepare output paths
111
- if is_url:
112
- # For URLs, use output_dir from media_config or current directory
113
- output_path = media_config.output_dir
114
- else:
115
- # For files, use input path directory
116
- output_path = Path(media_config.input_path).parent
111
+ output_dir = media_config.output_dir or Path(media_config.input_path).parent
117
112
 
118
113
  # Create transcriber
119
114
  if not transcription_config.lattice_model_path:
@@ -140,7 +135,7 @@ def transcribe(
140
135
  input_path = asyncio.run(
141
136
  downloader.download_media(
142
137
  url=media_config.input_path,
143
- output_dir=str(output_path),
138
+ output_dir=str(output_dir),
144
139
  media_format=media_config.normalize_format(),
145
140
  force_overwrite=media_config.force_overwrite,
146
141
  )
@@ -167,7 +162,7 @@ def transcribe(
167
162
  if is_url:
168
163
  # For URLs, generate output filename based on transcriber
169
164
  output_format = transcriber.file_suffix.lstrip(".")
170
- final_output = output_path / f"youtube_LattifAI_{transcriber.name}.{output_format}"
165
+ final_output = output_dir / f"youtube_LattifAI_{transcriber.name}.{output_format}"
171
166
  else:
172
167
  # For files, use input filename with suffix
173
168
  final_output = Path(media_config.input_path).with_suffix(".LattifAI.srt")
lattifai/cli/youtube.py CHANGED
@@ -25,6 +25,7 @@ def youtube(
25
25
  caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
26
26
  transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
27
27
  diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
28
+ use_transcription: bool = False,
28
29
  ):
29
30
  """
30
31
  Download media from YouTube (when needed) and align captions.
@@ -55,6 +56,11 @@ def youtube(
55
56
  Fields: gemini_api_key, model_name, language, device
56
57
  diarization: Speaker diarization configuration.
57
58
  Fields: enabled, num_speakers, min_speakers, max_speakers, device
59
+ use_transcription: If True, skip YouTube caption download and directly use
60
+ transcription.model_name to transcribe. If False (default), first try to
61
+ download YouTube captions; if download fails (no captions available or
62
+ errors like HTTP 429), automatically fallback to transcription if
63
+ transcription.model_name is configured.
58
64
 
59
65
  Examples:
60
66
  # Download from YouTube and align (positional argument)
@@ -108,7 +114,11 @@ def youtube(
108
114
  transcription_config=transcription,
109
115
  diarization_config=diarization,
110
116
  )
117
+
111
118
  # Call the client's youtube method
119
+ # If use_transcription=True, skip YouTube caption download and use transcription directly.
120
+ # If use_transcription=False (default), try YouTube captions first; on failure,
121
+ # automatically fallback to transcription if transcription.model_name is configured.
112
122
  return lattifai_client.youtube(
113
123
  url=media_config.input_path,
114
124
  output_dir=media_config.output_dir,
@@ -118,6 +128,7 @@ def youtube(
118
128
  split_sentence=caption_config.split_sentence,
119
129
  channel_selector=media_config.channel_selector,
120
130
  streaming_chunk_secs=media_config.streaming_chunk_secs,
131
+ use_transcription=use_transcription,
121
132
  )
122
133
 
123
134
 
lattifai/client.py CHANGED
@@ -56,6 +56,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
56
56
 
57
57
  # Initialize base API client
58
58
  super().__init__(config=client_config)
59
+ self.config = client_config
59
60
 
60
61
  # Initialize all configs with defaults
61
62
  alignment_config, transcription_config, diarization_config = self._init_configs(
@@ -125,38 +126,20 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
125
126
  safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
126
127
 
127
128
  if caption.supervisions and alignment_strategy == "transcription":
128
- # raise NotImplementedError("Transcription-based alignment is not yet implemented.")
129
- assert (
130
- "gemini" not in self.transcriber.name.lower()
131
- ), "Transcription-based alignment is not supported with Gemini transcriber."
132
- assert (
133
- caption.supervisions
134
- ), "Input caption should contain supervisions when using transcription-based alignment."
135
129
  if not caption.transcription:
136
- import asyncio
137
-
138
- safe_print(colorful.cyan("📝 Transcribing media for alignment..."))
139
- if output_caption_path:
140
- transcript_file = (
141
- Path(str(output_caption_path)).parent
142
- / f"{Path(str(media_audio)).stem}_{self.transcriber.file_name}"
143
- )
144
- if transcript_file.exists():
145
- # print(colorful.cyan(f"Reading existing transcription from {transcript_file}"))
146
- transcript = self._read_caption(transcript_file, verbose=False)
147
- caption.transcription = transcript.supervisions
148
- caption.audio_events = transcript.audio_events
149
-
150
- if not caption.transcription:
151
- transcript = asyncio.run(
152
- self.transcriber.transcribe(media_audio, language=self.caption_config.source_lang)
153
- )
154
- caption.transcription = transcript.transcription
155
- caption.audio_events = transcript.audio_events
130
+ transcript = self._transcribe(
131
+ media_audio,
132
+ source_lang=self.caption_config.source_lang,
133
+ is_async=False,
134
+ output_dir=Path(str(output_caption_path)).parent if output_caption_path else None,
135
+ )
136
+ caption.transcription = transcript.supervisions or transcript.transcription
137
+ caption.audio_events = transcript.audio_events
138
+ assert caption.transcription, "Transcription is empty after transcription step."
156
139
 
157
140
  # Align caption.supervisions with transcription to get segments
158
141
  import regex
159
- from error_align import ErrorAlign, error_align # noqa: F401
142
+ from error_align import error_align # noqa: F401
160
143
  from error_align.utils import DELIMITERS, NUMERIC_TOKEN, STANDARD_TOKEN, OpType
161
144
 
162
145
  JOIN_TOKEN = "❄"
@@ -183,21 +166,82 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
183
166
  )
184
167
  )
185
168
 
186
- alignments = error_align(
187
- f"{JOIN_TOKEN}".join(sup.text for sup in caption.supervisions),
188
- f"{JOIN_TOKEN}".join(sup.text for sup in caption.transcription),
189
- tokenizer=custom_tokenizer,
190
- )
169
+ if split_sentence or self.caption_config.split_sentence:
170
+ caption.supervisions = self.aligner.tokenizer.split_sentences(caption.supervisions)
191
171
 
192
- for align in alignments:
193
- if align.hyp == JOIN_TOKEN and align.op_type == OpType.MATCH:
194
- pass
172
+ ref = f"{JOIN_TOKEN}".join(sup.text for sup in caption.supervisions)
173
+ hyp = f"{JOIN_TOKEN}".join(sup.text for sup in caption.transcription)
174
+ alignments = error_align(ref, hyp, tokenizer=custom_tokenizer)
195
175
 
196
- # if align.op_type == OpType.MATCH:
197
- # continue
198
- # elif align.op_type in (OpType.INSERT, OpType.DELETE, OpType.SUBSTITUTE):
199
- # # print(colorful.yellow(f"⚠️ Alignment warning: {op}"))
200
- # pass
176
+ idx = 0
177
+ for k, align in enumerate(alignments):
178
+ if align.hyp == JOIN_TOKEN and align.op_type == OpType.MATCH:
179
+ # safe_print(f"Segment {k}: JOIN_TOKEN detected, creating segment.")
180
+
181
+ # Find first non-None ref_slice starting from idx
182
+ ref_start = 0
183
+ for i in range(idx, k + 1):
184
+ if i < len(alignments) and alignments[i].ref_slice is not None:
185
+ ref_start = alignments[i].ref_slice.start
186
+ break
187
+
188
+ # Find last non-None ref_slice up to current position
189
+ ref_stop = len(ref)
190
+ for i in range(k, idx - 1, -1):
191
+ if i < len(alignments) and alignments[i].ref_slice is not None:
192
+ ref_stop = alignments[i].ref_slice.stop
193
+ break
194
+
195
+ # Find first non-None hyp_slice starting from idx
196
+ hyp_start = 0
197
+ for i in range(idx, k + 1):
198
+ if i < len(alignments) and alignments[i].hyp_slice is not None:
199
+ hyp_start = alignments[i].hyp_slice.start
200
+ break
201
+
202
+ # Find last non-None hyp_slice up to current position
203
+ hyp_stop = len(hyp)
204
+ for i in range(k, idx - 1, -1):
205
+ if i < len(alignments) and alignments[i].hyp_slice is not None:
206
+ hyp_stop = alignments[i].hyp_slice.stop
207
+ break
208
+
209
+ safe_print(f"[REF]: {ref[ref_start:ref_stop]}")
210
+ safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop]}\n")
211
+ idx = k + 1
212
+
213
+ # last part - handle remaining alignments after last JOIN_TOKEN
214
+ if idx < len(alignments):
215
+ # Find first non-None ref_slice starting from idx
216
+ ref_start = 0
217
+ for i in range(idx, len(alignments)):
218
+ if alignments[i].ref_slice is not None:
219
+ ref_start = alignments[i].ref_slice.start
220
+ break
221
+
222
+ # Find last non-None ref_slice from end
223
+ ref_stop = len(ref)
224
+ for i in range(len(alignments) - 1, idx - 1, -1):
225
+ if alignments[i].ref_slice is not None:
226
+ ref_stop = alignments[i].ref_slice.stop
227
+ break
228
+
229
+ # Find first non-None hyp_slice starting from idx
230
+ hyp_start = 0
231
+ for i in range(idx, len(alignments)):
232
+ if alignments[i].hyp_slice is not None:
233
+ hyp_start = alignments[i].hyp_slice.start
234
+ break
235
+
236
+ # Find last non-None hyp_slice from end
237
+ hyp_stop = len(hyp)
238
+ for i in range(len(alignments) - 1, idx - 1, -1):
239
+ if alignments[i].hyp_slice is not None:
240
+ hyp_stop = alignments[i].hyp_slice.stop
241
+ break
242
+
243
+ safe_print(f"[REF]: {ref[ref_start:ref_stop + 1]}")
244
+ safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop + 1]}\n")
201
245
 
202
246
  raise NotImplementedError("Transcription-based segmentation is not yet implemented.")
203
247
  else:
@@ -219,6 +263,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
219
263
  )
220
264
 
221
265
  # align each segment
266
+ sr = media_audio.sampling_rate
222
267
  supervisions, alignments = [], []
223
268
  for i, (start, end, _supervisions, skipalign) in enumerate(segments, 1):
224
269
  print(
@@ -233,10 +278,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
233
278
 
234
279
  offset = round(start, 4)
235
280
  # Extract audio slice
236
- audio_slice_ndarray = media_audio.ndarray[
237
- :, int(start * media_audio.sampling_rate) : int(end * media_audio.sampling_rate)
238
- ]
239
- emission = self.aligner.emission(audio_slice_ndarray)
281
+ audio_slice = media_audio.ndarray[:, int(start * sr) : int(end * sr)]
282
+ emission = self.aligner.emission(audio_slice)
240
283
 
241
284
  # Align segment
242
285
  _supervisions, _alignments = self.aligner.alignment(
@@ -269,6 +312,10 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
269
312
  if output_caption_path:
270
313
  self._write_caption(caption, output_caption_path)
271
314
 
315
+ # Profile if enabled
316
+ if self.config.profile:
317
+ self.aligner.profile()
318
+
272
319
  except (CaptionProcessingError, LatticeEncodingError, AlignmentError, LatticeDecodingError):
273
320
  # Re-raise our specific errors as-is
274
321
  raise
@@ -363,7 +410,9 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
363
410
  # Step 1: Download media
364
411
  media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite)
365
412
 
366
- media_audio = self.audio_loader(media_file, channel_selector=channel_selector)
413
+ media_audio = self.audio_loader(
414
+ media_file, channel_selector=channel_selector, streaming_chunk_secs=streaming_chunk_secs
415
+ )
367
416
 
368
417
  # Step 2: Get or create captions (download or transcribe)
369
418
  caption = self._download_or_transcribe_caption(
@@ -388,7 +437,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
388
437
  output_caption_path=output_caption_path,
389
438
  split_sentence=split_sentence,
390
439
  channel_selector=channel_selector,
391
- streaming_chunk_secs=streaming_chunk_secs,
440
+ streaming_chunk_secs=None,
392
441
  )
393
442
 
394
443
  return caption
@@ -28,11 +28,11 @@ class AlignmentConfig:
28
28
  """Computation device: 'cpu' for CPU, 'cuda' for NVIDIA GPU, 'mps' for Apple Silicon."""
29
29
 
30
30
  batch_size: int = 1
31
- """Batch size for inference (number of samples processed simultaneously)."""
31
+ """Batch size for inference (number of samples processed simultaneously, NotImplemented yet)."""
32
32
 
33
33
  # Segmented Alignment for Long Audio
34
34
  trust_caption_timestamps: bool = False
35
- """When True, use original caption timestamps as strong reference constraints during alignment.
35
+ """When True, use original caption.supervisions' timestamps as strong reference constraints during alignment.
36
36
  The alignment process will still adjust timestamps but stay close to the input timing.
37
37
  Use this when you want to re-segment caption sentence boundaries (caption.split_sentence=True)
38
38
  while preserving the approximate timing from the original captions.
lattifai/config/client.py CHANGED
@@ -26,6 +26,11 @@ class ClientConfig:
26
26
  default_headers: Optional[Dict[str, str]] = field(default=None)
27
27
  """Optional static headers to include in all requests."""
28
28
 
29
+ profile: bool = False
30
+ """Enable profiling of client operations tasks.
31
+ When True, prints detailed timing information for various stages of the process.
32
+ """
33
+
29
34
  def __post_init__(self):
30
35
  """Validate and auto-populate configuration after initialization."""
31
36