lattifai 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. lattifai/__init__.py +10 -0
  2. lattifai/alignment/lattice1_aligner.py +64 -15
  3. lattifai/alignment/lattice1_worker.py +135 -50
  4. lattifai/alignment/segmenter.py +3 -2
  5. lattifai/alignment/tokenizer.py +14 -13
  6. lattifai/audio2.py +269 -70
  7. lattifai/caption/caption.py +213 -19
  8. lattifai/cli/__init__.py +2 -0
  9. lattifai/cli/alignment.py +2 -1
  10. lattifai/cli/app_installer.py +35 -33
  11. lattifai/cli/caption.py +9 -19
  12. lattifai/cli/diarization.py +108 -0
  13. lattifai/cli/server.py +3 -1
  14. lattifai/cli/transcribe.py +55 -38
  15. lattifai/cli/youtube.py +1 -0
  16. lattifai/client.py +42 -121
  17. lattifai/config/alignment.py +37 -2
  18. lattifai/config/caption.py +1 -1
  19. lattifai/config/media.py +23 -3
  20. lattifai/config/transcription.py +4 -0
  21. lattifai/diarization/lattifai.py +18 -7
  22. lattifai/errors.py +7 -3
  23. lattifai/mixin.py +45 -16
  24. lattifai/server/app.py +2 -1
  25. lattifai/transcription/__init__.py +1 -1
  26. lattifai/transcription/base.py +21 -2
  27. lattifai/transcription/gemini.py +127 -1
  28. lattifai/transcription/lattifai.py +30 -2
  29. lattifai/utils.py +96 -28
  30. lattifai/workflow/file_manager.py +15 -13
  31. lattifai/workflow/youtube.py +16 -1
  32. {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/METADATA +86 -22
  33. lattifai-1.1.0.dist-info/RECORD +57 -0
  34. {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/entry_points.txt +2 -0
  35. {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/licenses/LICENSE +1 -1
  36. lattifai-1.0.4.dist-info/RECORD +0 -56
  37. {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/WHEEL +0 -0
  38. {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,108 @@
1
+ """Speaker diarization CLI entry point with nemo_run."""
2
+
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ import colorful
7
+ import nemo_run as run
8
+ from typing_extensions import Annotated
9
+
10
+ from lattifai.client import LattifAI
11
+ from lattifai.config import CaptionConfig, ClientConfig, DiarizationConfig, MediaConfig
12
+ from lattifai.utils import safe_print
13
+
14
+ __all__ = ["diarize"]
15
+
16
+
17
+ @run.cli.entrypoint(name="run", namespace="diarization")
18
+ def diarize(
19
+ input_media: Optional[str] = None,
20
+ input_caption: Optional[str] = None,
21
+ output_caption: Optional[str] = None,
22
+ media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
23
+ caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
24
+ client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
25
+ diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
26
+ ):
27
+ """Run speaker diarization on aligned captions and audio."""
28
+
29
+ media_config = media or MediaConfig()
30
+ caption_config = caption or CaptionConfig()
31
+ diarization_config = diarization or DiarizationConfig()
32
+
33
+ if input_media and media_config.input_path:
34
+ raise ValueError("Cannot specify both positional input_media and media.input_path.")
35
+ if input_media:
36
+ media_config.set_input_path(input_media)
37
+ if not media_config.input_path:
38
+ raise ValueError("Input media path must be provided via positional input_media or media.input_path.")
39
+
40
+ if input_caption and caption_config.input_path:
41
+ raise ValueError("Cannot specify both positional input_caption and caption.input_path.")
42
+ if input_caption:
43
+ caption_config.set_input_path(input_caption)
44
+ if not caption_config.input_path:
45
+ raise ValueError("Input caption path must be provided via positional input_caption or caption.input_path.")
46
+
47
+ if output_caption and caption_config.output_path:
48
+ raise ValueError("Cannot specify both positional output_caption and caption.output_path.")
49
+ if output_caption:
50
+ caption_config.set_output_path(output_caption)
51
+
52
+ diarization_config.enabled = True
53
+
54
+ client_instance = LattifAI(
55
+ client_config=client,
56
+ caption_config=caption_config,
57
+ diarization_config=diarization_config,
58
+ )
59
+
60
+ safe_print(colorful.cyan("🎧 Loading media for diarization..."))
61
+ media_audio = client_instance.audio_loader(
62
+ media_config.input_path,
63
+ channel_selector=media_config.channel_selector,
64
+ streaming_chunk_secs=media_config.streaming_chunk_secs,
65
+ )
66
+
67
+ safe_print(colorful.cyan("📖 Loading caption segments..."))
68
+ caption_obj = client_instance._read_caption(
69
+ caption_config.input_path,
70
+ input_caption_format=None if caption_config.input_format == "auto" else caption_config.input_format,
71
+ verbose=False,
72
+ )
73
+
74
+ if not caption_obj.alignments:
75
+ caption_obj.alignments = caption_obj.supervisions
76
+
77
+ if not caption_obj.alignments:
78
+ raise ValueError("Caption does not contain segments for diarization.")
79
+
80
+ if caption_config.output_path:
81
+ output_path = caption_config.output_path
82
+ else:
83
+ from datetime import datetime
84
+
85
+ input_caption_path = Path(caption_config.input_path)
86
+ timestamp = datetime.now().strftime("%Y%m%d_%H")
87
+ default_output = (
88
+ input_caption_path.parent / f"{input_caption_path.stem}.diarized.{timestamp}.{caption_config.output_format}"
89
+ )
90
+ caption_config.set_output_path(default_output)
91
+ output_path = caption_config.output_path
92
+
93
+ safe_print(colorful.cyan("🗣️ Performing speaker diarization..."))
94
+ diarized_caption = client_instance.speaker_diarization(
95
+ input_media=media_audio,
96
+ caption=caption_obj,
97
+ output_caption_path=output_path,
98
+ )
99
+
100
+ return diarized_caption
101
+
102
+
103
+ def main():
104
+ run.cli.main(diarize)
105
+
106
+
107
+ if __name__ == "__main__":
108
+ main()
lattifai/cli/server.py CHANGED
@@ -4,6 +4,8 @@ import os
4
4
  import colorful
5
5
  import uvicorn
6
6
 
7
+ from lattifai.utils import safe_print
8
+
7
9
 
8
10
  def main():
9
11
  """Launch the LattifAI Web Interface."""
@@ -29,7 +31,7 @@ def main():
29
31
 
30
32
  args = parser.parse_args()
31
33
 
32
- print(colorful.bold_green("🚀 Launching LattifAI Backend Server..."))
34
+ safe_print(colorful.bold_green("🚀 Launching LattifAI Backend Server..."))
33
35
  print(colorful.cyan(f"Server running at http://localhost:{args.port}"))
34
36
  print(colorful.yellow(f"Host: {args.host}"))
35
37
  print(colorful.yellow(f"Auto-reload: {'disabled' if args.no_reload else 'enabled'}"))
@@ -3,10 +3,8 @@
3
3
  from typing import Optional
4
4
 
5
5
  import nemo_run as run
6
- from lhotse.utils import Pathlike
7
6
  from typing_extensions import Annotated
8
7
 
9
- from lattifai.audio2 import AudioLoader, ChannelSelectorType
10
8
  from lattifai.cli.alignment import align as alignment_align
11
9
  from lattifai.config import (
12
10
  AlignmentConfig,
@@ -23,9 +21,8 @@ from lattifai.utils import _resolve_model_path
23
21
  def transcribe(
24
22
  input: Optional[str] = None,
25
23
  output_caption: Optional[str] = None,
26
- output_dir: Optional[Pathlike] = None,
27
- media_format: str = "mp3",
28
- channel_selector: Optional[ChannelSelectorType] = "average",
24
+ media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
25
+ client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
29
26
  transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
30
27
  ):
31
28
  """
@@ -39,11 +36,8 @@ def transcribe(
39
36
  Args:
40
37
  input: Path to input audio/video file or YouTube URL (can be provided as positional argument)
41
38
  output_caption: Path for output caption file (can be provided as positional argument)
42
- output_dir: Directory for output files when using YouTube URL
43
- media_format: Media format for YouTube downloads (default: mp3)
44
- channel_selector: Audio channel selection strategy (default: average)
45
- Options: average, left, right, or an integer channel index.
46
- Note: Ignored when input is a URL and Gemini transcriber is used.
39
+ media: Media configuration for input/output handling.
40
+ Fields: input_path, output_dir, media_format, channel_selector, streaming_chunk_secs
47
41
  transcription: Transcription service configuration.
48
42
  Fields: model_name, device, language, gemini_api_key
49
43
 
@@ -67,6 +61,11 @@ def transcribe(
67
61
  lai transcribe run audio.wav output.srt \\
68
62
  transcription.language=zh
69
63
 
64
+ # With MediaConfig settings
65
+ lai transcribe run audio.wav output.srt \\
66
+ media.channel_selector=left \\
67
+ media.streaming_chunk_secs=30.0
68
+
70
69
  # Full configuration with keyword arguments
71
70
  lai transcribe run \\
72
71
  input=audio.wav \\
@@ -78,68 +77,86 @@ def transcribe(
78
77
  from pathlib import Path
79
78
 
80
79
  import colorful
80
+ from lattifai_core.client import SyncAPIClient
81
81
 
82
+ from lattifai.audio2 import AudioLoader
82
83
  from lattifai.transcription import create_transcriber
84
+ from lattifai.utils import safe_print
83
85
 
84
- # Initialize transcription config with defaults
86
+ # Initialize configs with defaults
87
+ client_config = client or ClientConfig()
85
88
  transcription_config = transcription or TranscriptionConfig()
89
+ media_config = media or MediaConfig()
90
+
91
+ # Initialize client wrapper to properly set client_wrapper
92
+ client_wrapper = SyncAPIClient(config=client_config)
93
+ transcription_config.client_wrapper = client_wrapper
94
+
95
+ # Initialize client wrapper to properly set client_wrapper
96
+ client_wrapper = SyncAPIClient(config=client_config)
97
+ transcription_config.client_wrapper = client_wrapper
86
98
 
87
99
  # Validate input is required
88
- if not input:
89
- raise ValueError("Input is required. Provide input as positional argument (file path or URL).")
100
+ if not input and not media_config.input_path:
101
+ raise ValueError("Input is required. Provide input as positional argument or media.input_path.")
102
+
103
+ # Assign input to media_config if provided
104
+ if input:
105
+ media_config.set_input_path(input)
90
106
 
91
107
  # Detect if input is a URL
92
- is_url = input.startswith(("http://", "https://"))
108
+ is_url = media_config.is_input_remote()
93
109
 
94
110
  # Prepare output paths
95
111
  if is_url:
96
- # For URLs, use output_dir
97
- if output_dir:
98
- output_path = Path(str(output_dir)).expanduser()
99
- output_path.mkdir(parents=True, exist_ok=True)
100
- else:
101
- output_path = Path.cwd()
112
+ # For URLs, use output_dir from media_config or current directory
113
+ output_path = media_config.output_dir
102
114
  else:
103
115
  # For files, use input path directory
104
- input_path = Path(str(input))
105
- output_path = input_path.parent
116
+ output_path = Path(media_config.input_path).parent
106
117
 
107
118
  # Create transcriber
108
119
  if not transcription_config.lattice_model_path:
109
- transcription_config.lattice_model_path = _resolve_model_path("Lattifai/Lattice-1")
120
+ transcription_config.lattice_model_path = _resolve_model_path(
121
+ "LattifAI/Lattice-1", getattr(transcription_config, "model_hub", "huggingface")
122
+ )
110
123
  transcriber = create_transcriber(transcription_config=transcription_config)
111
124
 
112
- print(colorful.cyan(f"🎤 Starting transcription with {transcriber.name}..."))
113
- print(colorful.cyan(f" Input: {input}"))
125
+ safe_print(colorful.cyan(f"🎤 Starting transcription with {transcriber.name}..."))
126
+ safe_print(colorful.cyan(f" Input: {media_config.input_path}"))
114
127
 
115
128
  # Perform transcription
116
129
  if is_url and transcriber.supports_url:
117
130
  # Check if transcriber supports URL directly
118
- print(colorful.cyan(" Transcribing from URL directly..."))
119
- transcript = asyncio.run(transcriber.transcribe(input))
131
+ safe_print(colorful.cyan(" Transcribing from URL directly..."))
132
+ transcript = asyncio.run(transcriber.transcribe(media_config.input_path))
120
133
  else:
121
134
  if is_url:
122
135
  # Download media first, then transcribe
123
- print(colorful.cyan(" Downloading media from URL..."))
136
+ safe_print(colorful.cyan(" Downloading media from URL..."))
124
137
  from lattifai.workflow.youtube import YouTubeDownloader
125
138
 
126
139
  downloader = YouTubeDownloader()
127
140
  input_path = asyncio.run(
128
141
  downloader.download_media(
129
- url=input,
142
+ url=media_config.input_path,
130
143
  output_dir=str(output_path),
131
- media_format=media_format,
132
- force_overwrite=False,
144
+ media_format=media_config.normalize_format(),
145
+ force_overwrite=media_config.force_overwrite,
133
146
  )
134
147
  )
135
- print(colorful.cyan(f" Media downloaded to: {input_path}"))
148
+ safe_print(colorful.cyan(f" Media downloaded to: {input_path}"))
136
149
  else:
137
- input_path = Path(str(input))
150
+ input_path = Path(media_config.input_path)
138
151
 
139
- print(colorful.cyan(" Loading audio..."))
152
+ safe_print(colorful.cyan(" Loading audio..."))
140
153
  # For files, load audio first
141
154
  audio_loader = AudioLoader(device=transcription_config.device)
142
- media_audio = audio_loader(input_path, channel_selector=channel_selector)
155
+ media_audio = audio_loader(
156
+ input_path,
157
+ channel_selector=media_config.channel_selector,
158
+ streaming_chunk_secs=media_config.streaming_chunk_secs,
159
+ )
143
160
  transcript = asyncio.run(transcriber.transcribe(media_audio))
144
161
 
145
162
  # Determine output caption path
@@ -153,14 +170,14 @@ def transcribe(
153
170
  final_output = output_path / f"youtube_LattifAI_{transcriber.name}.{output_format}"
154
171
  else:
155
172
  # For files, use input filename with suffix
156
- final_output = Path(str(input)).with_suffix(".LattifAI.srt")
173
+ final_output = Path(media_config.input_path).with_suffix(".LattifAI.srt")
157
174
 
158
- print(colorful.cyan(f" Output: {final_output}"))
175
+ safe_print(colorful.cyan(f" Output: {final_output}"))
159
176
 
160
177
  # Write output
161
178
  transcriber.write(transcript, final_output, encoding="utf-8", cache_audio_events=False)
162
179
 
163
- print(colorful.green(f"🎉 Transcription completed: {final_output}"))
180
+ safe_print(colorful.green(f"🎉 Transcription completed: {final_output}"))
164
181
 
165
182
  return transcript
166
183
 
lattifai/cli/youtube.py CHANGED
@@ -117,6 +117,7 @@ def youtube(
117
117
  force_overwrite=media_config.force_overwrite,
118
118
  split_sentence=caption_config.split_sentence,
119
119
  channel_selector=media_config.channel_selector,
120
+ streaming_chunk_secs=media_config.streaming_chunk_secs,
120
121
  )
121
122
 
122
123
 
lattifai/client.py CHANGED
@@ -18,6 +18,7 @@ from lattifai.errors import (
18
18
  LatticeEncodingError,
19
19
  )
20
20
  from lattifai.mixin import LattifAIClientMixin
21
+ from lattifai.utils import safe_print
21
22
 
22
23
  if TYPE_CHECKING:
23
24
  from lattifai.diarization import LattifAIDiarizer # noqa: F401
@@ -91,6 +92,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
91
92
  input_caption_format: Optional[InputCaptionFormat] = None,
92
93
  split_sentence: Optional[bool] = None,
93
94
  channel_selector: Optional[str | int] = "average",
95
+ streaming_chunk_secs: Optional[float] = None,
94
96
  ) -> Caption:
95
97
  try:
96
98
  # Step 1: Get caption
@@ -100,10 +102,17 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
100
102
  media_audio = self.audio_loader(
101
103
  input_media,
102
104
  channel_selector=channel_selector,
105
+ streaming_chunk_secs=streaming_chunk_secs,
103
106
  )
104
107
 
105
108
  if not input_caption:
106
- caption = self._transcribe(media_audio, source_lang=self.caption_config.source_lang, is_async=False)
109
+ output_dir = None
110
+ if output_caption_path:
111
+ output_dir = Path(str(output_caption_path)).parent
112
+ output_dir.mkdir(parents=True, exist_ok=True)
113
+ caption = self._transcribe(
114
+ media_audio, source_lang=self.caption_config.source_lang, is_async=False, output_dir=output_dir
115
+ )
107
116
  else:
108
117
  caption = self._read_caption(input_caption, input_caption_format)
109
118
 
@@ -113,7 +122,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
113
122
  alignment_strategy = self.aligner.config.strategy
114
123
 
115
124
  if alignment_strategy != "entire" or caption.transcription:
116
- print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
125
+ safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
117
126
 
118
127
  if caption.supervisions and alignment_strategy == "transcription":
119
128
  # raise NotImplementedError("Transcription-based alignment is not yet implemented.")
@@ -126,7 +135,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
126
135
  if not caption.transcription:
127
136
  import asyncio
128
137
 
129
- print(colorful.cyan("📝 Transcribing media for alignment..."))
138
+ safe_print(colorful.cyan("📝 Transcribing media for alignment..."))
130
139
  if output_caption_path:
131
140
  transcript_file = (
132
141
  Path(str(output_caption_path)).parent
@@ -223,11 +232,11 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
223
232
  continue
224
233
 
225
234
  offset = round(start, 4)
226
- emission = self.aligner.emission(
227
- media_audio.tensor[
228
- :, int(start * media_audio.sampling_rate) : int(end * media_audio.sampling_rate)
229
- ]
230
- )
235
+ # Extract audio slice
236
+ audio_slice_ndarray = media_audio.ndarray[
237
+ :, int(start * media_audio.sampling_rate) : int(end * media_audio.sampling_rate)
238
+ ]
239
+ emission = self.aligner.emission(audio_slice_ndarray)
231
240
 
232
241
  # Align segment
233
242
  _supervisions, _alignments = self.aligner.alignment(
@@ -257,18 +266,9 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
257
266
  caption.supervisions = supervisions
258
267
  caption.alignments = alignments
259
268
 
260
- # Step 5: Speaker diarization
261
- if self.diarization_config.enabled and self.diarizer:
262
- print(colorful.cyan("🗣️ Performing speaker diarization..."))
263
- caption = self.speaker_diarization(
264
- input_media=media_audio,
265
- caption=caption,
266
- output_caption_path=output_caption_path,
267
- )
268
- elif output_caption_path:
269
+ if output_caption_path:
269
270
  self._write_caption(caption, output_caption_path)
270
271
 
271
- return caption
272
272
  except (CaptionProcessingError, LatticeEncodingError, AlignmentError, LatticeDecodingError):
273
273
  # Re-raise our specific errors as-is
274
274
  raise
@@ -281,6 +281,17 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
281
281
  context={"original_error": str(e), "error_type": e.__class__.__name__},
282
282
  )
283
283
 
284
+ # Step 5: Speaker diarization
285
+ if self.diarization_config.enabled and self.diarizer:
286
+ safe_print(colorful.cyan("🗣️ Performing speaker diarization..."))
287
+ caption = self.speaker_diarization(
288
+ input_media=media_audio,
289
+ caption=caption,
290
+ output_caption_path=output_caption_path,
291
+ )
292
+
293
+ return caption
294
+
284
295
  def speaker_diarization(
285
296
  self,
286
297
  input_media: AudioData,
@@ -308,11 +319,18 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
308
319
  if output_caption_path:
309
320
  diarization_file = Path(str(output_caption_path)).with_suffix(".SpkDiar")
310
321
  if diarization_file.exists():
311
- print(colorful.cyan(f"Reading existing speaker diarization from {diarization_file}"))
322
+ safe_print(colorful.cyan(f"Reading existing speaker diarization from {diarization_file}"))
312
323
  caption.read_speaker_diarization(diarization_file)
313
324
 
314
325
  diarization, alignments = self.diarizer.diarize_with_alignments(
315
- input_media, caption.alignments, diarization=caption.speaker_diarization
326
+ input_media,
327
+ caption.alignments,
328
+ diarization=caption.speaker_diarization,
329
+ alignment_fn=self.aligner.alignment,
330
+ transcribe_fn=self.transcriber.transcribe_numpy if self.transcriber else None,
331
+ separate_fn=self.aligner.separate if self.aligner.worker.separator_ort else None,
332
+ debug=self.diarizer.config.debug,
333
+ output_path=output_caption_path,
316
334
  )
317
335
  caption.alignments = alignments
318
336
  caption.speaker_diarization = diarization
@@ -321,105 +339,6 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
321
339
  if output_caption_path:
322
340
  self._write_caption(caption, output_caption_path)
323
341
 
324
- if self.diarizer.config.debug:
325
- # debug
326
- from tgt import Interval, IntervalTier, TextGrid, write_to_file
327
-
328
- debug_tg = TextGrid()
329
- transcript_tier = IntervalTier(
330
- start_time=0,
331
- end_time=input_media.duration,
332
- name="transcript",
333
- objects=[Interval(sup.start, sup.end, sup.text) for sup in caption.alignments],
334
- )
335
- debug_tg.add_tier(transcript_tier)
336
-
337
- speaker_tier = IntervalTier(
338
- start_time=0,
339
- end_time=input_media.duration,
340
- name="speaker",
341
- objects=[Interval(sup.start, sup.end, sup.speaker) for sup in caption.alignments],
342
- )
343
- debug_tg.add_tier(speaker_tier)
344
-
345
- from collections import defaultdict
346
-
347
- spk2intervals = defaultdict(lambda: [])
348
- num_multispk = 0
349
-
350
- segments, skipks = [], []
351
- for k, supervision in enumerate(caption.alignments): # TODO: alignments 本身存在 overlap, eg: [event]
352
- # supervision = caption.alignments[k]
353
- if supervision.custom.get("speaker", []):
354
- num_multispk += 1
355
- else:
356
- continue
357
-
358
- if k in skipks:
359
- continue
360
-
361
- for speaker in supervision.custom.get("speaker", []):
362
- for name, start_time, end_time in speaker:
363
- spk2intervals[name].append(Interval(start_time, end_time, name))
364
-
365
- _segments = []
366
- if k > 0:
367
- _segments.append(caption.alignments[k - 1])
368
- _segments.append(supervision)
369
- while k + 1 < len(caption.alignments):
370
- skipks.append(k + 1)
371
- next_sup = caption.alignments[k + 1]
372
- if not next_sup.custom.get("speaker", []):
373
- k += 1
374
- break
375
- _segments.append(next_sup)
376
- k += 1
377
-
378
- if segments:
379
- if _segments[0].start >= segments[-1][-1].end:
380
- segments.append(_segments)
381
- else:
382
- if _segments[1:]:
383
- segments.append(_segments[1:])
384
- else:
385
- pass
386
- else:
387
- segments.append(_segments)
388
-
389
- print(
390
- f"Number of multi-speaker segments: {num_multispk}/{len(caption.alignments)} segments: {len(segments)}"
391
- )
392
-
393
- for speaker, intervals in sorted(spk2intervals.items(), key=lambda x: x[0]):
394
- speaker_tier = IntervalTier(
395
- start_time=0, end_time=input_media.duration, name=speaker, objects=intervals
396
- )
397
- debug_tg.add_tier(speaker_tier)
398
-
399
- for tier in caption.speaker_diarization.tiers:
400
- tier.name = f"Diarization-{tier.name}"
401
- debug_tg.add_tier(tier)
402
-
403
- tier = IntervalTier(
404
- start_time=0,
405
- end_time=input_media.duration,
406
- name="resegment",
407
- objects=[
408
- Interval(round(sup.start, 2), round(sup.end, 2), sup.text)
409
- for _segments in segments
410
- for sup in _segments
411
- ],
412
- )
413
- debug_tg.add_tier(tier)
414
-
415
- # if caption.audio_events:
416
- # for tier in caption.audio_events.tiers:
417
- # # tier.name = f"{tier.name}"
418
- # debug_tg.add_tier(tier)
419
-
420
- debug_tgt_file = Path(str(output_caption_path)).with_suffix(".DiarizationDebug.TextGrid")
421
- write_to_file(debug_tg, debug_tgt_file, format="long")
422
-
423
342
  return caption
424
343
 
425
344
  def youtube(
@@ -433,12 +352,13 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
433
352
  split_sentence: Optional[bool] = None,
434
353
  use_transcription: bool = False,
435
354
  channel_selector: Optional[str | int] = "average",
355
+ streaming_chunk_secs: Optional[float] = None,
436
356
  ) -> Caption:
437
357
  # Prepare output directory and media format
438
358
  output_dir = self._prepare_youtube_output_dir(output_dir)
439
359
  media_format = self._determine_media_format(media_format)
440
360
 
441
- print(colorful.cyan(f"🎬 Starting YouTube workflow for: {url}"))
361
+ safe_print(colorful.cyan(f"🎬 Starting YouTube workflow for: {url}"))
442
362
 
443
363
  # Step 1: Download media
444
364
  media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite)
@@ -460,7 +380,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
460
380
  output_caption_path = self._generate_output_caption_path(output_caption_path, media_file, output_dir)
461
381
 
462
382
  # Step 4: Perform alignment
463
- print(colorful.cyan("🔗 Performing forced alignment..."))
383
+ safe_print(colorful.cyan("🔗 Performing forced alignment..."))
464
384
 
465
385
  caption: Caption = self.alignment(
466
386
  input_media=media_audio,
@@ -468,6 +388,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
468
388
  output_caption_path=output_caption_path,
469
389
  split_sentence=split_sentence,
470
390
  channel_selector=channel_selector,
391
+ streaming_chunk_secs=streaming_chunk_secs,
471
392
  )
472
393
 
473
394
  return caption
@@ -18,8 +18,11 @@ class AlignmentConfig:
18
18
  """
19
19
 
20
20
  # Alignment configuration
21
- model_name: str = "Lattifai/Lattice-1"
22
- """Model identifier or path to local model directory (e.g., 'Lattifai/Lattice-1')."""
21
+ model_name: str = "LattifAI/Lattice-1"
22
+ """Model identifier or path to local model directory (e.g., 'LattifAI/Lattice-1')."""
23
+
24
+ model_hub: Literal["huggingface", "modelscope"] = "huggingface"
25
+ """Which model hub to use when resolving remote model names: 'huggingface' or 'modelscope'."""
23
26
 
24
27
  device: Literal["cpu", "cuda", "mps", "auto"] = "auto"
25
28
  """Computation device: 'cpu' for CPU, 'cuda' for NVIDIA GPU, 'mps' for Apple Silicon."""
@@ -58,6 +61,38 @@ class AlignmentConfig:
58
61
  Default: 4.0 seconds. Useful for detecting scene changes or natural breaks in content.
59
62
  """
60
63
 
64
+ # Beam search parameters for forced alignment
65
+ search_beam: int = 200
66
+ """Search beam size for beam search decoding. Larger values explore more hypotheses but are slower.
67
+ Default: 200. Typical range: 20-500.
68
+ """
69
+
70
+ output_beam: int = 80
71
+ """Output beam size for keeping top hypotheses. Should be smaller than search_beam.
72
+ Default: 80. Typical range: 10-200.
73
+ """
74
+
75
+ min_active_states: int = 400
76
+ """Minimum number of active states during decoding. Controls memory and search space.
77
+ Default: 400. Typical range: 30-1000.
78
+ """
79
+
80
+ max_active_states: int = 10000
81
+ """Maximum number of active states during decoding. Prevents excessive memory usage.
82
+ Default: 10000. Typical range: 1000-20000.
83
+ """
84
+
85
+ # Alignment timing configuration
86
+ start_margin: float = 0.08
87
+ """Maximum start time margin (in seconds) to extend segment boundaries at the beginning.
88
+ Default: 0.08. Typical range: 0.0-0.5.
89
+ """
90
+
91
+ end_margin: float = 0.20
92
+ """Maximum end time margin (in seconds) to extend segment boundaries at the end.
93
+ Default: 0.20. Typical range: 0.0-0.5.
94
+ """
95
+
61
96
  client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
62
97
  """Reference to the SyncAPIClient instance. Auto-set during client initialization."""
63
98
 
@@ -48,7 +48,7 @@ class CaptionConfig:
48
48
  include_speaker_in_text: bool = True
49
49
  """Preserve speaker labels in caption text content."""
50
50
 
51
- normalize_text: bool = False
51
+ normalize_text: bool = True
52
52
  """Clean HTML entities and normalize whitespace in caption text."""
53
53
 
54
54
  split_sentence: bool = False