lattifai 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. lattifai/__init__.py +61 -47
  2. lattifai/alignment/__init__.py +6 -0
  3. lattifai/alignment/lattice1_aligner.py +119 -0
  4. lattifai/alignment/lattice1_worker.py +185 -0
  5. lattifai/{tokenizer → alignment}/phonemizer.py +4 -4
  6. lattifai/alignment/segmenter.py +166 -0
  7. lattifai/{tokenizer → alignment}/tokenizer.py +244 -169
  8. lattifai/audio2.py +211 -0
  9. lattifai/caption/__init__.py +20 -0
  10. lattifai/caption/caption.py +1275 -0
  11. lattifai/{io → caption}/gemini_reader.py +30 -30
  12. lattifai/{io → caption}/gemini_writer.py +17 -17
  13. lattifai/{io → caption}/supervision.py +4 -3
  14. lattifai/caption/text_parser.py +145 -0
  15. lattifai/cli/__init__.py +17 -0
  16. lattifai/cli/alignment.py +153 -0
  17. lattifai/cli/caption.py +204 -0
  18. lattifai/cli/server.py +19 -0
  19. lattifai/cli/transcribe.py +197 -0
  20. lattifai/cli/youtube.py +128 -0
  21. lattifai/client.py +460 -251
  22. lattifai/config/__init__.py +20 -0
  23. lattifai/config/alignment.py +73 -0
  24. lattifai/config/caption.py +178 -0
  25. lattifai/config/client.py +46 -0
  26. lattifai/config/diarization.py +67 -0
  27. lattifai/config/media.py +335 -0
  28. lattifai/config/transcription.py +84 -0
  29. lattifai/diarization/__init__.py +5 -0
  30. lattifai/diarization/lattifai.py +89 -0
  31. lattifai/errors.py +98 -91
  32. lattifai/logging.py +116 -0
  33. lattifai/mixin.py +552 -0
  34. lattifai/server/app.py +420 -0
  35. lattifai/transcription/__init__.py +76 -0
  36. lattifai/transcription/base.py +108 -0
  37. lattifai/transcription/gemini.py +219 -0
  38. lattifai/transcription/lattifai.py +103 -0
  39. lattifai/{workflows → transcription}/prompts/__init__.py +4 -4
  40. lattifai/types.py +30 -0
  41. lattifai/utils.py +16 -44
  42. lattifai/workflow/__init__.py +22 -0
  43. lattifai/workflow/agents.py +6 -0
  44. lattifai/{workflows → workflow}/base.py +22 -22
  45. lattifai/{workflows → workflow}/file_manager.py +239 -215
  46. lattifai/workflow/youtube.py +564 -0
  47. lattifai-1.0.0.dist-info/METADATA +736 -0
  48. lattifai-1.0.0.dist-info/RECORD +52 -0
  49. {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
  50. lattifai-1.0.0.dist-info/entry_points.txt +13 -0
  51. {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +1 -1
  52. lattifai/base_client.py +0 -126
  53. lattifai/bin/__init__.py +0 -3
  54. lattifai/bin/agent.py +0 -325
  55. lattifai/bin/align.py +0 -296
  56. lattifai/bin/cli_base.py +0 -25
  57. lattifai/bin/subtitle.py +0 -210
  58. lattifai/io/__init__.py +0 -42
  59. lattifai/io/reader.py +0 -85
  60. lattifai/io/text_parser.py +0 -75
  61. lattifai/io/utils.py +0 -15
  62. lattifai/io/writer.py +0 -90
  63. lattifai/tokenizer/__init__.py +0 -3
  64. lattifai/workers/__init__.py +0 -3
  65. lattifai/workers/lattice1_alpha.py +0 -284
  66. lattifai/workflows/__init__.py +0 -34
  67. lattifai/workflows/agents.py +0 -10
  68. lattifai/workflows/gemini.py +0 -167
  69. lattifai/workflows/prompts/README.md +0 -22
  70. lattifai/workflows/prompts/gemini/README.md +0 -24
  71. lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
  72. lattifai/workflows/youtube.py +0 -931
  73. lattifai-0.4.5.dist-info/METADATA +0 -808
  74. lattifai-0.4.5.dist-info/RECORD +0 -39
  75. lattifai-0.4.5.dist-info/entry_points.txt +0 -3
  76. {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
lattifai/mixin.py ADDED
@@ -0,0 +1,552 @@
1
+ """Mixin class providing shared functionality for LattifAI clients."""
2
+
3
+ import tempfile
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Awaitable, Optional, Union
6
+
7
+ import colorful
8
+ from lhotse.utils import Pathlike
9
+
10
+ from lattifai.audio2 import AudioData
11
+ from lattifai.caption import Caption
12
+ from lattifai.errors import CaptionProcessingError
13
+
14
+ if TYPE_CHECKING:
15
+ from .config import AlignmentConfig, CaptionConfig, ClientConfig, DiarizationConfig, TranscriptionConfig
16
+
17
+
18
+ class LattifAIClientMixin:
19
+ """
20
+ Mixin class providing shared functionality for LattifAI clients.
21
+
22
+ This mixin contains common logic for transcription and downloading that is
23
+ used by both synchronous and asynchronous client implementations.
24
+ """
25
+
26
+ # Shared docstring templates for class, __init__, alignment, and youtube methods
27
+ _CLASS_DOC = """
28
+ {sync_or_async} LattifAI client for audio/video-caption alignment.
29
+
30
+ This client provides {sync_or_async_lower} methods for aligning audio/video files with caption/transcript
31
+ text using the Lattice-1 forced alignment model. It supports multiple caption formats
32
+ (SRT, VTT, ASS, TXT) and provides word-level alignment with configurable sentence splitting.
33
+
34
+ The client uses a config-driven architecture with four main configuration objects:
35
+ - ClientConfig: API connection settings (API key, base URL, timeout, retries)
36
+ - AlignmentConfig: Model and alignment behavior settings
37
+ - CaptionConfig: Caption I/O format and processing settings
38
+ - TranscriptionConfig: Transcription service settings (optional, for YouTube workflow)
39
+
40
+ Example:
41
+ >>> from lattifai import {client_class}, ClientConfig
42
+ >>>
43
+ >>> # Initialize with default settings
44
+ >>> client = {client_class}()
45
+ >>>
46
+ >>> # Or with custom configuration
47
+ >>> config = ClientConfig(api_key="your-api-key")
48
+ >>> client = {client_class}(config=config)
49
+ >>>
50
+ >>> # Perform alignment
51
+ >>> {await_keyword}alignments, output_path = {await_keyword}client.alignment(
52
+ ... input_media="audio.wav",
53
+ ... input_caption="caption.srt",
54
+ ... output_caption_path="aligned.srt"
55
+ ... )
56
+
57
+ Attributes:
58
+ aligner: Lattice1Aligner instance for performing forced alignment{async_note}
59
+ captioner: Captioner instance for reading/writing caption files
60
+ transcriber: Optional transcriber instance for audio transcription{transcriber_note}
61
+ """
62
+
63
+ _INIT_DOC = """
64
+ Initialize {client_class} {sync_or_async_lower} client.
65
+
66
+ Args:
67
+ client_config: Client configuration for API connection settings. If None, uses defaults
68
+ (reads API key from LATTIFAI_API_KEY environment variable).
69
+ alignment_config: Alignment {config_desc}
70
+ If None, uses {default_desc}.
71
+ caption_config: Caption I/O configuration for format handling and processing.
72
+ If None, uses default settings{caption_note}.
73
+ transcription_config: Transcription service configuration{transcription_note}.
74
+
75
+ Raises:
76
+ ConfigurationError: If API key is not provided {api_key_source}.
77
+ """
78
+
79
+ _ALIGNMENT_DOC = """
80
+ Perform {async_prefix}forced alignment on audio and caption/text.
81
+
82
+ This {async_word}method aligns caption text with audio by finding the precise timing of {timing_desc}
83
+ and caption segment. {concurrency_note}
84
+
85
+ The alignment process consists of five steps:
86
+ 1. Parse the input caption file into segments{async_suffix1}
87
+ 2. Generate a lattice graph from caption text{async_suffix2}
88
+ 3. Search the lattice using audio features{async_suffix3}
89
+ 4. Decode results to extract word-level timings{async_suffix4}
90
+ 5. Export aligned captions (if output path provided{async_suffix5})
91
+
92
+ Args:
93
+ input_media: Path to audio/video file (WAV, MP3, FLAC, MP4, etc.). Must be readable by ffmpeg.
94
+ input_caption: Path to caption or plain text file to align with audio.
95
+ input_caption_format: Input caption format ('srt', 'vtt', 'ass', 'txt'). If None, {format_default}
96
+ from file extension or uses config default.
97
+ split_sentence: Enable automatic sentence re-splitting for better alignment accuracy.
98
+ If None, uses config default (typically False).
99
+ output_caption_path: Optional path to write aligned caption file. If provided,
100
+ exports results{export_note}.
101
+
102
+ Returns:
103
+ Tuple containing:
104
+ - List of Supervision objects with aligned timing information{timing_note}
105
+ - Output caption path (same as input parameter, or None if not provided)
106
+
107
+ Raises:
108
+ CaptionProcessingError: If caption file cannot be parsed or output cannot be written.
109
+ LatticeEncodingError: If lattice graph generation fails (invalid text format).
110
+ AlignmentError: If audio alignment fails (audio processing or model inference error).
111
+ LatticeDecodingError: If lattice decoding fails (invalid results from model).
112
+
113
+ Example:
114
+ >>> {example_imports}
115
+ >>> {example_code}
116
+ """
117
+
118
+ _YOUTUBE_METHOD_DOC = """
119
+ Download and align YouTube video with captions or transcription.
120
+
121
+ This end-to-end method handles the complete YouTube alignment workflow:
122
+ 1. Downloads media from YouTube in specified format
123
+ 2. Downloads captions OR transcribes audio (based on config)
124
+ 3. Performs forced alignment with Lattice-1 model
125
+ 4. Exports aligned captions
126
+
127
+ Args:
128
+ url: YouTube video URL (e.g., https://youtube.com/watch?v=VIDEO_ID)
129
+ output_dir: Directory for downloaded files. If None, uses temporary directory.
130
+ media_format: Media format to download (mp3, mp4, wav, etc.). If None, uses config default.
131
+ source_lang: Specific caption language to download (e.g., 'en', 'zh'). If None, downloads all.
132
+ force_overwrite: Skip confirmation prompts and overwrite existing files.
133
+ output_caption_path: Path for aligned caption output. If None, auto-generates.
134
+ **alignment_kwargs: Additional arguments passed to alignment() method.
135
+
136
+ Returns:
137
+ Tuple containing:
138
+ - List of Supervision objects with aligned timing information
139
+ - Output caption path
140
+
141
+ Raises:
142
+ ValueError: If transcription is requested but transcriber not configured.
143
+ RuntimeError: If download or transcription fails.
144
+ CaptionProcessingError: If caption processing fails.
145
+ AlignmentError: If alignment fails.
146
+
147
+ Example:
148
+ >>> from lattifai import {client_class}
149
+ >>> from lattifai.config import TranscriptionConfig
150
+ >>>
151
+ >>> # With YouTube captions
152
+ >>> client = {client_class}()
153
+ >>> {await_keyword}alignments, path = {await_keyword}client.youtube(
154
+ ... url="https://youtube.com/watch?v=VIDEO_ID",
155
+ ... output_dir="./downloads"
156
+ ... )
157
+ >>>
158
+ >>> # With Gemini transcription
159
+ >>> config = TranscriptionConfig(gemini_api_key="YOUR_KEY")
160
+ >>> client = {client_class}(transcription_config=config)
161
+ >>> {await_keyword}alignments, path = {await_keyword}client.youtube(
162
+ ... url="https://youtube.com/watch?v=VIDEO_ID",
163
+ ... use_transcription=True
164
+ ... )
165
+ """
166
+
167
+ def _init_configs(
168
+ self,
169
+ alignment_config: Optional["AlignmentConfig"],
170
+ transcription_config: Optional["TranscriptionConfig"],
171
+ diarization_config: Optional["DiarizationConfig"] = None,
172
+ ) -> tuple:
173
+ """Initialize all configs with defaults if not provided."""
174
+ from .config import AlignmentConfig, DiarizationConfig, TranscriptionConfig
175
+
176
+ if alignment_config is None:
177
+ alignment_config = AlignmentConfig()
178
+ if transcription_config is None:
179
+ transcription_config = TranscriptionConfig()
180
+ if diarization_config is None:
181
+ diarization_config = DiarizationConfig()
182
+
183
+ from lattifai.utils import _resolve_model_path
184
+
185
+ if transcription_config is not None:
186
+ transcription_config.lattice_model_path = _resolve_model_path(alignment_config.model_name)
187
+
188
+ # Set client_wrapper for all configs
189
+ alignment_config.client_wrapper = self
190
+ transcription_config.client_wrapper = self
191
+ diarization_config.client_wrapper = self
192
+
193
+ return alignment_config, transcription_config, diarization_config
194
+
195
+ def _init_shared_components(
196
+ self,
197
+ transcription_config: Optional["TranscriptionConfig"],
198
+ ) -> None:
199
+ """Initialize shared components (transcriber, downloader)."""
200
+ # transcriber (optional, lazy loaded when needed)
201
+ self.transcription_config = transcription_config
202
+ self._transcriber = None
203
+
204
+ # downloader (lazy loaded when needed)
205
+ self._downloader = None
206
+
207
+ @property
208
+ def transcriber(self):
209
+ """Lazy load transcriber based on config."""
210
+ if self._transcriber is None and self.transcription_config:
211
+ from .transcription import create_transcriber
212
+
213
+ self._transcriber = create_transcriber(transcription_config=self.transcription_config)
214
+ return self._transcriber
215
+
216
+ @property
217
+ def downloader(self):
218
+ """Lazy load YouTube downloader."""
219
+ if self._downloader is None:
220
+ from .workflow.youtube import YouTubeDownloader
221
+
222
+ self._downloader = YouTubeDownloader()
223
+ return self._downloader
224
+
225
+ def _prepare_youtube_output_dir(self, output_dir: Optional["Pathlike"]) -> Path:
226
+ """Prepare and return output directory for YouTube downloads."""
227
+ if output_dir is None:
228
+ output_dir = Path(tempfile.gettempdir()) / "lattifai_youtube"
229
+ else:
230
+ output_dir = Path(output_dir).expanduser()
231
+ output_dir.mkdir(parents=True, exist_ok=True)
232
+ return output_dir
233
+
234
+ def _determine_media_format(self, media_format: Optional[str]) -> str:
235
+ """Determine media format from parameter or config."""
236
+ return media_format or "mp3"
237
+
238
+ def _generate_output_caption_path(
239
+ self, output_caption_path: Optional["Pathlike"], media_file: str, output_dir: Path
240
+ ) -> Path:
241
+ """Generate output caption path if not provided."""
242
+ if not output_caption_path:
243
+ media_name = Path(media_file).stem
244
+ output_format = self.caption_config.output_format or "srt"
245
+ output_caption_path = output_dir / f"{media_name}_LattifAI.{output_format}"
246
+ return Path(output_caption_path)
247
+
248
+ def _validate_transcription_setup(self) -> None:
249
+ """Validate that transcription is properly configured if requested."""
250
+ if not self.transcriber:
251
+ raise ValueError(
252
+ "Transcription requested but transcriber not configured. "
253
+ "Provide TranscriptionConfig with valid API key."
254
+ )
255
+
256
+ def _read_caption(
257
+ self,
258
+ input_caption: Union[Pathlike, Caption],
259
+ input_caption_format: Optional[str] = None,
260
+ normalize_text: Optional[bool] = None,
261
+ verbose: bool = True,
262
+ ) -> Caption:
263
+ """
264
+ Read caption file or return Caption object directly.
265
+
266
+ Args:
267
+ input_caption: Path to caption file or Caption object
268
+ input_caption_format: Optional format hint for parsing
269
+
270
+ Returns:
271
+ Caption object
272
+
273
+ Raises:
274
+ CaptionProcessingError: If caption cannot be read
275
+ """
276
+ if isinstance(input_caption, Caption):
277
+ return input_caption
278
+
279
+ try:
280
+ if verbose:
281
+ print(colorful.cyan(f"📖 Step 1: Reading caption file from {input_caption}"))
282
+ caption = Caption.read(
283
+ input_caption,
284
+ format=input_caption_format,
285
+ normalize_text=normalize_text if normalize_text is not None else self.caption_config.normalize_text,
286
+ )
287
+ diarization_file = Path(str(input_caption)).with_suffix(".SpkDiar")
288
+ if diarization_file.exists():
289
+ if verbose:
290
+ print(colorful.cyan(f"📖 Step 1b: Reading speaker diarization from {diarization_file}"))
291
+ caption.read_speaker_diarization(diarization_file)
292
+ events_file = Path(str(input_caption)).with_suffix(".AED")
293
+ if events_file.exists():
294
+ if verbose:
295
+ print(colorful.cyan(f"📖 Step 1c: Reading audio events from {events_file}"))
296
+ from tgt import read_textgrid
297
+
298
+ caption.audio_events = read_textgrid(events_file)
299
+
300
+ if verbose:
301
+ print(colorful.green(f" ✓ Parsed {len(caption)} caption segments"))
302
+ return caption
303
+ except Exception as e:
304
+ raise CaptionProcessingError(
305
+ f"Failed to parse caption file: {input_caption}",
306
+ caption_path=str(input_caption),
307
+ context={"original_error": str(e)},
308
+ )
309
+
310
+ def _write_caption(
311
+ self,
312
+ caption: Caption,
313
+ output_caption_path: Pathlike,
314
+ ) -> Pathlike:
315
+ """
316
+ Write caption to file.
317
+
318
+ Args:
319
+ caption: Caption object to write
320
+ output_caption_path: Output file path
321
+
322
+ Returns:
323
+ Path to written file
324
+
325
+ Raises:
326
+ CaptionProcessingError: If caption cannot be written
327
+ """
328
+ try:
329
+ result = caption.write(
330
+ output_caption_path,
331
+ include_speaker_in_text=self.caption_config.include_speaker_in_text,
332
+ )
333
+ diarization_file = Path(str(output_caption_path)).with_suffix(".SpkDiar")
334
+ if not diarization_file.exists() and caption.speaker_diarization:
335
+ print(colorful.green(f" Writing speaker diarization to: {diarization_file}"))
336
+ caption.write_speaker_diarization(diarization_file)
337
+
338
+ print(colorful.green(f"🎉🎉🎉🎉🎉 Caption file written to: {output_caption_path}"))
339
+ return result
340
+ except Exception as e:
341
+ raise CaptionProcessingError(
342
+ f"Failed to write output file: {output_caption_path}",
343
+ caption_path=str(output_caption_path),
344
+ context={"original_error": str(e)},
345
+ )
346
+
347
+ async def _download_media(
348
+ self,
349
+ url: str,
350
+ output_dir: Path,
351
+ media_format: str,
352
+ force_overwrite: bool,
353
+ ) -> str:
354
+ """Download media from YouTube (async implementation)."""
355
+ print(colorful.cyan("📥 Downloading media from YouTube..."))
356
+ media_file = await self.downloader.download_media(
357
+ url=url,
358
+ output_dir=str(output_dir),
359
+ media_format=media_format,
360
+ force_overwrite=force_overwrite,
361
+ )
362
+ print(colorful.green(f" ✓ Media downloaded: {media_file}"))
363
+ return media_file
364
+
365
+ def _download_media_sync(
366
+ self,
367
+ url: str,
368
+ output_dir: Path,
369
+ media_format: str,
370
+ force_overwrite: bool,
371
+ ) -> str:
372
+ """Download media from YouTube (sync wrapper)."""
373
+ import asyncio
374
+
375
+ return asyncio.run(self._download_media(url, output_dir, media_format, force_overwrite))
376
+
377
+ def _transcribe(
378
+ self,
379
+ media_file: Union[str, Path, AudioData],
380
+ source_lang: Optional[str],
381
+ is_async: bool = False,
382
+ ) -> Caption:
383
+ """
384
+ Get captions by downloading or transcribing.
385
+
386
+ Args:
387
+ url: YouTube video URL
388
+ output_dir: Output directory for caption file
389
+ media_file: Media file path (used to generate caption filename)
390
+ force_overwrite: Force overwrite existing files
391
+ source_lang: Caption language to download
392
+ is_async: If True, returns coroutine; if False, runs synchronously
393
+
394
+ Returns:
395
+ Caption file path (str) or coroutine that returns str
396
+ """
397
+ import asyncio
398
+
399
+ async def _async_impl():
400
+ # Transcription mode: use Transcriber to transcribe
401
+ self._validate_transcription_setup()
402
+
403
+ print(colorful.cyan(f"🎤 Transcribing({self.transcriber.name}) media: {str(media_file)} ..."))
404
+ transcription = await self.transcriber.transcribe_file(media_file, language=source_lang)
405
+ print(colorful.green(" ✓ Transcription completed."))
406
+
407
+ if "Gemini" in self.transcriber.name:
408
+ # write to temp file and use Caption read
409
+ with tempfile.NamedTemporaryFile(suffix=self.transcriber.file_suffix, delete=True) as tmp_file:
410
+ tmp_path = Path(tmp_file.name)
411
+ await asyncio.to_thread(
412
+ self.transcriber.write,
413
+ transcription,
414
+ tmp_path,
415
+ encoding="utf-8",
416
+ )
417
+ transcription = self._read_caption(
418
+ tmp_path, input_caption_format="gemini", normalize_text=False, verbose=False
419
+ )
420
+
421
+ return transcription
422
+
423
+ if is_async:
424
+ return _async_impl()
425
+ else:
426
+ return asyncio.run(_async_impl())
427
+
428
+ def _download_or_transcribe_caption(
429
+ self,
430
+ url: str,
431
+ output_dir: Path,
432
+ media_file: Union[str, Path, AudioData],
433
+ force_overwrite: bool,
434
+ source_lang: Optional[str],
435
+ is_async: bool = False,
436
+ use_transcription: bool = False,
437
+ ) -> Union[Union[str, Caption], Awaitable[Union[str, Caption]]]:
438
+ """
439
+ Get captions by downloading or transcribing.
440
+ Args:
441
+ url: YouTube video URL
442
+ output_dir: Output directory for caption file
443
+ media_file: Media file path (used to generate caption filename)
444
+ force_overwrite: Force overwrite existing files
445
+ source_lang: Caption language to download
446
+ is_async: If True, returns coroutine; if False, runs synchronously
447
+
448
+ Returns:
449
+ Caption file path (str) or coroutine that returns str
450
+ """
451
+ import asyncio
452
+
453
+ from lattifai.workflow.youtube import TRANSCRIBE_CHOICE
454
+
455
+ transcriber_name = self.transcriber.name
456
+
457
+ async def _async_impl():
458
+ # First check if caption input_path is already provided
459
+ if self.caption_config.input_path:
460
+ caption_path = Path(self.caption_config.input_path)
461
+ if caption_path.exists():
462
+ print(colorful.green(f"📄 Using provided caption file: {caption_path}"))
463
+ return str(caption_path)
464
+ else:
465
+ raise FileNotFoundError(f"Provided caption path does not exist: {caption_path}")
466
+
467
+ # Generate transcript file path
468
+ transcript_file = output_dir / f"{Path(str(media_file)).stem}_{self.transcriber.file_name}"
469
+
470
+ if use_transcription:
471
+ # Transcription mode: use Transcriber to transcribe
472
+ self._validate_transcription_setup()
473
+
474
+ # Check if transcript file already exists
475
+ if transcript_file.exists() and not force_overwrite:
476
+ from .workflow.file_manager import FileExistenceManager
477
+
478
+ choice = await asyncio.to_thread(
479
+ FileExistenceManager.prompt_file_selection,
480
+ file_type=f"{transcriber_name} transcript",
481
+ files=[str(transcript_file)],
482
+ operation="transcribe",
483
+ )
484
+
485
+ if choice == "cancel":
486
+ raise RuntimeError("Transcription cancelled by user")
487
+ elif choice == "use" or choice == str(transcript_file):
488
+ # User chose to use existing file (handles both "use" and file path)
489
+ if "gemini" in transcriber_name.lower():
490
+ return str(transcript_file)
491
+
492
+ caption = self._read_caption(transcript_file, normalize_text=False)
493
+ caption.transcription = caption.supervisions
494
+ caption.supervisions = None
495
+ return caption
496
+
497
+ # elif choice == "overwrite": continue to transcribe below
498
+
499
+ print(colorful.cyan(f"🎤 Transcribing media with {transcriber_name}..."))
500
+ if self.transcriber.supports_url:
501
+ transcription = await self.transcriber.transcribe(url, language=source_lang)
502
+ else:
503
+ transcription = await self.transcriber.transcribe_file(media_file, language=source_lang)
504
+
505
+ await asyncio.to_thread(self.transcriber.write, transcription, transcript_file, encoding="utf-8")
506
+
507
+ if isinstance(transcription, Caption):
508
+ caption_file = transcription
509
+ else:
510
+ caption_file = str(transcript_file)
511
+ print(colorful.green(f" ✓ Transcription completed: {caption_file}"))
512
+ else:
513
+ # Download YouTube captions
514
+ caption_file = await self.downloader.download_captions(
515
+ url=url,
516
+ output_dir=str(output_dir),
517
+ force_overwrite=force_overwrite,
518
+ source_lang=source_lang,
519
+ transcriber_name=transcriber_name,
520
+ )
521
+
522
+ if str(caption_file) == str(transcript_file):
523
+ # Transcription was used
524
+ caption = self._read_caption(transcript_file, normalize_text=False)
525
+ if transcriber_name and "gemini" not in transcriber_name.lower():
526
+ caption.transcription = caption.supervisions # alignment will trust transcription's timestamps
527
+ caption.supervisions = None
528
+ else:
529
+ # Gemini transcription's timestamps are not accurate
530
+ pass
531
+
532
+ return caption
533
+
534
+ if caption_file == TRANSCRIBE_CHOICE:
535
+ return await self._download_or_transcribe_caption(
536
+ url=url,
537
+ output_dir=output_dir,
538
+ media_file=media_file,
539
+ force_overwrite=force_overwrite,
540
+ source_lang=source_lang,
541
+ is_async=True,
542
+ use_transcription=True,
543
+ )
544
+ elif not caption_file:
545
+ raise RuntimeError("No caption file available and transcription was declined by user.")
546
+
547
+ return caption_file
548
+
549
+ if is_async:
550
+ return _async_impl()
551
+ else:
552
+ return asyncio.run(_async_impl())