lattifai 0.4.6__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +42 -27
- lattifai/alignment/__init__.py +6 -0
- lattifai/alignment/lattice1_aligner.py +119 -0
- lattifai/{workers/lattice1_alpha.py → alignment/lattice1_worker.py} +33 -132
- lattifai/{tokenizer → alignment}/phonemizer.py +1 -1
- lattifai/alignment/segmenter.py +166 -0
- lattifai/{tokenizer → alignment}/tokenizer.py +186 -112
- lattifai/audio2.py +211 -0
- lattifai/caption/__init__.py +20 -0
- lattifai/caption/caption.py +1275 -0
- lattifai/{io → caption}/supervision.py +1 -0
- lattifai/{io → caption}/text_parser.py +53 -10
- lattifai/cli/__init__.py +17 -0
- lattifai/cli/alignment.py +153 -0
- lattifai/cli/caption.py +204 -0
- lattifai/cli/server.py +19 -0
- lattifai/cli/transcribe.py +197 -0
- lattifai/cli/youtube.py +128 -0
- lattifai/client.py +455 -246
- lattifai/config/__init__.py +20 -0
- lattifai/config/alignment.py +73 -0
- lattifai/config/caption.py +178 -0
- lattifai/config/client.py +46 -0
- lattifai/config/diarization.py +67 -0
- lattifai/config/media.py +335 -0
- lattifai/config/transcription.py +84 -0
- lattifai/diarization/__init__.py +5 -0
- lattifai/diarization/lattifai.py +89 -0
- lattifai/errors.py +41 -34
- lattifai/logging.py +116 -0
- lattifai/mixin.py +552 -0
- lattifai/server/app.py +420 -0
- lattifai/transcription/__init__.py +76 -0
- lattifai/transcription/base.py +108 -0
- lattifai/transcription/gemini.py +219 -0
- lattifai/transcription/lattifai.py +103 -0
- lattifai/types.py +30 -0
- lattifai/utils.py +3 -31
- lattifai/workflow/__init__.py +22 -0
- lattifai/workflow/agents.py +6 -0
- lattifai/{workflows → workflow}/file_manager.py +81 -57
- lattifai/workflow/youtube.py +564 -0
- lattifai-1.0.0.dist-info/METADATA +736 -0
- lattifai-1.0.0.dist-info/RECORD +52 -0
- {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
- lattifai-1.0.0.dist-info/entry_points.txt +13 -0
- lattifai/base_client.py +0 -126
- lattifai/bin/__init__.py +0 -3
- lattifai/bin/agent.py +0 -324
- lattifai/bin/align.py +0 -295
- lattifai/bin/cli_base.py +0 -25
- lattifai/bin/subtitle.py +0 -210
- lattifai/io/__init__.py +0 -43
- lattifai/io/reader.py +0 -86
- lattifai/io/utils.py +0 -15
- lattifai/io/writer.py +0 -102
- lattifai/tokenizer/__init__.py +0 -3
- lattifai/workers/__init__.py +0 -3
- lattifai/workflows/__init__.py +0 -34
- lattifai/workflows/agents.py +0 -12
- lattifai/workflows/gemini.py +0 -167
- lattifai/workflows/prompts/README.md +0 -22
- lattifai/workflows/prompts/gemini/README.md +0 -24
- lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
- lattifai/workflows/youtube.py +0 -931
- lattifai-0.4.6.dist-info/METADATA +0 -806
- lattifai-0.4.6.dist-info/RECORD +0 -39
- lattifai-0.4.6.dist-info/entry_points.txt +0 -3
- /lattifai/{io → caption}/gemini_reader.py +0 -0
- /lattifai/{io → caption}/gemini_writer.py +0 -0
- /lattifai/{workflows → transcription}/prompts/__init__.py +0 -0
- /lattifai/{workflows → workflow}/base.py +0 -0
- {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
lattifai/logging.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Unified logging configuration for LattifAI."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
# Default log format
|
|
8
|
+
DEFAULT_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
9
|
+
SIMPLE_FORMAT = "%(levelname)s: %(message)s"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def setup_logger(
|
|
13
|
+
name: str,
|
|
14
|
+
level: Optional[int] = None,
|
|
15
|
+
format_string: Optional[str] = None,
|
|
16
|
+
handler: Optional[logging.Handler] = None,
|
|
17
|
+
) -> logging.Logger:
|
|
18
|
+
"""
|
|
19
|
+
Setup logger with consistent formatting for LattifAI modules.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
name: Logger name (will be prefixed with 'lattifai.')
|
|
23
|
+
level: Logging level (defaults to INFO)
|
|
24
|
+
format_string: Custom format string (defaults to SIMPLE_FORMAT)
|
|
25
|
+
handler: Custom handler (defaults to StreamHandler)
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Configured logger instance
|
|
29
|
+
|
|
30
|
+
Examples:
|
|
31
|
+
>>> logger = setup_logger(__name__)
|
|
32
|
+
>>> logger.info("Processing started")
|
|
33
|
+
|
|
34
|
+
>>> logger = setup_logger("alignment", level=logging.DEBUG)
|
|
35
|
+
>>> logger.debug("Debug information")
|
|
36
|
+
"""
|
|
37
|
+
# Ensure name is prefixed with 'lattifai.'
|
|
38
|
+
if not name.startswith("lattifai."):
|
|
39
|
+
logger_name = f"lattifai.{name}"
|
|
40
|
+
else:
|
|
41
|
+
logger_name = name
|
|
42
|
+
|
|
43
|
+
logger = logging.getLogger(logger_name)
|
|
44
|
+
|
|
45
|
+
# Set level
|
|
46
|
+
if level is None:
|
|
47
|
+
level = logging.INFO
|
|
48
|
+
logger.setLevel(level)
|
|
49
|
+
|
|
50
|
+
# Avoid duplicate handlers
|
|
51
|
+
if logger.handlers:
|
|
52
|
+
return logger
|
|
53
|
+
|
|
54
|
+
# Setup handler
|
|
55
|
+
if handler is None:
|
|
56
|
+
handler = logging.StreamHandler(sys.stderr)
|
|
57
|
+
|
|
58
|
+
# Setup formatter
|
|
59
|
+
if format_string is None:
|
|
60
|
+
format_string = SIMPLE_FORMAT
|
|
61
|
+
formatter = logging.Formatter(format_string)
|
|
62
|
+
handler.setFormatter(formatter)
|
|
63
|
+
|
|
64
|
+
logger.addHandler(handler)
|
|
65
|
+
return logger
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_logger(name: str) -> logging.Logger:
|
|
69
|
+
"""
|
|
70
|
+
Get existing logger or create new one with default settings.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
name: Logger name (will be prefixed with 'lattifai.')
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Logger instance
|
|
77
|
+
"""
|
|
78
|
+
if not name.startswith("lattifai."):
|
|
79
|
+
logger_name = f"lattifai.{name}"
|
|
80
|
+
else:
|
|
81
|
+
logger_name = name
|
|
82
|
+
|
|
83
|
+
logger = logging.getLogger(logger_name)
|
|
84
|
+
|
|
85
|
+
# If logger has no handlers, set it up with defaults
|
|
86
|
+
if not logger.handlers:
|
|
87
|
+
return setup_logger(name)
|
|
88
|
+
|
|
89
|
+
return logger
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def set_log_level(level: int) -> None:
|
|
93
|
+
"""
|
|
94
|
+
Set log level for all LattifAI loggers.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
level: Logging level (e.g., logging.DEBUG, logging.INFO)
|
|
98
|
+
|
|
99
|
+
Examples:
|
|
100
|
+
>>> from lattifai.logging import set_log_level
|
|
101
|
+
>>> import logging
|
|
102
|
+
>>> set_log_level(logging.DEBUG)
|
|
103
|
+
"""
|
|
104
|
+
root_logger = logging.getLogger("lattifai")
|
|
105
|
+
root_logger.setLevel(level)
|
|
106
|
+
for handler in root_logger.handlers:
|
|
107
|
+
handler.setLevel(level)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
__all__ = [
|
|
111
|
+
"setup_logger",
|
|
112
|
+
"get_logger",
|
|
113
|
+
"set_log_level",
|
|
114
|
+
"DEFAULT_FORMAT",
|
|
115
|
+
"SIMPLE_FORMAT",
|
|
116
|
+
]
|
lattifai/mixin.py
ADDED
|
@@ -0,0 +1,552 @@
|
|
|
1
|
+
"""Mixin class providing shared functionality for LattifAI clients."""
|
|
2
|
+
|
|
3
|
+
import tempfile
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Awaitable, Optional, Union
|
|
6
|
+
|
|
7
|
+
import colorful
|
|
8
|
+
from lhotse.utils import Pathlike
|
|
9
|
+
|
|
10
|
+
from lattifai.audio2 import AudioData
|
|
11
|
+
from lattifai.caption import Caption
|
|
12
|
+
from lattifai.errors import CaptionProcessingError
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from .config import AlignmentConfig, CaptionConfig, ClientConfig, DiarizationConfig, TranscriptionConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LattifAIClientMixin:
|
|
19
|
+
"""
|
|
20
|
+
Mixin class providing shared functionality for LattifAI clients.
|
|
21
|
+
|
|
22
|
+
This mixin contains common logic for transcription and downloading that is
|
|
23
|
+
used by both synchronous and asynchronous client implementations.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
# Shared docstring templates for class, __init__, alignment, and youtube methods
|
|
27
|
+
_CLASS_DOC = """
|
|
28
|
+
{sync_or_async} LattifAI client for audio/video-caption alignment.
|
|
29
|
+
|
|
30
|
+
This client provides {sync_or_async_lower} methods for aligning audio/video files with caption/transcript
|
|
31
|
+
text using the Lattice-1 forced alignment model. It supports multiple caption formats
|
|
32
|
+
(SRT, VTT, ASS, TXT) and provides word-level alignment with configurable sentence splitting.
|
|
33
|
+
|
|
34
|
+
The client uses a config-driven architecture with four main configuration objects:
|
|
35
|
+
- ClientConfig: API connection settings (API key, base URL, timeout, retries)
|
|
36
|
+
- AlignmentConfig: Model and alignment behavior settings
|
|
37
|
+
- CaptionConfig: Caption I/O format and processing settings
|
|
38
|
+
- TranscriptionConfig: Transcription service settings (optional, for YouTube workflow)
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
>>> from lattifai import {client_class}, ClientConfig
|
|
42
|
+
>>>
|
|
43
|
+
>>> # Initialize with default settings
|
|
44
|
+
>>> client = {client_class}()
|
|
45
|
+
>>>
|
|
46
|
+
>>> # Or with custom configuration
|
|
47
|
+
>>> config = ClientConfig(api_key="your-api-key")
|
|
48
|
+
>>> client = {client_class}(config=config)
|
|
49
|
+
>>>
|
|
50
|
+
>>> # Perform alignment
|
|
51
|
+
>>> {await_keyword}alignments, output_path = {await_keyword}client.alignment(
|
|
52
|
+
... input_media="audio.wav",
|
|
53
|
+
... input_caption="caption.srt",
|
|
54
|
+
... output_caption_path="aligned.srt"
|
|
55
|
+
... )
|
|
56
|
+
|
|
57
|
+
Attributes:
|
|
58
|
+
aligner: Lattice1Aligner instance for performing forced alignment{async_note}
|
|
59
|
+
captioner: Captioner instance for reading/writing caption files
|
|
60
|
+
transcriber: Optional transcriber instance for audio transcription{transcriber_note}
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
_INIT_DOC = """
|
|
64
|
+
Initialize {client_class} {sync_or_async_lower} client.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
client_config: Client configuration for API connection settings. If None, uses defaults
|
|
68
|
+
(reads API key from LATTIFAI_API_KEY environment variable).
|
|
69
|
+
alignment_config: Alignment {config_desc}
|
|
70
|
+
If None, uses {default_desc}.
|
|
71
|
+
caption_config: Caption I/O configuration for format handling and processing.
|
|
72
|
+
If None, uses default settings{caption_note}.
|
|
73
|
+
transcription_config: Transcription service configuration{transcription_note}.
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
ConfigurationError: If API key is not provided {api_key_source}.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
_ALIGNMENT_DOC = """
|
|
80
|
+
Perform {async_prefix}forced alignment on audio and caption/text.
|
|
81
|
+
|
|
82
|
+
This {async_word}method aligns caption text with audio by finding the precise timing of {timing_desc}
|
|
83
|
+
and caption segment. {concurrency_note}
|
|
84
|
+
|
|
85
|
+
The alignment process consists of five steps:
|
|
86
|
+
1. Parse the input caption file into segments{async_suffix1}
|
|
87
|
+
2. Generate a lattice graph from caption text{async_suffix2}
|
|
88
|
+
3. Search the lattice using audio features{async_suffix3}
|
|
89
|
+
4. Decode results to extract word-level timings{async_suffix4}
|
|
90
|
+
5. Export aligned captions (if output path provided{async_suffix5})
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
input_media: Path to audio/video file (WAV, MP3, FLAC, MP4, etc.). Must be readable by ffmpeg.
|
|
94
|
+
input_caption: Path to caption or plain text file to align with audio.
|
|
95
|
+
input_caption_format: Input caption format ('srt', 'vtt', 'ass', 'txt'). If None, {format_default}
|
|
96
|
+
from file extension or uses config default.
|
|
97
|
+
split_sentence: Enable automatic sentence re-splitting for better alignment accuracy.
|
|
98
|
+
If None, uses config default (typically False).
|
|
99
|
+
output_caption_path: Optional path to write aligned caption file. If provided,
|
|
100
|
+
exports results{export_note}.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Tuple containing:
|
|
104
|
+
- List of Supervision objects with aligned timing information{timing_note}
|
|
105
|
+
- Output caption path (same as input parameter, or None if not provided)
|
|
106
|
+
|
|
107
|
+
Raises:
|
|
108
|
+
CaptionProcessingError: If caption file cannot be parsed or output cannot be written.
|
|
109
|
+
LatticeEncodingError: If lattice graph generation fails (invalid text format).
|
|
110
|
+
AlignmentError: If audio alignment fails (audio processing or model inference error).
|
|
111
|
+
LatticeDecodingError: If lattice decoding fails (invalid results from model).
|
|
112
|
+
|
|
113
|
+
Example:
|
|
114
|
+
>>> {example_imports}
|
|
115
|
+
>>> {example_code}
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
_YOUTUBE_METHOD_DOC = """
|
|
119
|
+
Download and align YouTube video with captions or transcription.
|
|
120
|
+
|
|
121
|
+
This end-to-end method handles the complete YouTube alignment workflow:
|
|
122
|
+
1. Downloads media from YouTube in specified format
|
|
123
|
+
2. Downloads captions OR transcribes audio (based on config)
|
|
124
|
+
3. Performs forced alignment with Lattice-1 model
|
|
125
|
+
4. Exports aligned captions
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
url: YouTube video URL (e.g., https://youtube.com/watch?v=VIDEO_ID)
|
|
129
|
+
output_dir: Directory for downloaded files. If None, uses temporary directory.
|
|
130
|
+
media_format: Media format to download (mp3, mp4, wav, etc.). If None, uses config default.
|
|
131
|
+
source_lang: Specific caption language to download (e.g., 'en', 'zh'). If None, downloads all.
|
|
132
|
+
force_overwrite: Skip confirmation prompts and overwrite existing files.
|
|
133
|
+
output_caption_path: Path for aligned caption output. If None, auto-generates.
|
|
134
|
+
**alignment_kwargs: Additional arguments passed to alignment() method.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Tuple containing:
|
|
138
|
+
- List of Supervision objects with aligned timing information
|
|
139
|
+
- Output caption path
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
ValueError: If transcription is requested but transcriber not configured.
|
|
143
|
+
RuntimeError: If download or transcription fails.
|
|
144
|
+
CaptionProcessingError: If caption processing fails.
|
|
145
|
+
AlignmentError: If alignment fails.
|
|
146
|
+
|
|
147
|
+
Example:
|
|
148
|
+
>>> from lattifai import {client_class}
|
|
149
|
+
>>> from lattifai.config import TranscriptionConfig
|
|
150
|
+
>>>
|
|
151
|
+
>>> # With YouTube captions
|
|
152
|
+
>>> client = {client_class}()
|
|
153
|
+
>>> {await_keyword}alignments, path = {await_keyword}client.youtube(
|
|
154
|
+
... url="https://youtube.com/watch?v=VIDEO_ID",
|
|
155
|
+
... output_dir="./downloads"
|
|
156
|
+
... )
|
|
157
|
+
>>>
|
|
158
|
+
>>> # With Gemini transcription
|
|
159
|
+
>>> config = TranscriptionConfig(gemini_api_key="YOUR_KEY")
|
|
160
|
+
>>> client = {client_class}(transcription_config=config)
|
|
161
|
+
>>> {await_keyword}alignments, path = {await_keyword}client.youtube(
|
|
162
|
+
... url="https://youtube.com/watch?v=VIDEO_ID",
|
|
163
|
+
... use_transcription=True
|
|
164
|
+
... )
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
def _init_configs(
|
|
168
|
+
self,
|
|
169
|
+
alignment_config: Optional["AlignmentConfig"],
|
|
170
|
+
transcription_config: Optional["TranscriptionConfig"],
|
|
171
|
+
diarization_config: Optional["DiarizationConfig"] = None,
|
|
172
|
+
) -> tuple:
|
|
173
|
+
"""Initialize all configs with defaults if not provided."""
|
|
174
|
+
from .config import AlignmentConfig, DiarizationConfig, TranscriptionConfig
|
|
175
|
+
|
|
176
|
+
if alignment_config is None:
|
|
177
|
+
alignment_config = AlignmentConfig()
|
|
178
|
+
if transcription_config is None:
|
|
179
|
+
transcription_config = TranscriptionConfig()
|
|
180
|
+
if diarization_config is None:
|
|
181
|
+
diarization_config = DiarizationConfig()
|
|
182
|
+
|
|
183
|
+
from lattifai.utils import _resolve_model_path
|
|
184
|
+
|
|
185
|
+
if transcription_config is not None:
|
|
186
|
+
transcription_config.lattice_model_path = _resolve_model_path(alignment_config.model_name)
|
|
187
|
+
|
|
188
|
+
# Set client_wrapper for all configs
|
|
189
|
+
alignment_config.client_wrapper = self
|
|
190
|
+
transcription_config.client_wrapper = self
|
|
191
|
+
diarization_config.client_wrapper = self
|
|
192
|
+
|
|
193
|
+
return alignment_config, transcription_config, diarization_config
|
|
194
|
+
|
|
195
|
+
def _init_shared_components(
|
|
196
|
+
self,
|
|
197
|
+
transcription_config: Optional["TranscriptionConfig"],
|
|
198
|
+
) -> None:
|
|
199
|
+
"""Initialize shared components (transcriber, downloader)."""
|
|
200
|
+
# transcriber (optional, lazy loaded when needed)
|
|
201
|
+
self.transcription_config = transcription_config
|
|
202
|
+
self._transcriber = None
|
|
203
|
+
|
|
204
|
+
# downloader (lazy loaded when needed)
|
|
205
|
+
self._downloader = None
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def transcriber(self):
|
|
209
|
+
"""Lazy load transcriber based on config."""
|
|
210
|
+
if self._transcriber is None and self.transcription_config:
|
|
211
|
+
from .transcription import create_transcriber
|
|
212
|
+
|
|
213
|
+
self._transcriber = create_transcriber(transcription_config=self.transcription_config)
|
|
214
|
+
return self._transcriber
|
|
215
|
+
|
|
216
|
+
@property
|
|
217
|
+
def downloader(self):
|
|
218
|
+
"""Lazy load YouTube downloader."""
|
|
219
|
+
if self._downloader is None:
|
|
220
|
+
from .workflow.youtube import YouTubeDownloader
|
|
221
|
+
|
|
222
|
+
self._downloader = YouTubeDownloader()
|
|
223
|
+
return self._downloader
|
|
224
|
+
|
|
225
|
+
def _prepare_youtube_output_dir(self, output_dir: Optional["Pathlike"]) -> Path:
|
|
226
|
+
"""Prepare and return output directory for YouTube downloads."""
|
|
227
|
+
if output_dir is None:
|
|
228
|
+
output_dir = Path(tempfile.gettempdir()) / "lattifai_youtube"
|
|
229
|
+
else:
|
|
230
|
+
output_dir = Path(output_dir).expanduser()
|
|
231
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
232
|
+
return output_dir
|
|
233
|
+
|
|
234
|
+
def _determine_media_format(self, media_format: Optional[str]) -> str:
|
|
235
|
+
"""Determine media format from parameter or config."""
|
|
236
|
+
return media_format or "mp3"
|
|
237
|
+
|
|
238
|
+
def _generate_output_caption_path(
|
|
239
|
+
self, output_caption_path: Optional["Pathlike"], media_file: str, output_dir: Path
|
|
240
|
+
) -> Path:
|
|
241
|
+
"""Generate output caption path if not provided."""
|
|
242
|
+
if not output_caption_path:
|
|
243
|
+
media_name = Path(media_file).stem
|
|
244
|
+
output_format = self.caption_config.output_format or "srt"
|
|
245
|
+
output_caption_path = output_dir / f"{media_name}_LattifAI.{output_format}"
|
|
246
|
+
return Path(output_caption_path)
|
|
247
|
+
|
|
248
|
+
def _validate_transcription_setup(self) -> None:
|
|
249
|
+
"""Validate that transcription is properly configured if requested."""
|
|
250
|
+
if not self.transcriber:
|
|
251
|
+
raise ValueError(
|
|
252
|
+
"Transcription requested but transcriber not configured. "
|
|
253
|
+
"Provide TranscriptionConfig with valid API key."
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
def _read_caption(
|
|
257
|
+
self,
|
|
258
|
+
input_caption: Union[Pathlike, Caption],
|
|
259
|
+
input_caption_format: Optional[str] = None,
|
|
260
|
+
normalize_text: Optional[bool] = None,
|
|
261
|
+
verbose: bool = True,
|
|
262
|
+
) -> Caption:
|
|
263
|
+
"""
|
|
264
|
+
Read caption file or return Caption object directly.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
input_caption: Path to caption file or Caption object
|
|
268
|
+
input_caption_format: Optional format hint for parsing
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
Caption object
|
|
272
|
+
|
|
273
|
+
Raises:
|
|
274
|
+
CaptionProcessingError: If caption cannot be read
|
|
275
|
+
"""
|
|
276
|
+
if isinstance(input_caption, Caption):
|
|
277
|
+
return input_caption
|
|
278
|
+
|
|
279
|
+
try:
|
|
280
|
+
if verbose:
|
|
281
|
+
print(colorful.cyan(f"📖 Step 1: Reading caption file from {input_caption}"))
|
|
282
|
+
caption = Caption.read(
|
|
283
|
+
input_caption,
|
|
284
|
+
format=input_caption_format,
|
|
285
|
+
normalize_text=normalize_text if normalize_text is not None else self.caption_config.normalize_text,
|
|
286
|
+
)
|
|
287
|
+
diarization_file = Path(str(input_caption)).with_suffix(".SpkDiar")
|
|
288
|
+
if diarization_file.exists():
|
|
289
|
+
if verbose:
|
|
290
|
+
print(colorful.cyan(f"📖 Step 1b: Reading speaker diarization from {diarization_file}"))
|
|
291
|
+
caption.read_speaker_diarization(diarization_file)
|
|
292
|
+
events_file = Path(str(input_caption)).with_suffix(".AED")
|
|
293
|
+
if events_file.exists():
|
|
294
|
+
if verbose:
|
|
295
|
+
print(colorful.cyan(f"📖 Step 1c: Reading audio events from {events_file}"))
|
|
296
|
+
from tgt import read_textgrid
|
|
297
|
+
|
|
298
|
+
caption.audio_events = read_textgrid(events_file)
|
|
299
|
+
|
|
300
|
+
if verbose:
|
|
301
|
+
print(colorful.green(f" ✓ Parsed {len(caption)} caption segments"))
|
|
302
|
+
return caption
|
|
303
|
+
except Exception as e:
|
|
304
|
+
raise CaptionProcessingError(
|
|
305
|
+
f"Failed to parse caption file: {input_caption}",
|
|
306
|
+
caption_path=str(input_caption),
|
|
307
|
+
context={"original_error": str(e)},
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
def _write_caption(
|
|
311
|
+
self,
|
|
312
|
+
caption: Caption,
|
|
313
|
+
output_caption_path: Pathlike,
|
|
314
|
+
) -> Pathlike:
|
|
315
|
+
"""
|
|
316
|
+
Write caption to file.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
caption: Caption object to write
|
|
320
|
+
output_caption_path: Output file path
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Path to written file
|
|
324
|
+
|
|
325
|
+
Raises:
|
|
326
|
+
CaptionProcessingError: If caption cannot be written
|
|
327
|
+
"""
|
|
328
|
+
try:
|
|
329
|
+
result = caption.write(
|
|
330
|
+
output_caption_path,
|
|
331
|
+
include_speaker_in_text=self.caption_config.include_speaker_in_text,
|
|
332
|
+
)
|
|
333
|
+
diarization_file = Path(str(output_caption_path)).with_suffix(".SpkDiar")
|
|
334
|
+
if not diarization_file.exists() and caption.speaker_diarization:
|
|
335
|
+
print(colorful.green(f" Writing speaker diarization to: {diarization_file}"))
|
|
336
|
+
caption.write_speaker_diarization(diarization_file)
|
|
337
|
+
|
|
338
|
+
print(colorful.green(f"🎉🎉🎉🎉🎉 Caption file written to: {output_caption_path}"))
|
|
339
|
+
return result
|
|
340
|
+
except Exception as e:
|
|
341
|
+
raise CaptionProcessingError(
|
|
342
|
+
f"Failed to write output file: {output_caption_path}",
|
|
343
|
+
caption_path=str(output_caption_path),
|
|
344
|
+
context={"original_error": str(e)},
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
async def _download_media(
|
|
348
|
+
self,
|
|
349
|
+
url: str,
|
|
350
|
+
output_dir: Path,
|
|
351
|
+
media_format: str,
|
|
352
|
+
force_overwrite: bool,
|
|
353
|
+
) -> str:
|
|
354
|
+
"""Download media from YouTube (async implementation)."""
|
|
355
|
+
print(colorful.cyan("📥 Downloading media from YouTube..."))
|
|
356
|
+
media_file = await self.downloader.download_media(
|
|
357
|
+
url=url,
|
|
358
|
+
output_dir=str(output_dir),
|
|
359
|
+
media_format=media_format,
|
|
360
|
+
force_overwrite=force_overwrite,
|
|
361
|
+
)
|
|
362
|
+
print(colorful.green(f" ✓ Media downloaded: {media_file}"))
|
|
363
|
+
return media_file
|
|
364
|
+
|
|
365
|
+
def _download_media_sync(
|
|
366
|
+
self,
|
|
367
|
+
url: str,
|
|
368
|
+
output_dir: Path,
|
|
369
|
+
media_format: str,
|
|
370
|
+
force_overwrite: bool,
|
|
371
|
+
) -> str:
|
|
372
|
+
"""Download media from YouTube (sync wrapper)."""
|
|
373
|
+
import asyncio
|
|
374
|
+
|
|
375
|
+
return asyncio.run(self._download_media(url, output_dir, media_format, force_overwrite))
|
|
376
|
+
|
|
377
|
+
def _transcribe(
|
|
378
|
+
self,
|
|
379
|
+
media_file: Union[str, Path, AudioData],
|
|
380
|
+
source_lang: Optional[str],
|
|
381
|
+
is_async: bool = False,
|
|
382
|
+
) -> Caption:
|
|
383
|
+
"""
|
|
384
|
+
Get captions by downloading or transcribing.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
url: YouTube video URL
|
|
388
|
+
output_dir: Output directory for caption file
|
|
389
|
+
media_file: Media file path (used to generate caption filename)
|
|
390
|
+
force_overwrite: Force overwrite existing files
|
|
391
|
+
source_lang: Caption language to download
|
|
392
|
+
is_async: If True, returns coroutine; if False, runs synchronously
|
|
393
|
+
|
|
394
|
+
Returns:
|
|
395
|
+
Caption file path (str) or coroutine that returns str
|
|
396
|
+
"""
|
|
397
|
+
import asyncio
|
|
398
|
+
|
|
399
|
+
async def _async_impl():
|
|
400
|
+
# Transcription mode: use Transcriber to transcribe
|
|
401
|
+
self._validate_transcription_setup()
|
|
402
|
+
|
|
403
|
+
print(colorful.cyan(f"🎤 Transcribing({self.transcriber.name}) media: {str(media_file)} ..."))
|
|
404
|
+
transcription = await self.transcriber.transcribe_file(media_file, language=source_lang)
|
|
405
|
+
print(colorful.green(" ✓ Transcription completed."))
|
|
406
|
+
|
|
407
|
+
if "Gemini" in self.transcriber.name:
|
|
408
|
+
# write to temp file and use Caption read
|
|
409
|
+
with tempfile.NamedTemporaryFile(suffix=self.transcriber.file_suffix, delete=True) as tmp_file:
|
|
410
|
+
tmp_path = Path(tmp_file.name)
|
|
411
|
+
await asyncio.to_thread(
|
|
412
|
+
self.transcriber.write,
|
|
413
|
+
transcription,
|
|
414
|
+
tmp_path,
|
|
415
|
+
encoding="utf-8",
|
|
416
|
+
)
|
|
417
|
+
transcription = self._read_caption(
|
|
418
|
+
tmp_path, input_caption_format="gemini", normalize_text=False, verbose=False
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
return transcription
|
|
422
|
+
|
|
423
|
+
if is_async:
|
|
424
|
+
return _async_impl()
|
|
425
|
+
else:
|
|
426
|
+
return asyncio.run(_async_impl())
|
|
427
|
+
|
|
428
|
+
def _download_or_transcribe_caption(
|
|
429
|
+
self,
|
|
430
|
+
url: str,
|
|
431
|
+
output_dir: Path,
|
|
432
|
+
media_file: Union[str, Path, AudioData],
|
|
433
|
+
force_overwrite: bool,
|
|
434
|
+
source_lang: Optional[str],
|
|
435
|
+
is_async: bool = False,
|
|
436
|
+
use_transcription: bool = False,
|
|
437
|
+
) -> Union[Union[str, Caption], Awaitable[Union[str, Caption]]]:
|
|
438
|
+
"""
|
|
439
|
+
Get captions by downloading or transcribing.
|
|
440
|
+
Args:
|
|
441
|
+
url: YouTube video URL
|
|
442
|
+
output_dir: Output directory for caption file
|
|
443
|
+
media_file: Media file path (used to generate caption filename)
|
|
444
|
+
force_overwrite: Force overwrite existing files
|
|
445
|
+
source_lang: Caption language to download
|
|
446
|
+
is_async: If True, returns coroutine; if False, runs synchronously
|
|
447
|
+
|
|
448
|
+
Returns:
|
|
449
|
+
Caption file path (str) or coroutine that returns str
|
|
450
|
+
"""
|
|
451
|
+
import asyncio
|
|
452
|
+
|
|
453
|
+
from lattifai.workflow.youtube import TRANSCRIBE_CHOICE
|
|
454
|
+
|
|
455
|
+
transcriber_name = self.transcriber.name
|
|
456
|
+
|
|
457
|
+
async def _async_impl():
|
|
458
|
+
# First check if caption input_path is already provided
|
|
459
|
+
if self.caption_config.input_path:
|
|
460
|
+
caption_path = Path(self.caption_config.input_path)
|
|
461
|
+
if caption_path.exists():
|
|
462
|
+
print(colorful.green(f"📄 Using provided caption file: {caption_path}"))
|
|
463
|
+
return str(caption_path)
|
|
464
|
+
else:
|
|
465
|
+
raise FileNotFoundError(f"Provided caption path does not exist: {caption_path}")
|
|
466
|
+
|
|
467
|
+
# Generate transcript file path
|
|
468
|
+
transcript_file = output_dir / f"{Path(str(media_file)).stem}_{self.transcriber.file_name}"
|
|
469
|
+
|
|
470
|
+
if use_transcription:
|
|
471
|
+
# Transcription mode: use Transcriber to transcribe
|
|
472
|
+
self._validate_transcription_setup()
|
|
473
|
+
|
|
474
|
+
# Check if transcript file already exists
|
|
475
|
+
if transcript_file.exists() and not force_overwrite:
|
|
476
|
+
from .workflow.file_manager import FileExistenceManager
|
|
477
|
+
|
|
478
|
+
choice = await asyncio.to_thread(
|
|
479
|
+
FileExistenceManager.prompt_file_selection,
|
|
480
|
+
file_type=f"{transcriber_name} transcript",
|
|
481
|
+
files=[str(transcript_file)],
|
|
482
|
+
operation="transcribe",
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
if choice == "cancel":
|
|
486
|
+
raise RuntimeError("Transcription cancelled by user")
|
|
487
|
+
elif choice == "use" or choice == str(transcript_file):
|
|
488
|
+
# User chose to use existing file (handles both "use" and file path)
|
|
489
|
+
if "gemini" in transcriber_name.lower():
|
|
490
|
+
return str(transcript_file)
|
|
491
|
+
|
|
492
|
+
caption = self._read_caption(transcript_file, normalize_text=False)
|
|
493
|
+
caption.transcription = caption.supervisions
|
|
494
|
+
caption.supervisions = None
|
|
495
|
+
return caption
|
|
496
|
+
|
|
497
|
+
# elif choice == "overwrite": continue to transcribe below
|
|
498
|
+
|
|
499
|
+
print(colorful.cyan(f"🎤 Transcribing media with {transcriber_name}..."))
|
|
500
|
+
if self.transcriber.supports_url:
|
|
501
|
+
transcription = await self.transcriber.transcribe(url, language=source_lang)
|
|
502
|
+
else:
|
|
503
|
+
transcription = await self.transcriber.transcribe_file(media_file, language=source_lang)
|
|
504
|
+
|
|
505
|
+
await asyncio.to_thread(self.transcriber.write, transcription, transcript_file, encoding="utf-8")
|
|
506
|
+
|
|
507
|
+
if isinstance(transcription, Caption):
|
|
508
|
+
caption_file = transcription
|
|
509
|
+
else:
|
|
510
|
+
caption_file = str(transcript_file)
|
|
511
|
+
print(colorful.green(f" ✓ Transcription completed: {caption_file}"))
|
|
512
|
+
else:
|
|
513
|
+
# Download YouTube captions
|
|
514
|
+
caption_file = await self.downloader.download_captions(
|
|
515
|
+
url=url,
|
|
516
|
+
output_dir=str(output_dir),
|
|
517
|
+
force_overwrite=force_overwrite,
|
|
518
|
+
source_lang=source_lang,
|
|
519
|
+
transcriber_name=transcriber_name,
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
if str(caption_file) == str(transcript_file):
|
|
523
|
+
# Transcription was used
|
|
524
|
+
caption = self._read_caption(transcript_file, normalize_text=False)
|
|
525
|
+
if transcriber_name and "gemini" not in transcriber_name.lower():
|
|
526
|
+
caption.transcription = caption.supervisions # alignment will trust transcription's timestamps
|
|
527
|
+
caption.supervisions = None
|
|
528
|
+
else:
|
|
529
|
+
# Gemini transcription's timestamps are not accurate
|
|
530
|
+
pass
|
|
531
|
+
|
|
532
|
+
return caption
|
|
533
|
+
|
|
534
|
+
if caption_file == TRANSCRIBE_CHOICE:
|
|
535
|
+
return await self._download_or_transcribe_caption(
|
|
536
|
+
url=url,
|
|
537
|
+
output_dir=output_dir,
|
|
538
|
+
media_file=media_file,
|
|
539
|
+
force_overwrite=force_overwrite,
|
|
540
|
+
source_lang=source_lang,
|
|
541
|
+
is_async=True,
|
|
542
|
+
use_transcription=True,
|
|
543
|
+
)
|
|
544
|
+
elif not caption_file:
|
|
545
|
+
raise RuntimeError("No caption file available and transcription was declined by user.")
|
|
546
|
+
|
|
547
|
+
return caption_file
|
|
548
|
+
|
|
549
|
+
if is_async:
|
|
550
|
+
return _async_impl()
|
|
551
|
+
else:
|
|
552
|
+
return asyncio.run(_async_impl())
|