lattifai 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. lattifai/__init__.py +61 -47
  2. lattifai/alignment/__init__.py +6 -0
  3. lattifai/alignment/lattice1_aligner.py +119 -0
  4. lattifai/alignment/lattice1_worker.py +185 -0
  5. lattifai/{tokenizer → alignment}/phonemizer.py +4 -4
  6. lattifai/alignment/segmenter.py +166 -0
  7. lattifai/{tokenizer → alignment}/tokenizer.py +244 -169
  8. lattifai/audio2.py +211 -0
  9. lattifai/caption/__init__.py +20 -0
  10. lattifai/caption/caption.py +1275 -0
  11. lattifai/{io → caption}/gemini_reader.py +30 -30
  12. lattifai/{io → caption}/gemini_writer.py +17 -17
  13. lattifai/{io → caption}/supervision.py +4 -3
  14. lattifai/caption/text_parser.py +145 -0
  15. lattifai/cli/__init__.py +17 -0
  16. lattifai/cli/alignment.py +153 -0
  17. lattifai/cli/caption.py +204 -0
  18. lattifai/cli/server.py +19 -0
  19. lattifai/cli/transcribe.py +197 -0
  20. lattifai/cli/youtube.py +128 -0
  21. lattifai/client.py +460 -251
  22. lattifai/config/__init__.py +20 -0
  23. lattifai/config/alignment.py +73 -0
  24. lattifai/config/caption.py +178 -0
  25. lattifai/config/client.py +46 -0
  26. lattifai/config/diarization.py +67 -0
  27. lattifai/config/media.py +335 -0
  28. lattifai/config/transcription.py +84 -0
  29. lattifai/diarization/__init__.py +5 -0
  30. lattifai/diarization/lattifai.py +89 -0
  31. lattifai/errors.py +98 -91
  32. lattifai/logging.py +116 -0
  33. lattifai/mixin.py +552 -0
  34. lattifai/server/app.py +420 -0
  35. lattifai/transcription/__init__.py +76 -0
  36. lattifai/transcription/base.py +108 -0
  37. lattifai/transcription/gemini.py +219 -0
  38. lattifai/transcription/lattifai.py +103 -0
  39. lattifai/{workflows → transcription}/prompts/__init__.py +4 -4
  40. lattifai/types.py +30 -0
  41. lattifai/utils.py +16 -44
  42. lattifai/workflow/__init__.py +22 -0
  43. lattifai/workflow/agents.py +6 -0
  44. lattifai/{workflows → workflow}/base.py +22 -22
  45. lattifai/{workflows → workflow}/file_manager.py +239 -215
  46. lattifai/workflow/youtube.py +564 -0
  47. lattifai-1.0.0.dist-info/METADATA +736 -0
  48. lattifai-1.0.0.dist-info/RECORD +52 -0
  49. {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
  50. lattifai-1.0.0.dist-info/entry_points.txt +13 -0
  51. {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +1 -1
  52. lattifai/base_client.py +0 -126
  53. lattifai/bin/__init__.py +0 -3
  54. lattifai/bin/agent.py +0 -325
  55. lattifai/bin/align.py +0 -296
  56. lattifai/bin/cli_base.py +0 -25
  57. lattifai/bin/subtitle.py +0 -210
  58. lattifai/io/__init__.py +0 -42
  59. lattifai/io/reader.py +0 -85
  60. lattifai/io/text_parser.py +0 -75
  61. lattifai/io/utils.py +0 -15
  62. lattifai/io/writer.py +0 -90
  63. lattifai/tokenizer/__init__.py +0 -3
  64. lattifai/workers/__init__.py +0 -3
  65. lattifai/workers/lattice1_alpha.py +0 -284
  66. lattifai/workflows/__init__.py +0 -34
  67. lattifai/workflows/agents.py +0 -10
  68. lattifai/workflows/gemini.py +0 -167
  69. lattifai/workflows/prompts/README.md +0 -22
  70. lattifai/workflows/prompts/gemini/README.md +0 -24
  71. lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
  72. lattifai/workflows/youtube.py +0 -931
  73. lattifai-0.4.5.dist-info/METADATA +0 -808
  74. lattifai-0.4.5.dist-info/RECORD +0 -39
  75. lattifai-0.4.5.dist-info/entry_points.txt +0 -3
  76. {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,75 +0,0 @@
1
- import logging
2
- import re
3
- from typing import Optional, Tuple
4
-
5
- # 来自于字幕中常见的说话人标记格式
6
- SPEAKER_PATTERN = re.compile(r'((?:>>|>>|>|>).*?[::])\s*(.*)')
7
-
8
- # Transcriber Output Example:
9
- # 26:19.919 --> 26:34.921
10
- # [SPEAKER_01]: 越来越多的科技巨头入...
11
- SPEAKER_LATTIFAI = re.compile(r'(^\[SPEAKER_.*?\][::])\s*(.*)')
12
-
13
- # NISHTHA BHATIA: Hey, everyone.
14
- # DIETER: Oh, hey, Nishtha.
15
- # GEMINI: That might
16
- SPEAKER_PATTERN2 = re.compile(r'^([A-Z]{1,15}(?:\s+[A-Z]{1,15})?[::])\s*(.*)$')
17
-
18
-
19
- def parse_speaker_text(line) -> Tuple[Optional[str], str]:
20
- line = line.replace('\\N', ' ')
21
-
22
- if ':' not in line and ':' not in line:
23
- return None, line
24
-
25
- # 匹配以 >> 开头的行,并去除开头的名字和冒号
26
- match = SPEAKER_PATTERN.match(line)
27
- if match:
28
- return match.group(1).strip(), match.group(2).strip()
29
-
30
- match = SPEAKER_LATTIFAI.match(line)
31
- if match:
32
- assert len(match.groups()) == 2, match.groups()
33
- if not match.group(1):
34
- logging.error(f'ParseSub LINE [{line}]')
35
- else:
36
- return match.group(1).strip(), match.group(2).strip()
37
-
38
- match = SPEAKER_PATTERN2.match(line)
39
- if match:
40
- assert len(match.groups()) == 2, match.groups()
41
- return match.group(1).strip(), match.group(2).strip()
42
-
43
- return None, line
44
-
45
-
46
- if __name__ == '__main__':
47
- pattern = re.compile(r'>>\s*(.*?)\s*[::]\s*(.*)')
48
- pattern = re.compile(r'(>>.*?[::])\s*(.*)')
49
-
50
- test_strings = [
51
- '>>Key: Value',
52
- '>> Key with space : Value with space ',
53
- '>> 全角键 : 全角值',
54
- '>>Key:Value xxx. >>Key:Value',
55
- ]
56
-
57
- for text in test_strings:
58
- match = pattern.match(text)
59
- if match:
60
- print(f"Input: '{text}'")
61
- print(f" Key: '{match.group(1)}'")
62
- print(f" Value: '{match.group(2)}'")
63
- print('-------------')
64
-
65
- # pattern2
66
- test_strings2 = ['NISHTHA BHATIA: Hey, everyone.', 'DIETER: Oh, hey, Nishtha.', 'GEMINI: That might']
67
- for text in test_strings2:
68
- match = SPEAKER_PATTERN2.match(text)
69
- if match:
70
- print(f" Input: '{text}'")
71
- print(f"Speaker: '{match.group(1)}'")
72
- print(f"Content: '{match.group(2)}'")
73
- print('-------------')
74
- else:
75
- raise ValueError(f"No match for: '{text}'")
lattifai/io/utils.py DELETED
@@ -1,15 +0,0 @@
1
- """
2
- Utility constants and helper functions for subtitle I/O operations
3
- """
4
-
5
- # Supported subtitle formats for reading/writing
6
- SUBTITLE_FORMATS = ['srt', 'vtt', 'ass', 'ssa', 'sub', 'sbv', 'txt', 'md']
7
-
8
- # Input subtitle formats (includes special formats like 'auto' and 'gemini')
9
- INPUT_SUBTITLE_FORMATS = ['srt', 'vtt', 'ass', 'ssa', 'sub', 'sbv', 'txt', 'auto', 'gemini']
10
-
11
- # Output subtitle formats (includes special formats like 'TextGrid' and 'json')
12
- OUTPUT_SUBTITLE_FORMATS = ['srt', 'vtt', 'ass', 'ssa', 'sub', 'sbv', 'txt', 'TextGrid', 'json']
13
-
14
- # All subtitle formats combined (for file detection)
15
- ALL_SUBTITLE_FORMATS = list(set(SUBTITLE_FORMATS + ['TextGrid', 'json', 'gemini']))
lattifai/io/writer.py DELETED
@@ -1,90 +0,0 @@
1
- import json
2
- from abc import ABCMeta
3
- from typing import Any, List, Optional
4
-
5
- import pysubs2
6
- from lhotse.supervision import AlignmentItem
7
- from lhotse.utils import Pathlike
8
-
9
- from .reader import Supervision
10
-
11
-
12
- class SubtitleWriter(ABCMeta):
13
- """Class for writing subtitle files with optional word-level alignment."""
14
-
15
- @classmethod
16
- def write(cls, alignments: List[Supervision], output_path: Pathlike) -> Pathlike:
17
- if str(output_path)[-4:].lower() == '.txt':
18
- with open(output_path, 'w', encoding='utf-8') as f:
19
- for sup in alignments:
20
- word_items = parse_alignment_from_supervision(sup)
21
- if word_items:
22
- for item in word_items:
23
- f.write(f'[{item.start:.2f}-{item.end:.2f}] {item.symbol}\n')
24
- else:
25
- text = f'{sup.speaker} {sup.text}' if sup.speaker is not None else sup.text
26
- f.write(f'[{sup.start:.2f}-{sup.end:.2f}] {text}\n')
27
-
28
- elif str(output_path)[-5:].lower() == '.json':
29
- with open(output_path, 'w', encoding='utf-8') as f:
30
- # Enhanced JSON export with word-level alignment
31
- json_data = []
32
- for sup in alignments:
33
- sup_dict = sup.to_dict()
34
- json_data.append(sup_dict)
35
- json.dump(json_data, f, ensure_ascii=False, indent=4)
36
- elif str(output_path).endswith('.TextGrid') or str(output_path).endswith('.textgrid'):
37
- from tgt import Interval, IntervalTier, TextGrid, write_to_file
38
-
39
- tg = TextGrid()
40
- supervisions, words = [], []
41
- for supervision in sorted(alignments, key=lambda x: x.start):
42
- text = (
43
- f'{supervision.speaker} {supervision.text}' if supervision.speaker is not None else supervision.text
44
- )
45
- supervisions.append(Interval(supervision.start, supervision.end, text or ''))
46
- # Extract word-level alignment using helper function
47
- word_items = parse_alignment_from_supervision(supervision)
48
- if word_items:
49
- for item in word_items:
50
- words.append(Interval(item.start, item.end, item.symbol))
51
-
52
- tg.add_tier(IntervalTier(name='utterances', objects=supervisions))
53
- if words:
54
- tg.add_tier(IntervalTier(name='words', objects=words))
55
- write_to_file(tg, output_path, format='long')
56
- else:
57
- subs = pysubs2.SSAFile()
58
- for sup in alignments:
59
- # Add word-level timing as metadata in the subtitle text
60
- word_items = parse_alignment_from_supervision(sup)
61
- if word_items:
62
- for word in word_items:
63
- subs.append(
64
- pysubs2.SSAEvent(start=int(word.start * 1000), end=int(word.end * 1000), text=word.symbol)
65
- )
66
- else:
67
- text = f'{sup.speaker} {sup.text}' if sup.speaker is not None else sup.text
68
- subs.append(pysubs2.SSAEvent(start=int(sup.start * 1000), end=int(sup.end * 1000), text=text or ''))
69
- subs.save(output_path)
70
-
71
- return output_path
72
-
73
-
74
- def parse_alignment_from_supervision(supervision: Any) -> Optional[List[AlignmentItem]]:
75
- """
76
- Extract word-level alignment items from Supervision object.
77
-
78
- Args:
79
- supervision: Supervision object with potential alignment data
80
-
81
- Returns:
82
- List of AlignmentItem objects, or None if no alignment data present
83
- """
84
- if not hasattr(supervision, 'alignment') or not supervision.alignment:
85
- return None
86
-
87
- if 'word' not in supervision.alignment:
88
- return None
89
-
90
- return supervision.alignment['word']
@@ -1,3 +0,0 @@
1
- from .tokenizer import AsyncLatticeTokenizer, LatticeTokenizer
2
-
3
- __all__ = ['LatticeTokenizer', 'AsyncLatticeTokenizer']
@@ -1,3 +0,0 @@
1
- from .lattice1_alpha import Lattice1AlphaWorker
2
-
3
- __all__ = ['Lattice1AlphaWorker']
@@ -1,284 +0,0 @@
1
- import json
2
- import time
3
- from collections import defaultdict
4
- from typing import Any, BinaryIO, Dict, Iterable, Optional, Tuple, Union
5
-
6
- import numpy as np
7
- import onnxruntime as ort
8
- import soundfile as sf
9
- import torch
10
- from lhotse import FbankConfig
11
- from lhotse.augmentation import get_or_create_resampler
12
- from lhotse.features.kaldi.layers import Wav2LogFilterBank
13
- from lhotse.utils import Pathlike
14
-
15
- from lattifai.errors import AlignmentError, AudioFormatError, AudioLoadError, DependencyError, ModelLoadError
16
-
17
- ChannelSelectorType = Union[int, Iterable[int], str]
18
-
19
-
20
- def resample_audio(
21
- audio_sr: Tuple[torch.Tensor, int],
22
- sampling_rate: int,
23
- device: Optional[str],
24
- channel_selector: Optional[ChannelSelectorType] = 'average',
25
- ) -> torch.Tensor:
26
- """
27
- return:
28
- (1, T)
29
- """
30
- audio, sr = audio_sr
31
-
32
- if channel_selector is None:
33
- # keep the original multi-channel signal
34
- tensor = audio
35
- elif isinstance(channel_selector, int):
36
- assert audio.shape[0] >= channel_selector, f'Invalid channel: {channel_selector}'
37
- tensor = audio[channel_selector : channel_selector + 1].clone()
38
- del audio
39
- elif isinstance(channel_selector, str):
40
- assert channel_selector == 'average'
41
- tensor = torch.mean(audio.to(device), dim=0, keepdim=True)
42
- del audio
43
- else:
44
- assert isinstance(channel_selector, Iterable)
45
- num_channels = audio.shape[0]
46
- print(f'Selecting channels {channel_selector} from the signal with {num_channels} channels.')
47
- assert isinstance(channel_selector, Iterable)
48
- if max(channel_selector) >= num_channels:
49
- raise ValueError(
50
- f'Cannot select channel subset {channel_selector} from a signal with {num_channels} channels.'
51
- )
52
- tensor = audio[channel_selector]
53
-
54
- tensor = tensor.to(device)
55
- if sr != sampling_rate:
56
- resampler = get_or_create_resampler(sr, sampling_rate).to(device=device)
57
- length = tensor.size(-1)
58
- chunk_size = sampling_rate * 3600
59
- if length > chunk_size:
60
- resampled_chunks = []
61
- for i in range(0, length, chunk_size):
62
- resampled_chunks.append(resampler(tensor[..., i : i + chunk_size]))
63
- tensor = torch.cat(resampled_chunks, dim=-1)
64
- else:
65
- tensor = resampler(tensor)
66
-
67
- return tensor
68
-
69
-
70
- class Lattice1AlphaWorker:
71
- """Worker for processing audio with LatticeGraph."""
72
-
73
- def __init__(self, model_path: Pathlike, device: str = 'cpu', num_threads: int = 8) -> None:
74
- try:
75
- self.config = json.load(open(f'{model_path}/config.json'))
76
- except Exception as e:
77
- raise ModelLoadError(f'config from {model_path}', original_error=e)
78
-
79
- # SessionOptions
80
- sess_options = ort.SessionOptions()
81
- # sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
82
- sess_options.intra_op_num_threads = num_threads # CPU cores
83
- sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
84
- sess_options.add_session_config_entry('session.intra_op.allow_spinning', '0')
85
-
86
- providers = []
87
- if device.startswith('cuda') and ort.get_all_providers().count('CUDAExecutionProvider') > 0:
88
- providers.append('CUDAExecutionProvider')
89
- elif device.startswith('mps') and ort.get_all_providers().count('MPSExecutionProvider') > 0:
90
- providers.append('MPSExecutionProvider')
91
-
92
- try:
93
- self.acoustic_ort = ort.InferenceSession(
94
- f'{model_path}/acoustic_opt.onnx',
95
- sess_options,
96
- providers=providers + ['CPUExecutionProvider', 'CoreMLExecutionProvider'],
97
- )
98
- except Exception as e:
99
- raise ModelLoadError(f'acoustic model from {model_path}', original_error=e)
100
-
101
- try:
102
- config = FbankConfig(num_mel_bins=80, device=device, snip_edges=False)
103
- config_dict = config.to_dict()
104
- config_dict.pop('device')
105
- self.extractor = Wav2LogFilterBank(**config_dict).to(device).eval()
106
- except Exception as e:
107
- raise ModelLoadError(f'feature extractor for device {device}', original_error=e)
108
-
109
- self.device = torch.device(device)
110
- self.timings = defaultdict(lambda: 0.0)
111
-
112
- @torch.inference_mode()
113
- def emission(self, audio: torch.Tensor) -> torch.Tensor:
114
- _start = time.time()
115
- # audio -> features -> emission
116
- features = self.extractor(audio) # (1, T, D)
117
- if features.shape[1] > 6000:
118
- features_list = torch.split(features, 6000, dim=1)
119
- emissions = []
120
- for features in features_list:
121
- ort_inputs = {
122
- 'features': features.cpu().numpy(),
123
- 'feature_lengths': np.array([features.size(1)], dtype=np.int64),
124
- }
125
- emission = self.acoustic_ort.run(None, ort_inputs)[0] # (1, T, vocab_size) numpy
126
- emissions.append(emission)
127
- emission = torch.cat(
128
- [torch.from_numpy(emission).to(self.device) for emission in emissions], dim=1
129
- ) # (1, T, vocab_size)
130
- else:
131
- ort_inputs = {
132
- 'features': features.cpu().numpy(),
133
- 'feature_lengths': np.array([features.size(1)], dtype=np.int64),
134
- }
135
- emission = self.acoustic_ort.run(None, ort_inputs)[0] # (1, T, vocab_size) numpy
136
- emission = torch.from_numpy(emission).to(self.device)
137
-
138
- self.timings['emission'] += time.time() - _start
139
- return emission # (1, T, vocab_size) torch
140
-
141
- def load_audio(
142
- self, audio: Union[Pathlike, BinaryIO], channel_selector: Optional[ChannelSelectorType] = 'average'
143
- ) -> Tuple[torch.Tensor, int]:
144
- # load audio
145
- try:
146
- waveform, sample_rate = sf.read(audio, always_2d=True, dtype='float32') # numpy array
147
- waveform = waveform.T # (channels, samples)
148
- except Exception as primary_error:
149
- # Fallback to PyAV for formats not supported by soundfile
150
- try:
151
- import av
152
- except ImportError:
153
- raise DependencyError(
154
- 'av (PyAV)', install_command='pip install av', context={'primary_error': str(primary_error)}
155
- )
156
-
157
- try:
158
- container = av.open(audio)
159
- audio_stream = next((s for s in container.streams if s.type == 'audio'), None)
160
-
161
- if audio_stream is None:
162
- raise AudioFormatError(str(audio), 'No audio stream found in file')
163
-
164
- # Resample to target sample rate during decoding
165
- audio_stream.codec_context.format = av.AudioFormat('flt') # 32-bit float
166
-
167
- frames = []
168
- for frame in container.decode(audio_stream):
169
- # Convert frame to numpy array
170
- array = frame.to_ndarray()
171
- # Ensure shape is (channels, samples)
172
- if array.ndim == 1:
173
- array = array.reshape(1, -1)
174
- elif array.ndim == 2 and array.shape[0] > array.shape[1]:
175
- array = array.T
176
- frames.append(array)
177
-
178
- container.close()
179
-
180
- if not frames:
181
- raise AudioFormatError(str(audio), 'No audio data found in file')
182
-
183
- # Concatenate all frames
184
- waveform = np.concatenate(frames, axis=1)
185
- sample_rate = audio_stream.codec_context.sample_rate
186
- except Exception as e:
187
- raise AudioLoadError(str(audio), original_error=e)
188
-
189
- return resample_audio(
190
- (torch.from_numpy(waveform), sample_rate),
191
- self.config.get('sampling_rate', 16000),
192
- device=self.device.type,
193
- channel_selector=channel_selector,
194
- )
195
-
196
- def alignment(
197
- self, audio: Union[Union[Pathlike, BinaryIO], torch.tensor], lattice_graph: Tuple[str, int, float]
198
- ) -> Dict[str, Any]:
199
- """Process audio with LatticeGraph.
200
-
201
- Args:
202
- audio: Audio file path or binary data
203
- lattice_graph: LatticeGraph data
204
-
205
- Returns:
206
- Processed LatticeGraph
207
-
208
- Raises:
209
- AudioLoadError: If audio cannot be loaded
210
- DependencyError: If required dependencies are missing
211
- AlignmentError: If alignment process fails
212
- """
213
- # load audio
214
- if isinstance(audio, torch.Tensor):
215
- waveform = audio
216
- else:
217
- waveform = self.load_audio(audio) # (1, L)
218
-
219
- _start = time.time()
220
- try:
221
- emission = self.emission(waveform.to(self.device)) # (1, T, vocab_size)
222
- except Exception as e:
223
- raise AlignmentError(
224
- 'Failed to compute acoustic features from audio',
225
- audio_path=str(audio) if not isinstance(audio, torch.Tensor) else 'tensor',
226
- context={'original_error': str(e)},
227
- )
228
- self.timings['emission'] += time.time() - _start
229
-
230
- try:
231
- import k2
232
- except ImportError:
233
- raise DependencyError('k2', install_command='pip install install-k2 && python -m install_k2')
234
-
235
- try:
236
- from lattifai_core.lattice.decode import align_segments
237
- except ImportError:
238
- raise DependencyError('lattifai_core', install_command='Contact support for lattifai_core installation')
239
-
240
- lattice_graph_str, final_state, acoustic_scale = lattice_graph
241
-
242
- _start = time.time()
243
- try:
244
- # graph
245
- decoding_graph = k2.Fsa.from_str(lattice_graph_str, acceptor=False)
246
- decoding_graph.requires_grad_(False)
247
- decoding_graph = k2.arc_sort(decoding_graph)
248
- decoding_graph.skip_id = int(final_state)
249
- decoding_graph.return_id = int(final_state + 1)
250
- except Exception as e:
251
- raise AlignmentError(
252
- 'Failed to create decoding graph from lattice',
253
- context={'original_error': str(e), 'lattice_graph_length': len(lattice_graph_str)},
254
- )
255
- self.timings['decoding_graph'] += time.time() - _start
256
-
257
- _start = time.time()
258
- if self.device.type == 'mps':
259
- device = 'cpu' # k2 does not support mps yet
260
- else:
261
- device = self.device
262
-
263
- try:
264
- results, labels = align_segments(
265
- emission.to(device) * acoustic_scale,
266
- decoding_graph.to(device),
267
- torch.tensor([emission.shape[1]], dtype=torch.int32),
268
- search_beam=200,
269
- output_beam=80,
270
- min_active_states=400,
271
- max_active_states=10000,
272
- subsampling_factor=1,
273
- reject_low_confidence=False,
274
- )
275
- except Exception as e:
276
- raise AlignmentError(
277
- 'Failed to perform forced alignment',
278
- audio_path=str(audio) if not isinstance(audio, torch.Tensor) else 'tensor',
279
- context={'original_error': str(e), 'emission_shape': list(emission.shape), 'device': str(device)},
280
- )
281
- self.timings['align_segments'] += time.time() - _start
282
-
283
- channel = 0
284
- return emission, results, labels, 0.02, 0.0, channel # frame_shift=20ms, offset=0.0s
@@ -1,34 +0,0 @@
1
- """
2
- LattifAI Agentic Workflows
3
-
4
- This module provides agentic workflow capabilities for automated processing
5
- of multimedia content through intelligent agent-based pipelines.
6
- """
7
-
8
- # Import transcript processing functionality
9
- from lattifai.io import (
10
- ALL_SUBTITLE_FORMATS,
11
- INPUT_SUBTITLE_FORMATS,
12
- OUTPUT_SUBTITLE_FORMATS,
13
- SUBTITLE_FORMATS,
14
- GeminiReader,
15
- GeminiWriter,
16
- )
17
-
18
- from .agents import YouTubeSubtitleAgent
19
- from .base import WorkflowAgent, WorkflowResult, WorkflowStep
20
- from .file_manager import FileExistenceManager
21
-
22
- __all__ = [
23
- 'WorkflowAgent',
24
- 'WorkflowStep',
25
- 'WorkflowResult',
26
- 'YouTubeSubtitleAgent',
27
- 'FileExistenceManager',
28
- 'GeminiReader',
29
- 'GeminiWriter',
30
- 'SUBTITLE_FORMATS',
31
- 'INPUT_SUBTITLE_FORMATS',
32
- 'OUTPUT_SUBTITLE_FORMATS',
33
- 'ALL_SUBTITLE_FORMATS',
34
- ]
@@ -1,10 +0,0 @@
1
- """
2
- Subtitle Agents
3
-
4
- An agentic workflow for processing YouTube(or more) videos through:
5
- 1. URL processing and audio download
6
- 2. Gemini 2.5 Pro transcription
7
- 3. LattifAI alignment
8
- """
9
-
10
- from .youtube import YouTubeSubtitleAgent
@@ -1,167 +0,0 @@
1
- """
2
- Gemini 2.5 Pro transcription module
3
- """
4
-
5
- import asyncio
6
- from typing import Optional
7
-
8
- # Import Google GenAI SDK
9
- from google import genai
10
- from google.genai.types import GenerateContentConfig, Part, ThinkingConfig
11
-
12
- from .base import setup_workflow_logger
13
- from .prompts import get_prompt_loader
14
-
15
-
16
- class GeminiTranscriber:
17
- """Gemini 2.5 Pro audio transcription using the specified Gem
18
-
19
- Configuration (in __init__):
20
- - api_key: Gemini API key (required)
21
-
22
- Runtime parameters (in __call__):
23
- - youtube_url: YouTube URL to transcribe
24
- """
25
-
26
- # The specific Gem URL provided by the user
27
- GEM_URL = 'https://gemini.google.com/gem/1870ly7xvW2hU_umtv-LedGsjywT0sQiN'
28
-
29
- def __init__(self, api_key: Optional[str] = None):
30
- self.api_key = api_key
31
- self.logger = setup_workflow_logger('gemini')
32
- self.prompt_loader = get_prompt_loader()
33
-
34
- if not self.api_key:
35
- self.logger.warning(
36
- '⚠️ Gemini API key not provided. API key will be required when calling transcription methods.'
37
- )
38
-
39
- async def __call__(self, youtube_url: str) -> str:
40
- """Main entry point for transcription"""
41
- return await self.transcribe_url(youtube_url)
42
-
43
- async def transcribe_url(self, youtube_url: str) -> str:
44
- """
45
- Transcribe audio from YouTube URL using Gemini 2.5 Pro Gem
46
-
47
- Args:
48
- youtube_url: YouTube URL to transcribe
49
-
50
- Returns:
51
- Transcribed text
52
- """
53
- if not self.api_key:
54
- raise ValueError('Gemini API key is required for transcription')
55
-
56
- self.logger.info(f'🎤 Starting Gemini transcription for: {youtube_url}')
57
-
58
- try:
59
- # Initialize client
60
- client = genai.Client(api_key=self.api_key)
61
-
62
- # Load prompt from Gem configuration
63
- system_prompt = self.prompt_loader.get_gemini_transcription_prompt()
64
-
65
- # Generate transcription with extended thinking
66
- self.logger.info('🔄 Sending request to Gemini 2.5 Pro...')
67
- config = GenerateContentConfig(
68
- system_instruction=system_prompt,
69
- # Enable thinking by including it in response modalities
70
- response_modalities=['TEXT'],
71
- thinking_config=ThinkingConfig(
72
- include_thoughts=False,
73
- thinking_budget=-1,
74
- ),
75
- )
76
- response = await asyncio.get_event_loop().run_in_executor(
77
- None,
78
- lambda: client.models.generate_content(
79
- model='gemini-2.5-pro',
80
- contents=Part.from_uri(file_uri=youtube_url, mime_type='video/*'),
81
- config=config,
82
- ),
83
- )
84
-
85
- if not response.text:
86
- raise RuntimeError('Empty response from Gemini API')
87
-
88
- transcript = response.text.strip()
89
-
90
- self.logger.info(f'✅ Transcription completed: {len(transcript)} characters')
91
- return transcript
92
-
93
- except ImportError:
94
- raise RuntimeError('Google GenAI SDK not installed. Please install with: pip install google-genai')
95
- except Exception as e:
96
- self.logger.error(f'Gemini transcription failed: {str(e)}')
97
- raise RuntimeError(f'Gemini transcription failed: {str(e)}')
98
-
99
- async def transcribe_file(self, media_file_path: str) -> str:
100
- """
101
- Transcribe audio/video from local file using Gemini 2.5 Pro
102
-
103
- Args:
104
- media_file_path: Path to local audio file
105
-
106
- Returns:
107
- Transcribed text
108
- """
109
- if not self.api_key:
110
- raise ValueError('Gemini API key is required for transcription')
111
-
112
- self.logger.info(f'🎤 Starting Gemini transcription for file: {media_file_path}')
113
-
114
- try:
115
- # Initialize client
116
- client = genai.Client(api_key=self.api_key)
117
-
118
- # Load prompt from Gem configuration
119
- system_prompt = self.prompt_loader.get_gemini_transcription_prompt()
120
-
121
- # Upload audio file
122
- self.logger.info('📤 Uploading audio file to Gemini...')
123
- media_file = client.files.upload(path=media_file_path)
124
-
125
- # Generate transcription with extended thinking
126
- # Note: For thinking mode, you may want to use 'gemini-2.0-flash-thinking-exp' or similar models
127
- self.logger.info('🔄 Sending transcription request...')
128
- config = GenerateContentConfig(
129
- system_instruction=system_prompt,
130
- # Enable thinking by including it in response modalities
131
- response_modalities=['TEXT'],
132
- thinking_config=ThinkingConfig(
133
- include_thoughts=False,
134
- thinking_budget=-1,
135
- ),
136
- )
137
- response = await asyncio.get_event_loop().run_in_executor(
138
- None,
139
- lambda: client.models.generate_content(
140
- model='gemini-2.5-pro',
141
- contents=Part.from_uri(file_uri=media_file.uri, mime_type=media_file.mime_type),
142
- config=config,
143
- ),
144
- )
145
-
146
- if not response.text:
147
- raise RuntimeError('Empty response from Gemini API')
148
-
149
- transcript = response.text.strip()
150
-
151
- self.logger.info(f'✅ Transcription completed: {len(transcript)} characters')
152
- return transcript
153
-
154
- except ImportError:
155
- raise RuntimeError('Google GenAI SDK not installed. Please install with: pip install google-genai')
156
- except Exception as e:
157
- self.logger.error(f'Gemini transcription failed: {str(e)}')
158
- raise RuntimeError(f'Gemini transcription failed: {str(e)}')
159
-
160
- def get_gem_info(self) -> dict:
161
- """Get information about the Gem being used"""
162
- return {
163
- 'gem_name': 'Audio Transcription Gem',
164
- 'gem_url': self.GEM_URL,
165
- 'model': 'Gemini 2.5 Pro',
166
- 'description': 'Specialized Gem for media content transcribe',
167
- }