lattifai 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +61 -47
- lattifai/alignment/__init__.py +6 -0
- lattifai/alignment/lattice1_aligner.py +119 -0
- lattifai/alignment/lattice1_worker.py +185 -0
- lattifai/{tokenizer → alignment}/phonemizer.py +4 -4
- lattifai/alignment/segmenter.py +166 -0
- lattifai/{tokenizer → alignment}/tokenizer.py +244 -169
- lattifai/audio2.py +211 -0
- lattifai/caption/__init__.py +20 -0
- lattifai/caption/caption.py +1275 -0
- lattifai/{io → caption}/gemini_reader.py +30 -30
- lattifai/{io → caption}/gemini_writer.py +17 -17
- lattifai/{io → caption}/supervision.py +4 -3
- lattifai/caption/text_parser.py +145 -0
- lattifai/cli/__init__.py +17 -0
- lattifai/cli/alignment.py +153 -0
- lattifai/cli/caption.py +204 -0
- lattifai/cli/server.py +19 -0
- lattifai/cli/transcribe.py +197 -0
- lattifai/cli/youtube.py +128 -0
- lattifai/client.py +460 -251
- lattifai/config/__init__.py +20 -0
- lattifai/config/alignment.py +73 -0
- lattifai/config/caption.py +178 -0
- lattifai/config/client.py +46 -0
- lattifai/config/diarization.py +67 -0
- lattifai/config/media.py +335 -0
- lattifai/config/transcription.py +84 -0
- lattifai/diarization/__init__.py +5 -0
- lattifai/diarization/lattifai.py +89 -0
- lattifai/errors.py +98 -91
- lattifai/logging.py +116 -0
- lattifai/mixin.py +552 -0
- lattifai/server/app.py +420 -0
- lattifai/transcription/__init__.py +76 -0
- lattifai/transcription/base.py +108 -0
- lattifai/transcription/gemini.py +219 -0
- lattifai/transcription/lattifai.py +103 -0
- lattifai/{workflows → transcription}/prompts/__init__.py +4 -4
- lattifai/types.py +30 -0
- lattifai/utils.py +16 -44
- lattifai/workflow/__init__.py +22 -0
- lattifai/workflow/agents.py +6 -0
- lattifai/{workflows → workflow}/base.py +22 -22
- lattifai/{workflows → workflow}/file_manager.py +239 -215
- lattifai/workflow/youtube.py +564 -0
- lattifai-1.0.0.dist-info/METADATA +736 -0
- lattifai-1.0.0.dist-info/RECORD +52 -0
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
- lattifai-1.0.0.dist-info/entry_points.txt +13 -0
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +1 -1
- lattifai/base_client.py +0 -126
- lattifai/bin/__init__.py +0 -3
- lattifai/bin/agent.py +0 -325
- lattifai/bin/align.py +0 -296
- lattifai/bin/cli_base.py +0 -25
- lattifai/bin/subtitle.py +0 -210
- lattifai/io/__init__.py +0 -42
- lattifai/io/reader.py +0 -85
- lattifai/io/text_parser.py +0 -75
- lattifai/io/utils.py +0 -15
- lattifai/io/writer.py +0 -90
- lattifai/tokenizer/__init__.py +0 -3
- lattifai/workers/__init__.py +0 -3
- lattifai/workers/lattice1_alpha.py +0 -284
- lattifai/workflows/__init__.py +0 -34
- lattifai/workflows/agents.py +0 -10
- lattifai/workflows/gemini.py +0 -167
- lattifai/workflows/prompts/README.md +0 -22
- lattifai/workflows/prompts/gemini/README.md +0 -24
- lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
- lattifai/workflows/youtube.py +0 -931
- lattifai-0.4.5.dist-info/METADATA +0 -808
- lattifai-0.4.5.dist-info/RECORD +0 -39
- lattifai-0.4.5.dist-info/entry_points.txt +0 -3
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
lattifai/io/text_parser.py
DELETED
|
@@ -1,75 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import re
|
|
3
|
-
from typing import Optional, Tuple
|
|
4
|
-
|
|
5
|
-
# 来自于字幕中常见的说话人标记格式
|
|
6
|
-
SPEAKER_PATTERN = re.compile(r'((?:>>|>>|>|>).*?[::])\s*(.*)')
|
|
7
|
-
|
|
8
|
-
# Transcriber Output Example:
|
|
9
|
-
# 26:19.919 --> 26:34.921
|
|
10
|
-
# [SPEAKER_01]: 越来越多的科技巨头入...
|
|
11
|
-
SPEAKER_LATTIFAI = re.compile(r'(^\[SPEAKER_.*?\][::])\s*(.*)')
|
|
12
|
-
|
|
13
|
-
# NISHTHA BHATIA: Hey, everyone.
|
|
14
|
-
# DIETER: Oh, hey, Nishtha.
|
|
15
|
-
# GEMINI: That might
|
|
16
|
-
SPEAKER_PATTERN2 = re.compile(r'^([A-Z]{1,15}(?:\s+[A-Z]{1,15})?[::])\s*(.*)$')
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def parse_speaker_text(line) -> Tuple[Optional[str], str]:
|
|
20
|
-
line = line.replace('\\N', ' ')
|
|
21
|
-
|
|
22
|
-
if ':' not in line and ':' not in line:
|
|
23
|
-
return None, line
|
|
24
|
-
|
|
25
|
-
# 匹配以 >> 开头的行,并去除开头的名字和冒号
|
|
26
|
-
match = SPEAKER_PATTERN.match(line)
|
|
27
|
-
if match:
|
|
28
|
-
return match.group(1).strip(), match.group(2).strip()
|
|
29
|
-
|
|
30
|
-
match = SPEAKER_LATTIFAI.match(line)
|
|
31
|
-
if match:
|
|
32
|
-
assert len(match.groups()) == 2, match.groups()
|
|
33
|
-
if not match.group(1):
|
|
34
|
-
logging.error(f'ParseSub LINE [{line}]')
|
|
35
|
-
else:
|
|
36
|
-
return match.group(1).strip(), match.group(2).strip()
|
|
37
|
-
|
|
38
|
-
match = SPEAKER_PATTERN2.match(line)
|
|
39
|
-
if match:
|
|
40
|
-
assert len(match.groups()) == 2, match.groups()
|
|
41
|
-
return match.group(1).strip(), match.group(2).strip()
|
|
42
|
-
|
|
43
|
-
return None, line
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
if __name__ == '__main__':
|
|
47
|
-
pattern = re.compile(r'>>\s*(.*?)\s*[::]\s*(.*)')
|
|
48
|
-
pattern = re.compile(r'(>>.*?[::])\s*(.*)')
|
|
49
|
-
|
|
50
|
-
test_strings = [
|
|
51
|
-
'>>Key: Value',
|
|
52
|
-
'>> Key with space : Value with space ',
|
|
53
|
-
'>> 全角键 : 全角值',
|
|
54
|
-
'>>Key:Value xxx. >>Key:Value',
|
|
55
|
-
]
|
|
56
|
-
|
|
57
|
-
for text in test_strings:
|
|
58
|
-
match = pattern.match(text)
|
|
59
|
-
if match:
|
|
60
|
-
print(f"Input: '{text}'")
|
|
61
|
-
print(f" Key: '{match.group(1)}'")
|
|
62
|
-
print(f" Value: '{match.group(2)}'")
|
|
63
|
-
print('-------------')
|
|
64
|
-
|
|
65
|
-
# pattern2
|
|
66
|
-
test_strings2 = ['NISHTHA BHATIA: Hey, everyone.', 'DIETER: Oh, hey, Nishtha.', 'GEMINI: That might']
|
|
67
|
-
for text in test_strings2:
|
|
68
|
-
match = SPEAKER_PATTERN2.match(text)
|
|
69
|
-
if match:
|
|
70
|
-
print(f" Input: '{text}'")
|
|
71
|
-
print(f"Speaker: '{match.group(1)}'")
|
|
72
|
-
print(f"Content: '{match.group(2)}'")
|
|
73
|
-
print('-------------')
|
|
74
|
-
else:
|
|
75
|
-
raise ValueError(f"No match for: '{text}'")
|
lattifai/io/utils.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Utility constants and helper functions for subtitle I/O operations
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
# Supported subtitle formats for reading/writing
|
|
6
|
-
SUBTITLE_FORMATS = ['srt', 'vtt', 'ass', 'ssa', 'sub', 'sbv', 'txt', 'md']
|
|
7
|
-
|
|
8
|
-
# Input subtitle formats (includes special formats like 'auto' and 'gemini')
|
|
9
|
-
INPUT_SUBTITLE_FORMATS = ['srt', 'vtt', 'ass', 'ssa', 'sub', 'sbv', 'txt', 'auto', 'gemini']
|
|
10
|
-
|
|
11
|
-
# Output subtitle formats (includes special formats like 'TextGrid' and 'json')
|
|
12
|
-
OUTPUT_SUBTITLE_FORMATS = ['srt', 'vtt', 'ass', 'ssa', 'sub', 'sbv', 'txt', 'TextGrid', 'json']
|
|
13
|
-
|
|
14
|
-
# All subtitle formats combined (for file detection)
|
|
15
|
-
ALL_SUBTITLE_FORMATS = list(set(SUBTITLE_FORMATS + ['TextGrid', 'json', 'gemini']))
|
lattifai/io/writer.py
DELETED
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from abc import ABCMeta
|
|
3
|
-
from typing import Any, List, Optional
|
|
4
|
-
|
|
5
|
-
import pysubs2
|
|
6
|
-
from lhotse.supervision import AlignmentItem
|
|
7
|
-
from lhotse.utils import Pathlike
|
|
8
|
-
|
|
9
|
-
from .reader import Supervision
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class SubtitleWriter(ABCMeta):
|
|
13
|
-
"""Class for writing subtitle files with optional word-level alignment."""
|
|
14
|
-
|
|
15
|
-
@classmethod
|
|
16
|
-
def write(cls, alignments: List[Supervision], output_path: Pathlike) -> Pathlike:
|
|
17
|
-
if str(output_path)[-4:].lower() == '.txt':
|
|
18
|
-
with open(output_path, 'w', encoding='utf-8') as f:
|
|
19
|
-
for sup in alignments:
|
|
20
|
-
word_items = parse_alignment_from_supervision(sup)
|
|
21
|
-
if word_items:
|
|
22
|
-
for item in word_items:
|
|
23
|
-
f.write(f'[{item.start:.2f}-{item.end:.2f}] {item.symbol}\n')
|
|
24
|
-
else:
|
|
25
|
-
text = f'{sup.speaker} {sup.text}' if sup.speaker is not None else sup.text
|
|
26
|
-
f.write(f'[{sup.start:.2f}-{sup.end:.2f}] {text}\n')
|
|
27
|
-
|
|
28
|
-
elif str(output_path)[-5:].lower() == '.json':
|
|
29
|
-
with open(output_path, 'w', encoding='utf-8') as f:
|
|
30
|
-
# Enhanced JSON export with word-level alignment
|
|
31
|
-
json_data = []
|
|
32
|
-
for sup in alignments:
|
|
33
|
-
sup_dict = sup.to_dict()
|
|
34
|
-
json_data.append(sup_dict)
|
|
35
|
-
json.dump(json_data, f, ensure_ascii=False, indent=4)
|
|
36
|
-
elif str(output_path).endswith('.TextGrid') or str(output_path).endswith('.textgrid'):
|
|
37
|
-
from tgt import Interval, IntervalTier, TextGrid, write_to_file
|
|
38
|
-
|
|
39
|
-
tg = TextGrid()
|
|
40
|
-
supervisions, words = [], []
|
|
41
|
-
for supervision in sorted(alignments, key=lambda x: x.start):
|
|
42
|
-
text = (
|
|
43
|
-
f'{supervision.speaker} {supervision.text}' if supervision.speaker is not None else supervision.text
|
|
44
|
-
)
|
|
45
|
-
supervisions.append(Interval(supervision.start, supervision.end, text or ''))
|
|
46
|
-
# Extract word-level alignment using helper function
|
|
47
|
-
word_items = parse_alignment_from_supervision(supervision)
|
|
48
|
-
if word_items:
|
|
49
|
-
for item in word_items:
|
|
50
|
-
words.append(Interval(item.start, item.end, item.symbol))
|
|
51
|
-
|
|
52
|
-
tg.add_tier(IntervalTier(name='utterances', objects=supervisions))
|
|
53
|
-
if words:
|
|
54
|
-
tg.add_tier(IntervalTier(name='words', objects=words))
|
|
55
|
-
write_to_file(tg, output_path, format='long')
|
|
56
|
-
else:
|
|
57
|
-
subs = pysubs2.SSAFile()
|
|
58
|
-
for sup in alignments:
|
|
59
|
-
# Add word-level timing as metadata in the subtitle text
|
|
60
|
-
word_items = parse_alignment_from_supervision(sup)
|
|
61
|
-
if word_items:
|
|
62
|
-
for word in word_items:
|
|
63
|
-
subs.append(
|
|
64
|
-
pysubs2.SSAEvent(start=int(word.start * 1000), end=int(word.end * 1000), text=word.symbol)
|
|
65
|
-
)
|
|
66
|
-
else:
|
|
67
|
-
text = f'{sup.speaker} {sup.text}' if sup.speaker is not None else sup.text
|
|
68
|
-
subs.append(pysubs2.SSAEvent(start=int(sup.start * 1000), end=int(sup.end * 1000), text=text or ''))
|
|
69
|
-
subs.save(output_path)
|
|
70
|
-
|
|
71
|
-
return output_path
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def parse_alignment_from_supervision(supervision: Any) -> Optional[List[AlignmentItem]]:
|
|
75
|
-
"""
|
|
76
|
-
Extract word-level alignment items from Supervision object.
|
|
77
|
-
|
|
78
|
-
Args:
|
|
79
|
-
supervision: Supervision object with potential alignment data
|
|
80
|
-
|
|
81
|
-
Returns:
|
|
82
|
-
List of AlignmentItem objects, or None if no alignment data present
|
|
83
|
-
"""
|
|
84
|
-
if not hasattr(supervision, 'alignment') or not supervision.alignment:
|
|
85
|
-
return None
|
|
86
|
-
|
|
87
|
-
if 'word' not in supervision.alignment:
|
|
88
|
-
return None
|
|
89
|
-
|
|
90
|
-
return supervision.alignment['word']
|
lattifai/tokenizer/__init__.py
DELETED
lattifai/workers/__init__.py
DELETED
|
@@ -1,284 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import time
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
from typing import Any, BinaryIO, Dict, Iterable, Optional, Tuple, Union
|
|
5
|
-
|
|
6
|
-
import numpy as np
|
|
7
|
-
import onnxruntime as ort
|
|
8
|
-
import soundfile as sf
|
|
9
|
-
import torch
|
|
10
|
-
from lhotse import FbankConfig
|
|
11
|
-
from lhotse.augmentation import get_or_create_resampler
|
|
12
|
-
from lhotse.features.kaldi.layers import Wav2LogFilterBank
|
|
13
|
-
from lhotse.utils import Pathlike
|
|
14
|
-
|
|
15
|
-
from lattifai.errors import AlignmentError, AudioFormatError, AudioLoadError, DependencyError, ModelLoadError
|
|
16
|
-
|
|
17
|
-
ChannelSelectorType = Union[int, Iterable[int], str]
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def resample_audio(
|
|
21
|
-
audio_sr: Tuple[torch.Tensor, int],
|
|
22
|
-
sampling_rate: int,
|
|
23
|
-
device: Optional[str],
|
|
24
|
-
channel_selector: Optional[ChannelSelectorType] = 'average',
|
|
25
|
-
) -> torch.Tensor:
|
|
26
|
-
"""
|
|
27
|
-
return:
|
|
28
|
-
(1, T)
|
|
29
|
-
"""
|
|
30
|
-
audio, sr = audio_sr
|
|
31
|
-
|
|
32
|
-
if channel_selector is None:
|
|
33
|
-
# keep the original multi-channel signal
|
|
34
|
-
tensor = audio
|
|
35
|
-
elif isinstance(channel_selector, int):
|
|
36
|
-
assert audio.shape[0] >= channel_selector, f'Invalid channel: {channel_selector}'
|
|
37
|
-
tensor = audio[channel_selector : channel_selector + 1].clone()
|
|
38
|
-
del audio
|
|
39
|
-
elif isinstance(channel_selector, str):
|
|
40
|
-
assert channel_selector == 'average'
|
|
41
|
-
tensor = torch.mean(audio.to(device), dim=0, keepdim=True)
|
|
42
|
-
del audio
|
|
43
|
-
else:
|
|
44
|
-
assert isinstance(channel_selector, Iterable)
|
|
45
|
-
num_channels = audio.shape[0]
|
|
46
|
-
print(f'Selecting channels {channel_selector} from the signal with {num_channels} channels.')
|
|
47
|
-
assert isinstance(channel_selector, Iterable)
|
|
48
|
-
if max(channel_selector) >= num_channels:
|
|
49
|
-
raise ValueError(
|
|
50
|
-
f'Cannot select channel subset {channel_selector} from a signal with {num_channels} channels.'
|
|
51
|
-
)
|
|
52
|
-
tensor = audio[channel_selector]
|
|
53
|
-
|
|
54
|
-
tensor = tensor.to(device)
|
|
55
|
-
if sr != sampling_rate:
|
|
56
|
-
resampler = get_or_create_resampler(sr, sampling_rate).to(device=device)
|
|
57
|
-
length = tensor.size(-1)
|
|
58
|
-
chunk_size = sampling_rate * 3600
|
|
59
|
-
if length > chunk_size:
|
|
60
|
-
resampled_chunks = []
|
|
61
|
-
for i in range(0, length, chunk_size):
|
|
62
|
-
resampled_chunks.append(resampler(tensor[..., i : i + chunk_size]))
|
|
63
|
-
tensor = torch.cat(resampled_chunks, dim=-1)
|
|
64
|
-
else:
|
|
65
|
-
tensor = resampler(tensor)
|
|
66
|
-
|
|
67
|
-
return tensor
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class Lattice1AlphaWorker:
|
|
71
|
-
"""Worker for processing audio with LatticeGraph."""
|
|
72
|
-
|
|
73
|
-
def __init__(self, model_path: Pathlike, device: str = 'cpu', num_threads: int = 8) -> None:
|
|
74
|
-
try:
|
|
75
|
-
self.config = json.load(open(f'{model_path}/config.json'))
|
|
76
|
-
except Exception as e:
|
|
77
|
-
raise ModelLoadError(f'config from {model_path}', original_error=e)
|
|
78
|
-
|
|
79
|
-
# SessionOptions
|
|
80
|
-
sess_options = ort.SessionOptions()
|
|
81
|
-
# sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
|
82
|
-
sess_options.intra_op_num_threads = num_threads # CPU cores
|
|
83
|
-
sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
|
|
84
|
-
sess_options.add_session_config_entry('session.intra_op.allow_spinning', '0')
|
|
85
|
-
|
|
86
|
-
providers = []
|
|
87
|
-
if device.startswith('cuda') and ort.get_all_providers().count('CUDAExecutionProvider') > 0:
|
|
88
|
-
providers.append('CUDAExecutionProvider')
|
|
89
|
-
elif device.startswith('mps') and ort.get_all_providers().count('MPSExecutionProvider') > 0:
|
|
90
|
-
providers.append('MPSExecutionProvider')
|
|
91
|
-
|
|
92
|
-
try:
|
|
93
|
-
self.acoustic_ort = ort.InferenceSession(
|
|
94
|
-
f'{model_path}/acoustic_opt.onnx',
|
|
95
|
-
sess_options,
|
|
96
|
-
providers=providers + ['CPUExecutionProvider', 'CoreMLExecutionProvider'],
|
|
97
|
-
)
|
|
98
|
-
except Exception as e:
|
|
99
|
-
raise ModelLoadError(f'acoustic model from {model_path}', original_error=e)
|
|
100
|
-
|
|
101
|
-
try:
|
|
102
|
-
config = FbankConfig(num_mel_bins=80, device=device, snip_edges=False)
|
|
103
|
-
config_dict = config.to_dict()
|
|
104
|
-
config_dict.pop('device')
|
|
105
|
-
self.extractor = Wav2LogFilterBank(**config_dict).to(device).eval()
|
|
106
|
-
except Exception as e:
|
|
107
|
-
raise ModelLoadError(f'feature extractor for device {device}', original_error=e)
|
|
108
|
-
|
|
109
|
-
self.device = torch.device(device)
|
|
110
|
-
self.timings = defaultdict(lambda: 0.0)
|
|
111
|
-
|
|
112
|
-
@torch.inference_mode()
|
|
113
|
-
def emission(self, audio: torch.Tensor) -> torch.Tensor:
|
|
114
|
-
_start = time.time()
|
|
115
|
-
# audio -> features -> emission
|
|
116
|
-
features = self.extractor(audio) # (1, T, D)
|
|
117
|
-
if features.shape[1] > 6000:
|
|
118
|
-
features_list = torch.split(features, 6000, dim=1)
|
|
119
|
-
emissions = []
|
|
120
|
-
for features in features_list:
|
|
121
|
-
ort_inputs = {
|
|
122
|
-
'features': features.cpu().numpy(),
|
|
123
|
-
'feature_lengths': np.array([features.size(1)], dtype=np.int64),
|
|
124
|
-
}
|
|
125
|
-
emission = self.acoustic_ort.run(None, ort_inputs)[0] # (1, T, vocab_size) numpy
|
|
126
|
-
emissions.append(emission)
|
|
127
|
-
emission = torch.cat(
|
|
128
|
-
[torch.from_numpy(emission).to(self.device) for emission in emissions], dim=1
|
|
129
|
-
) # (1, T, vocab_size)
|
|
130
|
-
else:
|
|
131
|
-
ort_inputs = {
|
|
132
|
-
'features': features.cpu().numpy(),
|
|
133
|
-
'feature_lengths': np.array([features.size(1)], dtype=np.int64),
|
|
134
|
-
}
|
|
135
|
-
emission = self.acoustic_ort.run(None, ort_inputs)[0] # (1, T, vocab_size) numpy
|
|
136
|
-
emission = torch.from_numpy(emission).to(self.device)
|
|
137
|
-
|
|
138
|
-
self.timings['emission'] += time.time() - _start
|
|
139
|
-
return emission # (1, T, vocab_size) torch
|
|
140
|
-
|
|
141
|
-
def load_audio(
|
|
142
|
-
self, audio: Union[Pathlike, BinaryIO], channel_selector: Optional[ChannelSelectorType] = 'average'
|
|
143
|
-
) -> Tuple[torch.Tensor, int]:
|
|
144
|
-
# load audio
|
|
145
|
-
try:
|
|
146
|
-
waveform, sample_rate = sf.read(audio, always_2d=True, dtype='float32') # numpy array
|
|
147
|
-
waveform = waveform.T # (channels, samples)
|
|
148
|
-
except Exception as primary_error:
|
|
149
|
-
# Fallback to PyAV for formats not supported by soundfile
|
|
150
|
-
try:
|
|
151
|
-
import av
|
|
152
|
-
except ImportError:
|
|
153
|
-
raise DependencyError(
|
|
154
|
-
'av (PyAV)', install_command='pip install av', context={'primary_error': str(primary_error)}
|
|
155
|
-
)
|
|
156
|
-
|
|
157
|
-
try:
|
|
158
|
-
container = av.open(audio)
|
|
159
|
-
audio_stream = next((s for s in container.streams if s.type == 'audio'), None)
|
|
160
|
-
|
|
161
|
-
if audio_stream is None:
|
|
162
|
-
raise AudioFormatError(str(audio), 'No audio stream found in file')
|
|
163
|
-
|
|
164
|
-
# Resample to target sample rate during decoding
|
|
165
|
-
audio_stream.codec_context.format = av.AudioFormat('flt') # 32-bit float
|
|
166
|
-
|
|
167
|
-
frames = []
|
|
168
|
-
for frame in container.decode(audio_stream):
|
|
169
|
-
# Convert frame to numpy array
|
|
170
|
-
array = frame.to_ndarray()
|
|
171
|
-
# Ensure shape is (channels, samples)
|
|
172
|
-
if array.ndim == 1:
|
|
173
|
-
array = array.reshape(1, -1)
|
|
174
|
-
elif array.ndim == 2 and array.shape[0] > array.shape[1]:
|
|
175
|
-
array = array.T
|
|
176
|
-
frames.append(array)
|
|
177
|
-
|
|
178
|
-
container.close()
|
|
179
|
-
|
|
180
|
-
if not frames:
|
|
181
|
-
raise AudioFormatError(str(audio), 'No audio data found in file')
|
|
182
|
-
|
|
183
|
-
# Concatenate all frames
|
|
184
|
-
waveform = np.concatenate(frames, axis=1)
|
|
185
|
-
sample_rate = audio_stream.codec_context.sample_rate
|
|
186
|
-
except Exception as e:
|
|
187
|
-
raise AudioLoadError(str(audio), original_error=e)
|
|
188
|
-
|
|
189
|
-
return resample_audio(
|
|
190
|
-
(torch.from_numpy(waveform), sample_rate),
|
|
191
|
-
self.config.get('sampling_rate', 16000),
|
|
192
|
-
device=self.device.type,
|
|
193
|
-
channel_selector=channel_selector,
|
|
194
|
-
)
|
|
195
|
-
|
|
196
|
-
def alignment(
|
|
197
|
-
self, audio: Union[Union[Pathlike, BinaryIO], torch.tensor], lattice_graph: Tuple[str, int, float]
|
|
198
|
-
) -> Dict[str, Any]:
|
|
199
|
-
"""Process audio with LatticeGraph.
|
|
200
|
-
|
|
201
|
-
Args:
|
|
202
|
-
audio: Audio file path or binary data
|
|
203
|
-
lattice_graph: LatticeGraph data
|
|
204
|
-
|
|
205
|
-
Returns:
|
|
206
|
-
Processed LatticeGraph
|
|
207
|
-
|
|
208
|
-
Raises:
|
|
209
|
-
AudioLoadError: If audio cannot be loaded
|
|
210
|
-
DependencyError: If required dependencies are missing
|
|
211
|
-
AlignmentError: If alignment process fails
|
|
212
|
-
"""
|
|
213
|
-
# load audio
|
|
214
|
-
if isinstance(audio, torch.Tensor):
|
|
215
|
-
waveform = audio
|
|
216
|
-
else:
|
|
217
|
-
waveform = self.load_audio(audio) # (1, L)
|
|
218
|
-
|
|
219
|
-
_start = time.time()
|
|
220
|
-
try:
|
|
221
|
-
emission = self.emission(waveform.to(self.device)) # (1, T, vocab_size)
|
|
222
|
-
except Exception as e:
|
|
223
|
-
raise AlignmentError(
|
|
224
|
-
'Failed to compute acoustic features from audio',
|
|
225
|
-
audio_path=str(audio) if not isinstance(audio, torch.Tensor) else 'tensor',
|
|
226
|
-
context={'original_error': str(e)},
|
|
227
|
-
)
|
|
228
|
-
self.timings['emission'] += time.time() - _start
|
|
229
|
-
|
|
230
|
-
try:
|
|
231
|
-
import k2
|
|
232
|
-
except ImportError:
|
|
233
|
-
raise DependencyError('k2', install_command='pip install install-k2 && python -m install_k2')
|
|
234
|
-
|
|
235
|
-
try:
|
|
236
|
-
from lattifai_core.lattice.decode import align_segments
|
|
237
|
-
except ImportError:
|
|
238
|
-
raise DependencyError('lattifai_core', install_command='Contact support for lattifai_core installation')
|
|
239
|
-
|
|
240
|
-
lattice_graph_str, final_state, acoustic_scale = lattice_graph
|
|
241
|
-
|
|
242
|
-
_start = time.time()
|
|
243
|
-
try:
|
|
244
|
-
# graph
|
|
245
|
-
decoding_graph = k2.Fsa.from_str(lattice_graph_str, acceptor=False)
|
|
246
|
-
decoding_graph.requires_grad_(False)
|
|
247
|
-
decoding_graph = k2.arc_sort(decoding_graph)
|
|
248
|
-
decoding_graph.skip_id = int(final_state)
|
|
249
|
-
decoding_graph.return_id = int(final_state + 1)
|
|
250
|
-
except Exception as e:
|
|
251
|
-
raise AlignmentError(
|
|
252
|
-
'Failed to create decoding graph from lattice',
|
|
253
|
-
context={'original_error': str(e), 'lattice_graph_length': len(lattice_graph_str)},
|
|
254
|
-
)
|
|
255
|
-
self.timings['decoding_graph'] += time.time() - _start
|
|
256
|
-
|
|
257
|
-
_start = time.time()
|
|
258
|
-
if self.device.type == 'mps':
|
|
259
|
-
device = 'cpu' # k2 does not support mps yet
|
|
260
|
-
else:
|
|
261
|
-
device = self.device
|
|
262
|
-
|
|
263
|
-
try:
|
|
264
|
-
results, labels = align_segments(
|
|
265
|
-
emission.to(device) * acoustic_scale,
|
|
266
|
-
decoding_graph.to(device),
|
|
267
|
-
torch.tensor([emission.shape[1]], dtype=torch.int32),
|
|
268
|
-
search_beam=200,
|
|
269
|
-
output_beam=80,
|
|
270
|
-
min_active_states=400,
|
|
271
|
-
max_active_states=10000,
|
|
272
|
-
subsampling_factor=1,
|
|
273
|
-
reject_low_confidence=False,
|
|
274
|
-
)
|
|
275
|
-
except Exception as e:
|
|
276
|
-
raise AlignmentError(
|
|
277
|
-
'Failed to perform forced alignment',
|
|
278
|
-
audio_path=str(audio) if not isinstance(audio, torch.Tensor) else 'tensor',
|
|
279
|
-
context={'original_error': str(e), 'emission_shape': list(emission.shape), 'device': str(device)},
|
|
280
|
-
)
|
|
281
|
-
self.timings['align_segments'] += time.time() - _start
|
|
282
|
-
|
|
283
|
-
channel = 0
|
|
284
|
-
return emission, results, labels, 0.02, 0.0, channel # frame_shift=20ms, offset=0.0s
|
lattifai/workflows/__init__.py
DELETED
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
LattifAI Agentic Workflows
|
|
3
|
-
|
|
4
|
-
This module provides agentic workflow capabilities for automated processing
|
|
5
|
-
of multimedia content through intelligent agent-based pipelines.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
# Import transcript processing functionality
|
|
9
|
-
from lattifai.io import (
|
|
10
|
-
ALL_SUBTITLE_FORMATS,
|
|
11
|
-
INPUT_SUBTITLE_FORMATS,
|
|
12
|
-
OUTPUT_SUBTITLE_FORMATS,
|
|
13
|
-
SUBTITLE_FORMATS,
|
|
14
|
-
GeminiReader,
|
|
15
|
-
GeminiWriter,
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
from .agents import YouTubeSubtitleAgent
|
|
19
|
-
from .base import WorkflowAgent, WorkflowResult, WorkflowStep
|
|
20
|
-
from .file_manager import FileExistenceManager
|
|
21
|
-
|
|
22
|
-
__all__ = [
|
|
23
|
-
'WorkflowAgent',
|
|
24
|
-
'WorkflowStep',
|
|
25
|
-
'WorkflowResult',
|
|
26
|
-
'YouTubeSubtitleAgent',
|
|
27
|
-
'FileExistenceManager',
|
|
28
|
-
'GeminiReader',
|
|
29
|
-
'GeminiWriter',
|
|
30
|
-
'SUBTITLE_FORMATS',
|
|
31
|
-
'INPUT_SUBTITLE_FORMATS',
|
|
32
|
-
'OUTPUT_SUBTITLE_FORMATS',
|
|
33
|
-
'ALL_SUBTITLE_FORMATS',
|
|
34
|
-
]
|
lattifai/workflows/agents.py
DELETED
lattifai/workflows/gemini.py
DELETED
|
@@ -1,167 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Gemini 2.5 Pro transcription module
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
import asyncio
|
|
6
|
-
from typing import Optional
|
|
7
|
-
|
|
8
|
-
# Import Google GenAI SDK
|
|
9
|
-
from google import genai
|
|
10
|
-
from google.genai.types import GenerateContentConfig, Part, ThinkingConfig
|
|
11
|
-
|
|
12
|
-
from .base import setup_workflow_logger
|
|
13
|
-
from .prompts import get_prompt_loader
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class GeminiTranscriber:
|
|
17
|
-
"""Gemini 2.5 Pro audio transcription using the specified Gem
|
|
18
|
-
|
|
19
|
-
Configuration (in __init__):
|
|
20
|
-
- api_key: Gemini API key (required)
|
|
21
|
-
|
|
22
|
-
Runtime parameters (in __call__):
|
|
23
|
-
- youtube_url: YouTube URL to transcribe
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
# The specific Gem URL provided by the user
|
|
27
|
-
GEM_URL = 'https://gemini.google.com/gem/1870ly7xvW2hU_umtv-LedGsjywT0sQiN'
|
|
28
|
-
|
|
29
|
-
def __init__(self, api_key: Optional[str] = None):
|
|
30
|
-
self.api_key = api_key
|
|
31
|
-
self.logger = setup_workflow_logger('gemini')
|
|
32
|
-
self.prompt_loader = get_prompt_loader()
|
|
33
|
-
|
|
34
|
-
if not self.api_key:
|
|
35
|
-
self.logger.warning(
|
|
36
|
-
'⚠️ Gemini API key not provided. API key will be required when calling transcription methods.'
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
async def __call__(self, youtube_url: str) -> str:
|
|
40
|
-
"""Main entry point for transcription"""
|
|
41
|
-
return await self.transcribe_url(youtube_url)
|
|
42
|
-
|
|
43
|
-
async def transcribe_url(self, youtube_url: str) -> str:
|
|
44
|
-
"""
|
|
45
|
-
Transcribe audio from YouTube URL using Gemini 2.5 Pro Gem
|
|
46
|
-
|
|
47
|
-
Args:
|
|
48
|
-
youtube_url: YouTube URL to transcribe
|
|
49
|
-
|
|
50
|
-
Returns:
|
|
51
|
-
Transcribed text
|
|
52
|
-
"""
|
|
53
|
-
if not self.api_key:
|
|
54
|
-
raise ValueError('Gemini API key is required for transcription')
|
|
55
|
-
|
|
56
|
-
self.logger.info(f'🎤 Starting Gemini transcription for: {youtube_url}')
|
|
57
|
-
|
|
58
|
-
try:
|
|
59
|
-
# Initialize client
|
|
60
|
-
client = genai.Client(api_key=self.api_key)
|
|
61
|
-
|
|
62
|
-
# Load prompt from Gem configuration
|
|
63
|
-
system_prompt = self.prompt_loader.get_gemini_transcription_prompt()
|
|
64
|
-
|
|
65
|
-
# Generate transcription with extended thinking
|
|
66
|
-
self.logger.info('🔄 Sending request to Gemini 2.5 Pro...')
|
|
67
|
-
config = GenerateContentConfig(
|
|
68
|
-
system_instruction=system_prompt,
|
|
69
|
-
# Enable thinking by including it in response modalities
|
|
70
|
-
response_modalities=['TEXT'],
|
|
71
|
-
thinking_config=ThinkingConfig(
|
|
72
|
-
include_thoughts=False,
|
|
73
|
-
thinking_budget=-1,
|
|
74
|
-
),
|
|
75
|
-
)
|
|
76
|
-
response = await asyncio.get_event_loop().run_in_executor(
|
|
77
|
-
None,
|
|
78
|
-
lambda: client.models.generate_content(
|
|
79
|
-
model='gemini-2.5-pro',
|
|
80
|
-
contents=Part.from_uri(file_uri=youtube_url, mime_type='video/*'),
|
|
81
|
-
config=config,
|
|
82
|
-
),
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
if not response.text:
|
|
86
|
-
raise RuntimeError('Empty response from Gemini API')
|
|
87
|
-
|
|
88
|
-
transcript = response.text.strip()
|
|
89
|
-
|
|
90
|
-
self.logger.info(f'✅ Transcription completed: {len(transcript)} characters')
|
|
91
|
-
return transcript
|
|
92
|
-
|
|
93
|
-
except ImportError:
|
|
94
|
-
raise RuntimeError('Google GenAI SDK not installed. Please install with: pip install google-genai')
|
|
95
|
-
except Exception as e:
|
|
96
|
-
self.logger.error(f'Gemini transcription failed: {str(e)}')
|
|
97
|
-
raise RuntimeError(f'Gemini transcription failed: {str(e)}')
|
|
98
|
-
|
|
99
|
-
async def transcribe_file(self, media_file_path: str) -> str:
|
|
100
|
-
"""
|
|
101
|
-
Transcribe audio/video from local file using Gemini 2.5 Pro
|
|
102
|
-
|
|
103
|
-
Args:
|
|
104
|
-
media_file_path: Path to local audio file
|
|
105
|
-
|
|
106
|
-
Returns:
|
|
107
|
-
Transcribed text
|
|
108
|
-
"""
|
|
109
|
-
if not self.api_key:
|
|
110
|
-
raise ValueError('Gemini API key is required for transcription')
|
|
111
|
-
|
|
112
|
-
self.logger.info(f'🎤 Starting Gemini transcription for file: {media_file_path}')
|
|
113
|
-
|
|
114
|
-
try:
|
|
115
|
-
# Initialize client
|
|
116
|
-
client = genai.Client(api_key=self.api_key)
|
|
117
|
-
|
|
118
|
-
# Load prompt from Gem configuration
|
|
119
|
-
system_prompt = self.prompt_loader.get_gemini_transcription_prompt()
|
|
120
|
-
|
|
121
|
-
# Upload audio file
|
|
122
|
-
self.logger.info('📤 Uploading audio file to Gemini...')
|
|
123
|
-
media_file = client.files.upload(path=media_file_path)
|
|
124
|
-
|
|
125
|
-
# Generate transcription with extended thinking
|
|
126
|
-
# Note: For thinking mode, you may want to use 'gemini-2.0-flash-thinking-exp' or similar models
|
|
127
|
-
self.logger.info('🔄 Sending transcription request...')
|
|
128
|
-
config = GenerateContentConfig(
|
|
129
|
-
system_instruction=system_prompt,
|
|
130
|
-
# Enable thinking by including it in response modalities
|
|
131
|
-
response_modalities=['TEXT'],
|
|
132
|
-
thinking_config=ThinkingConfig(
|
|
133
|
-
include_thoughts=False,
|
|
134
|
-
thinking_budget=-1,
|
|
135
|
-
),
|
|
136
|
-
)
|
|
137
|
-
response = await asyncio.get_event_loop().run_in_executor(
|
|
138
|
-
None,
|
|
139
|
-
lambda: client.models.generate_content(
|
|
140
|
-
model='gemini-2.5-pro',
|
|
141
|
-
contents=Part.from_uri(file_uri=media_file.uri, mime_type=media_file.mime_type),
|
|
142
|
-
config=config,
|
|
143
|
-
),
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
if not response.text:
|
|
147
|
-
raise RuntimeError('Empty response from Gemini API')
|
|
148
|
-
|
|
149
|
-
transcript = response.text.strip()
|
|
150
|
-
|
|
151
|
-
self.logger.info(f'✅ Transcription completed: {len(transcript)} characters')
|
|
152
|
-
return transcript
|
|
153
|
-
|
|
154
|
-
except ImportError:
|
|
155
|
-
raise RuntimeError('Google GenAI SDK not installed. Please install with: pip install google-genai')
|
|
156
|
-
except Exception as e:
|
|
157
|
-
self.logger.error(f'Gemini transcription failed: {str(e)}')
|
|
158
|
-
raise RuntimeError(f'Gemini transcription failed: {str(e)}')
|
|
159
|
-
|
|
160
|
-
def get_gem_info(self) -> dict:
|
|
161
|
-
"""Get information about the Gem being used"""
|
|
162
|
-
return {
|
|
163
|
-
'gem_name': 'Audio Transcription Gem',
|
|
164
|
-
'gem_url': self.GEM_URL,
|
|
165
|
-
'model': 'Gemini 2.5 Pro',
|
|
166
|
-
'description': 'Specialized Gem for media content transcribe',
|
|
167
|
-
}
|