lattifai 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lattifai/bin/agent.py CHANGED
@@ -302,6 +302,10 @@ if not check_dependencies():
302
302
  if __name__ == '__main__':
303
303
  import os
304
304
 
305
+ from dotenv import find_dotenv, load_dotenv
306
+
307
+ load_dotenv(find_dotenv(usecwd=True))
308
+
305
309
  asyncio.run(
306
310
  _run_youtube_workflow(
307
311
  # url='https://www.youtube.com/watch?v=7nv1snJRCEI',
@@ -8,12 +8,12 @@ SPEAKER_PATTERN = re.compile(r'((?:>>|>>|>|>).*?[::])\s*(.*)')
8
8
  # Transcriber Output Example:
9
9
  # 26:19.919 --> 26:34.921
10
10
  # [SPEAKER_01]: 越来越多的科技巨头入...
11
- SPEAKER_LATTIFAI = re.compile(r'(^\[SPEAKER_.*?\]:)\s*(.*)')
11
+ SPEAKER_LATTIFAI = re.compile(r'(^\[SPEAKER_.*?\][::])\s*(.*)')
12
12
 
13
13
  # NISHTHA BHATIA: Hey, everyone.
14
14
  # DIETER: Oh, hey, Nishtha.
15
15
  # GEMINI: That might
16
- SPEAKER_PATTERN2 = re.compile(r'^([A-Z]{1,15}(?:\s+[A-Z]{1,15})?)[::]\s*(.*)$')
16
+ SPEAKER_PATTERN2 = re.compile(r'^([A-Z]{1,15}(?:\s+[A-Z]{1,15})?[::])\s*(.*)$')
17
17
 
18
18
 
19
19
  def parse_speaker_text(line) -> Tuple[Optional[str], str]:
@@ -148,7 +148,7 @@ class LatticeTokenizer:
148
148
  oov_words = []
149
149
  for text in texts:
150
150
  words = text.lower().replace('-', ' ').replace('—', ' ').replace('–', ' ').split()
151
- oovs = [w for w in words if w not in self.words]
151
+ oovs = [w.strip(PUNCTUATION) for w in words if w not in self.words]
152
152
  if oovs:
153
153
  oov_words.extend([w for w in oovs if (w not in self.words and len(w) <= MAXIMUM_WORD_LENGTH)])
154
154
 
@@ -1,25 +1,70 @@
1
1
  import json
2
2
  import time
3
3
  from collections import defaultdict
4
- from typing import Any, BinaryIO, Dict, Tuple, Union
4
+ from typing import Any, BinaryIO, Dict, Iterable, Optional, Tuple, Union
5
5
 
6
6
  import numpy as np
7
7
  import onnxruntime as ort
8
- import resampy
9
8
  import soundfile as sf
10
9
  import torch
11
10
  from lhotse import FbankConfig
12
- from lhotse.audio import read_audio
11
+ from lhotse.augmentation import get_or_create_resampler
13
12
  from lhotse.features.kaldi.layers import Wav2LogFilterBank
14
13
  from lhotse.utils import Pathlike
15
14
 
16
- from lattifai.errors import (
17
- AlignmentError,
18
- AudioFormatError,
19
- AudioLoadError,
20
- DependencyError,
21
- ModelLoadError,
22
- )
15
+ from lattifai.errors import AlignmentError, AudioFormatError, AudioLoadError, DependencyError, ModelLoadError
16
+
17
+ ChannelSelectorType = Union[int, Iterable[int], str]
18
+
19
+
20
+ def resample_audio(
21
+ audio_sr: Tuple[torch.Tensor, int],
22
+ sampling_rate: int,
23
+ device: Optional[str],
24
+ channel_selector: Optional[ChannelSelectorType] = 'average',
25
+ ) -> torch.Tensor:
26
+ """
27
+ return:
28
+ (1, T)
29
+ """
30
+ audio, sr = audio_sr
31
+
32
+ if channel_selector is None:
33
+ # keep the original multi-channel signal
34
+ tensor = audio
35
+ elif isinstance(channel_selector, int):
36
+ assert audio.shape[0] >= channel_selector, f'Invalid channel: {channel_selector}'
37
+ tensor = audio[channel_selector : channel_selector + 1].clone()
38
+ del audio
39
+ elif isinstance(channel_selector, str):
40
+ assert channel_selector == 'average'
41
+ tensor = torch.mean(audio.to(device), dim=0, keepdim=True)
42
+ del audio
43
+ else:
44
+ assert isinstance(channel_selector, Iterable)
45
+ num_channels = audio.shape[0]
46
+ print(f'Selecting channels {channel_selector} from the signal with {num_channels} channels.')
47
+ assert isinstance(channel_selector, Iterable)
48
+ if max(channel_selector) >= num_channels:
49
+ raise ValueError(
50
+ f'Cannot select channel subset {channel_selector} from a signal with {num_channels} channels.'
51
+ )
52
+ tensor = audio[channel_selector]
53
+
54
+ tensor = tensor.to(device)
55
+ if sr != sampling_rate:
56
+ resampler = get_or_create_resampler(sr, sampling_rate).to(device=device)
57
+ length = tensor.size(-1)
58
+ chunk_size = sampling_rate * 3600
59
+ if length > chunk_size:
60
+ resampled_chunks = []
61
+ for i in range(0, length, chunk_size):
62
+ resampled_chunks.append(resampler(tensor[..., i : i + chunk_size]))
63
+ tensor = torch.cat(resampled_chunks, dim=-1)
64
+ else:
65
+ tensor = resampler(tensor)
66
+
67
+ return tensor
23
68
 
24
69
 
25
70
  class Lattice1AlphaWorker:
@@ -48,7 +93,7 @@ class Lattice1AlphaWorker:
48
93
  self.acoustic_ort = ort.InferenceSession(
49
94
  f'{model_path}/acoustic_opt.onnx',
50
95
  sess_options,
51
- providers=providers + ['CoreMLExecutionProvider', 'CPUExecutionProvider'],
96
+ providers=providers + ['CPUExecutionProvider', 'CoreMLExecutionProvider'],
52
97
  )
53
98
  except Exception as e:
54
99
  raise ModelLoadError(f'acoustic model from {model_path}', original_error=e)
@@ -93,17 +138,13 @@ class Lattice1AlphaWorker:
93
138
  self.timings['emission'] += time.time() - _start
94
139
  return emission # (1, T, vocab_size) torch
95
140
 
96
- def load_audio(self, audio: Union[Pathlike, BinaryIO]) -> Tuple[torch.Tensor, int]:
141
+ def load_audio(
142
+ self, audio: Union[Pathlike, BinaryIO], channel_selector: Optional[ChannelSelectorType] = 'average'
143
+ ) -> Tuple[torch.Tensor, int]:
97
144
  # load audio
98
145
  try:
99
- waveform, sample_rate = read_audio(audio) # numpy array
100
- if len(waveform.shape) == 1:
101
- waveform = waveform.reshape([1, -1]) # (1, L)
102
- else: # make sure channel first
103
- if waveform.shape[0] > waveform.shape[1]:
104
- waveform = waveform.transpose(0, 1)
105
- # average multiple channels
106
- waveform = np.mean(waveform, axis=0, keepdims=True) # (1, L)
146
+ waveform, sample_rate = sf.read(audio, always_2d=True, dtype='float32') # numpy array
147
+ waveform = waveform.T # (channels, samples)
107
148
  except Exception as primary_error:
108
149
  # Fallback to PyAV for formats not supported by soundfile
109
150
  try:
@@ -141,23 +182,16 @@ class Lattice1AlphaWorker:
141
182
 
142
183
  # Concatenate all frames
143
184
  waveform = np.concatenate(frames, axis=1)
144
- # Average multiple channels to mono
145
- if waveform.shape[0] > 1:
146
- waveform = np.mean(waveform, axis=0, keepdims=True)
147
-
148
185
  sample_rate = audio_stream.codec_context.sample_rate
149
186
  except Exception as e:
150
187
  raise AudioLoadError(str(audio), original_error=e)
151
188
 
152
- try:
153
- if sample_rate != self.config['sample_rate']:
154
- waveform = resampy.resample(waveform, sample_rate, self.config['sample_rate'], axis=1)
155
- except Exception:
156
- raise AudioFormatError(
157
- str(audio), f'Failed to resample from {sample_rate}Hz to {self.config["sample_rate"]}Hz'
158
- )
159
-
160
- return torch.from_numpy(waveform).to(self.device) # (1, L)
189
+ return resample_audio(
190
+ (torch.from_numpy(waveform), sample_rate),
191
+ self.config.get('sampling_rate', 16000),
192
+ device=self.device.type,
193
+ channel_selector=channel_selector,
194
+ )
161
195
 
162
196
  def alignment(
163
197
  self, audio: Union[Union[Pathlike, BinaryIO], torch.tensor], lattice_graph: Tuple[str, int, float]
@@ -231,9 +265,9 @@ class Lattice1AlphaWorker:
231
265
  emission.to(device) * acoustic_scale,
232
266
  decoding_graph.to(device),
233
267
  torch.tensor([emission.shape[1]], dtype=torch.int32),
234
- search_beam=100,
235
- output_beam=40,
236
- min_active_states=200,
268
+ search_beam=200,
269
+ output_beam=80,
270
+ min_active_states=400,
237
271
  max_active_states=10000,
238
272
  subsampling_factor=1,
239
273
  reject_low_confidence=False,
@@ -708,7 +708,7 @@ class YouTubeSubtitleAgent(WorkflowAgent):
708
708
 
709
709
  # If subtitle was already downloaded in step 1 and user selected it, use it directly
710
710
  if downloaded_subtitle_path and downloaded_subtitle_path != 'gemini':
711
- self.logger.info(f'📥 Using subtitle downloaded in previous step: {downloaded_subtitle_path}')
711
+ self.logger.info(f'📥 Using subtitle: {downloaded_subtitle_path}')
712
712
  return {'subtitle_path': downloaded_subtitle_path}
713
713
 
714
714
  # Check for existing subtitles if subtitle was not downloaded yet
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lattifai
3
- Version: 0.4.3
3
+ Version: 0.4.5
4
4
  Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
5
5
  Author-email: Lattifai Technologies <tech@lattifai.com>
6
6
  Maintainer-email: Lattice <tech@lattifai.com>
@@ -57,7 +57,6 @@ Requires-Dist: pysubs2
57
57
  Requires-Dist: praatio
58
58
  Requires-Dist: tgt
59
59
  Requires-Dist: onnxruntime
60
- Requires-Dist: resampy
61
60
  Requires-Dist: g2p-phonemizer==0.1.1
62
61
  Requires-Dist: wtpsplit>=2.1.6
63
62
  Requires-Dist: av
@@ -4,7 +4,7 @@ lattifai/client.py,sha256=YvK25fcXwKREYCkq_TUBdEZh7I9RNEwCbgW4qUha2ho,13236
4
4
  lattifai/errors.py,sha256=5i_acoly1g-TLAID8QnhzQshwOXfgLL55mHsdwzlNGA,10814
5
5
  lattifai/utils.py,sha256=CzVwNc08u8lm7XavCMJskXHfni0xsZ-EgpcMkRywVm8,4736
6
6
  lattifai/bin/__init__.py,sha256=QWbmVUbzqwPQNeOV_g-bOsz53w37v-tbZ3uFrSj-7Mg,90
7
- lattifai/bin/agent.py,sha256=jc7qSuVV2_EkToEu2qYodXgGoTup_na6IgP25kFmfwk,9734
7
+ lattifai/bin/agent.py,sha256=8nRh0GC1M4__-qKQtxZspcyNnUm8DOVCWQf3sRblEOI,9826
8
8
  lattifai/bin/align.py,sha256=JJHk5uzmYGYhpA3ricHdmBLzJWC2aRyF0k4ANnap50w,8151
9
9
  lattifai/bin/cli_base.py,sha256=gvPUi9Z0eGwBJ8Es5xq1z00YzFPlocYiR2zpL7ekyhw,626
10
10
  lattifai/bin/subtitle.py,sha256=UZMPh71O2X1UwbfZ9VWlhzxkz78viz8KWwoVsDpewK0,6577
@@ -13,27 +13,27 @@ lattifai/io/gemini_reader.py,sha256=WDZA93MSrUAsa5j-ZDXLdPXzEIoREymEy-rMAED_6f4,
13
13
  lattifai/io/gemini_writer.py,sha256=rlXO9zx6kQhqTi9K9izE69-8S-2GPOIiJHPwZyebpiM,6515
14
14
  lattifai/io/reader.py,sha256=h4T8dveLHXqSonma0J50iyjqkxH26tujeoPbnLx05nA,3333
15
15
  lattifai/io/supervision.py,sha256=iBDRiDJ0hddo__SoEZau2cdEIBFnXZNLgSWFjtJd-lM,871
16
- lattifai/io/text_parser.py,sha256=LQHgcEYXaSdhwUo9rP6P_31Z6RMv_BTP1YSKzXji4bk,2386
16
+ lattifai/io/text_parser.py,sha256=5rO2qgFLfWIcoU0K-FGBgs2qG7r6Uz2J2LklGTgSUyA,2391
17
17
  lattifai/io/utils.py,sha256=4drRwcM1n7AYhdJcF51EZxMTy_Ut_1GKtdWpRhPuVmg,686
18
18
  lattifai/io/writer.py,sha256=8n9ZBuXuVOCFwzr1hqrnXpZ-fARTsepebwjKgRuueWE,3872
19
19
  lattifai/tokenizer/__init__.py,sha256=y-FyfO7tLga9b46pkCC6jdSBKOFZS-jFfHcqUieGEyU,120
20
20
  lattifai/tokenizer/phonemizer.py,sha256=SfRi1KIMpmaao6OVmR1h_I_3QU-vrE6D5bh72Afg5XM,1759
21
- lattifai/tokenizer/tokenizer.py,sha256=6XlHehhwahVWgUMj79LqMs13xaRTa17beOZqico4vLE,19186
21
+ lattifai/tokenizer/tokenizer.py,sha256=uLtGeT6ad_vPuoXNw8Rz43bAdKQWheI_tfHAV47FyqQ,19205
22
22
  lattifai/workers/__init__.py,sha256=s6YfkIq4FDIAzY9sPjRpXnJfszj2repqnMTqydRM5Zw,83
23
- lattifai/workers/lattice1_alpha.py,sha256=1lCq0-bgWMXvYslAbCTFgHC0p6UWPto1y0wkTw9WrmQ,10177
23
+ lattifai/workers/lattice1_alpha.py,sha256=4FX91eTmcTDZt78NrMY8EgkwlwBKlU4Qw39qcQeqiTg,11469
24
24
  lattifai/workflows/__init__.py,sha256=mTOdwQQs2YY1s0JsVGsATb2TWPhpNo7bRiAAJW92740,830
25
25
  lattifai/workflows/agents.py,sha256=ZnxyEb-Li8gQw3Z7V3K7jjnT7sQAtG1uCFuXGX77IcE,227
26
26
  lattifai/workflows/base.py,sha256=ZSePq2O2hB59K5CMbk_iGiOM9FSHOVQdG3t8Oyz9gwE,6264
27
27
  lattifai/workflows/file_manager.py,sha256=5km59cB4s9PcJgMMqZ6YqRY8sTqvTzd1qyQ2T5bHlfI,31985
28
28
  lattifai/workflows/gemini.py,sha256=kpp3GiHyui8RZLWRQMx7vojBscCi2lZSxrjM1VHo_mc,6144
29
- lattifai/workflows/youtube.py,sha256=y1ez32G40CtHAo7Kz7J_DO96ljOwW-3SP9Sm2uPe5hY,38785
29
+ lattifai/workflows/youtube.py,sha256=tRy9LV5mwgQfQ3jfL_aiyB2catDHqCS2Hmy9Lk3-VGs,38757
30
30
  lattifai/workflows/prompts/README.md,sha256=X49KWSQVdjWxxWUp4R2w3ZqKrAOi6_kDNHh1hMaQ4PE,694
31
31
  lattifai/workflows/prompts/__init__.py,sha256=i3kMT5sg_W9LbPcda0xmZWLg0tPjXGVI3iKtHrBng3o,1351
32
32
  lattifai/workflows/prompts/gemini/README.md,sha256=rt7f7yDGtaobKBo95LG3u56mqa3ABOXQd0UVgJYtYuo,781
33
33
  lattifai/workflows/prompts/gemini/transcription_gem.txt,sha256=cljzZ--BDgnnKzqVCakr-fTp2Xk38UOsUquvruNX-LU,4600
34
- lattifai-0.4.3.dist-info/licenses/LICENSE,sha256=LNuoH5jpXXNKgjQ3XLwztFq8D3O7kZI-LSg81o4ym2M,1065
35
- lattifai-0.4.3.dist-info/METADATA,sha256=-WztCYs961omWHWJMd1PcM6KLjQgbGYzYl72xGqMYdE,26710
36
- lattifai-0.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
37
- lattifai-0.4.3.dist-info/entry_points.txt,sha256=fCgo8-LKA_9C7_jmEGsZPJko0woXHtEh0iRbpO7PYzI,69
38
- lattifai-0.4.3.dist-info/top_level.txt,sha256=tHSoXF26r-IGfbIP_JoYATqbmf14h5NrnNJGH4j5reI,9
39
- lattifai-0.4.3.dist-info/RECORD,,
34
+ lattifai-0.4.5.dist-info/licenses/LICENSE,sha256=LNuoH5jpXXNKgjQ3XLwztFq8D3O7kZI-LSg81o4ym2M,1065
35
+ lattifai-0.4.5.dist-info/METADATA,sha256=e0ojxc-4xgpgpWZC4wdXlTySQYliuLCa-MGG1F80U6E,26687
36
+ lattifai-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
37
+ lattifai-0.4.5.dist-info/entry_points.txt,sha256=fCgo8-LKA_9C7_jmEGsZPJko0woXHtEh0iRbpO7PYzI,69
38
+ lattifai-0.4.5.dist-info/top_level.txt,sha256=tHSoXF26r-IGfbIP_JoYATqbmf14h5NrnNJGH4j5reI,9
39
+ lattifai-0.4.5.dist-info/RECORD,,