PyPI - lattifai - Versions diffs - 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

lattifai 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

lattifai/bin/agent.py CHANGED Viewed

@@ -302,6 +302,10 @@ if not check_dependencies():
 if __name__ == '__main__':
     import os
+    from dotenv import find_dotenv, load_dotenv
+    load_dotenv(find_dotenv(usecwd=True))
     asyncio.run(
         _run_youtube_workflow(
             # url='https://www.youtube.com/watch?v=7nv1snJRCEI',

lattifai/io/text_parser.py CHANGED Viewed

@@ -8,12 +8,12 @@ SPEAKER_PATTERN = re.compile(r'((?:>>|&gt;&gt;|>|&gt;).*?[:：])\s*(.*)')
 # Transcriber Output Example:
 # 26:19.919 --> 26:34.921
 # [SPEAKER_01]: 越来越多的科技巨头入...
-SPEAKER_LATTIFAI = re.compile(r'(^\[SPEAKER_.*?\]:)\s*(.*)')
+SPEAKER_LATTIFAI = re.compile(r'(^\[SPEAKER_.*?\][:：])\s*(.*)')
 # NISHTHA BHATIA: Hey, everyone.
 # DIETER: Oh, hey, Nishtha.
 # GEMINI: That might
-SPEAKER_PATTERN2 = re.compile(r'^([A-Z]{1,15}(?:\s+[A-Z]{1,15})?)[:：]\s*(.*)$')
+SPEAKER_PATTERN2 = re.compile(r'^([A-Z]{1,15}(?:\s+[A-Z]{1,15})?[:：])\s*(.*)$')
 def parse_speaker_text(line) -> Tuple[Optional[str], str]:

lattifai/tokenizer/tokenizer.py CHANGED Viewed

@@ -148,7 +148,7 @@ class LatticeTokenizer:
         oov_words = []
         for text in texts:
             words = text.lower().replace('-', ' ').replace('—', ' ').replace('–', ' ').split()
-            oovs = [w for w in words if w not in self.words]
+            oovs = [w.strip(PUNCTUATION) for w in words if w not in self.words]
             if oovs:
                 oov_words.extend([w for w in oovs if (w not in self.words and len(w) <= MAXIMUM_WORD_LENGTH)])

lattifai/workers/lattice1_alpha.py CHANGED Viewed

@@ -1,25 +1,70 @@
 import json
 import time
 from collections import defaultdict
-from typing import Any, BinaryIO, Dict, Tuple, Union
+from typing import Any, BinaryIO, Dict, Iterable, Optional, Tuple, Union
 import numpy as np
 import onnxruntime as ort
-import resampy
 import soundfile as sf
 import torch
 from lhotse import FbankConfig
-from lhotse.audio import read_audio
+from lhotse.augmentation import get_or_create_resampler
 from lhotse.features.kaldi.layers import Wav2LogFilterBank
 from lhotse.utils import Pathlike
-from lattifai.errors import (
-    AlignmentError,
-    AudioFormatError,
-    AudioLoadError,
-    DependencyError,
-    ModelLoadError,
-)
+from lattifai.errors import AlignmentError, AudioFormatError, AudioLoadError, DependencyError, ModelLoadError
+ChannelSelectorType = Union[int, Iterable[int], str]
+def resample_audio(
+    audio_sr: Tuple[torch.Tensor, int],
+    sampling_rate: int,
+    device: Optional[str],
+    channel_selector: Optional[ChannelSelectorType] = 'average',
+) -> torch.Tensor:
+    """
+    return:
+        (1, T)
+    """
+    audio, sr = audio_sr
+    if channel_selector is None:
+        # keep the original multi-channel signal
+        tensor = audio
+    elif isinstance(channel_selector, int):
+        assert audio.shape[0] >= channel_selector, f'Invalid channel: {channel_selector}'
+        tensor = audio[channel_selector : channel_selector + 1].clone()
+        del audio
+    elif isinstance(channel_selector, str):
+        assert channel_selector == 'average'
+        tensor = torch.mean(audio.to(device), dim=0, keepdim=True)
+        del audio
+    else:
+        assert isinstance(channel_selector, Iterable)
+        num_channels = audio.shape[0]
+        print(f'Selecting channels {channel_selector} from the signal with {num_channels} channels.')
+        assert isinstance(channel_selector, Iterable)
+        if max(channel_selector) >= num_channels:
+            raise ValueError(
+                f'Cannot select channel subset {channel_selector} from a signal with {num_channels} channels.'
+            )
+        tensor = audio[channel_selector]
+    tensor = tensor.to(device)
+    if sr != sampling_rate:
+        resampler = get_or_create_resampler(sr, sampling_rate).to(device=device)
+        length = tensor.size(-1)
+        chunk_size = sampling_rate * 3600
+        if length > chunk_size:
+            resampled_chunks = []
+            for i in range(0, length, chunk_size):
+                resampled_chunks.append(resampler(tensor[..., i : i + chunk_size]))
+            tensor = torch.cat(resampled_chunks, dim=-1)
+        else:
+            tensor = resampler(tensor)
+    return tensor
 class Lattice1AlphaWorker:
@@ -48,7 +93,7 @@ class Lattice1AlphaWorker:
             self.acoustic_ort = ort.InferenceSession(
                 f'{model_path}/acoustic_opt.onnx',
                 sess_options,
-                providers=providers + ['CoreMLExecutionProvider', 'CPUExecutionProvider'],
+                providers=providers + ['CPUExecutionProvider', 'CoreMLExecutionProvider'],
             )
         except Exception as e:
             raise ModelLoadError(f'acoustic model from {model_path}', original_error=e)
@@ -93,17 +138,13 @@ class Lattice1AlphaWorker:
         self.timings['emission'] += time.time() - _start
         return emission  # (1, T, vocab_size) torch
-    def load_audio(self, audio: Union[Pathlike, BinaryIO]) -> Tuple[torch.Tensor, int]:
+    def load_audio(
+        self, audio: Union[Pathlike, BinaryIO], channel_selector: Optional[ChannelSelectorType] = 'average'
+    ) -> Tuple[torch.Tensor, int]:
         # load audio
         try:
-            waveform, sample_rate = read_audio(audio)  # numpy array
-            if len(waveform.shape) == 1:
-                waveform = waveform.reshape([1, -1])  # (1, L)
-            else:  # make sure channel first
-                if waveform.shape[0] > waveform.shape[1]:
-                    waveform = waveform.transpose(0, 1)
-                # average multiple channels
-                waveform = np.mean(waveform, axis=0, keepdims=True)  # (1, L)
+            waveform, sample_rate = sf.read(audio, always_2d=True, dtype='float32')  # numpy array
+            waveform = waveform.T  # (channels, samples)
         except Exception as primary_error:
             # Fallback to PyAV for formats not supported by soundfile
             try:
@@ -141,23 +182,16 @@ class Lattice1AlphaWorker:
                 # Concatenate all frames
                 waveform = np.concatenate(frames, axis=1)
-                # Average multiple channels to mono
-                if waveform.shape[0] > 1:
-                    waveform = np.mean(waveform, axis=0, keepdims=True)
                 sample_rate = audio_stream.codec_context.sample_rate
             except Exception as e:
                 raise AudioLoadError(str(audio), original_error=e)
-        try:
-            if sample_rate != self.config['sample_rate']:
-                waveform = resampy.resample(waveform, sample_rate, self.config['sample_rate'], axis=1)
-        except Exception:
-            raise AudioFormatError(
-                str(audio), f'Failed to resample from {sample_rate}Hz to {self.config["sample_rate"]}Hz'
-            )
-        return torch.from_numpy(waveform).to(self.device)  # (1, L)
+        return resample_audio(
+            (torch.from_numpy(waveform), sample_rate),
+            self.config.get('sampling_rate', 16000),
+            device=self.device.type,
+            channel_selector=channel_selector,
+        )
     def alignment(
         self, audio: Union[Union[Pathlike, BinaryIO], torch.tensor], lattice_graph: Tuple[str, int, float]
@@ -231,9 +265,9 @@ class Lattice1AlphaWorker:
                 emission.to(device) * acoustic_scale,
                 decoding_graph.to(device),
                 torch.tensor([emission.shape[1]], dtype=torch.int32),
-                search_beam=100,
-                output_beam=40,
-                min_active_states=200,
+                search_beam=200,
+                output_beam=80,
+                min_active_states=400,
                 max_active_states=10000,
                 subsampling_factor=1,
                 reject_low_confidence=False,

lattifai/workflows/youtube.py CHANGED Viewed

@@ -708,7 +708,7 @@ class YouTubeSubtitleAgent(WorkflowAgent):
         # If subtitle was already downloaded in step 1 and user selected it, use it directly
         if downloaded_subtitle_path and downloaded_subtitle_path != 'gemini':
-            self.logger.info(f'📥 Using subtitle downloaded in previous step: {downloaded_subtitle_path}')
+            self.logger.info(f'📥 Using subtitle: {downloaded_subtitle_path}')
             return {'subtitle_path': downloaded_subtitle_path}
         # Check for existing subtitles if subtitle was not downloaded yet

{lattifai-0.4.3.dist-info → lattifai-0.4.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lattifai
-Version: 0.4.3
+Version: 0.4.5
 Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
 Author-email: Lattifai Technologies <tech@lattifai.com>
 Maintainer-email: Lattice <tech@lattifai.com>
@@ -57,7 +57,6 @@ Requires-Dist: pysubs2
 Requires-Dist: praatio
 Requires-Dist: tgt
 Requires-Dist: onnxruntime
-Requires-Dist: resampy
 Requires-Dist: g2p-phonemizer==0.1.1
 Requires-Dist: wtpsplit>=2.1.6
 Requires-Dist: av

{lattifai-0.4.3.dist-info → lattifai-0.4.5.dist-info}/RECORD RENAMED Viewed

@@ -4,7 +4,7 @@ lattifai/client.py,sha256=YvK25fcXwKREYCkq_TUBdEZh7I9RNEwCbgW4qUha2ho,13236
 lattifai/errors.py,sha256=5i_acoly1g-TLAID8QnhzQshwOXfgLL55mHsdwzlNGA,10814
 lattifai/utils.py,sha256=CzVwNc08u8lm7XavCMJskXHfni0xsZ-EgpcMkRywVm8,4736
 lattifai/bin/__init__.py,sha256=QWbmVUbzqwPQNeOV_g-bOsz53w37v-tbZ3uFrSj-7Mg,90
-lattifai/bin/agent.py,sha256=jc7qSuVV2_EkToEu2qYodXgGoTup_na6IgP25kFmfwk,9734
+lattifai/bin/agent.py,sha256=8nRh0GC1M4__-qKQtxZspcyNnUm8DOVCWQf3sRblEOI,9826
 lattifai/bin/align.py,sha256=JJHk5uzmYGYhpA3ricHdmBLzJWC2aRyF0k4ANnap50w,8151
 lattifai/bin/cli_base.py,sha256=gvPUi9Z0eGwBJ8Es5xq1z00YzFPlocYiR2zpL7ekyhw,626
 lattifai/bin/subtitle.py,sha256=UZMPh71O2X1UwbfZ9VWlhzxkz78viz8KWwoVsDpewK0,6577
@@ -13,27 +13,27 @@ lattifai/io/gemini_reader.py,sha256=WDZA93MSrUAsa5j-ZDXLdPXzEIoREymEy-rMAED_6f4,
 lattifai/io/gemini_writer.py,sha256=rlXO9zx6kQhqTi9K9izE69-8S-2GPOIiJHPwZyebpiM,6515
 lattifai/io/reader.py,sha256=h4T8dveLHXqSonma0J50iyjqkxH26tujeoPbnLx05nA,3333
 lattifai/io/supervision.py,sha256=iBDRiDJ0hddo__SoEZau2cdEIBFnXZNLgSWFjtJd-lM,871
-lattifai/io/text_parser.py,sha256=LQHgcEYXaSdhwUo9rP6P_31Z6RMv_BTP1YSKzXji4bk,2386
+lattifai/io/text_parser.py,sha256=5rO2qgFLfWIcoU0K-FGBgs2qG7r6Uz2J2LklGTgSUyA,2391
 lattifai/io/utils.py,sha256=4drRwcM1n7AYhdJcF51EZxMTy_Ut_1GKtdWpRhPuVmg,686
 lattifai/io/writer.py,sha256=8n9ZBuXuVOCFwzr1hqrnXpZ-fARTsepebwjKgRuueWE,3872
 lattifai/tokenizer/__init__.py,sha256=y-FyfO7tLga9b46pkCC6jdSBKOFZS-jFfHcqUieGEyU,120
 lattifai/tokenizer/phonemizer.py,sha256=SfRi1KIMpmaao6OVmR1h_I_3QU-vrE6D5bh72Afg5XM,1759
-lattifai/tokenizer/tokenizer.py,sha256=6XlHehhwahVWgUMj79LqMs13xaRTa17beOZqico4vLE,19186
+lattifai/tokenizer/tokenizer.py,sha256=uLtGeT6ad_vPuoXNw8Rz43bAdKQWheI_tfHAV47FyqQ,19205
 lattifai/workers/__init__.py,sha256=s6YfkIq4FDIAzY9sPjRpXnJfszj2repqnMTqydRM5Zw,83
-lattifai/workers/lattice1_alpha.py,sha256=1lCq0-bgWMXvYslAbCTFgHC0p6UWPto1y0wkTw9WrmQ,10177
+lattifai/workers/lattice1_alpha.py,sha256=4FX91eTmcTDZt78NrMY8EgkwlwBKlU4Qw39qcQeqiTg,11469
 lattifai/workflows/__init__.py,sha256=mTOdwQQs2YY1s0JsVGsATb2TWPhpNo7bRiAAJW92740,830
 lattifai/workflows/agents.py,sha256=ZnxyEb-Li8gQw3Z7V3K7jjnT7sQAtG1uCFuXGX77IcE,227
 lattifai/workflows/base.py,sha256=ZSePq2O2hB59K5CMbk_iGiOM9FSHOVQdG3t8Oyz9gwE,6264
 lattifai/workflows/file_manager.py,sha256=5km59cB4s9PcJgMMqZ6YqRY8sTqvTzd1qyQ2T5bHlfI,31985
 lattifai/workflows/gemini.py,sha256=kpp3GiHyui8RZLWRQMx7vojBscCi2lZSxrjM1VHo_mc,6144
-lattifai/workflows/youtube.py,sha256=y1ez32G40CtHAo7Kz7J_DO96ljOwW-3SP9Sm2uPe5hY,38785
+lattifai/workflows/youtube.py,sha256=tRy9LV5mwgQfQ3jfL_aiyB2catDHqCS2Hmy9Lk3-VGs,38757
 lattifai/workflows/prompts/README.md,sha256=X49KWSQVdjWxxWUp4R2w3ZqKrAOi6_kDNHh1hMaQ4PE,694
 lattifai/workflows/prompts/__init__.py,sha256=i3kMT5sg_W9LbPcda0xmZWLg0tPjXGVI3iKtHrBng3o,1351
 lattifai/workflows/prompts/gemini/README.md,sha256=rt7f7yDGtaobKBo95LG3u56mqa3ABOXQd0UVgJYtYuo,781
 lattifai/workflows/prompts/gemini/transcription_gem.txt,sha256=cljzZ--BDgnnKzqVCakr-fTp2Xk38UOsUquvruNX-LU,4600
-lattifai-0.4.3.dist-info/licenses/LICENSE,sha256=LNuoH5jpXXNKgjQ3XLwztFq8D3O7kZI-LSg81o4ym2M,1065
-lattifai-0.4.3.dist-info/METADATA,sha256=-WztCYs961omWHWJMd1PcM6KLjQgbGYzYl72xGqMYdE,26710
-lattifai-0.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-lattifai-0.4.3.dist-info/entry_points.txt,sha256=fCgo8-LKA_9C7_jmEGsZPJko0woXHtEh0iRbpO7PYzI,69
-lattifai-0.4.3.dist-info/top_level.txt,sha256=tHSoXF26r-IGfbIP_JoYATqbmf14h5NrnNJGH4j5reI,9
-lattifai-0.4.3.dist-info/RECORD,,
+lattifai-0.4.5.dist-info/licenses/LICENSE,sha256=LNuoH5jpXXNKgjQ3XLwztFq8D3O7kZI-LSg81o4ym2M,1065
+lattifai-0.4.5.dist-info/METADATA,sha256=e0ojxc-4xgpgpWZC4wdXlTySQYliuLCa-MGG1F80U6E,26687
+lattifai-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+lattifai-0.4.5.dist-info/entry_points.txt,sha256=fCgo8-LKA_9C7_jmEGsZPJko0woXHtEh0iRbpO7PYzI,69
+lattifai-0.4.5.dist-info/top_level.txt,sha256=tHSoXF26r-IGfbIP_JoYATqbmf14h5NrnNJGH4j5reI,9
+lattifai-0.4.5.dist-info/RECORD,,

{lattifai-0.4.3.dist-info → lattifai-0.4.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{lattifai-0.4.3.dist-info → lattifai-0.4.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{lattifai-0.4.3.dist-info → lattifai-0.4.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{lattifai-0.4.3.dist-info → lattifai-0.4.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

lattifai 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

lattifai 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl