lattifai 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,7 @@ import resampy
9
9
  import soundfile as sf
10
10
  import torch
11
11
  from lhotse import FbankConfig
12
+ from lhotse.audio import read_audio
12
13
  from lhotse.features.kaldi.layers import Wav2LogFilterBank
13
14
  from lhotse.utils import Pathlike
14
15
 
@@ -76,13 +77,55 @@ class Lattice1AlphaWorker:
76
77
 
77
78
  def load_audio(self, audio: Union[Pathlike, BinaryIO]) -> Tuple[torch.Tensor, int]:
78
79
  # load audio
79
- waveform, sample_rate = sf.read(audio, always_2d=True, dtype='float32')
80
- if waveform.shape[1] > 1: # TODO: support choose channel
81
- waveform = np.mean(waveform, axis=1, keepdims=True)
80
+ try:
81
+ waveform, sample_rate = read_audio(audio) # numpy array
82
+ if len(waveform.shape) == 1:
83
+ waveform = waveform.reshape([1, -1]) # (1, L)
84
+ else: # make sure channel first
85
+ if waveform.shape[0] > waveform.shape[1]:
86
+ waveform = waveform.transpose(0, 1)
87
+ # average multiple channels
88
+ waveform = np.mean(waveform, axis=0, keepdims=True) # (1, L)
89
+ except Exception:
90
+ # Fallback to PyAV for formats not supported by soundfile
91
+ import av
92
+
93
+ container = av.open(audio)
94
+ audio_stream = next((s for s in container.streams if s.type == 'audio'), None)
95
+
96
+ if audio_stream is None:
97
+ raise ValueError(f'No audio stream found in {audio}')
98
+
99
+ # Resample to target sample rate during decoding
100
+ audio_stream.codec_context.format = av.AudioFormat('flt') # 32-bit float
101
+
102
+ frames = []
103
+ for frame in container.decode(audio_stream):
104
+ # Convert frame to numpy array
105
+ array = frame.to_ndarray()
106
+ # Ensure shape is (channels, samples)
107
+ if array.ndim == 1:
108
+ array = array.reshape(1, -1)
109
+ elif array.ndim == 2 and array.shape[0] > array.shape[1]:
110
+ array = array.T
111
+ frames.append(array)
112
+
113
+ container.close()
114
+
115
+ if not frames:
116
+ raise ValueError(f'No audio data found in {audio}')
117
+
118
+ # Concatenate all frames
119
+ waveform = np.concatenate(frames, axis=1)
120
+ # Average multiple channels to mono
121
+ if waveform.shape[0] > 1:
122
+ waveform = np.mean(waveform, axis=0, keepdims=True)
123
+
124
+ sample_rate = audio_stream.codec_context.sample_rate
125
+
82
126
  if sample_rate != self.config['sample_rate']:
83
- waveform = resampy.resample(waveform, sample_rate, self.config['sample_rate'], axis=0)
84
- waveform = torch.from_numpy(waveform.T).to(self.device) # (1, L)
85
- return waveform
127
+ waveform = resampy.resample(waveform, sample_rate, self.config['sample_rate'], axis=1)
128
+ return torch.from_numpy(waveform).to(self.device) # (1, L)
86
129
 
87
130
  def alignment(
88
131
  self, audio: Union[Union[Pathlike, BinaryIO], torch.tensor], lattice_graph: Tuple[str, int, float]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lattifai
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
5
5
  Author-email: Lattifai Technologies <tech@lattifai.com>
6
6
  Maintainer-email: Lattice <tech@lattifai.com>
@@ -61,6 +61,7 @@ Requires-Dist: onnxruntime
61
61
  Requires-Dist: resampy
62
62
  Requires-Dist: g2p-phonemizer==0.1.1
63
63
  Requires-Dist: wtpsplit>=2.1.6
64
+ Requires-Dist: av
64
65
  Provides-Extra: numpy
65
66
  Requires-Dist: numpy; extra == "numpy"
66
67
  Provides-Extra: test
@@ -13,10 +13,10 @@ lattifai/tokenizer/__init__.py,sha256=aqv44PDtq6g3oFFKW_l4HSR5ywT5W8eP1dHHywIvBf
13
13
  lattifai/tokenizer/phonemizer.py,sha256=SfRi1KIMpmaao6OVmR1h_I_3QU-vrE6D5bh72Afg5XM,1759
14
14
  lattifai/tokenizer/tokenizer.py,sha256=Yuo0pLPQnF2uX0Fm5g8i5vtcADn7GeLpSqdGpMJgTww,11492
15
15
  lattifai/workers/__init__.py,sha256=s6YfkIq4FDIAzY9sPjRpXnJfszj2repqnMTqydRM5Zw,83
16
- lattifai/workers/lattice1_alpha.py,sha256=1VFo59EcygEctTHOhkcII8v3_mrj8JEJ8Fcaqk_7LVo,5762
17
- lattifai-0.2.2.dist-info/licenses/LICENSE,sha256=LNuoH5jpXXNKgjQ3XLwztFq8D3O7kZI-LSg81o4ym2M,1065
18
- lattifai-0.2.2.dist-info/METADATA,sha256=4vmPOYKsIlvADiw0zUDQ2dbDpe-vOV-o5A0Hs1p7xfg,10971
19
- lattifai-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
- lattifai-0.2.2.dist-info/entry_points.txt,sha256=CwTI2NbJvF9msIHboAfTA99cmDr_HOWoODjS8R64JOw,131
21
- lattifai-0.2.2.dist-info/top_level.txt,sha256=-OVWZ68YYFcTN13ARkLasp2OUappe9wEVq-CKes7jM4,17
22
- lattifai-0.2.2.dist-info/RECORD,,
16
+ lattifai/workers/lattice1_alpha.py,sha256=fnimZqhPQ1fBCUjcDVblnFGTWP0vbNLRM7E7lOdHJu8,7428
17
+ lattifai-0.2.4.dist-info/licenses/LICENSE,sha256=LNuoH5jpXXNKgjQ3XLwztFq8D3O7kZI-LSg81o4ym2M,1065
18
+ lattifai-0.2.4.dist-info/METADATA,sha256=7SNTA_Egpv3F5rd0F20-4MigC7muz5x6kyr_xxj4CIk,10989
19
+ lattifai-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
+ lattifai-0.2.4.dist-info/entry_points.txt,sha256=CwTI2NbJvF9msIHboAfTA99cmDr_HOWoODjS8R64JOw,131
21
+ lattifai-0.2.4.dist-info/top_level.txt,sha256=-OVWZ68YYFcTN13ARkLasp2OUappe9wEVq-CKes7jM4,17
22
+ lattifai-0.2.4.dist-info/RECORD,,