pysilero-vad 1.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pysilero_vad/__init__.py CHANGED
@@ -1,18 +1,25 @@
1
1
  import logging
2
2
  from pathlib import Path
3
- from typing import Final, Union
3
+ from typing import Final, Iterable, Union
4
4
 
5
5
  import numpy as np
6
6
  import onnxruntime
7
7
 
8
- _RATE: Final = 16000
8
+ _RATE: Final = 16000 # Khz
9
9
  _MAX_WAV: Final = 32767
10
10
  _DIR = Path(__file__).parent
11
11
  _DEFAULT_ONNX_PATH = _DIR / "models" / "silero_vad.onnx"
12
+ _CONTEXT_SIZE: Final = 64 # 16Khz
13
+ _CHUNK_SAMPLES: Final = 512
14
+ _CHUNK_BYTES: Final = _CHUNK_SAMPLES * 2 # 16-bit
12
15
 
13
16
  _LOGGER = logging.getLogger()
14
17
 
15
18
 
19
+ class InvalidChunkSizeError(Exception):
20
+ """Error raised when chunk size is not correct."""
21
+
22
+
16
23
  class SileroVoiceActivityDetector:
17
24
  """Detects speech/silence using Silero VAD.
18
25
 
@@ -30,31 +37,71 @@ class SileroVoiceActivityDetector:
30
37
  onnx_path, providers=["CPUExecutionProvider"], sess_options=opts
31
38
  )
32
39
 
33
- self._h = np.zeros((2, 1, 64)).astype("float32")
34
- self._c = np.zeros((2, 1, 64)).astype("float32")
40
+ self._context = np.zeros((1, _CONTEXT_SIZE), dtype=np.float32)
41
+ self._state = np.zeros((2, 1, 128), dtype=np.float32)
42
+ self._sr = np.array(_RATE, dtype=np.int64)
43
+
44
+ @staticmethod
45
+ def chunk_samples() -> int:
46
+ """Return number of samples required for an audio chunk."""
47
+ return _CHUNK_SAMPLES
48
+
49
+ @staticmethod
50
+ def chunk_bytes() -> int:
51
+ """Return number of bytes required for an audio chunk."""
52
+ return _CHUNK_BYTES
35
53
 
36
54
  def reset(self) -> None:
37
55
  """Reset state."""
38
- self._h = np.zeros((2, 1, 64)).astype("float32")
39
- self._c = np.zeros((2, 1, 64)).astype("float32")
56
+ self._state = np.zeros((2, 1, 128)).astype("float32")
40
57
 
41
58
  def __call__(self, audio: bytes) -> float:
42
- """Return probability of speech in audio [0-1].
59
+ """Return probability of speech [0-1] in a single audio chunk.
43
60
 
44
- Audio must be 16Khz 16-bit mono PCM.
61
+ Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
62
+ """
63
+ return self.process_chunk(audio)
64
+
65
+ def process_chunk(self, audio: bytes) -> float:
66
+ """Return probability of speech [0-1] in a single audio chunk.
67
+
68
+ Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
45
69
  """
70
+ if len(audio) != _CHUNK_BYTES:
71
+ # Window size is fixed at 512 samples in v5
72
+ raise InvalidChunkSizeError
73
+
46
74
  audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / _MAX_WAV
47
75
 
48
- # Add batch dimension
49
- audio_array = np.expand_dims(audio_array, 0)
76
+ # Add batch dimension and context
77
+ audio_array = np.concatenate(
78
+ (self._context, audio_array[np.newaxis, :]), axis=1
79
+ )
80
+ self._context = audio_array[:, -_CONTEXT_SIZE:]
50
81
 
82
+ # ort_inputs = {"input": audio_array, "state": self._state, "sr": self._sr}
51
83
  ort_inputs = {
52
- "input": audio_array,
53
- "h": self._h,
54
- "c": self._c,
55
- "sr": np.array(_RATE, dtype=np.int64),
84
+ "input": audio_array[:, : _CHUNK_SAMPLES + _CONTEXT_SIZE],
85
+ "state": self._state,
86
+ "sr": self._sr,
56
87
  }
57
88
  ort_outs = self.session.run(None, ort_inputs)
58
- out, self._h, self._c = ort_outs
89
+ out, self._state = ort_outs
59
90
 
60
91
  return out.squeeze()
92
+
93
+ def process_chunks(self, audio: bytes) -> Iterable[float]:
94
+ """Return probability of speech in audio [0-1] for each chunk of audio.
95
+
96
+ Audio must be 16Khz 16-bit mono PCM.
97
+ """
98
+ if len(audio) < _CHUNK_BYTES:
99
+ # Window size is fixed at 512 samples in v5
100
+ raise InvalidChunkSizeError
101
+
102
+ num_audio_bytes = len(audio)
103
+ audio_idx = 0
104
+
105
+ while (audio_idx + _CHUNK_BYTES) < num_audio_bytes:
106
+ yield self.process_chunk(audio[audio_idx : audio_idx + _CHUNK_BYTES])
107
+ audio_idx += _CHUNK_BYTES
Binary file
@@ -1,25 +1,27 @@
1
- Metadata-Version: 2.1
2
- Name: pysilero-vad
3
- Version: 1.0.0
1
+ Metadata-Version: 2.2
2
+ Name: pysilero_vad
3
+ Version: 2.0.1
4
4
  Summary: Pre-packaged voice activity detector using silero-vad
5
- Home-page: http://github.com/rhasspy/pysilero-vad
6
- Author: Michael Hansen
7
- Author-email: mike@rhasspy.org
5
+ Author-email: Michael Hansen <mike@rhasspy.org>
8
6
  License: MIT
9
- Keywords: voice activity vad
7
+ Project-URL: Source Code, http://github.com/rhasspy/pysilero-vad
8
+ Keywords: voice,activity,vad
9
+ Platform: any
10
10
  Classifier: Development Status :: 3 - Alpha
11
11
  Classifier: Intended Audience :: Developers
12
- Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
12
+ Classifier: Topic :: Text Processing :: Linguistic
13
13
  Classifier: License :: OSI Approved :: MIT License
14
- Classifier: Programming Language :: Python :: 3.7
15
14
  Classifier: Programming Language :: Python :: 3.8
16
15
  Classifier: Programming Language :: Python :: 3.9
17
16
  Classifier: Programming Language :: Python :: 3.10
18
17
  Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Requires-Python: >=3.8.0
19
21
  Description-Content-Type: text/markdown
20
22
  License-File: LICENSE.md
21
- Requires-Dist: onnxruntime <2,>=1.10.0
22
- Requires-Dist: numpy <1.26
23
+ Requires-Dist: onnxruntime<2,>=1.18.0
24
+ Requires-Dist: numpy<2
23
25
 
24
26
  # pySilero VAD
25
27
 
@@ -34,7 +36,10 @@ from pysilero_vad import SileroVoiceActivityDetector
34
36
 
35
37
  vad = SileroVoiceActivityDetector()
36
38
 
37
- # Audio must be 16Khz, 16-bit mono PCM
39
+ # Audio must be 16Khz, 16-bit mono PCM with correct chunk size
40
+ # See also: vad.chunk_samples()
41
+ assert len(audio_bytes) == vad.chunk_bytes()
42
+
38
43
  if vad(audio_bytes) >= 0.5:
39
44
  print("Speech")
40
45
  else:
@@ -0,0 +1,9 @@
1
+ pysilero_vad/__init__.py,sha256=_QtP_z0JjpOkSHMaqRFuSI9Bf0oL-k8IJE0hmZdZDLk,3433
2
+ pysilero_vad/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ pysilero_vad/models/silero_vad.onnx,sha256=a5nL_Tkka2cG-Y7BPHxQxrKZGB8kdPoFy8gEaswnQ5Y,2313101
4
+ pysilero_vad-2.0.1.dist-info/LICENSE.md,sha256=E3RtUJ105V6iJl--8gS7fNv4SoMVsCB-mIMmy1Q4cCg,1071
5
+ pysilero_vad-2.0.1.dist-info/METADATA,sha256=l2Xc-Dw2iaRvnXJ0e9lb69cUGn4LJAamNx138UAQh_A,1410
6
+ pysilero_vad-2.0.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
7
+ pysilero_vad-2.0.1.dist-info/top_level.txt,sha256=QQlOVbq_uDMukkVxjBFRi8eOwSrzJDrbP8YY1MCeMIs,13
8
+ pysilero_vad-2.0.1.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
9
+ pysilero_vad-2.0.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1 @@
1
+
@@ -1,8 +0,0 @@
1
- pysilero_vad/__init__.py,sha256=k0kb-HkhJwqD_O5YsOAQhV0Zbk7gnAy5XRTy2iiLQXY,1708
2
- pysilero_vad/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- pysilero_vad/models/silero_vad.onnx,sha256=o16_Uv085fFGmyo2FY26dhvEe5c-ozgrMYbKFbH1ryg,1807522
4
- pysilero_vad-1.0.0.dist-info/LICENSE.md,sha256=E3RtUJ105V6iJl--8gS7fNv4SoMVsCB-mIMmy1Q4cCg,1071
5
- pysilero_vad-1.0.0.dist-info/METADATA,sha256=-0F8V6kxyed4OmFwVuJkSZ9lbvnvnmEny0INFxmnHbQ,1219
6
- pysilero_vad-1.0.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
7
- pysilero_vad-1.0.0.dist-info/top_level.txt,sha256=QQlOVbq_uDMukkVxjBFRi8eOwSrzJDrbP8YY1MCeMIs,13
8
- pysilero_vad-1.0.0.dist-info/RECORD,,