pysilero-vad 1.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
pysilero_vad/__init__.py CHANGED
@@ -1,18 +1,25 @@
1
1
  import logging
2
2
  from pathlib import Path
3
- from typing import Final, Union
3
+ from typing import Final, Iterable, Union
4
4
 
5
5
  import numpy as np
6
6
  import onnxruntime
7
7
 
8
- _RATE: Final = 16000
8
+ _RATE: Final = 16000 # Khz
9
9
  _MAX_WAV: Final = 32767
10
10
  _DIR = Path(__file__).parent
11
11
  _DEFAULT_ONNX_PATH = _DIR / "models" / "silero_vad.onnx"
12
+ _CONTEXT_SIZE: Final = 64 # 16Khz
13
+ _CHUNK_SAMPLES: Final = 512
14
+ _CHUNK_BYTES: Final = _CHUNK_SAMPLES * 2 # 16-bit
12
15
 
13
16
  _LOGGER = logging.getLogger()
14
17
 
15
18
 
19
+ class InvalidChunkSizeError(Exception):
20
+ """Error raised when chunk size is not correct."""
21
+
22
+
16
23
  class SileroVoiceActivityDetector:
17
24
  """Detects speech/silence using Silero VAD.
18
25
 
@@ -30,31 +37,71 @@ class SileroVoiceActivityDetector:
30
37
  onnx_path, providers=["CPUExecutionProvider"], sess_options=opts
31
38
  )
32
39
 
33
- self._h = np.zeros((2, 1, 64)).astype("float32")
34
- self._c = np.zeros((2, 1, 64)).astype("float32")
40
+ self._context = np.zeros((1, _CONTEXT_SIZE), dtype=np.float32)
41
+ self._state = np.zeros((2, 1, 128), dtype=np.float32)
42
+ self._sr = np.array(_RATE, dtype=np.int64)
43
+
44
+ @staticmethod
45
+ def chunk_samples() -> int:
46
+ """Return number of samples required for an audio chunk."""
47
+ return _CHUNK_SAMPLES
48
+
49
+ @staticmethod
50
+ def chunk_bytes() -> int:
51
+ """Return number of bytes required for an audio chunk."""
52
+ return _CHUNK_BYTES
35
53
 
36
54
  def reset(self) -> None:
37
55
  """Reset state."""
38
- self._h = np.zeros((2, 1, 64)).astype("float32")
39
- self._c = np.zeros((2, 1, 64)).astype("float32")
56
+ self._state = np.zeros((2, 1, 128)).astype("float32")
40
57
 
41
58
  def __call__(self, audio: bytes) -> float:
42
- """Return probability of speech in audio [0-1].
59
+ """Return probability of speech [0-1] in a single audio chunk.
43
60
 
44
- Audio must be 16Khz 16-bit mono PCM.
61
+ Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
62
+ """
63
+ return self.process_chunk(audio)
64
+
65
+ def process_chunk(self, audio: bytes) -> float:
66
+ """Return probability of speech [0-1] in a single audio chunk.
67
+
68
+ Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
45
69
  """
70
+ if len(audio) != _CHUNK_BYTES:
71
+ # Window size is fixed at 512 samples in v5
72
+ raise InvalidChunkSizeError
73
+
46
74
  audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / _MAX_WAV
47
75
 
48
- # Add batch dimension
49
- audio_array = np.expand_dims(audio_array, 0)
76
+ # Add batch dimension and context
77
+ audio_array = np.concatenate(
78
+ (self._context, audio_array[np.newaxis, :]), axis=1
79
+ )
80
+ self._context = audio_array[:, -_CONTEXT_SIZE:]
50
81
 
82
+ # ort_inputs = {"input": audio_array, "state": self._state, "sr": self._sr}
51
83
  ort_inputs = {
52
- "input": audio_array,
53
- "h": self._h,
54
- "c": self._c,
55
- "sr": np.array(_RATE, dtype=np.int64),
84
+ "input": audio_array[:, : _CHUNK_SAMPLES + _CONTEXT_SIZE],
85
+ "state": self._state,
86
+ "sr": self._sr,
56
87
  }
57
88
  ort_outs = self.session.run(None, ort_inputs)
58
- out, self._h, self._c = ort_outs
89
+ out, self._state = ort_outs
59
90
 
60
91
  return out.squeeze()
92
+
93
+ def process_chunks(self, audio: bytes) -> Iterable[float]:
94
+ """Return probability of speech in audio [0-1] for each chunk of audio.
95
+
96
+ Audio must be 16Khz 16-bit mono PCM.
97
+ """
98
+ if len(audio) < _CHUNK_BYTES:
99
+ # Window size is fixed at 512 samples in v5
100
+ raise InvalidChunkSizeError
101
+
102
+ num_audio_bytes = len(audio)
103
+ audio_idx = 0
104
+
105
+ while (audio_idx + _CHUNK_BYTES) < num_audio_bytes:
106
+ yield self.process_chunk(audio[audio_idx : audio_idx + _CHUNK_BYTES])
107
+ audio_idx += _CHUNK_BYTES
Binary file
@@ -1,25 +1,27 @@
1
- Metadata-Version: 2.1
2
- Name: pysilero-vad
3
- Version: 1.0.0
1
+ Metadata-Version: 2.2
2
+ Name: pysilero_vad
3
+ Version: 2.0.1
4
4
  Summary: Pre-packaged voice activity detector using silero-vad
5
- Home-page: http://github.com/rhasspy/pysilero-vad
6
- Author: Michael Hansen
7
- Author-email: mike@rhasspy.org
5
+ Author-email: Michael Hansen <mike@rhasspy.org>
8
6
  License: MIT
9
- Keywords: voice activity vad
7
+ Project-URL: Source Code, http://github.com/rhasspy/pysilero-vad
8
+ Keywords: voice,activity,vad
9
+ Platform: any
10
10
  Classifier: Development Status :: 3 - Alpha
11
11
  Classifier: Intended Audience :: Developers
12
- Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
12
+ Classifier: Topic :: Text Processing :: Linguistic
13
13
  Classifier: License :: OSI Approved :: MIT License
14
- Classifier: Programming Language :: Python :: 3.7
15
14
  Classifier: Programming Language :: Python :: 3.8
16
15
  Classifier: Programming Language :: Python :: 3.9
17
16
  Classifier: Programming Language :: Python :: 3.10
18
17
  Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Requires-Python: >=3.8.0
19
21
  Description-Content-Type: text/markdown
20
22
  License-File: LICENSE.md
21
- Requires-Dist: onnxruntime <2,>=1.10.0
22
- Requires-Dist: numpy <1.26
23
+ Requires-Dist: onnxruntime<2,>=1.18.0
24
+ Requires-Dist: numpy<2
23
25
 
24
26
  # pySilero VAD
25
27
 
@@ -34,7 +36,10 @@ from pysilero_vad import SileroVoiceActivityDetector
34
36
 
35
37
  vad = SileroVoiceActivityDetector()
36
38
 
37
- # Audio must be 16Khz, 16-bit mono PCM
39
+ # Audio must be 16Khz, 16-bit mono PCM with correct chunk size
40
+ # See also: vad.chunk_samples()
41
+ assert len(audio_bytes) == vad.chunk_bytes()
42
+
38
43
  if vad(audio_bytes) >= 0.5:
39
44
  print("Speech")
40
45
  else:
@@ -0,0 +1,9 @@
1
+ pysilero_vad/__init__.py,sha256=_QtP_z0JjpOkSHMaqRFuSI9Bf0oL-k8IJE0hmZdZDLk,3433
2
+ pysilero_vad/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ pysilero_vad/models/silero_vad.onnx,sha256=a5nL_Tkka2cG-Y7BPHxQxrKZGB8kdPoFy8gEaswnQ5Y,2313101
4
+ pysilero_vad-2.0.1.dist-info/LICENSE.md,sha256=E3RtUJ105V6iJl--8gS7fNv4SoMVsCB-mIMmy1Q4cCg,1071
5
+ pysilero_vad-2.0.1.dist-info/METADATA,sha256=l2Xc-Dw2iaRvnXJ0e9lb69cUGn4LJAamNx138UAQh_A,1410
6
+ pysilero_vad-2.0.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
7
+ pysilero_vad-2.0.1.dist-info/top_level.txt,sha256=QQlOVbq_uDMukkVxjBFRi8eOwSrzJDrbP8YY1MCeMIs,13
8
+ pysilero_vad-2.0.1.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
9
+ pysilero_vad-2.0.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1 @@
1
+
@@ -1,8 +0,0 @@
1
- pysilero_vad/__init__.py,sha256=k0kb-HkhJwqD_O5YsOAQhV0Zbk7gnAy5XRTy2iiLQXY,1708
2
- pysilero_vad/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- pysilero_vad/models/silero_vad.onnx,sha256=o16_Uv085fFGmyo2FY26dhvEe5c-ozgrMYbKFbH1ryg,1807522
4
- pysilero_vad-1.0.0.dist-info/LICENSE.md,sha256=E3RtUJ105V6iJl--8gS7fNv4SoMVsCB-mIMmy1Q4cCg,1071
5
- pysilero_vad-1.0.0.dist-info/METADATA,sha256=-0F8V6kxyed4OmFwVuJkSZ9lbvnvnmEny0INFxmnHbQ,1219
6
- pysilero_vad-1.0.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
7
- pysilero_vad-1.0.0.dist-info/top_level.txt,sha256=QQlOVbq_uDMukkVxjBFRi8eOwSrzJDrbP8YY1MCeMIs,13
8
- pysilero_vad-1.0.0.dist-info/RECORD,,