pysilero-vad 1.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pysilero_vad/__init__.py +62 -15
- pysilero_vad/models/silero_vad.onnx +0 -0
- {pysilero_vad-1.0.0.dist-info → pysilero_vad-2.0.1.dist-info}/METADATA +17 -12
- pysilero_vad-2.0.1.dist-info/RECORD +9 -0
- {pysilero_vad-1.0.0.dist-info → pysilero_vad-2.0.1.dist-info}/WHEEL +1 -1
- pysilero_vad-2.0.1.dist-info/zip-safe +1 -0
- pysilero_vad-1.0.0.dist-info/RECORD +0 -8
- {pysilero_vad-1.0.0.dist-info → pysilero_vad-2.0.1.dist-info}/LICENSE.md +0 -0
- {pysilero_vad-1.0.0.dist-info → pysilero_vad-2.0.1.dist-info}/top_level.txt +0 -0
pysilero_vad/__init__.py
CHANGED
@@ -1,18 +1,25 @@
|
|
1
1
|
import logging
|
2
2
|
from pathlib import Path
|
3
|
-
from typing import Final, Union
|
3
|
+
from typing import Final, Iterable, Union
|
4
4
|
|
5
5
|
import numpy as np
|
6
6
|
import onnxruntime
|
7
7
|
|
8
|
-
_RATE: Final = 16000
|
8
|
+
_RATE: Final = 16000 # Khz
|
9
9
|
_MAX_WAV: Final = 32767
|
10
10
|
_DIR = Path(__file__).parent
|
11
11
|
_DEFAULT_ONNX_PATH = _DIR / "models" / "silero_vad.onnx"
|
12
|
+
_CONTEXT_SIZE: Final = 64 # 16Khz
|
13
|
+
_CHUNK_SAMPLES: Final = 512
|
14
|
+
_CHUNK_BYTES: Final = _CHUNK_SAMPLES * 2 # 16-bit
|
12
15
|
|
13
16
|
_LOGGER = logging.getLogger()
|
14
17
|
|
15
18
|
|
19
|
+
class InvalidChunkSizeError(Exception):
|
20
|
+
"""Error raised when chunk size is not correct."""
|
21
|
+
|
22
|
+
|
16
23
|
class SileroVoiceActivityDetector:
|
17
24
|
"""Detects speech/silence using Silero VAD.
|
18
25
|
|
@@ -30,31 +37,71 @@ class SileroVoiceActivityDetector:
|
|
30
37
|
onnx_path, providers=["CPUExecutionProvider"], sess_options=opts
|
31
38
|
)
|
32
39
|
|
33
|
-
self.
|
34
|
-
self.
|
40
|
+
self._context = np.zeros((1, _CONTEXT_SIZE), dtype=np.float32)
|
41
|
+
self._state = np.zeros((2, 1, 128), dtype=np.float32)
|
42
|
+
self._sr = np.array(_RATE, dtype=np.int64)
|
43
|
+
|
44
|
+
@staticmethod
|
45
|
+
def chunk_samples() -> int:
|
46
|
+
"""Return number of samples required for an audio chunk."""
|
47
|
+
return _CHUNK_SAMPLES
|
48
|
+
|
49
|
+
@staticmethod
|
50
|
+
def chunk_bytes() -> int:
|
51
|
+
"""Return number of bytes required for an audio chunk."""
|
52
|
+
return _CHUNK_BYTES
|
35
53
|
|
36
54
|
def reset(self) -> None:
|
37
55
|
"""Reset state."""
|
38
|
-
self.
|
39
|
-
self._c = np.zeros((2, 1, 64)).astype("float32")
|
56
|
+
self._state = np.zeros((2, 1, 128)).astype("float32")
|
40
57
|
|
41
58
|
def __call__(self, audio: bytes) -> float:
|
42
|
-
"""Return probability of speech
|
59
|
+
"""Return probability of speech [0-1] in a single audio chunk.
|
43
60
|
|
44
|
-
Audio must be 16Khz 16-bit mono PCM.
|
61
|
+
Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
|
62
|
+
"""
|
63
|
+
return self.process_chunk(audio)
|
64
|
+
|
65
|
+
def process_chunk(self, audio: bytes) -> float:
|
66
|
+
"""Return probability of speech [0-1] in a single audio chunk.
|
67
|
+
|
68
|
+
Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
|
45
69
|
"""
|
70
|
+
if len(audio) != _CHUNK_BYTES:
|
71
|
+
# Window size is fixed at 512 samples in v5
|
72
|
+
raise InvalidChunkSizeError
|
73
|
+
|
46
74
|
audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / _MAX_WAV
|
47
75
|
|
48
|
-
# Add batch dimension
|
49
|
-
audio_array = np.
|
76
|
+
# Add batch dimension and context
|
77
|
+
audio_array = np.concatenate(
|
78
|
+
(self._context, audio_array[np.newaxis, :]), axis=1
|
79
|
+
)
|
80
|
+
self._context = audio_array[:, -_CONTEXT_SIZE:]
|
50
81
|
|
82
|
+
# ort_inputs = {"input": audio_array, "state": self._state, "sr": self._sr}
|
51
83
|
ort_inputs = {
|
52
|
-
"input": audio_array,
|
53
|
-
"
|
54
|
-
"
|
55
|
-
"sr": np.array(_RATE, dtype=np.int64),
|
84
|
+
"input": audio_array[:, : _CHUNK_SAMPLES + _CONTEXT_SIZE],
|
85
|
+
"state": self._state,
|
86
|
+
"sr": self._sr,
|
56
87
|
}
|
57
88
|
ort_outs = self.session.run(None, ort_inputs)
|
58
|
-
out, self.
|
89
|
+
out, self._state = ort_outs
|
59
90
|
|
60
91
|
return out.squeeze()
|
92
|
+
|
93
|
+
def process_chunks(self, audio: bytes) -> Iterable[float]:
|
94
|
+
"""Return probability of speech in audio [0-1] for each chunk of audio.
|
95
|
+
|
96
|
+
Audio must be 16Khz 16-bit mono PCM.
|
97
|
+
"""
|
98
|
+
if len(audio) < _CHUNK_BYTES:
|
99
|
+
# Window size is fixed at 512 samples in v5
|
100
|
+
raise InvalidChunkSizeError
|
101
|
+
|
102
|
+
num_audio_bytes = len(audio)
|
103
|
+
audio_idx = 0
|
104
|
+
|
105
|
+
while (audio_idx + _CHUNK_BYTES) < num_audio_bytes:
|
106
|
+
yield self.process_chunk(audio[audio_idx : audio_idx + _CHUNK_BYTES])
|
107
|
+
audio_idx += _CHUNK_BYTES
|
Binary file
|
@@ -1,25 +1,27 @@
|
|
1
|
-
Metadata-Version: 2.
|
2
|
-
Name:
|
3
|
-
Version:
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: pysilero_vad
|
3
|
+
Version: 2.0.1
|
4
4
|
Summary: Pre-packaged voice activity detector using silero-vad
|
5
|
-
|
6
|
-
Author: Michael Hansen
|
7
|
-
Author-email: mike@rhasspy.org
|
5
|
+
Author-email: Michael Hansen <mike@rhasspy.org>
|
8
6
|
License: MIT
|
9
|
-
|
7
|
+
Project-URL: Source Code, http://github.com/rhasspy/pysilero-vad
|
8
|
+
Keywords: voice,activity,vad
|
9
|
+
Platform: any
|
10
10
|
Classifier: Development Status :: 3 - Alpha
|
11
11
|
Classifier: Intended Audience :: Developers
|
12
|
-
Classifier: Topic ::
|
12
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
13
13
|
Classifier: License :: OSI Approved :: MIT License
|
14
|
-
Classifier: Programming Language :: Python :: 3.7
|
15
14
|
Classifier: Programming Language :: Python :: 3.8
|
16
15
|
Classifier: Programming Language :: Python :: 3.9
|
17
16
|
Classifier: Programming Language :: Python :: 3.10
|
18
17
|
Classifier: Programming Language :: Python :: 3.11
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
20
|
+
Requires-Python: >=3.8.0
|
19
21
|
Description-Content-Type: text/markdown
|
20
22
|
License-File: LICENSE.md
|
21
|
-
Requires-Dist: onnxruntime
|
22
|
-
Requires-Dist: numpy
|
23
|
+
Requires-Dist: onnxruntime<2,>=1.18.0
|
24
|
+
Requires-Dist: numpy<2
|
23
25
|
|
24
26
|
# pySilero VAD
|
25
27
|
|
@@ -34,7 +36,10 @@ from pysilero_vad import SileroVoiceActivityDetector
|
|
34
36
|
|
35
37
|
vad = SileroVoiceActivityDetector()
|
36
38
|
|
37
|
-
# Audio must be 16Khz, 16-bit mono PCM
|
39
|
+
# Audio must be 16Khz, 16-bit mono PCM with correct chunk size
|
40
|
+
# See also: vad.chunk_samples()
|
41
|
+
assert len(audio_bytes) == vad.chunk_bytes()
|
42
|
+
|
38
43
|
if vad(audio_bytes) >= 0.5:
|
39
44
|
print("Speech")
|
40
45
|
else:
|
@@ -0,0 +1,9 @@
|
|
1
|
+
pysilero_vad/__init__.py,sha256=_QtP_z0JjpOkSHMaqRFuSI9Bf0oL-k8IJE0hmZdZDLk,3433
|
2
|
+
pysilero_vad/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
pysilero_vad/models/silero_vad.onnx,sha256=a5nL_Tkka2cG-Y7BPHxQxrKZGB8kdPoFy8gEaswnQ5Y,2313101
|
4
|
+
pysilero_vad-2.0.1.dist-info/LICENSE.md,sha256=E3RtUJ105V6iJl--8gS7fNv4SoMVsCB-mIMmy1Q4cCg,1071
|
5
|
+
pysilero_vad-2.0.1.dist-info/METADATA,sha256=l2Xc-Dw2iaRvnXJ0e9lb69cUGn4LJAamNx138UAQh_A,1410
|
6
|
+
pysilero_vad-2.0.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
7
|
+
pysilero_vad-2.0.1.dist-info/top_level.txt,sha256=QQlOVbq_uDMukkVxjBFRi8eOwSrzJDrbP8YY1MCeMIs,13
|
8
|
+
pysilero_vad-2.0.1.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
9
|
+
pysilero_vad-2.0.1.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -1,8 +0,0 @@
|
|
1
|
-
pysilero_vad/__init__.py,sha256=k0kb-HkhJwqD_O5YsOAQhV0Zbk7gnAy5XRTy2iiLQXY,1708
|
2
|
-
pysilero_vad/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
pysilero_vad/models/silero_vad.onnx,sha256=o16_Uv085fFGmyo2FY26dhvEe5c-ozgrMYbKFbH1ryg,1807522
|
4
|
-
pysilero_vad-1.0.0.dist-info/LICENSE.md,sha256=E3RtUJ105V6iJl--8gS7fNv4SoMVsCB-mIMmy1Q4cCg,1071
|
5
|
-
pysilero_vad-1.0.0.dist-info/METADATA,sha256=-0F8V6kxyed4OmFwVuJkSZ9lbvnvnmEny0INFxmnHbQ,1219
|
6
|
-
pysilero_vad-1.0.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
7
|
-
pysilero_vad-1.0.0.dist-info/top_level.txt,sha256=QQlOVbq_uDMukkVxjBFRi8eOwSrzJDrbP8YY1MCeMIs,13
|
8
|
-
pysilero_vad-1.0.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|