pysilero-vad 1.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- pysilero_vad/__init__.py +62 -15
- pysilero_vad/models/silero_vad.onnx +0 -0
- {pysilero_vad-1.0.0.dist-info → pysilero_vad-2.0.1.dist-info}/METADATA +17 -12
- pysilero_vad-2.0.1.dist-info/RECORD +9 -0
- {pysilero_vad-1.0.0.dist-info → pysilero_vad-2.0.1.dist-info}/WHEEL +1 -1
- pysilero_vad-2.0.1.dist-info/zip-safe +1 -0
- pysilero_vad-1.0.0.dist-info/RECORD +0 -8
- {pysilero_vad-1.0.0.dist-info → pysilero_vad-2.0.1.dist-info}/LICENSE.md +0 -0
- {pysilero_vad-1.0.0.dist-info → pysilero_vad-2.0.1.dist-info}/top_level.txt +0 -0
pysilero_vad/__init__.py
CHANGED
@@ -1,18 +1,25 @@
|
|
1
1
|
import logging
|
2
2
|
from pathlib import Path
|
3
|
-
from typing import Final, Union
|
3
|
+
from typing import Final, Iterable, Union
|
4
4
|
|
5
5
|
import numpy as np
|
6
6
|
import onnxruntime
|
7
7
|
|
8
|
-
_RATE: Final = 16000
|
8
|
+
_RATE: Final = 16000 # Khz
|
9
9
|
_MAX_WAV: Final = 32767
|
10
10
|
_DIR = Path(__file__).parent
|
11
11
|
_DEFAULT_ONNX_PATH = _DIR / "models" / "silero_vad.onnx"
|
12
|
+
_CONTEXT_SIZE: Final = 64 # 16Khz
|
13
|
+
_CHUNK_SAMPLES: Final = 512
|
14
|
+
_CHUNK_BYTES: Final = _CHUNK_SAMPLES * 2 # 16-bit
|
12
15
|
|
13
16
|
_LOGGER = logging.getLogger()
|
14
17
|
|
15
18
|
|
19
|
+
class InvalidChunkSizeError(Exception):
|
20
|
+
"""Error raised when chunk size is not correct."""
|
21
|
+
|
22
|
+
|
16
23
|
class SileroVoiceActivityDetector:
|
17
24
|
"""Detects speech/silence using Silero VAD.
|
18
25
|
|
@@ -30,31 +37,71 @@ class SileroVoiceActivityDetector:
|
|
30
37
|
onnx_path, providers=["CPUExecutionProvider"], sess_options=opts
|
31
38
|
)
|
32
39
|
|
33
|
-
self.
|
34
|
-
self.
|
40
|
+
self._context = np.zeros((1, _CONTEXT_SIZE), dtype=np.float32)
|
41
|
+
self._state = np.zeros((2, 1, 128), dtype=np.float32)
|
42
|
+
self._sr = np.array(_RATE, dtype=np.int64)
|
43
|
+
|
44
|
+
@staticmethod
|
45
|
+
def chunk_samples() -> int:
|
46
|
+
"""Return number of samples required for an audio chunk."""
|
47
|
+
return _CHUNK_SAMPLES
|
48
|
+
|
49
|
+
@staticmethod
|
50
|
+
def chunk_bytes() -> int:
|
51
|
+
"""Return number of bytes required for an audio chunk."""
|
52
|
+
return _CHUNK_BYTES
|
35
53
|
|
36
54
|
def reset(self) -> None:
|
37
55
|
"""Reset state."""
|
38
|
-
self.
|
39
|
-
self._c = np.zeros((2, 1, 64)).astype("float32")
|
56
|
+
self._state = np.zeros((2, 1, 128)).astype("float32")
|
40
57
|
|
41
58
|
def __call__(self, audio: bytes) -> float:
|
42
|
-
"""Return probability of speech
|
59
|
+
"""Return probability of speech [0-1] in a single audio chunk.
|
43
60
|
|
44
|
-
Audio must be 16Khz 16-bit mono PCM.
|
61
|
+
Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
|
62
|
+
"""
|
63
|
+
return self.process_chunk(audio)
|
64
|
+
|
65
|
+
def process_chunk(self, audio: bytes) -> float:
|
66
|
+
"""Return probability of speech [0-1] in a single audio chunk.
|
67
|
+
|
68
|
+
Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
|
45
69
|
"""
|
70
|
+
if len(audio) != _CHUNK_BYTES:
|
71
|
+
# Window size is fixed at 512 samples in v5
|
72
|
+
raise InvalidChunkSizeError
|
73
|
+
|
46
74
|
audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / _MAX_WAV
|
47
75
|
|
48
|
-
# Add batch dimension
|
49
|
-
audio_array = np.
|
76
|
+
# Add batch dimension and context
|
77
|
+
audio_array = np.concatenate(
|
78
|
+
(self._context, audio_array[np.newaxis, :]), axis=1
|
79
|
+
)
|
80
|
+
self._context = audio_array[:, -_CONTEXT_SIZE:]
|
50
81
|
|
82
|
+
# ort_inputs = {"input": audio_array, "state": self._state, "sr": self._sr}
|
51
83
|
ort_inputs = {
|
52
|
-
"input": audio_array,
|
53
|
-
"
|
54
|
-
"
|
55
|
-
"sr": np.array(_RATE, dtype=np.int64),
|
84
|
+
"input": audio_array[:, : _CHUNK_SAMPLES + _CONTEXT_SIZE],
|
85
|
+
"state": self._state,
|
86
|
+
"sr": self._sr,
|
56
87
|
}
|
57
88
|
ort_outs = self.session.run(None, ort_inputs)
|
58
|
-
out, self.
|
89
|
+
out, self._state = ort_outs
|
59
90
|
|
60
91
|
return out.squeeze()
|
92
|
+
|
93
|
+
def process_chunks(self, audio: bytes) -> Iterable[float]:
|
94
|
+
"""Return probability of speech in audio [0-1] for each chunk of audio.
|
95
|
+
|
96
|
+
Audio must be 16Khz 16-bit mono PCM.
|
97
|
+
"""
|
98
|
+
if len(audio) < _CHUNK_BYTES:
|
99
|
+
# Window size is fixed at 512 samples in v5
|
100
|
+
raise InvalidChunkSizeError
|
101
|
+
|
102
|
+
num_audio_bytes = len(audio)
|
103
|
+
audio_idx = 0
|
104
|
+
|
105
|
+
while (audio_idx + _CHUNK_BYTES) < num_audio_bytes:
|
106
|
+
yield self.process_chunk(audio[audio_idx : audio_idx + _CHUNK_BYTES])
|
107
|
+
audio_idx += _CHUNK_BYTES
|
Binary file
|
@@ -1,25 +1,27 @@
|
|
1
|
-
Metadata-Version: 2.
|
2
|
-
Name:
|
3
|
-
Version:
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: pysilero_vad
|
3
|
+
Version: 2.0.1
|
4
4
|
Summary: Pre-packaged voice activity detector using silero-vad
|
5
|
-
|
6
|
-
Author: Michael Hansen
|
7
|
-
Author-email: mike@rhasspy.org
|
5
|
+
Author-email: Michael Hansen <mike@rhasspy.org>
|
8
6
|
License: MIT
|
9
|
-
|
7
|
+
Project-URL: Source Code, http://github.com/rhasspy/pysilero-vad
|
8
|
+
Keywords: voice,activity,vad
|
9
|
+
Platform: any
|
10
10
|
Classifier: Development Status :: 3 - Alpha
|
11
11
|
Classifier: Intended Audience :: Developers
|
12
|
-
Classifier: Topic ::
|
12
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
13
13
|
Classifier: License :: OSI Approved :: MIT License
|
14
|
-
Classifier: Programming Language :: Python :: 3.7
|
15
14
|
Classifier: Programming Language :: Python :: 3.8
|
16
15
|
Classifier: Programming Language :: Python :: 3.9
|
17
16
|
Classifier: Programming Language :: Python :: 3.10
|
18
17
|
Classifier: Programming Language :: Python :: 3.11
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
20
|
+
Requires-Python: >=3.8.0
|
19
21
|
Description-Content-Type: text/markdown
|
20
22
|
License-File: LICENSE.md
|
21
|
-
Requires-Dist: onnxruntime
|
22
|
-
Requires-Dist: numpy
|
23
|
+
Requires-Dist: onnxruntime<2,>=1.18.0
|
24
|
+
Requires-Dist: numpy<2
|
23
25
|
|
24
26
|
# pySilero VAD
|
25
27
|
|
@@ -34,7 +36,10 @@ from pysilero_vad import SileroVoiceActivityDetector
|
|
34
36
|
|
35
37
|
vad = SileroVoiceActivityDetector()
|
36
38
|
|
37
|
-
# Audio must be 16Khz, 16-bit mono PCM
|
39
|
+
# Audio must be 16Khz, 16-bit mono PCM with correct chunk size
|
40
|
+
# See also: vad.chunk_samples()
|
41
|
+
assert len(audio_bytes) == vad.chunk_bytes()
|
42
|
+
|
38
43
|
if vad(audio_bytes) >= 0.5:
|
39
44
|
print("Speech")
|
40
45
|
else:
|
@@ -0,0 +1,9 @@
|
|
1
|
+
pysilero_vad/__init__.py,sha256=_QtP_z0JjpOkSHMaqRFuSI9Bf0oL-k8IJE0hmZdZDLk,3433
|
2
|
+
pysilero_vad/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
pysilero_vad/models/silero_vad.onnx,sha256=a5nL_Tkka2cG-Y7BPHxQxrKZGB8kdPoFy8gEaswnQ5Y,2313101
|
4
|
+
pysilero_vad-2.0.1.dist-info/LICENSE.md,sha256=E3RtUJ105V6iJl--8gS7fNv4SoMVsCB-mIMmy1Q4cCg,1071
|
5
|
+
pysilero_vad-2.0.1.dist-info/METADATA,sha256=l2Xc-Dw2iaRvnXJ0e9lb69cUGn4LJAamNx138UAQh_A,1410
|
6
|
+
pysilero_vad-2.0.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
7
|
+
pysilero_vad-2.0.1.dist-info/top_level.txt,sha256=QQlOVbq_uDMukkVxjBFRi8eOwSrzJDrbP8YY1MCeMIs,13
|
8
|
+
pysilero_vad-2.0.1.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
9
|
+
pysilero_vad-2.0.1.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -1,8 +0,0 @@
|
|
1
|
-
pysilero_vad/__init__.py,sha256=k0kb-HkhJwqD_O5YsOAQhV0Zbk7gnAy5XRTy2iiLQXY,1708
|
2
|
-
pysilero_vad/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
pysilero_vad/models/silero_vad.onnx,sha256=o16_Uv085fFGmyo2FY26dhvEe5c-ozgrMYbKFbH1ryg,1807522
|
4
|
-
pysilero_vad-1.0.0.dist-info/LICENSE.md,sha256=E3RtUJ105V6iJl--8gS7fNv4SoMVsCB-mIMmy1Q4cCg,1071
|
5
|
-
pysilero_vad-1.0.0.dist-info/METADATA,sha256=-0F8V6kxyed4OmFwVuJkSZ9lbvnvnmEny0INFxmnHbQ,1219
|
6
|
-
pysilero_vad-1.0.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
7
|
-
pysilero_vad-1.0.0.dist-info/top_level.txt,sha256=QQlOVbq_uDMukkVxjBFRi8eOwSrzJDrbP8YY1MCeMIs,13
|
8
|
-
pysilero_vad-1.0.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|