pysilero-vad 2.0.1__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- pysilero_vad-2.0.1/LICENSE.md +21 -0
- pysilero_vad-2.0.1/PKG-INFO +48 -0
- pysilero_vad-2.0.1/README.md +23 -0
- pysilero_vad-2.0.1/pyproject.toml +43 -0
- pysilero_vad-2.0.1/pysilero_vad/__init__.py +107 -0
- pysilero_vad-2.0.1/pysilero_vad/models/silero_vad.onnx +0 -0
- pysilero_vad-2.0.1/pysilero_vad/py.typed +0 -0
- pysilero_vad-2.0.1/pysilero_vad.egg-info/PKG-INFO +48 -0
- pysilero_vad-2.0.1/pysilero_vad.egg-info/SOURCES.txt +14 -0
- pysilero_vad-2.0.1/pysilero_vad.egg-info/dependency_links.txt +1 -0
- pysilero_vad-2.0.1/pysilero_vad.egg-info/requires.txt +2 -0
- pysilero_vad-2.0.1/pysilero_vad.egg-info/top_level.txt +1 -0
- pysilero_vad-2.0.1/pysilero_vad.egg-info/zip-safe +1 -0
- pysilero_vad-2.0.1/setup.cfg +21 -0
- pysilero_vad-2.0.1/tests/test_vad.py +44 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2023 Michael Hansen
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,48 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: pysilero_vad
|
3
|
+
Version: 2.0.1
|
4
|
+
Summary: Pre-packaged voice activity detector using silero-vad
|
5
|
+
Author-email: Michael Hansen <mike@rhasspy.org>
|
6
|
+
License: MIT
|
7
|
+
Project-URL: Source Code, http://github.com/rhasspy/pysilero-vad
|
8
|
+
Keywords: voice,activity,vad
|
9
|
+
Platform: any
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
11
|
+
Classifier: Intended Audience :: Developers
|
12
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
20
|
+
Requires-Python: >=3.8.0
|
21
|
+
Description-Content-Type: text/markdown
|
22
|
+
License-File: LICENSE.md
|
23
|
+
Requires-Dist: onnxruntime<2,>=1.18.0
|
24
|
+
Requires-Dist: numpy<2
|
25
|
+
|
26
|
+
# pySilero VAD
|
27
|
+
|
28
|
+
A pre-packaged voice activity detector using [silero-vad](https://github.com/snakers4/silero-vad).
|
29
|
+
|
30
|
+
``` sh
|
31
|
+
pip install pysilero-vad
|
32
|
+
```
|
33
|
+
|
34
|
+
``` python
|
35
|
+
from pysilero_vad import SileroVoiceActivityDetector
|
36
|
+
|
37
|
+
vad = SileroVoiceActivityDetector()
|
38
|
+
|
39
|
+
# Audio must be 16Khz, 16-bit mono PCM with correct chunk size
|
40
|
+
# See also: vad.chunk_samples()
|
41
|
+
assert len(audio_bytes) == vad.chunk_bytes()
|
42
|
+
|
43
|
+
if vad(audio_bytes) >= 0.5:
|
44
|
+
print("Speech")
|
45
|
+
else:
|
46
|
+
print("Silence")
|
47
|
+
```
|
48
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# pySilero VAD
|
2
|
+
|
3
|
+
A pre-packaged voice activity detector using [silero-vad](https://github.com/snakers4/silero-vad).
|
4
|
+
|
5
|
+
``` sh
|
6
|
+
pip install pysilero-vad
|
7
|
+
```
|
8
|
+
|
9
|
+
``` python
|
10
|
+
from pysilero_vad import SileroVoiceActivityDetector
|
11
|
+
|
12
|
+
vad = SileroVoiceActivityDetector()
|
13
|
+
|
14
|
+
# Audio must be 16Khz, 16-bit mono PCM with correct chunk size
|
15
|
+
# See also: vad.chunk_samples()
|
16
|
+
assert len(audio_bytes) == vad.chunk_bytes()
|
17
|
+
|
18
|
+
if vad(audio_bytes) >= 0.5:
|
19
|
+
print("Speech")
|
20
|
+
else:
|
21
|
+
print("Silence")
|
22
|
+
```
|
23
|
+
|
@@ -0,0 +1,43 @@
|
|
1
|
+
[build-system]
|
2
|
+
requires = ["setuptools>=62.3"]
|
3
|
+
build-backend = "setuptools.build_meta"
|
4
|
+
|
5
|
+
[project]
|
6
|
+
name = "pysilero_vad"
|
7
|
+
version = "2.0.1"
|
8
|
+
license = {text = "MIT"}
|
9
|
+
description = "Pre-packaged voice activity detector using silero-vad"
|
10
|
+
readme = "README.md"
|
11
|
+
authors = [
|
12
|
+
{name = "Michael Hansen", email = "mike@rhasspy.org"}
|
13
|
+
]
|
14
|
+
keywords = ["voice", "activity", "vad"]
|
15
|
+
classifiers = [
|
16
|
+
"Development Status :: 3 - Alpha",
|
17
|
+
"Intended Audience :: Developers",
|
18
|
+
"Topic :: Text Processing :: Linguistic",
|
19
|
+
"License :: OSI Approved :: MIT License",
|
20
|
+
"Programming Language :: Python :: 3.8",
|
21
|
+
"Programming Language :: Python :: 3.9",
|
22
|
+
"Programming Language :: Python :: 3.10",
|
23
|
+
"Programming Language :: Python :: 3.11",
|
24
|
+
"Programming Language :: Python :: 3.12",
|
25
|
+
"Programming Language :: Python :: 3.13",
|
26
|
+
]
|
27
|
+
requires-python = ">=3.8.0"
|
28
|
+
dependencies = [
|
29
|
+
"onnxruntime>=1.18.0,<2",
|
30
|
+
"numpy<2"
|
31
|
+
]
|
32
|
+
|
33
|
+
[project.urls]
|
34
|
+
"Source Code" = "http://github.com/rhasspy/pysilero-vad"
|
35
|
+
|
36
|
+
[tool.setuptools]
|
37
|
+
platforms = ["any"]
|
38
|
+
zip-safe = true
|
39
|
+
include-package-data = true
|
40
|
+
|
41
|
+
[tool.setuptools.packages.find]
|
42
|
+
include = ["pysilero_vad"]
|
43
|
+
exclude = ["tests", "tests.*"]
|
@@ -0,0 +1,107 @@
|
|
1
|
+
import logging
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Final, Iterable, Union
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import onnxruntime
|
7
|
+
|
8
|
+
_RATE: Final = 16000 # Khz
|
9
|
+
_MAX_WAV: Final = 32767
|
10
|
+
_DIR = Path(__file__).parent
|
11
|
+
_DEFAULT_ONNX_PATH = _DIR / "models" / "silero_vad.onnx"
|
12
|
+
_CONTEXT_SIZE: Final = 64 # 16Khz
|
13
|
+
_CHUNK_SAMPLES: Final = 512
|
14
|
+
_CHUNK_BYTES: Final = _CHUNK_SAMPLES * 2 # 16-bit
|
15
|
+
|
16
|
+
_LOGGER = logging.getLogger()
|
17
|
+
|
18
|
+
|
19
|
+
class InvalidChunkSizeError(Exception):
|
20
|
+
"""Error raised when chunk size is not correct."""
|
21
|
+
|
22
|
+
|
23
|
+
class SileroVoiceActivityDetector:
|
24
|
+
"""Detects speech/silence using Silero VAD.
|
25
|
+
|
26
|
+
https://github.com/snakers4/silero-vad
|
27
|
+
"""
|
28
|
+
|
29
|
+
def __init__(self, onnx_path: Union[str, Path] = _DEFAULT_ONNX_PATH) -> None:
|
30
|
+
onnx_path = str(onnx_path)
|
31
|
+
|
32
|
+
opts = onnxruntime.SessionOptions()
|
33
|
+
opts.inter_op_num_threads = 1
|
34
|
+
opts.intra_op_num_threads = 1
|
35
|
+
|
36
|
+
self.session = onnxruntime.InferenceSession(
|
37
|
+
onnx_path, providers=["CPUExecutionProvider"], sess_options=opts
|
38
|
+
)
|
39
|
+
|
40
|
+
self._context = np.zeros((1, _CONTEXT_SIZE), dtype=np.float32)
|
41
|
+
self._state = np.zeros((2, 1, 128), dtype=np.float32)
|
42
|
+
self._sr = np.array(_RATE, dtype=np.int64)
|
43
|
+
|
44
|
+
@staticmethod
|
45
|
+
def chunk_samples() -> int:
|
46
|
+
"""Return number of samples required for an audio chunk."""
|
47
|
+
return _CHUNK_SAMPLES
|
48
|
+
|
49
|
+
@staticmethod
|
50
|
+
def chunk_bytes() -> int:
|
51
|
+
"""Return number of bytes required for an audio chunk."""
|
52
|
+
return _CHUNK_BYTES
|
53
|
+
|
54
|
+
def reset(self) -> None:
|
55
|
+
"""Reset state."""
|
56
|
+
self._state = np.zeros((2, 1, 128)).astype("float32")
|
57
|
+
|
58
|
+
def __call__(self, audio: bytes) -> float:
|
59
|
+
"""Return probability of speech [0-1] in a single audio chunk.
|
60
|
+
|
61
|
+
Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
|
62
|
+
"""
|
63
|
+
return self.process_chunk(audio)
|
64
|
+
|
65
|
+
def process_chunk(self, audio: bytes) -> float:
|
66
|
+
"""Return probability of speech [0-1] in a single audio chunk.
|
67
|
+
|
68
|
+
Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
|
69
|
+
"""
|
70
|
+
if len(audio) != _CHUNK_BYTES:
|
71
|
+
# Window size is fixed at 512 samples in v5
|
72
|
+
raise InvalidChunkSizeError
|
73
|
+
|
74
|
+
audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / _MAX_WAV
|
75
|
+
|
76
|
+
# Add batch dimension and context
|
77
|
+
audio_array = np.concatenate(
|
78
|
+
(self._context, audio_array[np.newaxis, :]), axis=1
|
79
|
+
)
|
80
|
+
self._context = audio_array[:, -_CONTEXT_SIZE:]
|
81
|
+
|
82
|
+
# ort_inputs = {"input": audio_array, "state": self._state, "sr": self._sr}
|
83
|
+
ort_inputs = {
|
84
|
+
"input": audio_array[:, : _CHUNK_SAMPLES + _CONTEXT_SIZE],
|
85
|
+
"state": self._state,
|
86
|
+
"sr": self._sr,
|
87
|
+
}
|
88
|
+
ort_outs = self.session.run(None, ort_inputs)
|
89
|
+
out, self._state = ort_outs
|
90
|
+
|
91
|
+
return out.squeeze()
|
92
|
+
|
93
|
+
def process_chunks(self, audio: bytes) -> Iterable[float]:
|
94
|
+
"""Return probability of speech in audio [0-1] for each chunk of audio.
|
95
|
+
|
96
|
+
Audio must be 16Khz 16-bit mono PCM.
|
97
|
+
"""
|
98
|
+
if len(audio) < _CHUNK_BYTES:
|
99
|
+
# Window size is fixed at 512 samples in v5
|
100
|
+
raise InvalidChunkSizeError
|
101
|
+
|
102
|
+
num_audio_bytes = len(audio)
|
103
|
+
audio_idx = 0
|
104
|
+
|
105
|
+
while (audio_idx + _CHUNK_BYTES) < num_audio_bytes:
|
106
|
+
yield self.process_chunk(audio[audio_idx : audio_idx + _CHUNK_BYTES])
|
107
|
+
audio_idx += _CHUNK_BYTES
|
Binary file
|
File without changes
|
@@ -0,0 +1,48 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: pysilero_vad
|
3
|
+
Version: 2.0.1
|
4
|
+
Summary: Pre-packaged voice activity detector using silero-vad
|
5
|
+
Author-email: Michael Hansen <mike@rhasspy.org>
|
6
|
+
License: MIT
|
7
|
+
Project-URL: Source Code, http://github.com/rhasspy/pysilero-vad
|
8
|
+
Keywords: voice,activity,vad
|
9
|
+
Platform: any
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
11
|
+
Classifier: Intended Audience :: Developers
|
12
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
20
|
+
Requires-Python: >=3.8.0
|
21
|
+
Description-Content-Type: text/markdown
|
22
|
+
License-File: LICENSE.md
|
23
|
+
Requires-Dist: onnxruntime<2,>=1.18.0
|
24
|
+
Requires-Dist: numpy<2
|
25
|
+
|
26
|
+
# pySilero VAD
|
27
|
+
|
28
|
+
A pre-packaged voice activity detector using [silero-vad](https://github.com/snakers4/silero-vad).
|
29
|
+
|
30
|
+
``` sh
|
31
|
+
pip install pysilero-vad
|
32
|
+
```
|
33
|
+
|
34
|
+
``` python
|
35
|
+
from pysilero_vad import SileroVoiceActivityDetector
|
36
|
+
|
37
|
+
vad = SileroVoiceActivityDetector()
|
38
|
+
|
39
|
+
# Audio must be 16Khz, 16-bit mono PCM with correct chunk size
|
40
|
+
# See also: vad.chunk_samples()
|
41
|
+
assert len(audio_bytes) == vad.chunk_bytes()
|
42
|
+
|
43
|
+
if vad(audio_bytes) >= 0.5:
|
44
|
+
print("Speech")
|
45
|
+
else:
|
46
|
+
print("Silence")
|
47
|
+
```
|
48
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
LICENSE.md
|
2
|
+
README.md
|
3
|
+
pyproject.toml
|
4
|
+
setup.cfg
|
5
|
+
pysilero_vad/__init__.py
|
6
|
+
pysilero_vad/py.typed
|
7
|
+
pysilero_vad.egg-info/PKG-INFO
|
8
|
+
pysilero_vad.egg-info/SOURCES.txt
|
9
|
+
pysilero_vad.egg-info/dependency_links.txt
|
10
|
+
pysilero_vad.egg-info/requires.txt
|
11
|
+
pysilero_vad.egg-info/top_level.txt
|
12
|
+
pysilero_vad.egg-info/zip-safe
|
13
|
+
pysilero_vad/models/silero_vad.onnx
|
14
|
+
tests/test_vad.py
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
pysilero_vad
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1,21 @@
|
|
1
|
+
[flake8]
|
2
|
+
max-line-length = 88
|
3
|
+
ignore =
|
4
|
+
E501,
|
5
|
+
W503,
|
6
|
+
E203,
|
7
|
+
D202,
|
8
|
+
W504
|
9
|
+
|
10
|
+
[isort]
|
11
|
+
multi_line_output = 3
|
12
|
+
include_trailing_comma = True
|
13
|
+
force_grid_wrap = 0
|
14
|
+
use_parentheses = True
|
15
|
+
line_length = 88
|
16
|
+
indent = " "
|
17
|
+
|
18
|
+
[egg_info]
|
19
|
+
tag_build =
|
20
|
+
tag_date = 0
|
21
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
import wave
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Union
|
4
|
+
|
5
|
+
import pytest
|
6
|
+
from pysilero_vad import SileroVoiceActivityDetector, InvalidChunkSizeError
|
7
|
+
|
8
|
+
_DIR = Path(__file__).parent
|
9
|
+
|
10
|
+
|
11
|
+
def _load_wav(wav_path: Union[str, Path]) -> bytes:
|
12
|
+
"""Return audio bytes from a WAV file."""
|
13
|
+
with wave.open(str(wav_path), "rb") as wav_file:
|
14
|
+
assert wav_file.getframerate() == 16000
|
15
|
+
assert wav_file.getsampwidth() == 2
|
16
|
+
assert wav_file.getnchannels() == 1
|
17
|
+
|
18
|
+
return wav_file.readframes(wav_file.getnframes())
|
19
|
+
|
20
|
+
|
21
|
+
def test_silence() -> None:
|
22
|
+
"""Test VAD on recorded silence."""
|
23
|
+
vad = SileroVoiceActivityDetector()
|
24
|
+
assert all(p < 0.5 for p in vad.process_chunks(_load_wav(_DIR / "silence.wav")))
|
25
|
+
|
26
|
+
|
27
|
+
def test_speech() -> None:
|
28
|
+
"""Test VAD on recorded speech."""
|
29
|
+
vad = SileroVoiceActivityDetector()
|
30
|
+
assert any(p >= 0.5 for p in vad.process_chunks(_load_wav(_DIR / "speech.wav")))
|
31
|
+
|
32
|
+
def test_invalid_chunk_size() -> None:
|
33
|
+
"""Test that chunk size must be 512 samples."""
|
34
|
+
vad = SileroVoiceActivityDetector()
|
35
|
+
|
36
|
+
# Should work
|
37
|
+
vad(bytes(SileroVoiceActivityDetector.chunk_bytes()))
|
38
|
+
|
39
|
+
# Should fail
|
40
|
+
with pytest.raises(InvalidChunkSizeError):
|
41
|
+
vad(bytes(SileroVoiceActivityDetector.chunk_bytes() * 2))
|
42
|
+
|
43
|
+
with pytest.raises(InvalidChunkSizeError):
|
44
|
+
vad(bytes(SileroVoiceActivityDetector.chunk_bytes() // 2))
|