pysilero-vad 2.0.1__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Michael Hansen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,48 @@
1
+ Metadata-Version: 2.2
2
+ Name: pysilero_vad
3
+ Version: 2.0.1
4
+ Summary: Pre-packaged voice activity detector using silero-vad
5
+ Author-email: Michael Hansen <mike@rhasspy.org>
6
+ License: MIT
7
+ Project-URL: Source Code, http://github.com/rhasspy/pysilero-vad
8
+ Keywords: voice,activity,vad
9
+ Platform: any
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Topic :: Text Processing :: Linguistic
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3.8
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Requires-Python: >=3.8.0
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE.md
23
+ Requires-Dist: onnxruntime<2,>=1.18.0
24
+ Requires-Dist: numpy<2
25
+
26
+ # pySilero VAD
27
+
28
+ A pre-packaged voice activity detector using [silero-vad](https://github.com/snakers4/silero-vad).
29
+
30
+ ``` sh
31
+ pip install pysilero-vad
32
+ ```
33
+
34
+ ``` python
35
+ from pysilero_vad import SileroVoiceActivityDetector
36
+
37
+ vad = SileroVoiceActivityDetector()
38
+
39
+ # Audio must be 16Khz, 16-bit mono PCM with correct chunk size
40
+ # See also: vad.chunk_samples()
41
+ assert len(audio_bytes) == vad.chunk_bytes()
42
+
43
+ if vad(audio_bytes) >= 0.5:
44
+ print("Speech")
45
+ else:
46
+ print("Silence")
47
+ ```
48
+
@@ -0,0 +1,23 @@
1
+ # pySilero VAD
2
+
3
+ A pre-packaged voice activity detector using [silero-vad](https://github.com/snakers4/silero-vad).
4
+
5
+ ``` sh
6
+ pip install pysilero-vad
7
+ ```
8
+
9
+ ``` python
10
+ from pysilero_vad import SileroVoiceActivityDetector
11
+
12
+ vad = SileroVoiceActivityDetector()
13
+
14
+ # Audio must be 16Khz, 16-bit mono PCM with correct chunk size
15
+ # See also: vad.chunk_samples()
16
+ assert len(audio_bytes) == vad.chunk_bytes()
17
+
18
+ if vad(audio_bytes) >= 0.5:
19
+ print("Speech")
20
+ else:
21
+ print("Silence")
22
+ ```
23
+
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["setuptools>=62.3"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "pysilero_vad"
7
+ version = "2.0.1"
8
+ license = {text = "MIT"}
9
+ description = "Pre-packaged voice activity detector using silero-vad"
10
+ readme = "README.md"
11
+ authors = [
12
+ {name = "Michael Hansen", email = "mike@rhasspy.org"}
13
+ ]
14
+ keywords = ["voice", "activity", "vad"]
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Intended Audience :: Developers",
18
+ "Topic :: Text Processing :: Linguistic",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3.8",
21
+ "Programming Language :: Python :: 3.9",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ "Programming Language :: Python :: 3.13",
26
+ ]
27
+ requires-python = ">=3.8.0"
28
+ dependencies = [
29
+ "onnxruntime>=1.18.0,<2",
30
+ "numpy<2"
31
+ ]
32
+
33
+ [project.urls]
34
+ "Source Code" = "http://github.com/rhasspy/pysilero-vad"
35
+
36
+ [tool.setuptools]
37
+ platforms = ["any"]
38
+ zip-safe = true
39
+ include-package-data = true
40
+
41
+ [tool.setuptools.packages.find]
42
+ include = ["pysilero_vad"]
43
+ exclude = ["tests", "tests.*"]
@@ -0,0 +1,107 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Final, Iterable, Union
4
+
5
+ import numpy as np
6
+ import onnxruntime
7
+
8
+ _RATE: Final = 16000 # Khz
9
+ _MAX_WAV: Final = 32767
10
+ _DIR = Path(__file__).parent
11
+ _DEFAULT_ONNX_PATH = _DIR / "models" / "silero_vad.onnx"
12
+ _CONTEXT_SIZE: Final = 64 # 16Khz
13
+ _CHUNK_SAMPLES: Final = 512
14
+ _CHUNK_BYTES: Final = _CHUNK_SAMPLES * 2 # 16-bit
15
+
16
+ _LOGGER = logging.getLogger()
17
+
18
+
19
+ class InvalidChunkSizeError(Exception):
20
+ """Error raised when chunk size is not correct."""
21
+
22
+
23
+ class SileroVoiceActivityDetector:
24
+ """Detects speech/silence using Silero VAD.
25
+
26
+ https://github.com/snakers4/silero-vad
27
+ """
28
+
29
+ def __init__(self, onnx_path: Union[str, Path] = _DEFAULT_ONNX_PATH) -> None:
30
+ onnx_path = str(onnx_path)
31
+
32
+ opts = onnxruntime.SessionOptions()
33
+ opts.inter_op_num_threads = 1
34
+ opts.intra_op_num_threads = 1
35
+
36
+ self.session = onnxruntime.InferenceSession(
37
+ onnx_path, providers=["CPUExecutionProvider"], sess_options=opts
38
+ )
39
+
40
+ self._context = np.zeros((1, _CONTEXT_SIZE), dtype=np.float32)
41
+ self._state = np.zeros((2, 1, 128), dtype=np.float32)
42
+ self._sr = np.array(_RATE, dtype=np.int64)
43
+
44
+ @staticmethod
45
+ def chunk_samples() -> int:
46
+ """Return number of samples required for an audio chunk."""
47
+ return _CHUNK_SAMPLES
48
+
49
+ @staticmethod
50
+ def chunk_bytes() -> int:
51
+ """Return number of bytes required for an audio chunk."""
52
+ return _CHUNK_BYTES
53
+
54
+ def reset(self) -> None:
55
+ """Reset state."""
56
+ self._state = np.zeros((2, 1, 128)).astype("float32")
57
+
58
+ def __call__(self, audio: bytes) -> float:
59
+ """Return probability of speech [0-1] in a single audio chunk.
60
+
61
+ Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
62
+ """
63
+ return self.process_chunk(audio)
64
+
65
+ def process_chunk(self, audio: bytes) -> float:
66
+ """Return probability of speech [0-1] in a single audio chunk.
67
+
68
+ Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
69
+ """
70
+ if len(audio) != _CHUNK_BYTES:
71
+ # Window size is fixed at 512 samples in v5
72
+ raise InvalidChunkSizeError
73
+
74
+ audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / _MAX_WAV
75
+
76
+ # Add batch dimension and context
77
+ audio_array = np.concatenate(
78
+ (self._context, audio_array[np.newaxis, :]), axis=1
79
+ )
80
+ self._context = audio_array[:, -_CONTEXT_SIZE:]
81
+
82
+ # ort_inputs = {"input": audio_array, "state": self._state, "sr": self._sr}
83
+ ort_inputs = {
84
+ "input": audio_array[:, : _CHUNK_SAMPLES + _CONTEXT_SIZE],
85
+ "state": self._state,
86
+ "sr": self._sr,
87
+ }
88
+ ort_outs = self.session.run(None, ort_inputs)
89
+ out, self._state = ort_outs
90
+
91
+ return out.squeeze()
92
+
93
+ def process_chunks(self, audio: bytes) -> Iterable[float]:
94
+ """Return probability of speech in audio [0-1] for each chunk of audio.
95
+
96
+ Audio must be 16Khz 16-bit mono PCM.
97
+ """
98
+ if len(audio) < _CHUNK_BYTES:
99
+ # Window size is fixed at 512 samples in v5
100
+ raise InvalidChunkSizeError
101
+
102
+ num_audio_bytes = len(audio)
103
+ audio_idx = 0
104
+
105
+ while (audio_idx + _CHUNK_BYTES) < num_audio_bytes:
106
+ yield self.process_chunk(audio[audio_idx : audio_idx + _CHUNK_BYTES])
107
+ audio_idx += _CHUNK_BYTES
File without changes
@@ -0,0 +1,48 @@
1
+ Metadata-Version: 2.2
2
+ Name: pysilero_vad
3
+ Version: 2.0.1
4
+ Summary: Pre-packaged voice activity detector using silero-vad
5
+ Author-email: Michael Hansen <mike@rhasspy.org>
6
+ License: MIT
7
+ Project-URL: Source Code, http://github.com/rhasspy/pysilero-vad
8
+ Keywords: voice,activity,vad
9
+ Platform: any
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Topic :: Text Processing :: Linguistic
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3.8
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Requires-Python: >=3.8.0
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE.md
23
+ Requires-Dist: onnxruntime<2,>=1.18.0
24
+ Requires-Dist: numpy<2
25
+
26
+ # pySilero VAD
27
+
28
+ A pre-packaged voice activity detector using [silero-vad](https://github.com/snakers4/silero-vad).
29
+
30
+ ``` sh
31
+ pip install pysilero-vad
32
+ ```
33
+
34
+ ``` python
35
+ from pysilero_vad import SileroVoiceActivityDetector
36
+
37
+ vad = SileroVoiceActivityDetector()
38
+
39
+ # Audio must be 16Khz, 16-bit mono PCM with correct chunk size
40
+ # See also: vad.chunk_samples()
41
+ assert len(audio_bytes) == vad.chunk_bytes()
42
+
43
+ if vad(audio_bytes) >= 0.5:
44
+ print("Speech")
45
+ else:
46
+ print("Silence")
47
+ ```
48
+
@@ -0,0 +1,14 @@
1
+ LICENSE.md
2
+ README.md
3
+ pyproject.toml
4
+ setup.cfg
5
+ pysilero_vad/__init__.py
6
+ pysilero_vad/py.typed
7
+ pysilero_vad.egg-info/PKG-INFO
8
+ pysilero_vad.egg-info/SOURCES.txt
9
+ pysilero_vad.egg-info/dependency_links.txt
10
+ pysilero_vad.egg-info/requires.txt
11
+ pysilero_vad.egg-info/top_level.txt
12
+ pysilero_vad.egg-info/zip-safe
13
+ pysilero_vad/models/silero_vad.onnx
14
+ tests/test_vad.py
@@ -0,0 +1,2 @@
1
+ onnxruntime<2,>=1.18.0
2
+ numpy<2
@@ -0,0 +1 @@
1
+ pysilero_vad
@@ -0,0 +1,21 @@
1
+ [flake8]
2
+ max-line-length = 88
3
+ ignore =
4
+ E501,
5
+ W503,
6
+ E203,
7
+ D202,
8
+ W504
9
+
10
+ [isort]
11
+ multi_line_output = 3
12
+ include_trailing_comma = True
13
+ force_grid_wrap = 0
14
+ use_parentheses = True
15
+ line_length = 88
16
+ indent = " "
17
+
18
+ [egg_info]
19
+ tag_build =
20
+ tag_date = 0
21
+
@@ -0,0 +1,44 @@
1
+ import wave
2
+ from pathlib import Path
3
+ from typing import Union
4
+
5
+ import pytest
6
+ from pysilero_vad import SileroVoiceActivityDetector, InvalidChunkSizeError
7
+
8
+ _DIR = Path(__file__).parent
9
+
10
+
11
+ def _load_wav(wav_path: Union[str, Path]) -> bytes:
12
+ """Return audio bytes from a WAV file."""
13
+ with wave.open(str(wav_path), "rb") as wav_file:
14
+ assert wav_file.getframerate() == 16000
15
+ assert wav_file.getsampwidth() == 2
16
+ assert wav_file.getnchannels() == 1
17
+
18
+ return wav_file.readframes(wav_file.getnframes())
19
+
20
+
21
+ def test_silence() -> None:
22
+ """Test VAD on recorded silence."""
23
+ vad = SileroVoiceActivityDetector()
24
+ assert all(p < 0.5 for p in vad.process_chunks(_load_wav(_DIR / "silence.wav")))
25
+
26
+
27
+ def test_speech() -> None:
28
+ """Test VAD on recorded speech."""
29
+ vad = SileroVoiceActivityDetector()
30
+ assert any(p >= 0.5 for p in vad.process_chunks(_load_wav(_DIR / "speech.wav")))
31
+
32
+ def test_invalid_chunk_size() -> None:
33
+ """Test that chunk size must be 512 samples."""
34
+ vad = SileroVoiceActivityDetector()
35
+
36
+ # Should work
37
+ vad(bytes(SileroVoiceActivityDetector.chunk_bytes()))
38
+
39
+ # Should fail
40
+ with pytest.raises(InvalidChunkSizeError):
41
+ vad(bytes(SileroVoiceActivityDetector.chunk_bytes() * 2))
42
+
43
+ with pytest.raises(InvalidChunkSizeError):
44
+ vad(bytes(SileroVoiceActivityDetector.chunk_bytes() // 2))