speaker-detector 0.1.4__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {speaker_detector-0.1.4 → speaker_detector-0.1.6}/PKG-INFO +28 -3
- {speaker_detector-0.1.4 → speaker_detector-0.1.6}/README.md +21 -1
- {speaker_detector-0.1.4 → speaker_detector-0.1.6}/pyproject.toml +17 -4
- {speaker_detector-0.1.4 → speaker_detector-0.1.6}/speaker_detector/cli.py +12 -26
- speaker_detector-0.1.6/speaker_detector/core.py +116 -0
- speaker_detector-0.1.6/speaker_detector/model/classifier.ckpt +0 -0
- speaker_detector-0.1.6/speaker_detector/model/embedding_model.ckpt +0 -0
- speaker_detector-0.1.6/speaker_detector/model/hyperparams.yaml +58 -0
- speaker_detector-0.1.6/speaker_detector/model/label_encoder.ckpt +7207 -0
- speaker_detector-0.1.6/speaker_detector/model/mean_var_norm_emb.ckpt +0 -0
- speaker_detector-0.1.6/speaker_detector/server copy.py +296 -0
- speaker_detector-0.1.6/speaker_detector/server.py +82 -0
- speaker_detector-0.1.6/speaker_detector/state.py +69 -0
- speaker_detector-0.1.6/speaker_detector/web/static/favicon.ico +0 -0
- speaker_detector-0.1.6/speaker_detector/web/static/index.html +29 -0
- speaker_detector-0.1.6/speaker_detector/web/static/scripts/loader copy.js +10 -0
- speaker_detector-0.1.6/speaker_detector/web/static/scripts/loader.js +14 -0
- speaker_detector-0.1.6/speaker_detector/web/static/scripts/script copy.js +954 -0
- speaker_detector-0.1.6/speaker_detector/web/static/scripts/script.js +22 -0
- speaker_detector-0.1.6/speaker_detector/web/static/style.css +133 -0
- {speaker_detector-0.1.4 → speaker_detector-0.1.6}/speaker_detector.egg-info/PKG-INFO +28 -3
- speaker_detector-0.1.6/speaker_detector.egg-info/SOURCES.txt +28 -0
- speaker_detector-0.1.6/speaker_detector.egg-info/requires.txt +9 -0
- speaker_detector-0.1.4/speaker_detector/analyze.py +0 -59
- speaker_detector-0.1.4/speaker_detector/combine.py +0 -22
- speaker_detector-0.1.4/speaker_detector/core.py +0 -103
- speaker_detector-0.1.4/speaker_detector/export_embeddings.py +0 -41
- speaker_detector-0.1.4/speaker_detector/export_model.py +0 -40
- speaker_detector-0.1.4/speaker_detector/generate_summary.py +0 -110
- speaker_detector-0.1.4/speaker_detector.egg-info/SOURCES.txt +0 -18
- speaker_detector-0.1.4/speaker_detector.egg-info/requires.txt +0 -4
- {speaker_detector-0.1.4 → speaker_detector-0.1.6}/setup.cfg +0 -0
- {speaker_detector-0.1.4 → speaker_detector-0.1.6}/speaker_detector/__main__.py +0 -0
- {speaker_detector-0.1.4/speaker_detector → speaker_detector-0.1.6/speaker_detector/model}/ECAPA_TDNN.py +0 -0
- {speaker_detector-0.1.4/speaker_detector → speaker_detector-0.1.6/speaker_detector/web/static}/__init__.py +0 -0
- {speaker_detector-0.1.4 → speaker_detector-0.1.6}/speaker_detector.egg-info/dependency_links.txt +0 -0
- {speaker_detector-0.1.4 → speaker_detector-0.1.6}/speaker_detector.egg-info/entry_points.txt +0 -0
- {speaker_detector-0.1.4 → speaker_detector-0.1.6}/speaker_detector.egg-info/top_level.txt +0 -0
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: speaker-detector
|
3
|
-
Version: 0.1.
|
4
|
-
Summary: A CLI tool for speaker enrollment and identification using SpeechBrain.
|
3
|
+
Version: 0.1.6
|
4
|
+
Summary: A CLI + Web tool for speaker enrollment and identification using SpeechBrain.
|
5
5
|
Author-email: Lara Whybrow <lara.whybrow@gmail.com>
|
6
6
|
License: MIT
|
7
7
|
Project-URL: Homepage, https://github.com/P0llen/speaker-detector
|
@@ -22,7 +22,14 @@ Description-Content-Type: text/markdown
|
|
22
22
|
Requires-Dist: torch
|
23
23
|
Requires-Dist: torchaudio
|
24
24
|
Requires-Dist: speechbrain
|
25
|
-
Requires-Dist:
|
25
|
+
Requires-Dist: flask
|
26
|
+
Requires-Dist: flask-cors
|
27
|
+
Requires-Dist: numpy
|
28
|
+
Requires-Dist: sounddevice
|
29
|
+
Requires-Dist: soundfile
|
30
|
+
Requires-Dist: pydub
|
31
|
+
|
32
|
+
Note: Still in development, as I am configuring the system for the most performant approach. Feel free to jump on the project with me.
|
26
33
|
|
27
34
|
# speaker-detector 🎙️
|
28
35
|
|
@@ -45,6 +52,12 @@ Install from [TestPyPI](https://test.pypi.org/):
|
|
45
52
|
|
46
53
|
```bash
|
47
54
|
pip install --index-url https://test.pypi.org/simple/ speaker-detector
|
55
|
+
|
56
|
+
When installing packages with a stale requirement file you might need to use: pip install --break-system-packages soundfile to install on WSL Ubuntu
|
57
|
+
|
58
|
+
Run this version with -m module flag if you are contributing and want to run server.py:
|
59
|
+
python3 -m speaker_detector.server
|
60
|
+
|
48
61
|
```
|
49
62
|
|
50
63
|
## 🚀 Usage
|
@@ -99,3 +112,15 @@ onnxruntime
|
|
99
112
|
|
100
113
|
|
101
114
|
NB: When pushing to Github, do not include any .identifier files.
|
115
|
+
|
116
|
+
You can manually clean up stale embeddings that don’t match any existing speaker folder with a quick script:
|
117
|
+
|
118
|
+
# Run inside your project root
|
119
|
+
cd storage/embeddings
|
120
|
+
for f in *.pt; do
|
121
|
+
speaker="${f%.pt}"
|
122
|
+
if [ ! -d "../speakers/$speaker" ]; then
|
123
|
+
echo "Deleting stale embedding: $f"
|
124
|
+
rm "$f"
|
125
|
+
fi
|
126
|
+
done
|
@@ -1,3 +1,5 @@
|
|
1
|
+
Note: Still in development, as I am configuring the system for the most performant approach. Feel free to jump on the project with me.
|
2
|
+
|
1
3
|
# speaker-detector 🎙️
|
2
4
|
|
3
5
|
A lightweight CLI tool for speaker enrollment and voice identification, powered by [SpeechBrain](https://speechbrain.readthedocs.io/).
|
@@ -19,6 +21,12 @@ Install from [TestPyPI](https://test.pypi.org/):
|
|
19
21
|
|
20
22
|
```bash
|
21
23
|
pip install --index-url https://test.pypi.org/simple/ speaker-detector
|
24
|
+
|
25
|
+
When installing packages with a stale requirement file you might need to use: pip install --break-system-packages soundfile to install on WSL Ubuntu
|
26
|
+
|
27
|
+
Run this version with -m module flag if you are contributing and want to run server.py:
|
28
|
+
python3 -m speaker_detector.server
|
29
|
+
|
22
30
|
```
|
23
31
|
|
24
32
|
## 🚀 Usage
|
@@ -72,4 +80,16 @@ onnxruntime
|
|
72
80
|
|
73
81
|
|
74
82
|
|
75
|
-
NB: When pushing to Github, do not include any .identifier files.
|
83
|
+
NB: When pushing to Github, do not include any .identifier files.
|
84
|
+
|
85
|
+
You can manually clean up stale embeddings that don’t match any existing speaker folder with a quick script:
|
86
|
+
|
87
|
+
# Run inside your project root
|
88
|
+
cd storage/embeddings
|
89
|
+
for f in *.pt; do
|
90
|
+
speaker="${f%.pt}"
|
91
|
+
if [ ! -d "../speakers/$speaker" ]; then
|
92
|
+
echo "Deleting stale embedding: $f"
|
93
|
+
rm "$f"
|
94
|
+
fi
|
95
|
+
done
|
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "speaker-detector"
|
7
|
-
version = "0.1.
|
8
|
-
description = "A CLI tool for speaker enrollment and identification using SpeechBrain."
|
7
|
+
version = "0.1.6"
|
8
|
+
description = "A CLI + Web tool for speaker enrollment and identification using SpeechBrain."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
11
11
|
license = { text = "MIT" }
|
@@ -31,7 +31,12 @@ dependencies = [
|
|
31
31
|
"torch",
|
32
32
|
"torchaudio",
|
33
33
|
"speechbrain",
|
34
|
-
"
|
34
|
+
"flask",
|
35
|
+
"flask-cors",
|
36
|
+
"numpy",
|
37
|
+
"sounddevice",
|
38
|
+
"soundfile",
|
39
|
+
"pydub"
|
35
40
|
]
|
36
41
|
|
37
42
|
[project.scripts]
|
@@ -47,4 +52,12 @@ Documentation = "https://github.com/P0llen/speaker-detector#readme"
|
|
47
52
|
packages = ["speaker_detector"]
|
48
53
|
|
49
54
|
[tool.setuptools.package-data]
|
50
|
-
speaker_detector = [
|
55
|
+
speaker_detector = [
|
56
|
+
"*.json",
|
57
|
+
"*.yaml",
|
58
|
+
"model/*",
|
59
|
+
"web/static/*",
|
60
|
+
"web/static/scripts/*",
|
61
|
+
"web/static/templates/*",
|
62
|
+
"web/static/templates/components/*"
|
63
|
+
]
|
@@ -21,20 +21,9 @@ def main():
|
|
21
21
|
# ---- list-speakers ----
|
22
22
|
subparsers.add_parser("list-speakers", help="List enrolled speakers")
|
23
23
|
|
24
|
-
# ----
|
25
|
-
|
26
|
-
|
27
|
-
model_parser.add_argument("--out", default="speaker_embedding.onnx", help="Output ONNX file")
|
28
|
-
|
29
|
-
# ---- export-speaker-json ----
|
30
|
-
emb_parser = subparsers.add_parser("export-speaker-json", help="Convert enrolled .pt file to browser-friendly .json")
|
31
|
-
emb_parser.add_argument("--pt", required=True, help="Path to enrolled_speakers.pt")
|
32
|
-
emb_parser.add_argument("--out", default="speakers.json", help="Output .json file for browser")
|
33
|
-
|
34
|
-
# ---- combine ----
|
35
|
-
comb_parser = subparsers.add_parser("combine", help="Combine individual .pt files into enrolled_speakers.pt")
|
36
|
-
comb_parser.add_argument("--folder", required=True, help="Folder with individual .pt files")
|
37
|
-
comb_parser.add_argument("--out", required=True, help="Output .pt file path")
|
24
|
+
# ---- rebuild ----
|
25
|
+
rebuild_cmd = subparsers.add_parser("rebuild", help="Rebuild embeddings")
|
26
|
+
rebuild_cmd.add_argument("--name", help="Name of the speaker to rebuild (leave empty to rebuild all)", default=None)
|
38
27
|
|
39
28
|
# ---- Parse arguments ----
|
40
29
|
args = parser.parse_args()
|
@@ -46,10 +35,8 @@ def main():
|
|
46
35
|
os.environ["PYTHONWARNINGS"] = "ignore"
|
47
36
|
|
48
37
|
# ---- Import modules after filtering warnings ----
|
49
|
-
from .core import enroll_speaker, identify_speaker, list_speakers
|
50
|
-
from .
|
51
|
-
from .export_embeddings import export_embeddings_to_json
|
52
|
-
from .combine import combine_embeddings_from_folder
|
38
|
+
from .core import enroll_speaker, identify_speaker, list_speakers, rebuild_embedding
|
39
|
+
from .utils.analyze import rebuild_all_embeddings
|
53
40
|
|
54
41
|
# ---- Command Dispatch ----
|
55
42
|
if args.command == "enroll":
|
@@ -69,14 +56,13 @@ def main():
|
|
69
56
|
else:
|
70
57
|
print("⚠️ No speakers enrolled yet.")
|
71
58
|
|
72
|
-
elif args.command == "
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
combine_embeddings_from_folder(args.folder, args.out)
|
59
|
+
elif args.command == "rebuild":
|
60
|
+
if args.name:
|
61
|
+
rebuild_embedding(args.name)
|
62
|
+
print(f"🔁 Rebuilt: {args.name}")
|
63
|
+
else:
|
64
|
+
rebuild_all_embeddings()
|
65
|
+
print("🔁 Rebuilt all embeddings.")
|
80
66
|
|
81
67
|
else:
|
82
68
|
parser.print_help()
|
@@ -0,0 +1,116 @@
|
|
1
|
+
# core.py
|
2
|
+
|
3
|
+
from pathlib import Path
|
4
|
+
import torch
|
5
|
+
import torchaudio
|
6
|
+
from speechbrain.inference import SpeakerRecognition
|
7
|
+
|
8
|
+
# ── DIRECTORIES ──────────────────────────────────────────────────────────────
|
9
|
+
BASE_DIR = Path(__file__).resolve().parent.parent / "storage"
|
10
|
+
SPEAKER_AUDIO_DIR = BASE_DIR / "speakers"
|
11
|
+
EMBEDDINGS_DIR = BASE_DIR / "embeddings"
|
12
|
+
NOISE_DIR = BASE_DIR / "background_noise"
|
13
|
+
|
14
|
+
SPEAKER_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
|
15
|
+
EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
|
16
|
+
NOISE_DIR.mkdir(parents=True, exist_ok=True)
|
17
|
+
|
18
|
+
# ── MODEL LOADING ────────────────────────────────────────────────────────────
|
19
|
+
MODEL = SpeakerRecognition.from_hparams(
|
20
|
+
source="speechbrain/spkrec-ecapa-voxceleb",
|
21
|
+
savedir="model"
|
22
|
+
)
|
23
|
+
|
24
|
+
# ── EMBEDDING HELPERS ────────────────────────────────────────────────────────
|
25
|
+
def get_embedding(audio_path: str) -> torch.Tensor:
|
26
|
+
signal, fs = torchaudio.load(audio_path)
|
27
|
+
if signal.numel() == 0:
|
28
|
+
raise ValueError(f"{audio_path} is empty.")
|
29
|
+
return MODEL.encode_batch(signal).squeeze().detach().cpu()
|
30
|
+
|
31
|
+
def average_embeddings(paths: list[str]) -> torch.Tensor:
|
32
|
+
embeddings = [get_embedding(p) for p in paths]
|
33
|
+
return torch.stack(embeddings).mean(dim=0)
|
34
|
+
|
35
|
+
# ── ENROLL / IMPROVE ─────────────────────────────────────────────────────────
|
36
|
+
def enroll_speaker(audio_path: str, speaker_id: str) -> None:
|
37
|
+
speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
|
38
|
+
speaker_dir.mkdir(parents=True, exist_ok=True)
|
39
|
+
|
40
|
+
existing = list(speaker_dir.glob("*.wav"))
|
41
|
+
dest_path = speaker_dir / f"{len(existing)+1}.wav"
|
42
|
+
|
43
|
+
waveform, sr = torchaudio.load(audio_path)
|
44
|
+
if waveform.numel() == 0:
|
45
|
+
raise ValueError("Cannot enroll empty audio file.")
|
46
|
+
torchaudio.save(str(dest_path), waveform, sr)
|
47
|
+
|
48
|
+
emb = get_embedding(audio_path)
|
49
|
+
torch.save(emb, EMBEDDINGS_DIR / f"{speaker_id}.pt")
|
50
|
+
|
51
|
+
def rebuild_embedding(speaker_id: str) -> None:
|
52
|
+
speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
|
53
|
+
wavs = list(speaker_dir.glob("*.wav"))
|
54
|
+
if not wavs:
|
55
|
+
raise RuntimeError(f"No recordings for {speaker_id}.")
|
56
|
+
emb = average_embeddings([str(w) for w in wavs])
|
57
|
+
torch.save(emb, EMBEDDINGS_DIR / f"{speaker_id}.pt")
|
58
|
+
|
59
|
+
# ── BACKGROUND NOISE MODELING ────────────────────────────────────────────────
|
60
|
+
def compute_background_embedding() -> None:
|
61
|
+
paths = [str(p) for p in NOISE_DIR.glob("*.wav")]
|
62
|
+
if not paths:
|
63
|
+
raise RuntimeError("No background noise samples.")
|
64
|
+
emb = average_embeddings(paths)
|
65
|
+
torch.save(emb, EMBEDDINGS_DIR / "background_noise.pt")
|
66
|
+
|
67
|
+
# ── IDENTIFICATION ───────────────────────────────────────────────────────────
|
68
|
+
def identify_speaker(audio_path: str, threshold: float = 0.25) -> tuple[str, float]:
|
69
|
+
try:
|
70
|
+
test_emb = get_embedding(audio_path)
|
71
|
+
except Exception:
|
72
|
+
return "error", 0.0
|
73
|
+
|
74
|
+
scores = {}
|
75
|
+
for emb_path in EMBEDDINGS_DIR.glob("*.pt"):
|
76
|
+
name = emb_path.stem
|
77
|
+
try:
|
78
|
+
emb = torch.load(emb_path)
|
79
|
+
score = torch.nn.functional.cosine_similarity(emb, test_emb, dim=0).item()
|
80
|
+
scores[name] = score
|
81
|
+
except:
|
82
|
+
continue
|
83
|
+
|
84
|
+
if not scores:
|
85
|
+
return "unknown", 0.0
|
86
|
+
|
87
|
+
sorted_scores = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
|
88
|
+
best, best_score = sorted_scores[0]
|
89
|
+
second_score = sorted_scores[1][1] if len(sorted_scores) > 1 else 0.0
|
90
|
+
auto_thresh = (best_score - second_score) > 0.1
|
91
|
+
match = auto_thresh or best_score >= threshold
|
92
|
+
|
93
|
+
return (best, round(best_score, 3)) if match else ("unknown", round(best_score, 3))
|
94
|
+
|
95
|
+
# ── REBUILD CHECKING ─────────────────────────────────────────────────────────
|
96
|
+
def list_speakers() -> list[str]:
|
97
|
+
return [p.name for p in SPEAKER_AUDIO_DIR.iterdir() if p.is_dir()]
|
98
|
+
|
99
|
+
def speaker_needs_rebuild(speaker_id: str) -> bool:
|
100
|
+
speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
|
101
|
+
emb_path = EMBEDDINGS_DIR / f"{speaker_id}.pt"
|
102
|
+
if not emb_path.exists():
|
103
|
+
return True
|
104
|
+
emb_mtime = emb_path.stat().st_mtime
|
105
|
+
for wav in speaker_dir.glob("*.wav"):
|
106
|
+
if wav.stat().st_mtime > emb_mtime:
|
107
|
+
return True
|
108
|
+
return False
|
109
|
+
|
110
|
+
def get_speakers_needing_rebuild() -> list[str]:
|
111
|
+
return [s for s in list_speakers() if speaker_needs_rebuild(s)]
|
112
|
+
|
113
|
+
|
114
|
+
|
115
|
+
# ── ALIAS FOR COMPATIBILITY ──────────────────────────────────────────────────
|
116
|
+
rebuild_embeddings_for_speaker = rebuild_embedding
|
Binary file
|
Binary file
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# ############################################################################
|
2
|
+
# Model: ECAPA big for Speaker verification
|
3
|
+
# ############################################################################
|
4
|
+
|
5
|
+
# Feature parameters
|
6
|
+
n_mels: 80
|
7
|
+
|
8
|
+
# Pretrain folder (HuggingFace)
|
9
|
+
pretrained_path: speechbrain/spkrec-ecapa-voxceleb
|
10
|
+
|
11
|
+
# Output parameters
|
12
|
+
out_n_neurons: 7205
|
13
|
+
|
14
|
+
# Model params
|
15
|
+
compute_features: !new:speechbrain.lobes.features.Fbank
|
16
|
+
n_mels: !ref <n_mels>
|
17
|
+
|
18
|
+
mean_var_norm: !new:speechbrain.processing.features.InputNormalization
|
19
|
+
norm_type: sentence
|
20
|
+
std_norm: False
|
21
|
+
|
22
|
+
embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
|
23
|
+
input_size: !ref <n_mels>
|
24
|
+
channels: [1024, 1024, 1024, 1024, 3072]
|
25
|
+
kernel_sizes: [5, 3, 3, 3, 1]
|
26
|
+
dilations: [1, 2, 3, 4, 1]
|
27
|
+
attention_channels: 128
|
28
|
+
lin_neurons: 192
|
29
|
+
|
30
|
+
classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
|
31
|
+
input_size: 192
|
32
|
+
out_neurons: !ref <out_n_neurons>
|
33
|
+
|
34
|
+
mean_var_norm_emb: !new:speechbrain.processing.features.InputNormalization
|
35
|
+
norm_type: global
|
36
|
+
std_norm: False
|
37
|
+
|
38
|
+
modules:
|
39
|
+
compute_features: !ref <compute_features>
|
40
|
+
mean_var_norm: !ref <mean_var_norm>
|
41
|
+
embedding_model: !ref <embedding_model>
|
42
|
+
mean_var_norm_emb: !ref <mean_var_norm_emb>
|
43
|
+
classifier: !ref <classifier>
|
44
|
+
|
45
|
+
label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
|
46
|
+
|
47
|
+
|
48
|
+
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
49
|
+
loadables:
|
50
|
+
embedding_model: !ref <embedding_model>
|
51
|
+
mean_var_norm_emb: !ref <mean_var_norm_emb>
|
52
|
+
classifier: !ref <classifier>
|
53
|
+
label_encoder: !ref <label_encoder>
|
54
|
+
paths:
|
55
|
+
embedding_model: !ref <pretrained_path>/embedding_model.ckpt
|
56
|
+
mean_var_norm_emb: !ref <pretrained_path>/mean_var_norm_emb.ckpt
|
57
|
+
classifier: !ref <pretrained_path>/classifier.ckpt
|
58
|
+
label_encoder: !ref <pretrained_path>/label_encoder.txt
|