speaker-detector 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speaker_detector/cli.py +12 -26
- speaker_detector/core.py +78 -65
- speaker_detector/model/classifier.ckpt +0 -0
- speaker_detector/model/embedding_model.ckpt +0 -0
- speaker_detector/model/hyperparams.yaml +58 -0
- speaker_detector/model/label_encoder.ckpt +7207 -0
- speaker_detector/model/mean_var_norm_emb.ckpt +0 -0
- speaker_detector/server copy.py +296 -0
- speaker_detector/server.py +82 -0
- speaker_detector/state.py +69 -0
- speaker_detector/web/static/favicon.ico +0 -0
- speaker_detector/web/static/index.html +29 -0
- speaker_detector/web/static/scripts/loader copy.js +10 -0
- speaker_detector/web/static/scripts/loader.js +14 -0
- speaker_detector/web/static/scripts/script copy.js +954 -0
- speaker_detector/web/static/scripts/script.js +22 -0
- speaker_detector/web/static/style.css +133 -0
- {speaker_detector-0.1.5.dist-info → speaker_detector-0.1.6.dist-info}/METADATA +28 -3
- speaker_detector-0.1.6.dist-info/RECORD +25 -0
- {speaker_detector-0.1.5.dist-info → speaker_detector-0.1.6.dist-info}/WHEEL +1 -1
- speaker_detector/analyze.py +0 -59
- speaker_detector/combine.py +0 -22
- speaker_detector/export_embeddings.py +0 -62
- speaker_detector/export_model.py +0 -40
- speaker_detector/generate_summary.py +0 -110
- speaker_detector-0.1.5.dist-info/RECORD +0 -15
- /speaker_detector/{ECAPA_TDNN.py → model/ECAPA_TDNN.py} +0 -0
- /speaker_detector/{__init__.py → web/static/__init__.py} +0 -0
- {speaker_detector-0.1.5.dist-info → speaker_detector-0.1.6.dist-info}/entry_points.txt +0 -0
- {speaker_detector-0.1.5.dist-info → speaker_detector-0.1.6.dist-info}/top_level.txt +0 -0
speaker_detector/cli.py
CHANGED
@@ -21,20 +21,9 @@ def main():
|
|
21
21
|
# ---- list-speakers ----
|
22
22
|
subparsers.add_parser("list-speakers", help="List enrolled speakers")
|
23
23
|
|
24
|
-
# ----
|
25
|
-
|
26
|
-
|
27
|
-
model_parser.add_argument("--out", default="speaker_embedding.onnx", help="Output ONNX file")
|
28
|
-
|
29
|
-
# ---- export-speaker-json ----
|
30
|
-
emb_parser = subparsers.add_parser("export-speaker-json", help="Convert enrolled .pt file to browser-friendly .json")
|
31
|
-
emb_parser.add_argument("--pt", required=True, help="Path to enrolled_speakers.pt")
|
32
|
-
emb_parser.add_argument("--out", default="speakers.json", help="Output .json file for browser")
|
33
|
-
|
34
|
-
# ---- combine ----
|
35
|
-
comb_parser = subparsers.add_parser("combine", help="Combine individual .pt files into enrolled_speakers.pt")
|
36
|
-
comb_parser.add_argument("--folder", required=True, help="Folder with individual .pt files")
|
37
|
-
comb_parser.add_argument("--out", required=True, help="Output .pt file path")
|
24
|
+
# ---- rebuild ----
|
25
|
+
rebuild_cmd = subparsers.add_parser("rebuild", help="Rebuild embeddings")
|
26
|
+
rebuild_cmd.add_argument("--name", help="Name of the speaker to rebuild (leave empty to rebuild all)", default=None)
|
38
27
|
|
39
28
|
# ---- Parse arguments ----
|
40
29
|
args = parser.parse_args()
|
@@ -46,10 +35,8 @@ def main():
|
|
46
35
|
os.environ["PYTHONWARNINGS"] = "ignore"
|
47
36
|
|
48
37
|
# ---- Import modules after filtering warnings ----
|
49
|
-
from .core import enroll_speaker, identify_speaker, list_speakers
|
50
|
-
from .
|
51
|
-
from .export_embeddings import export_embeddings_to_json
|
52
|
-
from .combine import combine_embeddings_from_folder
|
38
|
+
from .core import enroll_speaker, identify_speaker, list_speakers, rebuild_embedding
|
39
|
+
from .utils.analyze import rebuild_all_embeddings
|
53
40
|
|
54
41
|
# ---- Command Dispatch ----
|
55
42
|
if args.command == "enroll":
|
@@ -69,14 +56,13 @@ def main():
|
|
69
56
|
else:
|
70
57
|
print("⚠️ No speakers enrolled yet.")
|
71
58
|
|
72
|
-
elif args.command == "
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
combine_embeddings_from_folder(args.folder, args.out)
|
59
|
+
elif args.command == "rebuild":
|
60
|
+
if args.name:
|
61
|
+
rebuild_embedding(args.name)
|
62
|
+
print(f"🔁 Rebuilt: {args.name}")
|
63
|
+
else:
|
64
|
+
rebuild_all_embeddings()
|
65
|
+
print("🔁 Rebuilt all embeddings.")
|
80
66
|
|
81
67
|
else:
|
82
68
|
parser.print_help()
|
speaker_detector/core.py
CHANGED
@@ -1,103 +1,116 @@
|
|
1
|
-
|
1
|
+
# core.py
|
2
|
+
|
2
3
|
from pathlib import Path
|
3
|
-
import torchaudio
|
4
4
|
import torch
|
5
|
+
import torchaudio
|
6
|
+
from speechbrain.inference import SpeakerRecognition
|
5
7
|
|
6
|
-
#
|
8
|
+
# ── DIRECTORIES ──────────────────────────────────────────────────────────────
|
7
9
|
BASE_DIR = Path(__file__).resolve().parent.parent / "storage"
|
8
10
|
SPEAKER_AUDIO_DIR = BASE_DIR / "speakers"
|
9
11
|
EMBEDDINGS_DIR = BASE_DIR / "embeddings"
|
12
|
+
NOISE_DIR = BASE_DIR / "background_noise"
|
10
13
|
|
11
|
-
# Ensure they exist
|
12
14
|
SPEAKER_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
|
13
15
|
EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
|
16
|
+
NOISE_DIR.mkdir(parents=True, exist_ok=True)
|
14
17
|
|
15
|
-
#
|
18
|
+
# ── MODEL LOADING ────────────────────────────────────────────────────────────
|
16
19
|
MODEL = SpeakerRecognition.from_hparams(
|
17
|
-
source="speechbrain/spkrec-ecapa-voxceleb",
|
20
|
+
source="speechbrain/spkrec-ecapa-voxceleb",
|
21
|
+
savedir="model"
|
18
22
|
)
|
19
23
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
24
|
+
# ── EMBEDDING HELPERS ────────────────────────────────────────────────────────
|
25
|
+
def get_embedding(audio_path: str) -> torch.Tensor:
|
26
|
+
signal, fs = torchaudio.load(audio_path)
|
27
|
+
if signal.numel() == 0:
|
28
|
+
raise ValueError(f"{audio_path} is empty.")
|
29
|
+
return MODEL.encode_batch(signal).squeeze().detach().cpu()
|
30
|
+
|
31
|
+
def average_embeddings(paths: list[str]) -> torch.Tensor:
|
32
|
+
embeddings = [get_embedding(p) for p in paths]
|
33
|
+
return torch.stack(embeddings).mean(dim=0)
|
34
|
+
|
35
|
+
# ── ENROLL / IMPROVE ─────────────────────────────────────────────────────────
|
36
|
+
def enroll_speaker(audio_path: str, speaker_id: str) -> None:
|
30
37
|
speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
|
31
38
|
speaker_dir.mkdir(parents=True, exist_ok=True)
|
32
39
|
|
33
|
-
# Save audio sample
|
34
40
|
existing = list(speaker_dir.glob("*.wav"))
|
35
|
-
|
36
|
-
dest_path = speaker_dir / f"{new_index}.wav"
|
41
|
+
dest_path = speaker_dir / f"{len(existing)+1}.wav"
|
37
42
|
|
38
|
-
waveform,
|
43
|
+
waveform, sr = torchaudio.load(audio_path)
|
39
44
|
if waveform.numel() == 0:
|
40
45
|
raise ValueError("Cannot enroll empty audio file.")
|
46
|
+
torchaudio.save(str(dest_path), waveform, sr)
|
41
47
|
|
42
|
-
torchaudio.save(str(dest_path), waveform, sample_rate)
|
43
|
-
print(f"🎙 Saved {speaker_id}'s recording #{new_index} → {dest_path}")
|
44
|
-
|
45
|
-
# Save embedding
|
46
48
|
emb = get_embedding(audio_path)
|
47
|
-
|
48
|
-
torch.save(emb, emb_path)
|
49
|
-
print(f"🧠 Saved embedding for {speaker_id} → {emb_path}")
|
49
|
+
torch.save(emb, EMBEDDINGS_DIR / f"{speaker_id}.pt")
|
50
50
|
|
51
|
-
def
|
51
|
+
def rebuild_embedding(speaker_id: str) -> None:
|
52
|
+
speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
|
53
|
+
wavs = list(speaker_dir.glob("*.wav"))
|
54
|
+
if not wavs:
|
55
|
+
raise RuntimeError(f"No recordings for {speaker_id}.")
|
56
|
+
emb = average_embeddings([str(w) for w in wavs])
|
57
|
+
torch.save(emb, EMBEDDINGS_DIR / f"{speaker_id}.pt")
|
58
|
+
|
59
|
+
# ── BACKGROUND NOISE MODELING ────────────────────────────────────────────────
|
60
|
+
def compute_background_embedding() -> None:
|
61
|
+
paths = [str(p) for p in NOISE_DIR.glob("*.wav")]
|
62
|
+
if not paths:
|
63
|
+
raise RuntimeError("No background noise samples.")
|
64
|
+
emb = average_embeddings(paths)
|
65
|
+
torch.save(emb, EMBEDDINGS_DIR / "background_noise.pt")
|
66
|
+
|
67
|
+
# ── IDENTIFICATION ───────────────────────────────────────────────────────────
|
68
|
+
def identify_speaker(audio_path: str, threshold: float = 0.25) -> tuple[str, float]:
|
52
69
|
try:
|
53
70
|
test_emb = get_embedding(audio_path)
|
54
|
-
except Exception
|
55
|
-
return
|
71
|
+
except Exception:
|
72
|
+
return "error", 0.0
|
56
73
|
|
57
74
|
scores = {}
|
58
75
|
for emb_path in EMBEDDINGS_DIR.glob("*.pt"):
|
59
|
-
|
76
|
+
name = emb_path.stem
|
60
77
|
try:
|
61
|
-
|
62
|
-
score = torch.nn.functional.cosine_similarity(
|
63
|
-
scores[
|
64
|
-
except
|
78
|
+
emb = torch.load(emb_path)
|
79
|
+
score = torch.nn.functional.cosine_similarity(emb, test_emb, dim=0).item()
|
80
|
+
scores[name] = score
|
81
|
+
except:
|
65
82
|
continue
|
66
83
|
|
67
84
|
if not scores:
|
68
|
-
return
|
85
|
+
return "unknown", 0.0
|
69
86
|
|
70
87
|
sorted_scores = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
|
71
|
-
best,
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
result = {
|
76
|
-
"speaker": best[0] if is_match else "unknown",
|
77
|
-
"score": round(best[1], 3),
|
78
|
-
"all_scores": {k: round(v, 3) for k, v in sorted_scores}
|
79
|
-
}
|
80
|
-
return result
|
81
|
-
|
82
|
-
def list_speakers():
|
83
|
-
speakers = []
|
84
|
-
for dir in SPEAKER_AUDIO_DIR.iterdir():
|
85
|
-
if dir.is_dir():
|
86
|
-
count = len(list(dir.glob("*.wav")))
|
87
|
-
speakers.append(f"{dir.name} ({count} recording{'s' if count != 1 else ''})")
|
88
|
-
print(f"📋 Found {len(speakers)} enrolled speaker(s): {speakers}")
|
89
|
-
return [s.split()[0] for s in speakers]
|
90
|
-
|
91
|
-
def rebuild_embedding(speaker_id):
|
92
|
-
speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
|
93
|
-
wavs = list(speaker_dir.glob("*.wav"))
|
88
|
+
best, best_score = sorted_scores[0]
|
89
|
+
second_score = sorted_scores[1][1] if len(sorted_scores) > 1 else 0.0
|
90
|
+
auto_thresh = (best_score - second_score) > 0.1
|
91
|
+
match = auto_thresh or best_score >= threshold
|
94
92
|
|
95
|
-
if
|
96
|
-
raise RuntimeError(f"No recordings found for {speaker_id}.")
|
93
|
+
return (best, round(best_score, 3)) if match else ("unknown", round(best_score, 3))
|
97
94
|
|
98
|
-
|
99
|
-
|
95
|
+
# ── REBUILD CHECKING ─────────────────────────────────────────────────────────
|
96
|
+
def list_speakers() -> list[str]:
|
97
|
+
return [p.name for p in SPEAKER_AUDIO_DIR.iterdir() if p.is_dir()]
|
100
98
|
|
99
|
+
def speaker_needs_rebuild(speaker_id: str) -> bool:
|
100
|
+
speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
|
101
101
|
emb_path = EMBEDDINGS_DIR / f"{speaker_id}.pt"
|
102
|
-
|
103
|
-
|
102
|
+
if not emb_path.exists():
|
103
|
+
return True
|
104
|
+
emb_mtime = emb_path.stat().st_mtime
|
105
|
+
for wav in speaker_dir.glob("*.wav"):
|
106
|
+
if wav.stat().st_mtime > emb_mtime:
|
107
|
+
return True
|
108
|
+
return False
|
109
|
+
|
110
|
+
def get_speakers_needing_rebuild() -> list[str]:
|
111
|
+
return [s for s in list_speakers() if speaker_needs_rebuild(s)]
|
112
|
+
|
113
|
+
|
114
|
+
|
115
|
+
# ── ALIAS FOR COMPATIBILITY ──────────────────────────────────────────────────
|
116
|
+
rebuild_embeddings_for_speaker = rebuild_embedding
|
Binary file
|
Binary file
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# ############################################################################
|
2
|
+
# Model: ECAPA big for Speaker verification
|
3
|
+
# ############################################################################
|
4
|
+
|
5
|
+
# Feature parameters
|
6
|
+
n_mels: 80
|
7
|
+
|
8
|
+
# Pretrain folder (HuggingFace)
|
9
|
+
pretrained_path: speechbrain/spkrec-ecapa-voxceleb
|
10
|
+
|
11
|
+
# Output parameters
|
12
|
+
out_n_neurons: 7205
|
13
|
+
|
14
|
+
# Model params
|
15
|
+
compute_features: !new:speechbrain.lobes.features.Fbank
|
16
|
+
n_mels: !ref <n_mels>
|
17
|
+
|
18
|
+
mean_var_norm: !new:speechbrain.processing.features.InputNormalization
|
19
|
+
norm_type: sentence
|
20
|
+
std_norm: False
|
21
|
+
|
22
|
+
embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
|
23
|
+
input_size: !ref <n_mels>
|
24
|
+
channels: [1024, 1024, 1024, 1024, 3072]
|
25
|
+
kernel_sizes: [5, 3, 3, 3, 1]
|
26
|
+
dilations: [1, 2, 3, 4, 1]
|
27
|
+
attention_channels: 128
|
28
|
+
lin_neurons: 192
|
29
|
+
|
30
|
+
classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
|
31
|
+
input_size: 192
|
32
|
+
out_neurons: !ref <out_n_neurons>
|
33
|
+
|
34
|
+
mean_var_norm_emb: !new:speechbrain.processing.features.InputNormalization
|
35
|
+
norm_type: global
|
36
|
+
std_norm: False
|
37
|
+
|
38
|
+
modules:
|
39
|
+
compute_features: !ref <compute_features>
|
40
|
+
mean_var_norm: !ref <mean_var_norm>
|
41
|
+
embedding_model: !ref <embedding_model>
|
42
|
+
mean_var_norm_emb: !ref <mean_var_norm_emb>
|
43
|
+
classifier: !ref <classifier>
|
44
|
+
|
45
|
+
label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
|
46
|
+
|
47
|
+
|
48
|
+
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
49
|
+
loadables:
|
50
|
+
embedding_model: !ref <embedding_model>
|
51
|
+
mean_var_norm_emb: !ref <mean_var_norm_emb>
|
52
|
+
classifier: !ref <classifier>
|
53
|
+
label_encoder: !ref <label_encoder>
|
54
|
+
paths:
|
55
|
+
embedding_model: !ref <pretrained_path>/embedding_model.ckpt
|
56
|
+
mean_var_norm_emb: !ref <pretrained_path>/mean_var_norm_emb.ckpt
|
57
|
+
classifier: !ref <pretrained_path>/classifier.ckpt
|
58
|
+
label_encoder: !ref <pretrained_path>/label_encoder.txt
|