speaker-detector 0.1.5__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {speaker_detector-0.1.5 → speaker_detector-0.1.6}/PKG-INFO +28 -3
  2. {speaker_detector-0.1.5 → speaker_detector-0.1.6}/README.md +21 -1
  3. {speaker_detector-0.1.5 → speaker_detector-0.1.6}/pyproject.toml +17 -4
  4. {speaker_detector-0.1.5 → speaker_detector-0.1.6}/speaker_detector/cli.py +12 -26
  5. speaker_detector-0.1.6/speaker_detector/core.py +116 -0
  6. speaker_detector-0.1.6/speaker_detector/model/classifier.ckpt +0 -0
  7. speaker_detector-0.1.6/speaker_detector/model/embedding_model.ckpt +0 -0
  8. speaker_detector-0.1.6/speaker_detector/model/hyperparams.yaml +58 -0
  9. speaker_detector-0.1.6/speaker_detector/model/label_encoder.ckpt +7207 -0
  10. speaker_detector-0.1.6/speaker_detector/model/mean_var_norm_emb.ckpt +0 -0
  11. speaker_detector-0.1.6/speaker_detector/server copy.py +296 -0
  12. speaker_detector-0.1.6/speaker_detector/server.py +82 -0
  13. speaker_detector-0.1.6/speaker_detector/state.py +69 -0
  14. speaker_detector-0.1.6/speaker_detector/web/static/favicon.ico +0 -0
  15. speaker_detector-0.1.6/speaker_detector/web/static/index.html +29 -0
  16. speaker_detector-0.1.6/speaker_detector/web/static/scripts/loader copy.js +10 -0
  17. speaker_detector-0.1.6/speaker_detector/web/static/scripts/loader.js +14 -0
  18. speaker_detector-0.1.6/speaker_detector/web/static/scripts/script copy.js +954 -0
  19. speaker_detector-0.1.6/speaker_detector/web/static/scripts/script.js +22 -0
  20. speaker_detector-0.1.6/speaker_detector/web/static/style.css +133 -0
  21. {speaker_detector-0.1.5 → speaker_detector-0.1.6}/speaker_detector.egg-info/PKG-INFO +28 -3
  22. speaker_detector-0.1.6/speaker_detector.egg-info/SOURCES.txt +28 -0
  23. speaker_detector-0.1.6/speaker_detector.egg-info/requires.txt +9 -0
  24. speaker_detector-0.1.5/speaker_detector/analyze.py +0 -59
  25. speaker_detector-0.1.5/speaker_detector/combine.py +0 -22
  26. speaker_detector-0.1.5/speaker_detector/core.py +0 -103
  27. speaker_detector-0.1.5/speaker_detector/export_embeddings.py +0 -62
  28. speaker_detector-0.1.5/speaker_detector/export_model.py +0 -40
  29. speaker_detector-0.1.5/speaker_detector/generate_summary.py +0 -110
  30. speaker_detector-0.1.5/speaker_detector.egg-info/SOURCES.txt +0 -18
  31. speaker_detector-0.1.5/speaker_detector.egg-info/requires.txt +0 -4
  32. {speaker_detector-0.1.5 → speaker_detector-0.1.6}/setup.cfg +0 -0
  33. {speaker_detector-0.1.5 → speaker_detector-0.1.6}/speaker_detector/__main__.py +0 -0
  34. {speaker_detector-0.1.5/speaker_detector → speaker_detector-0.1.6/speaker_detector/model}/ECAPA_TDNN.py +0 -0
  35. {speaker_detector-0.1.5/speaker_detector → speaker_detector-0.1.6/speaker_detector/web/static}/__init__.py +0 -0
  36. {speaker_detector-0.1.5 → speaker_detector-0.1.6}/speaker_detector.egg-info/dependency_links.txt +0 -0
  37. {speaker_detector-0.1.5 → speaker_detector-0.1.6}/speaker_detector.egg-info/entry_points.txt +0 -0
  38. {speaker_detector-0.1.5 → speaker_detector-0.1.6}/speaker_detector.egg-info/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: speaker-detector
3
- Version: 0.1.5
4
- Summary: A CLI tool for speaker enrollment and identification using SpeechBrain.
3
+ Version: 0.1.6
4
+ Summary: A CLI + Web tool for speaker enrollment and identification using SpeechBrain.
5
5
  Author-email: Lara Whybrow <lara.whybrow@gmail.com>
6
6
  License: MIT
7
7
  Project-URL: Homepage, https://github.com/P0llen/speaker-detector
@@ -22,7 +22,14 @@ Description-Content-Type: text/markdown
22
22
  Requires-Dist: torch
23
23
  Requires-Dist: torchaudio
24
24
  Requires-Dist: speechbrain
25
- Requires-Dist: onnx
25
+ Requires-Dist: flask
26
+ Requires-Dist: flask-cors
27
+ Requires-Dist: numpy
28
+ Requires-Dist: sounddevice
29
+ Requires-Dist: soundfile
30
+ Requires-Dist: pydub
31
+
32
+ Note: Still in development, as I am configuring the system for the most performant approach. Feel free to jump on the project with me.
26
33
 
27
34
  # speaker-detector 🎙️
28
35
 
@@ -45,6 +52,12 @@ Install from [TestPyPI](https://test.pypi.org/):
45
52
 
46
53
  ```bash
47
54
  pip install --index-url https://test.pypi.org/simple/ speaker-detector
55
+
56
+ When installing packages with a stale requirement file you might need to use: pip install --break-system-packages soundfile to install on WSL Ubuntu
57
+
58
+ Run this version with -m module flag if you are contributing and want to run server.py:
59
+ python3 -m speaker_detector.server
60
+
48
61
  ```
49
62
 
50
63
  ## 🚀 Usage
@@ -99,3 +112,15 @@ onnxruntime
99
112
 
100
113
 
101
114
  NB: When pushing to Github, do not include any .identifier files.
115
+
116
+ You can manually clean up stale embeddings that don’t match any existing speaker folder with a quick script:
117
+
118
+ # Run inside your project root
119
+ cd storage/embeddings
120
+ for f in *.pt; do
121
+ speaker="${f%.pt}"
122
+ if [ ! -d "../speakers/$speaker" ]; then
123
+ echo "Deleting stale embedding: $f"
124
+ rm "$f"
125
+ fi
126
+ done
@@ -1,3 +1,5 @@
1
+ Note: Still in development, as I am configuring the system for the most performant approach. Feel free to jump on the project with me.
2
+
1
3
  # speaker-detector 🎙️
2
4
 
3
5
  A lightweight CLI tool for speaker enrollment and voice identification, powered by [SpeechBrain](https://speechbrain.readthedocs.io/).
@@ -19,6 +21,12 @@ Install from [TestPyPI](https://test.pypi.org/):
19
21
 
20
22
  ```bash
21
23
  pip install --index-url https://test.pypi.org/simple/ speaker-detector
24
+
25
+ When installing packages with a stale requirement file you might need to use: pip install --break-system-packages soundfile to install on WSL Ubuntu
26
+
27
+ Run this version with -m module flag if you are contributing and want to run server.py:
28
+ python3 -m speaker_detector.server
29
+
22
30
  ```
23
31
 
24
32
  ## 🚀 Usage
@@ -72,4 +80,16 @@ onnxruntime
72
80
 
73
81
 
74
82
 
75
- NB: When pushing to Github, do not include any .identifier files.
83
+ NB: When pushing to Github, do not include any .identifier files.
84
+
85
+ You can manually clean up stale embeddings that don’t match any existing speaker folder with a quick script:
86
+
87
+ # Run inside your project root
88
+ cd storage/embeddings
89
+ for f in *.pt; do
90
+ speaker="${f%.pt}"
91
+ if [ ! -d "../speakers/$speaker" ]; then
92
+ echo "Deleting stale embedding: $f"
93
+ rm "$f"
94
+ fi
95
+ done
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "speaker-detector"
7
- version = "0.1.5"
8
- description = "A CLI tool for speaker enrollment and identification using SpeechBrain."
7
+ version = "0.1.6"
8
+ description = "A CLI + Web tool for speaker enrollment and identification using SpeechBrain."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
11
11
  license = { text = "MIT" }
@@ -31,7 +31,12 @@ dependencies = [
31
31
  "torch",
32
32
  "torchaudio",
33
33
  "speechbrain",
34
- "onnx"
34
+ "flask",
35
+ "flask-cors",
36
+ "numpy",
37
+ "sounddevice",
38
+ "soundfile",
39
+ "pydub"
35
40
  ]
36
41
 
37
42
  [project.scripts]
@@ -47,4 +52,12 @@ Documentation = "https://github.com/P0llen/speaker-detector#readme"
47
52
  packages = ["speaker_detector"]
48
53
 
49
54
  [tool.setuptools.package-data]
50
- speaker_detector = ["*.onnx", "*.json", "*.yaml", "models/*"]
55
+ speaker_detector = [
56
+ "*.json",
57
+ "*.yaml",
58
+ "model/*",
59
+ "web/static/*",
60
+ "web/static/scripts/*",
61
+ "web/static/templates/*",
62
+ "web/static/templates/components/*"
63
+ ]
@@ -21,20 +21,9 @@ def main():
21
21
  # ---- list-speakers ----
22
22
  subparsers.add_parser("list-speakers", help="List enrolled speakers")
23
23
 
24
- # ---- export-model ----
25
- model_parser = subparsers.add_parser("export-model", help="Export ECAPA model to ONNX")
26
- model_parser.add_argument("--pt", required=True, help="Path to embedding_model.ckpt")
27
- model_parser.add_argument("--out", default="speaker_embedding.onnx", help="Output ONNX file")
28
-
29
- # ---- export-speaker-json ----
30
- emb_parser = subparsers.add_parser("export-speaker-json", help="Convert enrolled .pt file to browser-friendly .json")
31
- emb_parser.add_argument("--pt", required=True, help="Path to enrolled_speakers.pt")
32
- emb_parser.add_argument("--out", default="speakers.json", help="Output .json file for browser")
33
-
34
- # ---- combine ----
35
- comb_parser = subparsers.add_parser("combine", help="Combine individual .pt files into enrolled_speakers.pt")
36
- comb_parser.add_argument("--folder", required=True, help="Folder with individual .pt files")
37
- comb_parser.add_argument("--out", required=True, help="Output .pt file path")
24
+ # ---- rebuild ----
25
+ rebuild_cmd = subparsers.add_parser("rebuild", help="Rebuild embeddings")
26
+ rebuild_cmd.add_argument("--name", help="Name of the speaker to rebuild (leave empty to rebuild all)", default=None)
38
27
 
39
28
  # ---- Parse arguments ----
40
29
  args = parser.parse_args()
@@ -46,10 +35,8 @@ def main():
46
35
  os.environ["PYTHONWARNINGS"] = "ignore"
47
36
 
48
37
  # ---- Import modules after filtering warnings ----
49
- from .core import enroll_speaker, identify_speaker, list_speakers
50
- from .export_model import export_model_to_onnx
51
- from .export_embeddings import export_embeddings_to_json
52
- from .combine import combine_embeddings_from_folder
38
+ from .core import enroll_speaker, identify_speaker, list_speakers, rebuild_embedding
39
+ from .utils.analyze import rebuild_all_embeddings
53
40
 
54
41
  # ---- Command Dispatch ----
55
42
  if args.command == "enroll":
@@ -69,14 +56,13 @@ def main():
69
56
  else:
70
57
  print("⚠️ No speakers enrolled yet.")
71
58
 
72
- elif args.command == "export-model":
73
- export_model_to_onnx(args.pt, args.out)
74
-
75
- elif args.command == "export-speaker-json":
76
- export_embeddings_to_json(args.pt, args.out)
77
-
78
- elif args.command == "combine":
79
- combine_embeddings_from_folder(args.folder, args.out)
59
+ elif args.command == "rebuild":
60
+ if args.name:
61
+ rebuild_embedding(args.name)
62
+ print(f"🔁 Rebuilt: {args.name}")
63
+ else:
64
+ rebuild_all_embeddings()
65
+ print("🔁 Rebuilt all embeddings.")
80
66
 
81
67
  else:
82
68
  parser.print_help()
@@ -0,0 +1,116 @@
1
+ # core.py
2
+
3
+ from pathlib import Path
4
+ import torch
5
+ import torchaudio
6
+ from speechbrain.inference import SpeakerRecognition
7
+
8
+ # ── DIRECTORIES ──────────────────────────────────────────────────────────────
9
+ BASE_DIR = Path(__file__).resolve().parent.parent / "storage"
10
+ SPEAKER_AUDIO_DIR = BASE_DIR / "speakers"
11
+ EMBEDDINGS_DIR = BASE_DIR / "embeddings"
12
+ NOISE_DIR = BASE_DIR / "background_noise"
13
+
14
+ SPEAKER_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
15
+ EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
16
+ NOISE_DIR.mkdir(parents=True, exist_ok=True)
17
+
18
+ # ── MODEL LOADING ────────────────────────────────────────────────────────────
19
+ MODEL = SpeakerRecognition.from_hparams(
20
+ source="speechbrain/spkrec-ecapa-voxceleb",
21
+ savedir="model"
22
+ )
23
+
24
+ # ── EMBEDDING HELPERS ────────────────────────────────────────────────────────
25
+ def get_embedding(audio_path: str) -> torch.Tensor:
26
+ signal, fs = torchaudio.load(audio_path)
27
+ if signal.numel() == 0:
28
+ raise ValueError(f"{audio_path} is empty.")
29
+ return MODEL.encode_batch(signal).squeeze().detach().cpu()
30
+
31
+ def average_embeddings(paths: list[str]) -> torch.Tensor:
32
+ embeddings = [get_embedding(p) for p in paths]
33
+ return torch.stack(embeddings).mean(dim=0)
34
+
35
+ # ── ENROLL / IMPROVE ─────────────────────────────────────────────────────────
36
+ def enroll_speaker(audio_path: str, speaker_id: str) -> None:
37
+ speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
38
+ speaker_dir.mkdir(parents=True, exist_ok=True)
39
+
40
+ existing = list(speaker_dir.glob("*.wav"))
41
+ dest_path = speaker_dir / f"{len(existing)+1}.wav"
42
+
43
+ waveform, sr = torchaudio.load(audio_path)
44
+ if waveform.numel() == 0:
45
+ raise ValueError("Cannot enroll empty audio file.")
46
+ torchaudio.save(str(dest_path), waveform, sr)
47
+
48
+ emb = get_embedding(audio_path)
49
+ torch.save(emb, EMBEDDINGS_DIR / f"{speaker_id}.pt")
50
+
51
+ def rebuild_embedding(speaker_id: str) -> None:
52
+ speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
53
+ wavs = list(speaker_dir.glob("*.wav"))
54
+ if not wavs:
55
+ raise RuntimeError(f"No recordings for {speaker_id}.")
56
+ emb = average_embeddings([str(w) for w in wavs])
57
+ torch.save(emb, EMBEDDINGS_DIR / f"{speaker_id}.pt")
58
+
59
+ # ── BACKGROUND NOISE MODELING ────────────────────────────────────────────────
60
+ def compute_background_embedding() -> None:
61
+ paths = [str(p) for p in NOISE_DIR.glob("*.wav")]
62
+ if not paths:
63
+ raise RuntimeError("No background noise samples.")
64
+ emb = average_embeddings(paths)
65
+ torch.save(emb, EMBEDDINGS_DIR / "background_noise.pt")
66
+
67
+ # ── IDENTIFICATION ───────────────────────────────────────────────────────────
68
+ def identify_speaker(audio_path: str, threshold: float = 0.25) -> tuple[str, float]:
69
+ try:
70
+ test_emb = get_embedding(audio_path)
71
+ except Exception:
72
+ return "error", 0.0
73
+
74
+ scores = {}
75
+ for emb_path in EMBEDDINGS_DIR.glob("*.pt"):
76
+ name = emb_path.stem
77
+ try:
78
+ emb = torch.load(emb_path)
79
+ score = torch.nn.functional.cosine_similarity(emb, test_emb, dim=0).item()
80
+ scores[name] = score
81
+ except:
82
+ continue
83
+
84
+ if not scores:
85
+ return "unknown", 0.0
86
+
87
+ sorted_scores = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
88
+ best, best_score = sorted_scores[0]
89
+ second_score = sorted_scores[1][1] if len(sorted_scores) > 1 else 0.0
90
+ auto_thresh = (best_score - second_score) > 0.1
91
+ match = auto_thresh or best_score >= threshold
92
+
93
+ return (best, round(best_score, 3)) if match else ("unknown", round(best_score, 3))
94
+
95
+ # ── REBUILD CHECKING ─────────────────────────────────────────────────────────
96
+ def list_speakers() -> list[str]:
97
+ return [p.name for p in SPEAKER_AUDIO_DIR.iterdir() if p.is_dir()]
98
+
99
+ def speaker_needs_rebuild(speaker_id: str) -> bool:
100
+ speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
101
+ emb_path = EMBEDDINGS_DIR / f"{speaker_id}.pt"
102
+ if not emb_path.exists():
103
+ return True
104
+ emb_mtime = emb_path.stat().st_mtime
105
+ for wav in speaker_dir.glob("*.wav"):
106
+ if wav.stat().st_mtime > emb_mtime:
107
+ return True
108
+ return False
109
+
110
+ def get_speakers_needing_rebuild() -> list[str]:
111
+ return [s for s in list_speakers() if speaker_needs_rebuild(s)]
112
+
113
+
114
+
115
+ # ── ALIAS FOR COMPATIBILITY ──────────────────────────────────────────────────
116
+ rebuild_embeddings_for_speaker = rebuild_embedding
@@ -0,0 +1,58 @@
1
+ # ############################################################################
2
+ # Model: ECAPA big for Speaker verification
3
+ # ############################################################################
4
+
5
+ # Feature parameters
6
+ n_mels: 80
7
+
8
+ # Pretrain folder (HuggingFace)
9
+ pretrained_path: speechbrain/spkrec-ecapa-voxceleb
10
+
11
+ # Output parameters
12
+ out_n_neurons: 7205
13
+
14
+ # Model params
15
+ compute_features: !new:speechbrain.lobes.features.Fbank
16
+ n_mels: !ref <n_mels>
17
+
18
+ mean_var_norm: !new:speechbrain.processing.features.InputNormalization
19
+ norm_type: sentence
20
+ std_norm: False
21
+
22
+ embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
23
+ input_size: !ref <n_mels>
24
+ channels: [1024, 1024, 1024, 1024, 3072]
25
+ kernel_sizes: [5, 3, 3, 3, 1]
26
+ dilations: [1, 2, 3, 4, 1]
27
+ attention_channels: 128
28
+ lin_neurons: 192
29
+
30
+ classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
31
+ input_size: 192
32
+ out_neurons: !ref <out_n_neurons>
33
+
34
+ mean_var_norm_emb: !new:speechbrain.processing.features.InputNormalization
35
+ norm_type: global
36
+ std_norm: False
37
+
38
+ modules:
39
+ compute_features: !ref <compute_features>
40
+ mean_var_norm: !ref <mean_var_norm>
41
+ embedding_model: !ref <embedding_model>
42
+ mean_var_norm_emb: !ref <mean_var_norm_emb>
43
+ classifier: !ref <classifier>
44
+
45
+ label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
46
+
47
+
48
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
49
+ loadables:
50
+ embedding_model: !ref <embedding_model>
51
+ mean_var_norm_emb: !ref <mean_var_norm_emb>
52
+ classifier: !ref <classifier>
53
+ label_encoder: !ref <label_encoder>
54
+ paths:
55
+ embedding_model: !ref <pretrained_path>/embedding_model.ckpt
56
+ mean_var_norm_emb: !ref <pretrained_path>/mean_var_norm_emb.ckpt
57
+ classifier: !ref <pretrained_path>/classifier.ckpt
58
+ label_encoder: !ref <pretrained_path>/label_encoder.txt