speaker-detector 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. speaker_detector/cli.py +12 -26
  2. speaker_detector/core.py +78 -65
  3. speaker_detector/model/classifier.ckpt +0 -0
  4. speaker_detector/model/embedding_model.ckpt +0 -0
  5. speaker_detector/model/hyperparams.yaml +58 -0
  6. speaker_detector/model/label_encoder.ckpt +7207 -0
  7. speaker_detector/model/mean_var_norm_emb.ckpt +0 -0
  8. speaker_detector/server copy.py +296 -0
  9. speaker_detector/server.py +82 -0
  10. speaker_detector/state.py +69 -0
  11. speaker_detector/web/static/favicon.ico +0 -0
  12. speaker_detector/web/static/index.html +29 -0
  13. speaker_detector/web/static/scripts/loader copy.js +10 -0
  14. speaker_detector/web/static/scripts/loader.js +14 -0
  15. speaker_detector/web/static/scripts/script copy.js +954 -0
  16. speaker_detector/web/static/scripts/script.js +22 -0
  17. speaker_detector/web/static/style.css +133 -0
  18. {speaker_detector-0.1.4.dist-info → speaker_detector-0.1.6.dist-info}/METADATA +28 -3
  19. speaker_detector-0.1.6.dist-info/RECORD +25 -0
  20. {speaker_detector-0.1.4.dist-info → speaker_detector-0.1.6.dist-info}/WHEEL +1 -1
  21. speaker_detector/analyze.py +0 -59
  22. speaker_detector/combine.py +0 -22
  23. speaker_detector/export_embeddings.py +0 -41
  24. speaker_detector/export_model.py +0 -40
  25. speaker_detector/generate_summary.py +0 -110
  26. speaker_detector-0.1.4.dist-info/RECORD +0 -15
  27. /speaker_detector/{ECAPA_TDNN.py → model/ECAPA_TDNN.py} +0 -0
  28. /speaker_detector/{__init__.py → web/static/__init__.py} +0 -0
  29. {speaker_detector-0.1.4.dist-info → speaker_detector-0.1.6.dist-info}/entry_points.txt +0 -0
  30. {speaker_detector-0.1.4.dist-info → speaker_detector-0.1.6.dist-info}/top_level.txt +0 -0
speaker_detector/cli.py CHANGED
@@ -21,20 +21,9 @@ def main():
21
21
  # ---- list-speakers ----
22
22
  subparsers.add_parser("list-speakers", help="List enrolled speakers")
23
23
 
24
- # ---- export-model ----
25
- model_parser = subparsers.add_parser("export-model", help="Export ECAPA model to ONNX")
26
- model_parser.add_argument("--pt", required=True, help="Path to embedding_model.ckpt")
27
- model_parser.add_argument("--out", default="speaker_embedding.onnx", help="Output ONNX file")
28
-
29
- # ---- export-speaker-json ----
30
- emb_parser = subparsers.add_parser("export-speaker-json", help="Convert enrolled .pt file to browser-friendly .json")
31
- emb_parser.add_argument("--pt", required=True, help="Path to enrolled_speakers.pt")
32
- emb_parser.add_argument("--out", default="speakers.json", help="Output .json file for browser")
33
-
34
- # ---- combine ----
35
- comb_parser = subparsers.add_parser("combine", help="Combine individual .pt files into enrolled_speakers.pt")
36
- comb_parser.add_argument("--folder", required=True, help="Folder with individual .pt files")
37
- comb_parser.add_argument("--out", required=True, help="Output .pt file path")
24
+ # ---- rebuild ----
25
+ rebuild_cmd = subparsers.add_parser("rebuild", help="Rebuild embeddings")
26
+ rebuild_cmd.add_argument("--name", help="Name of the speaker to rebuild (leave empty to rebuild all)", default=None)
38
27
 
39
28
  # ---- Parse arguments ----
40
29
  args = parser.parse_args()
@@ -46,10 +35,8 @@ def main():
46
35
  os.environ["PYTHONWARNINGS"] = "ignore"
47
36
 
48
37
  # ---- Import modules after filtering warnings ----
49
- from .core import enroll_speaker, identify_speaker, list_speakers
50
- from .export_model import export_model_to_onnx
51
- from .export_embeddings import export_embeddings_to_json
52
- from .combine import combine_embeddings_from_folder
38
+ from .core import enroll_speaker, identify_speaker, list_speakers, rebuild_embedding
39
+ from .utils.analyze import rebuild_all_embeddings
53
40
 
54
41
  # ---- Command Dispatch ----
55
42
  if args.command == "enroll":
@@ -69,14 +56,13 @@ def main():
69
56
  else:
70
57
  print("⚠️ No speakers enrolled yet.")
71
58
 
72
- elif args.command == "export-model":
73
- export_model_to_onnx(args.pt, args.out)
74
-
75
- elif args.command == "export-speaker-json":
76
- export_embeddings_to_json(args.pt, args.out)
77
-
78
- elif args.command == "combine":
79
- combine_embeddings_from_folder(args.folder, args.out)
59
+ elif args.command == "rebuild":
60
+ if args.name:
61
+ rebuild_embedding(args.name)
62
+ print(f"🔁 Rebuilt: {args.name}")
63
+ else:
64
+ rebuild_all_embeddings()
65
+ print("🔁 Rebuilt all embeddings.")
80
66
 
81
67
  else:
82
68
  parser.print_help()
speaker_detector/core.py CHANGED
@@ -1,103 +1,116 @@
1
- from speechbrain.pretrained import SpeakerRecognition
1
+ # core.py
2
+
2
3
  from pathlib import Path
3
- import torchaudio
4
4
  import torch
5
+ import torchaudio
6
+ from speechbrain.inference import SpeakerRecognition
5
7
 
6
- # Storage directories
8
+ # ── DIRECTORIES ──────────────────────────────────────────────────────────────
7
9
  BASE_DIR = Path(__file__).resolve().parent.parent / "storage"
8
10
  SPEAKER_AUDIO_DIR = BASE_DIR / "speakers"
9
11
  EMBEDDINGS_DIR = BASE_DIR / "embeddings"
12
+ NOISE_DIR = BASE_DIR / "background_noise"
10
13
 
11
- # Ensure they exist
12
14
  SPEAKER_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
13
15
  EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
16
+ NOISE_DIR.mkdir(parents=True, exist_ok=True)
14
17
 
15
- # Load model once
18
+ # ── MODEL LOADING ────────────────────────────────────────────────────────────
16
19
  MODEL = SpeakerRecognition.from_hparams(
17
- source="speechbrain/spkrec-ecapa-voxceleb", savedir="model"
20
+ source="speechbrain/spkrec-ecapa-voxceleb",
21
+ savedir="model"
18
22
  )
19
23
 
20
- def get_embedding(audio_path):
21
- try:
22
- signal, fs = torchaudio.load(audio_path)
23
- if signal.numel() == 0:
24
- raise ValueError(f"{audio_path} is empty.")
25
- return MODEL.encode_batch(signal).squeeze().detach().cpu()
26
- except Exception as e:
27
- raise RuntimeError(f"Failed to embed {audio_path}: {e}")
28
-
29
- def enroll_speaker(audio_path, speaker_id):
24
+ # ── EMBEDDING HELPERS ────────────────────────────────────────────────────────
25
+ def get_embedding(audio_path: str) -> torch.Tensor:
26
+ signal, fs = torchaudio.load(audio_path)
27
+ if signal.numel() == 0:
28
+ raise ValueError(f"{audio_path} is empty.")
29
+ return MODEL.encode_batch(signal).squeeze().detach().cpu()
30
+
31
+ def average_embeddings(paths: list[str]) -> torch.Tensor:
32
+ embeddings = [get_embedding(p) for p in paths]
33
+ return torch.stack(embeddings).mean(dim=0)
34
+
35
+ # ── ENROLL / IMPROVE ─────────────────────────────────────────────────────────
36
+ def enroll_speaker(audio_path: str, speaker_id: str) -> None:
30
37
  speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
31
38
  speaker_dir.mkdir(parents=True, exist_ok=True)
32
39
 
33
- # Save audio sample
34
40
  existing = list(speaker_dir.glob("*.wav"))
35
- new_index = len(existing) + 1
36
- dest_path = speaker_dir / f"{new_index}.wav"
41
+ dest_path = speaker_dir / f"{len(existing)+1}.wav"
37
42
 
38
- waveform, sample_rate = torchaudio.load(audio_path)
43
+ waveform, sr = torchaudio.load(audio_path)
39
44
  if waveform.numel() == 0:
40
45
  raise ValueError("Cannot enroll empty audio file.")
46
+ torchaudio.save(str(dest_path), waveform, sr)
41
47
 
42
- torchaudio.save(str(dest_path), waveform, sample_rate)
43
- print(f"🎙 Saved {speaker_id}'s recording #{new_index} → {dest_path}")
44
-
45
- # Save embedding
46
48
  emb = get_embedding(audio_path)
47
- emb_path = EMBEDDINGS_DIR / f"{speaker_id}.pt"
48
- torch.save(emb, emb_path)
49
- print(f"🧠 Saved embedding for {speaker_id} → {emb_path}")
49
+ torch.save(emb, EMBEDDINGS_DIR / f"{speaker_id}.pt")
50
50
 
51
- def identify_speaker(audio_path, threshold=0.25):
51
+ def rebuild_embedding(speaker_id: str) -> None:
52
+ speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
53
+ wavs = list(speaker_dir.glob("*.wav"))
54
+ if not wavs:
55
+ raise RuntimeError(f"No recordings for {speaker_id}.")
56
+ emb = average_embeddings([str(w) for w in wavs])
57
+ torch.save(emb, EMBEDDINGS_DIR / f"{speaker_id}.pt")
58
+
59
+ # ── BACKGROUND NOISE MODELING ────────────────────────────────────────────────
60
+ def compute_background_embedding() -> None:
61
+ paths = [str(p) for p in NOISE_DIR.glob("*.wav")]
62
+ if not paths:
63
+ raise RuntimeError("No background noise samples.")
64
+ emb = average_embeddings(paths)
65
+ torch.save(emb, EMBEDDINGS_DIR / "background_noise.pt")
66
+
67
+ # ── IDENTIFICATION ───────────────────────────────────────────────────────────
68
+ def identify_speaker(audio_path: str, threshold: float = 0.25) -> tuple[str, float]:
52
69
  try:
53
70
  test_emb = get_embedding(audio_path)
54
- except Exception as e:
55
- return {"speaker": "error", "score": 0, "error": str(e)}
71
+ except Exception:
72
+ return "error", 0.0
56
73
 
57
74
  scores = {}
58
75
  for emb_path in EMBEDDINGS_DIR.glob("*.pt"):
59
- speaker_name = emb_path.stem
76
+ name = emb_path.stem
60
77
  try:
61
- enrolled_emb = torch.load(emb_path)
62
- score = torch.nn.functional.cosine_similarity(enrolled_emb, test_emb, dim=0).item()
63
- scores[speaker_name] = score
64
- except Exception as e:
78
+ emb = torch.load(emb_path)
79
+ score = torch.nn.functional.cosine_similarity(emb, test_emb, dim=0).item()
80
+ scores[name] = score
81
+ except:
65
82
  continue
66
83
 
67
84
  if not scores:
68
- return {"speaker": "unknown", "score": 0}
85
+ return "unknown", 0.0
69
86
 
70
87
  sorted_scores = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
71
- best, second = sorted_scores[0], sorted_scores[1] if len(sorted_scores) > 1 else (None, None)
72
- auto_thresh = best[1] - (second[1] if second else 0) > 0.1
73
- is_match = auto_thresh or best[1] >= threshold
74
-
75
- result = {
76
- "speaker": best[0] if is_match else "unknown",
77
- "score": round(best[1], 3),
78
- "all_scores": {k: round(v, 3) for k, v in sorted_scores}
79
- }
80
- return result
81
-
82
- def list_speakers():
83
- speakers = []
84
- for dir in SPEAKER_AUDIO_DIR.iterdir():
85
- if dir.is_dir():
86
- count = len(list(dir.glob("*.wav")))
87
- speakers.append(f"{dir.name} ({count} recording{'s' if count != 1 else ''})")
88
- print(f"📋 Found {len(speakers)} enrolled speaker(s): {speakers}")
89
- return [s.split()[0] for s in speakers]
90
-
91
- def rebuild_embedding(speaker_id):
92
- speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
93
- wavs = list(speaker_dir.glob("*.wav"))
88
+ best, best_score = sorted_scores[0]
89
+ second_score = sorted_scores[1][1] if len(sorted_scores) > 1 else 0.0
90
+ auto_thresh = (best_score - second_score) > 0.1
91
+ match = auto_thresh or best_score >= threshold
94
92
 
95
- if not wavs:
96
- raise RuntimeError(f"No recordings found for {speaker_id}.")
93
+ return (best, round(best_score, 3)) if match else ("unknown", round(best_score, 3))
97
94
 
98
- embeddings = [get_embedding(w) for w in wavs]
99
- avg_emb = torch.stack(embeddings).mean(dim=0)
95
+ # ── REBUILD CHECKING ─────────────────────────────────────────────────────────
96
+ def list_speakers() -> list[str]:
97
+ return [p.name for p in SPEAKER_AUDIO_DIR.iterdir() if p.is_dir()]
100
98
 
99
+ def speaker_needs_rebuild(speaker_id: str) -> bool:
100
+ speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
101
101
  emb_path = EMBEDDINGS_DIR / f"{speaker_id}.pt"
102
- torch.save(avg_emb, emb_path)
103
- print(f"🔁 Rebuilt embedding for {speaker_id}")
102
+ if not emb_path.exists():
103
+ return True
104
+ emb_mtime = emb_path.stat().st_mtime
105
+ for wav in speaker_dir.glob("*.wav"):
106
+ if wav.stat().st_mtime > emb_mtime:
107
+ return True
108
+ return False
109
+
110
+ def get_speakers_needing_rebuild() -> list[str]:
111
+ return [s for s in list_speakers() if speaker_needs_rebuild(s)]
112
+
113
+
114
+
115
+ # ── ALIAS FOR COMPATIBILITY ──────────────────────────────────────────────────
116
+ rebuild_embeddings_for_speaker = rebuild_embedding
Binary file
@@ -0,0 +1,58 @@
1
+ # ############################################################################
2
+ # Model: ECAPA big for Speaker verification
3
+ # ############################################################################
4
+
5
+ # Feature parameters
6
+ n_mels: 80
7
+
8
+ # Pretrain folder (HuggingFace)
9
+ pretrained_path: speechbrain/spkrec-ecapa-voxceleb
10
+
11
+ # Output parameters
12
+ out_n_neurons: 7205
13
+
14
+ # Model params
15
+ compute_features: !new:speechbrain.lobes.features.Fbank
16
+ n_mels: !ref <n_mels>
17
+
18
+ mean_var_norm: !new:speechbrain.processing.features.InputNormalization
19
+ norm_type: sentence
20
+ std_norm: False
21
+
22
+ embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
23
+ input_size: !ref <n_mels>
24
+ channels: [1024, 1024, 1024, 1024, 3072]
25
+ kernel_sizes: [5, 3, 3, 3, 1]
26
+ dilations: [1, 2, 3, 4, 1]
27
+ attention_channels: 128
28
+ lin_neurons: 192
29
+
30
+ classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
31
+ input_size: 192
32
+ out_neurons: !ref <out_n_neurons>
33
+
34
+ mean_var_norm_emb: !new:speechbrain.processing.features.InputNormalization
35
+ norm_type: global
36
+ std_norm: False
37
+
38
+ modules:
39
+ compute_features: !ref <compute_features>
40
+ mean_var_norm: !ref <mean_var_norm>
41
+ embedding_model: !ref <embedding_model>
42
+ mean_var_norm_emb: !ref <mean_var_norm_emb>
43
+ classifier: !ref <classifier>
44
+
45
+ label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
46
+
47
+
48
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
49
+ loadables:
50
+ embedding_model: !ref <embedding_model>
51
+ mean_var_norm_emb: !ref <mean_var_norm_emb>
52
+ classifier: !ref <classifier>
53
+ label_encoder: !ref <label_encoder>
54
+ paths:
55
+ embedding_model: !ref <pretrained_path>/embedding_model.ckpt
56
+ mean_var_norm_emb: !ref <pretrained_path>/mean_var_norm_emb.ckpt
57
+ classifier: !ref <pretrained_path>/classifier.ckpt
58
+ label_encoder: !ref <pretrained_path>/label_encoder.txt