speaker-detector 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speaker_detector/cli.py +12 -26
- speaker_detector/core.py +78 -65
- speaker_detector/model/classifier.ckpt +0 -0
- speaker_detector/model/embedding_model.ckpt +0 -0
- speaker_detector/model/hyperparams.yaml +58 -0
- speaker_detector/model/label_encoder.ckpt +7207 -0
- speaker_detector/model/mean_var_norm_emb.ckpt +0 -0
- speaker_detector/server copy.py +296 -0
- speaker_detector/server.py +82 -0
- speaker_detector/state.py +69 -0
- speaker_detector/web/static/favicon.ico +0 -0
- speaker_detector/web/static/index.html +29 -0
- speaker_detector/web/static/scripts/loader copy.js +10 -0
- speaker_detector/web/static/scripts/loader.js +14 -0
- speaker_detector/web/static/scripts/script copy.js +954 -0
- speaker_detector/web/static/scripts/script.js +22 -0
- speaker_detector/web/static/style.css +133 -0
- {speaker_detector-0.1.4.dist-info → speaker_detector-0.1.6.dist-info}/METADATA +28 -3
- speaker_detector-0.1.6.dist-info/RECORD +25 -0
- {speaker_detector-0.1.4.dist-info → speaker_detector-0.1.6.dist-info}/WHEEL +1 -1
- speaker_detector/analyze.py +0 -59
- speaker_detector/combine.py +0 -22
- speaker_detector/export_embeddings.py +0 -41
- speaker_detector/export_model.py +0 -40
- speaker_detector/generate_summary.py +0 -110
- speaker_detector-0.1.4.dist-info/RECORD +0 -15
- /speaker_detector/{ECAPA_TDNN.py → model/ECAPA_TDNN.py} +0 -0
- /speaker_detector/{__init__.py → web/static/__init__.py} +0 -0
- {speaker_detector-0.1.4.dist-info → speaker_detector-0.1.6.dist-info}/entry_points.txt +0 -0
- {speaker_detector-0.1.4.dist-info → speaker_detector-0.1.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,22 @@
|
|
1
|
+
import { setupAccordionNav } from "/static/components/accordion-nav/accordion-nav.js";
|
2
|
+
import { setupMicTest } from "/static/components/mic-test/mic-test.js";
|
3
|
+
import { setupEnrollSpeaker } from "/static/components/enroll-speaker/enroll-speaker.js";
|
4
|
+
import { setupIdentifySpeaker } from "/static/components/identify-speaker/identify-speaker.js";
|
5
|
+
import { setupMeetingMode } from "/static/components/meeting-mode/meeting-mode.js";
|
6
|
+
import { setupRecordingsTab } from "/static/components/recordings-tab/recordings-tab.js";
|
7
|
+
import { setupSpeakersList } from "/static/components/speakers-list/speakers-list.js";
|
8
|
+
import { setupCorrection } from "/static/components/correction/correction.js";
|
9
|
+
import { setupMicPopup } from "/static/components/mic-popup/mic-popup.js";
|
10
|
+
|
11
|
+
// ✅ Export setup block so loader can run it later
|
12
|
+
export function runSetup() {
|
13
|
+
setupAccordionNav();
|
14
|
+
setupMicTest();
|
15
|
+
setupEnrollSpeaker();
|
16
|
+
setupIdentifySpeaker();
|
17
|
+
setupMeetingMode();
|
18
|
+
setupRecordingsTab();
|
19
|
+
|
20
|
+
setupCorrection();
|
21
|
+
setupMicPopup();
|
22
|
+
}
|
@@ -0,0 +1,133 @@
|
|
1
|
+
body {
|
2
|
+
font-family: sans-serif;
|
3
|
+
margin: 0;
|
4
|
+
padding: 0;
|
5
|
+
background: #f5f5f5;
|
6
|
+
}
|
7
|
+
|
8
|
+
.header {
|
9
|
+
background: #007acc;
|
10
|
+
color: white;
|
11
|
+
padding: 1rem;
|
12
|
+
text-align: center;
|
13
|
+
}
|
14
|
+
|
15
|
+
.workflow-accordion {
|
16
|
+
display: flex;
|
17
|
+
gap: 10px;
|
18
|
+
padding: 1rem;
|
19
|
+
background: white;
|
20
|
+
border-bottom: 1px solid #ddd;
|
21
|
+
flex-wrap: wrap;
|
22
|
+
}
|
23
|
+
|
24
|
+
.accordion-step {
|
25
|
+
flex: 1;
|
26
|
+
background: #e0e0e0;
|
27
|
+
padding: 10px;
|
28
|
+
border-radius: 8px;
|
29
|
+
text-align: center;
|
30
|
+
cursor: pointer;
|
31
|
+
transition: background 0.2s ease, color 0.2s ease;
|
32
|
+
}
|
33
|
+
|
34
|
+
.accordion-step.active {
|
35
|
+
background: #007acc;
|
36
|
+
color: white;
|
37
|
+
}
|
38
|
+
|
39
|
+
.accordion-content {
|
40
|
+
display: none;
|
41
|
+
padding: 1rem;
|
42
|
+
background: white;
|
43
|
+
border-bottom: 1px solid #ddd;
|
44
|
+
}
|
45
|
+
|
46
|
+
.accordion-content.active {
|
47
|
+
display: block;
|
48
|
+
}
|
49
|
+
|
50
|
+
button {
|
51
|
+
padding: 0.5rem 1rem;
|
52
|
+
margin-top: 0.5rem;
|
53
|
+
background: #007acc;
|
54
|
+
color: white;
|
55
|
+
border: none;
|
56
|
+
border-radius: 5px;
|
57
|
+
cursor: pointer;
|
58
|
+
transition: background 0.2s ease;
|
59
|
+
}
|
60
|
+
|
61
|
+
button:hover {
|
62
|
+
background: #005fa3;
|
63
|
+
}
|
64
|
+
|
65
|
+
.status {
|
66
|
+
margin-top: 0.5rem;
|
67
|
+
font-weight: bold;
|
68
|
+
color: #555;
|
69
|
+
}
|
70
|
+
|
71
|
+
canvas {
|
72
|
+
width: 100%;
|
73
|
+
height: 100px;
|
74
|
+
background: #111;
|
75
|
+
margin: 0.5rem 0;
|
76
|
+
}
|
77
|
+
|
78
|
+
ul {
|
79
|
+
list-style: none;
|
80
|
+
padding: 0;
|
81
|
+
}
|
82
|
+
|
83
|
+
ul li {
|
84
|
+
padding: 0.5rem 0;
|
85
|
+
border-bottom: 1px solid #eee;
|
86
|
+
}
|
87
|
+
|
88
|
+
input[type="text"] {
|
89
|
+
padding: 0.5rem;
|
90
|
+
width: 60%;
|
91
|
+
margin-right: 0.5rem;
|
92
|
+
border: 1px solid #ccc;
|
93
|
+
border-radius: 5px;
|
94
|
+
}
|
95
|
+
|
96
|
+
#timeline {
|
97
|
+
margin-top: 1rem;
|
98
|
+
}
|
99
|
+
|
100
|
+
#identify-result {
|
101
|
+
margin: 0.5rem 0;
|
102
|
+
}
|
103
|
+
|
104
|
+
@media (max-width: 768px) {
|
105
|
+
.workflow-accordion {
|
106
|
+
flex-direction: column;
|
107
|
+
}
|
108
|
+
|
109
|
+
.accordion-step {
|
110
|
+
flex: unset;
|
111
|
+
}
|
112
|
+
}
|
113
|
+
|
114
|
+
#loading-overlay {
|
115
|
+
position: fixed;
|
116
|
+
inset: 0;
|
117
|
+
background: #111;
|
118
|
+
color: white;
|
119
|
+
display: flex;
|
120
|
+
align-items: center;
|
121
|
+
justify-content: center;
|
122
|
+
font-size: 1.2rem;
|
123
|
+
z-index: 9999;
|
124
|
+
}
|
125
|
+
|
126
|
+
.loader {
|
127
|
+
animation: pulse 1.2s infinite ease-in-out;
|
128
|
+
}
|
129
|
+
|
130
|
+
@keyframes pulse {
|
131
|
+
0%, 100% { opacity: 0.4; }
|
132
|
+
50% { opacity: 1; }
|
133
|
+
}
|
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: speaker-detector
|
3
|
-
Version: 0.1.
|
4
|
-
Summary: A CLI tool for speaker enrollment and identification using SpeechBrain.
|
3
|
+
Version: 0.1.6
|
4
|
+
Summary: A CLI + Web tool for speaker enrollment and identification using SpeechBrain.
|
5
5
|
Author-email: Lara Whybrow <lara.whybrow@gmail.com>
|
6
6
|
License: MIT
|
7
7
|
Project-URL: Homepage, https://github.com/P0llen/speaker-detector
|
@@ -22,7 +22,14 @@ Description-Content-Type: text/markdown
|
|
22
22
|
Requires-Dist: torch
|
23
23
|
Requires-Dist: torchaudio
|
24
24
|
Requires-Dist: speechbrain
|
25
|
-
Requires-Dist:
|
25
|
+
Requires-Dist: flask
|
26
|
+
Requires-Dist: flask-cors
|
27
|
+
Requires-Dist: numpy
|
28
|
+
Requires-Dist: sounddevice
|
29
|
+
Requires-Dist: soundfile
|
30
|
+
Requires-Dist: pydub
|
31
|
+
|
32
|
+
Note: Still in development, as I am configuring the system for the most performant approach. Feel free to jump on the project with me.
|
26
33
|
|
27
34
|
# speaker-detector 🎙️
|
28
35
|
|
@@ -45,6 +52,12 @@ Install from [TestPyPI](https://test.pypi.org/):
|
|
45
52
|
|
46
53
|
```bash
|
47
54
|
pip install --index-url https://test.pypi.org/simple/ speaker-detector
|
55
|
+
|
56
|
+
When installing packages with a stale requirement file you might need to use: pip install --break-system-packages soundfile to install on WSL Ubuntu
|
57
|
+
|
58
|
+
Run this version with -m module flag if you are contributing and want to run server.py:
|
59
|
+
python3 -m speaker_detector.server
|
60
|
+
|
48
61
|
```
|
49
62
|
|
50
63
|
## 🚀 Usage
|
@@ -99,3 +112,15 @@ onnxruntime
|
|
99
112
|
|
100
113
|
|
101
114
|
NB: When pushing to Github, do not include any .identifier files.
|
115
|
+
|
116
|
+
You can manually clean up stale embeddings that don’t match any existing speaker folder with a quick script:
|
117
|
+
|
118
|
+
# Run inside your project root
|
119
|
+
cd storage/embeddings
|
120
|
+
for f in *.pt; do
|
121
|
+
speaker="${f%.pt}"
|
122
|
+
if [ ! -d "../speakers/$speaker" ]; then
|
123
|
+
echo "Deleting stale embedding: $f"
|
124
|
+
rm "$f"
|
125
|
+
fi
|
126
|
+
done
|
@@ -0,0 +1,25 @@
|
|
1
|
+
speaker_detector/__main__.py,sha256=EClCwCzb6h6YBpt0hrnG4h0mlNhNePyg_xBNNSVm1os,65
|
2
|
+
speaker_detector/cli.py,sha256=TxJhu3Pjhg41tkcu--aLtn0vZwBYyoVEef10zqSBzig,2619
|
3
|
+
speaker_detector/core.py,sha256=zct1lNfeGB6Y7WB1rqNVPKyMKXu2g0KCuMkpAiyTLi0,5267
|
4
|
+
speaker_detector/server copy.py,sha256=A1WplNK8yGe9AnEjrSRqHO-uJJsMwIlvEzDhgu72XyY,10723
|
5
|
+
speaker_detector/server.py,sha256=6AkZ2drrE-jfV3M-4jdzobontsVxrGSrmQPykpSWyeo,3516
|
6
|
+
speaker_detector/state.py,sha256=ikcTNsjBzbFNMWfpfDaWkWMOb01Y9wlzvCAu7ifQ30w,2761
|
7
|
+
speaker_detector/model/ECAPA_TDNN.py,sha256=KB5T-ye4c9ZWgTgn_SMH-T_-qYSEHQJJtf3xHjsfNPk,19024
|
8
|
+
speaker_detector/model/classifier.ckpt,sha256=_Z42NP5ovQpCfJXjVMDGdzdPYrP0NORbeFmZUNhg1TU,5534328
|
9
|
+
speaker_detector/model/embedding_model.ckpt,sha256=BXXLZIRea5oQ25vLdNWsMrMmuNyQNSZx00Xi7j0BJqI,83316686
|
10
|
+
speaker_detector/model/hyperparams.yaml,sha256=b3iFT6BLpZ52FDe3aiV106ul5QFt4-m2nwyaUHf7GkE,1919
|
11
|
+
speaker_detector/model/label_encoder.ckpt,sha256=4Tw6Fnu0ESaFZw7oltIOK1Za8Ws6TO6qhon6TSKtuLk,128619
|
12
|
+
speaker_detector/model/mean_var_norm_emb.ckpt,sha256=zXAiWwWze-ZPxaleJDldgEIx1D90suHlpRPbe2mzTDM,1921
|
13
|
+
speaker_detector/web/static/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
speaker_detector/web/static/favicon.ico,sha256=JzURAUfKDB96EFxNo00a0pAG0DQWY8v7YDiBJeRDn9g,496
|
15
|
+
speaker_detector/web/static/index.html,sha256=0Hmfzmyjqm_MyFyoNJY51EzLHFFamKXlFygGDy2TDX0,1218
|
16
|
+
speaker_detector/web/static/style.css,sha256=R5pGvJqh5j95o6RSmpEFDNQMxNM8yQ7ie7Hlms1HZ7U,2025
|
17
|
+
speaker_detector/web/static/scripts/loader copy.js,sha256=BwhTS_ulxb62cwF6qAk1ng-bMBM-29l1rQ7W7QFTYPk,333
|
18
|
+
speaker_detector/web/static/scripts/loader.js,sha256=OWgmKfZ0E7bKaVPR47Q5aA-JKFeD3741K_wS_lqphz4,503
|
19
|
+
speaker_detector/web/static/scripts/script copy.js,sha256=LLcKKjTjXEy9yj5e4gQhgIsItRLYAJe_9V7mppyM8Bc,31494
|
20
|
+
speaker_detector/web/static/scripts/script.js,sha256=UyHWk1HrkWW6ZyxQYRLOPbtCMLHcBbNudMfTJ9k3neA,1023
|
21
|
+
speaker_detector-0.1.6.dist-info/METADATA,sha256=WSOiZtAgOwze7mJFPETyqg7VtTOOHp4nQewa-GHUAlc,5414
|
22
|
+
speaker_detector-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
23
|
+
speaker_detector-0.1.6.dist-info/entry_points.txt,sha256=2B30ee2cTyeeA49x_TBURl53bDRiLWGK3NWhb9rlK3s,63
|
24
|
+
speaker_detector-0.1.6.dist-info/top_level.txt,sha256=PJ5rfvd3GAbzMbc7-Fwhtufjf6HxzzTiiHociOy7RiM,17
|
25
|
+
speaker_detector-0.1.6.dist-info/RECORD,,
|
speaker_detector/analyze.py
DELETED
@@ -1,59 +0,0 @@
|
|
1
|
-
from pathlib import Path
|
2
|
-
import torchaudio
|
3
|
-
import torch
|
4
|
-
from speaker_detector.core import get_embedding, STORAGE_DIR
|
5
|
-
|
6
|
-
CHUNK_DURATION = 2.5 # seconds
|
7
|
-
|
8
|
-
def match_speaker(embedding, speaker_embeddings):
|
9
|
-
scores = {}
|
10
|
-
for name, emb in speaker_embeddings.items():
|
11
|
-
score = torch.nn.functional.cosine_similarity(emb, embedding, dim=0).item()
|
12
|
-
scores[name] = score
|
13
|
-
if not scores:
|
14
|
-
return "unknown", 0.0
|
15
|
-
best = max(scores.items(), key=lambda kv: kv[1])
|
16
|
-
return best[0], round(best[1], 3)
|
17
|
-
|
18
|
-
def analyze_meeting(wav_path):
|
19
|
-
waveform, sample_rate = torchaudio.load(wav_path)
|
20
|
-
duration_sec = waveform.shape[1] / sample_rate
|
21
|
-
|
22
|
-
chunk_samples = int(CHUNK_DURATION * sample_rate)
|
23
|
-
num_chunks = int(waveform.shape[1] / chunk_samples)
|
24
|
-
|
25
|
-
# Load enrolled speaker embeddings
|
26
|
-
speaker_embeddings = {}
|
27
|
-
for spk_dir in STORAGE_DIR.iterdir():
|
28
|
-
if not spk_dir.is_dir():
|
29
|
-
continue
|
30
|
-
wavs = list(spk_dir.glob("*.wav"))
|
31
|
-
if not wavs:
|
32
|
-
continue
|
33
|
-
# Average multiple embeddings
|
34
|
-
embs = [get_embedding(str(wav)) for wav in wavs]
|
35
|
-
speaker_embeddings[spk_dir.name] = torch.stack(embs).mean(dim=0)
|
36
|
-
|
37
|
-
results = []
|
38
|
-
|
39
|
-
for i in range(num_chunks):
|
40
|
-
start_sample = i * chunk_samples
|
41
|
-
end_sample = start_sample + chunk_samples
|
42
|
-
chunk = waveform[:, start_sample:end_sample]
|
43
|
-
|
44
|
-
tmp_path = Path(wav_path).parent / f"tmp_chunk_{i}.wav"
|
45
|
-
torchaudio.save(str(tmp_path), chunk, sample_rate)
|
46
|
-
|
47
|
-
embedding = get_embedding(str(tmp_path))
|
48
|
-
speaker, score = match_speaker(embedding, speaker_embeddings)
|
49
|
-
|
50
|
-
results.append({
|
51
|
-
"start": round(i * CHUNK_DURATION, 2),
|
52
|
-
"end": round((i + 1) * CHUNK_DURATION, 2),
|
53
|
-
"speaker": speaker,
|
54
|
-
"score": score
|
55
|
-
})
|
56
|
-
|
57
|
-
tmp_path.unlink() # clean up
|
58
|
-
|
59
|
-
return results
|
speaker_detector/combine.py
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
import torch
|
2
|
-
import os
|
3
|
-
|
4
|
-
def combine_embeddings_from_folder(folder_path, output_path):
|
5
|
-
speaker_data = {}
|
6
|
-
|
7
|
-
for fname in os.listdir(folder_path):
|
8
|
-
if fname.endswith(".pt"):
|
9
|
-
label = os.path.splitext(fname)[0]
|
10
|
-
fpath = os.path.join(folder_path, fname)
|
11
|
-
tensor = torch.load(fpath, map_location="cpu")
|
12
|
-
if not isinstance(tensor, torch.Tensor):
|
13
|
-
print(f"❌ Skipping {fname}: not a valid tensor")
|
14
|
-
continue
|
15
|
-
speaker_data[label] = tensor
|
16
|
-
|
17
|
-
if not speaker_data:
|
18
|
-
print("⚠️ No valid .pt files found.")
|
19
|
-
return
|
20
|
-
|
21
|
-
torch.save(speaker_data, output_path)
|
22
|
-
print(f"✅ Combined {len(speaker_data)} speakers into {output_path}")
|
@@ -1,41 +0,0 @@
|
|
1
|
-
import torch
|
2
|
-
import json
|
3
|
-
|
4
|
-
def export_embeddings_to_json(pt_path, json_path):
|
5
|
-
"""
|
6
|
-
Converts a .pt file containing speaker embeddings into a
|
7
|
-
JSON file for use in the browser frontend.
|
8
|
-
|
9
|
-
Expected input format:
|
10
|
-
{
|
11
|
-
"lara": tensor([...]),
|
12
|
-
"guest": tensor([...]),
|
13
|
-
...
|
14
|
-
}
|
15
|
-
|
16
|
-
Output format:
|
17
|
-
[
|
18
|
-
{ "label": "lara", "vector": [...] },
|
19
|
-
{ "label": "guest", "vector": [...] },
|
20
|
-
...
|
21
|
-
]
|
22
|
-
"""
|
23
|
-
data = torch.load(pt_path, map_location="cpu")
|
24
|
-
|
25
|
-
if not isinstance(data, dict):
|
26
|
-
raise ValueError("Expected a dict of {label: tensor} in the .pt file")
|
27
|
-
|
28
|
-
converted = []
|
29
|
-
for label, tensor in data.items():
|
30
|
-
if not isinstance(tensor, torch.Tensor):
|
31
|
-
print(f"⚠️ Skipping {label}: not a tensor")
|
32
|
-
continue
|
33
|
-
converted.append({
|
34
|
-
"label": label,
|
35
|
-
"vector": tensor.tolist()
|
36
|
-
})
|
37
|
-
|
38
|
-
with open(json_path, "w") as f:
|
39
|
-
json.dump(converted, f, indent=2)
|
40
|
-
|
41
|
-
print(f"✅ Exported {len(converted)} speaker embeddings to {json_path}")
|
speaker_detector/export_model.py
DELETED
@@ -1,40 +0,0 @@
|
|
1
|
-
import torch
|
2
|
-
from speechbrain.lobes.models.ECAPA_TDNN import ECAPA_TDNN
|
3
|
-
from collections import OrderedDict
|
4
|
-
|
5
|
-
def export_model_to_onnx(ckpt_path, out_path):
|
6
|
-
model = ECAPA_TDNN(
|
7
|
-
input_size=80,
|
8
|
-
channels=[1024, 1024, 1024, 1024, 3072],
|
9
|
-
kernel_sizes=[5, 3, 3, 3, 1],
|
10
|
-
dilations=[1, 2, 3, 4, 1],
|
11
|
-
attention_channels=128,
|
12
|
-
lin_neurons=192,
|
13
|
-
)
|
14
|
-
|
15
|
-
state_dict = torch.load(ckpt_path, map_location="cpu")
|
16
|
-
|
17
|
-
if "model" in state_dict:
|
18
|
-
state_dict = state_dict["model"]
|
19
|
-
|
20
|
-
new_state_dict = OrderedDict()
|
21
|
-
for k, v in state_dict.items():
|
22
|
-
if k.startswith("embedding_model."):
|
23
|
-
k = k[len("embedding_model."):]
|
24
|
-
new_state_dict[k] = v
|
25
|
-
|
26
|
-
model.load_state_dict(new_state_dict)
|
27
|
-
model.eval()
|
28
|
-
|
29
|
-
dummy_input = torch.randn(1, 200, 80)
|
30
|
-
torch.onnx.export(
|
31
|
-
model,
|
32
|
-
dummy_input,
|
33
|
-
out_path,
|
34
|
-
input_names=["features"],
|
35
|
-
output_names=["embedding"],
|
36
|
-
dynamic_axes={"features": {0: "batch", 1: "time"}},
|
37
|
-
opset_version=12,
|
38
|
-
)
|
39
|
-
|
40
|
-
print(f"✅ Exported ECAPA-TDNN to {out_path}")
|
@@ -1,110 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import torch
|
3
|
-
import torchaudio
|
4
|
-
import requests
|
5
|
-
from pathlib import Path
|
6
|
-
from pydub import AudioSegment
|
7
|
-
from dotenv import load_dotenv
|
8
|
-
from speaker_detector.core import get_embedding, STORAGE_DIR
|
9
|
-
|
10
|
-
load_dotenv()
|
11
|
-
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
12
|
-
|
13
|
-
CHUNK_DURATION = 8 # seconds
|
14
|
-
SCORE_THRESHOLD = 0.6
|
15
|
-
MIN_VALID_DURATION = 1.0 # seconds
|
16
|
-
WHISPER_API_URL = "https://api.openai.com/v1/audio/transcriptions"
|
17
|
-
|
18
|
-
def match_speaker(embedding, speaker_embeddings):
|
19
|
-
scores = {
|
20
|
-
name: torch.nn.functional.cosine_similarity(emb, embedding, dim=0).item()
|
21
|
-
for name, emb in speaker_embeddings.items()
|
22
|
-
}
|
23
|
-
if not scores:
|
24
|
-
return "unknown", 0.0
|
25
|
-
best = max(scores.items(), key=lambda kv: kv[1])
|
26
|
-
return best[0], round(best[1], 3)
|
27
|
-
|
28
|
-
def transcribe_full_audio(wav_path: Path) -> str:
|
29
|
-
try:
|
30
|
-
with open(wav_path, "rb") as f:
|
31
|
-
response = requests.post(
|
32
|
-
WHISPER_API_URL,
|
33
|
-
headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
|
34
|
-
files={"file": (wav_path.name, f, "audio/wav")},
|
35
|
-
data={
|
36
|
-
"model": "whisper-1",
|
37
|
-
"response_format": "json",
|
38
|
-
"temperature": 0.2,
|
39
|
-
"language": "en",
|
40
|
-
"prompt": "This is a meeting transcription.",
|
41
|
-
},
|
42
|
-
timeout=120
|
43
|
-
)
|
44
|
-
response.raise_for_status()
|
45
|
-
return response.json()["text"].strip()
|
46
|
-
except Exception as e:
|
47
|
-
print(f"❌ Whisper failed: {e}")
|
48
|
-
return ""
|
49
|
-
|
50
|
-
def is_valid_audio(path):
|
51
|
-
try:
|
52
|
-
waveform, sample_rate = torchaudio.load(str(path))
|
53
|
-
duration_sec = waveform.shape[1] / sample_rate
|
54
|
-
return duration_sec >= MIN_VALID_DURATION
|
55
|
-
except Exception:
|
56
|
-
return False
|
57
|
-
|
58
|
-
def generate_summary(meeting_dir: Path):
|
59
|
-
meeting_dir = meeting_dir.resolve()
|
60
|
-
chunk_files = sorted([
|
61
|
-
f for f in meeting_dir.iterdir()
|
62
|
-
if f.name.startswith("chunk_") and f.suffix == ".wav" and is_valid_audio(f)
|
63
|
-
])
|
64
|
-
|
65
|
-
if not chunk_files:
|
66
|
-
return {"warning": "No valid .wav chunks found in meeting folder.", "segments": []}
|
67
|
-
|
68
|
-
# Merge all chunks into one file
|
69
|
-
combined = AudioSegment.empty()
|
70
|
-
for f in chunk_files:
|
71
|
-
combined += AudioSegment.from_wav(f)
|
72
|
-
merged_path = meeting_dir / "combined.wav"
|
73
|
-
combined.export(merged_path, format="wav")
|
74
|
-
|
75
|
-
# Get full transcript
|
76
|
-
full_text = transcribe_full_audio(merged_path)
|
77
|
-
print("🧠 Full transcript:", full_text)
|
78
|
-
|
79
|
-
# Load speaker embeddings
|
80
|
-
speaker_embeddings = {}
|
81
|
-
for spk_dir in STORAGE_DIR.iterdir():
|
82
|
-
if spk_dir.is_dir():
|
83
|
-
wavs = [w for w in spk_dir.glob("*.wav") if is_valid_audio(w)]
|
84
|
-
if wavs:
|
85
|
-
embs = [get_embedding(str(w)) for w in wavs]
|
86
|
-
speaker_embeddings[spk_dir.name] = torch.stack(embs).mean(dim=0)
|
87
|
-
|
88
|
-
segments = []
|
89
|
-
total = len(chunk_files)
|
90
|
-
|
91
|
-
for idx, chunk in enumerate(chunk_files):
|
92
|
-
try:
|
93
|
-
emb = get_embedding(chunk)
|
94
|
-
speaker, score = match_speaker(emb, speaker_embeddings)
|
95
|
-
segment_text = f"[chunk {idx+1}]"
|
96
|
-
segments.append({
|
97
|
-
"timestamp": idx * CHUNK_DURATION,
|
98
|
-
"speaker": speaker if score >= SCORE_THRESHOLD else "unknown",
|
99
|
-
"score": round(score, 3),
|
100
|
-
"text": segment_text,
|
101
|
-
"progress": round((idx + 1) / total * 100)
|
102
|
-
})
|
103
|
-
except Exception as e:
|
104
|
-
print(f"❌ Failed on {chunk.name}: {e}")
|
105
|
-
|
106
|
-
return {
|
107
|
-
"transcript": full_text,
|
108
|
-
"segments": segments if segments else [],
|
109
|
-
"warning": None if segments else "No speaker segments found."
|
110
|
-
}
|
@@ -1,15 +0,0 @@
|
|
1
|
-
speaker_detector/ECAPA_TDNN.py,sha256=KB5T-ye4c9ZWgTgn_SMH-T_-qYSEHQJJtf3xHjsfNPk,19024
|
2
|
-
speaker_detector/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
speaker_detector/__main__.py,sha256=EClCwCzb6h6YBpt0hrnG4h0mlNhNePyg_xBNNSVm1os,65
|
4
|
-
speaker_detector/analyze.py,sha256=sA8qyzczdHUbJw2_1JIbXn1WpiKC5dHLPRtPPoppJzY,1943
|
5
|
-
speaker_detector/cli.py,sha256=TKci4o4Fru-3NqUkPDRQRvtis2niNEAh9sQWwE5t6Us,3521
|
6
|
-
speaker_detector/combine.py,sha256=yCiqG6VMojz0CxSTPqjx0RrUban8oFIcKlA1zFMzaU4,761
|
7
|
-
speaker_detector/core.py,sha256=lQNOcmZs2IJOqrNKlk1BeVQX6tzc7BSpeP5Gordff-E,3586
|
8
|
-
speaker_detector/export_embeddings.py,sha256=OxNXadzEiMEJgpmCG6HHFncUX7DumFvTOys1R6UMUnw,1151
|
9
|
-
speaker_detector/export_model.py,sha256=qVVT2wSCnsPA8pSAEEyIMkY7Kc8uAgepc03MxBMT3xU,1146
|
10
|
-
speaker_detector/generate_summary.py,sha256=oTWEf2bxTCRIUl8L17-J64FyhRbCPnDjihFluEnBWc8,3726
|
11
|
-
speaker_detector-0.1.4.dist-info/METADATA,sha256=c0l2KJOvthuvSJ_g7QLb23sYDHqXhm4wi8dJYChEB8M,4564
|
12
|
-
speaker_detector-0.1.4.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
13
|
-
speaker_detector-0.1.4.dist-info/entry_points.txt,sha256=2B30ee2cTyeeA49x_TBURl53bDRiLWGK3NWhb9rlK3s,63
|
14
|
-
speaker_detector-0.1.4.dist-info/top_level.txt,sha256=PJ5rfvd3GAbzMbc7-Fwhtufjf6HxzzTiiHociOy7RiM,17
|
15
|
-
speaker_detector-0.1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|