speaker-detector 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speaker_detector/ECAPA_TDNN.py +633 -0
- speaker_detector/__init__.py +0 -0
- speaker_detector/__main__.py +4 -0
- speaker_detector/analyze.py +59 -0
- speaker_detector/cli.py +82 -0
- speaker_detector/combine.py +22 -0
- speaker_detector/core.py +103 -0
- speaker_detector/export_embeddings.py +41 -0
- speaker_detector/export_model.py +40 -0
- speaker_detector/generate_summary.py +110 -0
- speaker_detector-0.1.3.dist-info/METADATA +101 -0
- speaker_detector-0.1.3.dist-info/RECORD +15 -0
- speaker_detector-0.1.3.dist-info/WHEEL +5 -0
- speaker_detector-0.1.3.dist-info/entry_points.txt +2 -0
- speaker_detector-0.1.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,633 @@
|
|
1
|
+
"""A popular speaker recognition and diarization model.
|
2
|
+
|
3
|
+
Authors
|
4
|
+
* Hwidong Na 2020
|
5
|
+
"""
|
6
|
+
|
7
|
+
import torch # noqa: F401
|
8
|
+
import torch.nn as nn
|
9
|
+
import torch.nn.functional as F
|
10
|
+
|
11
|
+
from speechbrain.dataio.dataio import length_to_mask
|
12
|
+
from speechbrain.nnet.CNN import Conv1d as _Conv1d
|
13
|
+
from speechbrain.nnet.linear import Linear
|
14
|
+
from speechbrain.nnet.normalization import BatchNorm1d as _BatchNorm1d
|
15
|
+
|
16
|
+
|
17
|
+
# Skip transpose as much as possible for efficiency
|
18
|
+
class Conv1d(_Conv1d):
|
19
|
+
"""1D convolution. Skip transpose is used to improve efficiency."""
|
20
|
+
|
21
|
+
def __init__(self, *args, **kwargs):
|
22
|
+
super().__init__(skip_transpose=True, *args, **kwargs)
|
23
|
+
|
24
|
+
|
25
|
+
class BatchNorm1d(_BatchNorm1d):
|
26
|
+
"""1D batch normalization. Skip transpose is used to improve efficiency."""
|
27
|
+
|
28
|
+
def __init__(self, *args, **kwargs):
|
29
|
+
super().__init__(skip_transpose=True, *args, **kwargs)
|
30
|
+
|
31
|
+
|
32
|
+
class TDNNBlock(nn.Module):
|
33
|
+
"""An implementation of TDNN.
|
34
|
+
|
35
|
+
Arguments
|
36
|
+
---------
|
37
|
+
in_channels : int
|
38
|
+
Number of input channels.
|
39
|
+
out_channels : int
|
40
|
+
The number of output channels.
|
41
|
+
kernel_size : int
|
42
|
+
The kernel size of the TDNN blocks.
|
43
|
+
dilation : int
|
44
|
+
The dilation of the TDNN block.
|
45
|
+
activation : torch class
|
46
|
+
A class for constructing the activation layers.
|
47
|
+
groups : int
|
48
|
+
The groups size of the TDNN blocks.
|
49
|
+
dropout : float
|
50
|
+
Rate of channel dropout during training.
|
51
|
+
|
52
|
+
Example
|
53
|
+
-------
|
54
|
+
>>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
|
55
|
+
>>> layer = TDNNBlock(64, 64, kernel_size=3, dilation=1)
|
56
|
+
>>> out_tensor = layer(inp_tensor).transpose(1, 2)
|
57
|
+
>>> out_tensor.shape
|
58
|
+
torch.Size([8, 120, 64])
|
59
|
+
"""
|
60
|
+
|
61
|
+
def __init__(
|
62
|
+
self,
|
63
|
+
in_channels,
|
64
|
+
out_channels,
|
65
|
+
kernel_size,
|
66
|
+
dilation,
|
67
|
+
activation=nn.ReLU,
|
68
|
+
groups=1,
|
69
|
+
dropout=0.0,
|
70
|
+
):
|
71
|
+
super().__init__()
|
72
|
+
self.conv = Conv1d(
|
73
|
+
in_channels=in_channels,
|
74
|
+
out_channels=out_channels,
|
75
|
+
kernel_size=kernel_size,
|
76
|
+
dilation=dilation,
|
77
|
+
groups=groups,
|
78
|
+
)
|
79
|
+
self.activation = activation()
|
80
|
+
self.norm = BatchNorm1d(input_size=out_channels)
|
81
|
+
self.dropout = nn.Dropout1d(p=dropout)
|
82
|
+
|
83
|
+
def forward(self, x):
|
84
|
+
"""Processes the input tensor x and returns an output tensor."""
|
85
|
+
return self.dropout(self.norm(self.activation(self.conv(x))))
|
86
|
+
|
87
|
+
|
88
|
+
class Res2NetBlock(torch.nn.Module):
|
89
|
+
"""An implementation of Res2NetBlock w/ dilation.
|
90
|
+
|
91
|
+
Arguments
|
92
|
+
---------
|
93
|
+
in_channels : int
|
94
|
+
The number of channels expected in the input.
|
95
|
+
out_channels : int
|
96
|
+
The number of output channels.
|
97
|
+
scale : int
|
98
|
+
The scale of the Res2Net block.
|
99
|
+
kernel_size: int
|
100
|
+
The kernel size of the Res2Net block.
|
101
|
+
dilation : int
|
102
|
+
The dilation of the Res2Net block.
|
103
|
+
dropout : float
|
104
|
+
Rate of channel dropout during training.
|
105
|
+
|
106
|
+
Example
|
107
|
+
-------
|
108
|
+
>>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
|
109
|
+
>>> layer = Res2NetBlock(64, 64, scale=4, dilation=3)
|
110
|
+
>>> out_tensor = layer(inp_tensor).transpose(1, 2)
|
111
|
+
>>> out_tensor.shape
|
112
|
+
torch.Size([8, 120, 64])
|
113
|
+
"""
|
114
|
+
|
115
|
+
def __init__(
|
116
|
+
self,
|
117
|
+
in_channels,
|
118
|
+
out_channels,
|
119
|
+
scale=8,
|
120
|
+
kernel_size=3,
|
121
|
+
dilation=1,
|
122
|
+
dropout=0.0,
|
123
|
+
):
|
124
|
+
super().__init__()
|
125
|
+
assert in_channels % scale == 0
|
126
|
+
assert out_channels % scale == 0
|
127
|
+
|
128
|
+
in_channel = in_channels // scale
|
129
|
+
hidden_channel = out_channels // scale
|
130
|
+
|
131
|
+
self.blocks = nn.ModuleList(
|
132
|
+
[
|
133
|
+
TDNNBlock(
|
134
|
+
in_channel,
|
135
|
+
hidden_channel,
|
136
|
+
kernel_size=kernel_size,
|
137
|
+
dilation=dilation,
|
138
|
+
dropout=dropout,
|
139
|
+
)
|
140
|
+
for i in range(scale - 1)
|
141
|
+
]
|
142
|
+
)
|
143
|
+
self.scale = scale
|
144
|
+
|
145
|
+
def forward(self, x):
|
146
|
+
"""Processes the input tensor x and returns an output tensor."""
|
147
|
+
y = []
|
148
|
+
for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
|
149
|
+
if i == 0:
|
150
|
+
y_i = x_i
|
151
|
+
elif i == 1:
|
152
|
+
y_i = self.blocks[i - 1](x_i)
|
153
|
+
else:
|
154
|
+
y_i = self.blocks[i - 1](x_i + y_i)
|
155
|
+
y.append(y_i)
|
156
|
+
y = torch.cat(y, dim=1)
|
157
|
+
return y
|
158
|
+
|
159
|
+
|
160
|
+
class SEBlock(nn.Module):
|
161
|
+
"""An implementation of squeeze-and-excitation block.
|
162
|
+
|
163
|
+
Arguments
|
164
|
+
---------
|
165
|
+
in_channels : int
|
166
|
+
The number of input channels.
|
167
|
+
se_channels : int
|
168
|
+
The number of output channels after squeeze.
|
169
|
+
out_channels : int
|
170
|
+
The number of output channels.
|
171
|
+
|
172
|
+
Example
|
173
|
+
-------
|
174
|
+
>>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
|
175
|
+
>>> se_layer = SEBlock(64, 16, 64)
|
176
|
+
>>> lengths = torch.rand((8,))
|
177
|
+
>>> out_tensor = se_layer(inp_tensor, lengths).transpose(1, 2)
|
178
|
+
>>> out_tensor.shape
|
179
|
+
torch.Size([8, 120, 64])
|
180
|
+
"""
|
181
|
+
|
182
|
+
def __init__(self, in_channels, se_channels, out_channels):
|
183
|
+
super().__init__()
|
184
|
+
|
185
|
+
self.conv1 = Conv1d(
|
186
|
+
in_channels=in_channels, out_channels=se_channels, kernel_size=1
|
187
|
+
)
|
188
|
+
self.relu = torch.nn.ReLU(inplace=True)
|
189
|
+
self.conv2 = Conv1d(
|
190
|
+
in_channels=se_channels, out_channels=out_channels, kernel_size=1
|
191
|
+
)
|
192
|
+
self.sigmoid = torch.nn.Sigmoid()
|
193
|
+
|
194
|
+
def forward(self, x, lengths=None):
|
195
|
+
"""Processes the input tensor x and returns an output tensor."""
|
196
|
+
L = x.shape[-1]
|
197
|
+
if lengths is not None:
|
198
|
+
mask = length_to_mask(lengths * L, max_len=L, device=x.device)
|
199
|
+
mask = mask.unsqueeze(1)
|
200
|
+
total = mask.sum(dim=2, keepdim=True)
|
201
|
+
s = (x * mask).sum(dim=2, keepdim=True) / total
|
202
|
+
else:
|
203
|
+
s = x.mean(dim=2, keepdim=True)
|
204
|
+
|
205
|
+
s = self.relu(self.conv1(s))
|
206
|
+
s = self.sigmoid(self.conv2(s))
|
207
|
+
|
208
|
+
return s * x
|
209
|
+
|
210
|
+
|
211
|
+
class AttentiveStatisticsPooling(nn.Module):
|
212
|
+
"""This class implements an attentive statistic pooling layer for each channel.
|
213
|
+
It returns the concatenated mean and std of the input tensor.
|
214
|
+
|
215
|
+
Arguments
|
216
|
+
---------
|
217
|
+
channels: int
|
218
|
+
The number of input channels.
|
219
|
+
attention_channels: int
|
220
|
+
The number of attention channels.
|
221
|
+
global_context: bool
|
222
|
+
Whether to use global context.
|
223
|
+
|
224
|
+
Example
|
225
|
+
-------
|
226
|
+
>>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
|
227
|
+
>>> asp_layer = AttentiveStatisticsPooling(64)
|
228
|
+
>>> lengths = torch.rand((8,))
|
229
|
+
>>> out_tensor = asp_layer(inp_tensor, lengths).transpose(1, 2)
|
230
|
+
>>> out_tensor.shape
|
231
|
+
torch.Size([8, 1, 128])
|
232
|
+
"""
|
233
|
+
|
234
|
+
def __init__(self, channels, attention_channels=128, global_context=True):
|
235
|
+
super().__init__()
|
236
|
+
|
237
|
+
self.eps = 1e-12
|
238
|
+
self.global_context = global_context
|
239
|
+
if global_context:
|
240
|
+
self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
|
241
|
+
else:
|
242
|
+
self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
|
243
|
+
self.tanh = nn.Tanh()
|
244
|
+
self.conv = Conv1d(
|
245
|
+
in_channels=attention_channels, out_channels=channels, kernel_size=1
|
246
|
+
)
|
247
|
+
|
248
|
+
def forward(self, x, lengths=None):
|
249
|
+
"""Calculates mean and std for a batch (input tensor).
|
250
|
+
|
251
|
+
Arguments
|
252
|
+
---------
|
253
|
+
x : torch.Tensor
|
254
|
+
Tensor of shape [N, C, L].
|
255
|
+
lengths : torch.Tensor
|
256
|
+
The corresponding relative lengths of the inputs.
|
257
|
+
|
258
|
+
Returns
|
259
|
+
-------
|
260
|
+
pooled_stats : torch.Tensor
|
261
|
+
mean and std of batch
|
262
|
+
"""
|
263
|
+
L = x.shape[-1]
|
264
|
+
|
265
|
+
def _compute_statistics(x, m, dim=2, eps=self.eps):
|
266
|
+
mean = (m * x).sum(dim)
|
267
|
+
std = torch.sqrt(
|
268
|
+
(m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps)
|
269
|
+
)
|
270
|
+
return mean, std
|
271
|
+
|
272
|
+
if lengths is None:
|
273
|
+
lengths = torch.ones(x.shape[0], device=x.device)
|
274
|
+
|
275
|
+
# Make binary mask of shape [N, 1, L]
|
276
|
+
mask = length_to_mask(lengths * L, max_len=L, device=x.device)
|
277
|
+
mask = mask.unsqueeze(1)
|
278
|
+
|
279
|
+
# Expand the temporal context of the pooling layer by allowing the
|
280
|
+
# self-attention to look at global properties of the utterance.
|
281
|
+
if self.global_context:
|
282
|
+
# torch.std is unstable for backward computation
|
283
|
+
# https://github.com/pytorch/pytorch/issues/4320
|
284
|
+
total = mask.sum(dim=2, keepdim=True).float()
|
285
|
+
mean, std = _compute_statistics(x, mask / total)
|
286
|
+
mean = mean.unsqueeze(2).repeat(1, 1, L)
|
287
|
+
std = std.unsqueeze(2).repeat(1, 1, L)
|
288
|
+
attn = torch.cat([x, mean, std], dim=1)
|
289
|
+
else:
|
290
|
+
attn = x
|
291
|
+
|
292
|
+
# Apply layers
|
293
|
+
attn = self.conv(self.tanh(self.tdnn(attn)))
|
294
|
+
|
295
|
+
# Filter out zero-paddings
|
296
|
+
attn = attn.masked_fill(mask == 0, float("-inf"))
|
297
|
+
|
298
|
+
attn = F.softmax(attn, dim=2)
|
299
|
+
mean, std = _compute_statistics(x, attn)
|
300
|
+
# Append mean and std of the batch
|
301
|
+
pooled_stats = torch.cat((mean, std), dim=1)
|
302
|
+
pooled_stats = pooled_stats.unsqueeze(2)
|
303
|
+
|
304
|
+
return pooled_stats
|
305
|
+
|
306
|
+
|
307
|
+
class SERes2NetBlock(nn.Module):
|
308
|
+
"""An implementation of building block in ECAPA-TDNN, i.e.,
|
309
|
+
TDNN-Res2Net-TDNN-SEBlock.
|
310
|
+
|
311
|
+
Arguments
|
312
|
+
---------
|
313
|
+
in_channels: int
|
314
|
+
Expected size of input channels.
|
315
|
+
out_channels: int
|
316
|
+
The number of output channels.
|
317
|
+
res2net_scale: int
|
318
|
+
The scale of the Res2Net block.
|
319
|
+
se_channels : int
|
320
|
+
The number of output channels after squeeze.
|
321
|
+
kernel_size: int
|
322
|
+
The kernel size of the TDNN blocks.
|
323
|
+
dilation: int
|
324
|
+
The dilation of the Res2Net block.
|
325
|
+
activation : torch class
|
326
|
+
A class for constructing the activation layers.
|
327
|
+
groups: int
|
328
|
+
Number of blocked connections from input channels to output channels.
|
329
|
+
dropout: float
|
330
|
+
Rate of channel dropout during training.
|
331
|
+
|
332
|
+
Example
|
333
|
+
-------
|
334
|
+
>>> x = torch.rand(8, 120, 64).transpose(1, 2)
|
335
|
+
>>> conv = SERes2NetBlock(64, 64, res2net_scale=4)
|
336
|
+
>>> out = conv(x).transpose(1, 2)
|
337
|
+
>>> out.shape
|
338
|
+
torch.Size([8, 120, 64])
|
339
|
+
"""
|
340
|
+
|
341
|
+
def __init__(
|
342
|
+
self,
|
343
|
+
in_channels,
|
344
|
+
out_channels,
|
345
|
+
res2net_scale=8,
|
346
|
+
se_channels=128,
|
347
|
+
kernel_size=1,
|
348
|
+
dilation=1,
|
349
|
+
activation=torch.nn.ReLU,
|
350
|
+
groups=1,
|
351
|
+
dropout=0.0,
|
352
|
+
):
|
353
|
+
super().__init__()
|
354
|
+
self.out_channels = out_channels
|
355
|
+
self.tdnn1 = TDNNBlock(
|
356
|
+
in_channels,
|
357
|
+
out_channels,
|
358
|
+
kernel_size=1,
|
359
|
+
dilation=1,
|
360
|
+
activation=activation,
|
361
|
+
groups=groups,
|
362
|
+
dropout=dropout,
|
363
|
+
)
|
364
|
+
self.res2net_block = Res2NetBlock(
|
365
|
+
out_channels, out_channels, res2net_scale, kernel_size, dilation
|
366
|
+
)
|
367
|
+
self.tdnn2 = TDNNBlock(
|
368
|
+
out_channels,
|
369
|
+
out_channels,
|
370
|
+
kernel_size=1,
|
371
|
+
dilation=1,
|
372
|
+
activation=activation,
|
373
|
+
groups=groups,
|
374
|
+
dropout=dropout,
|
375
|
+
)
|
376
|
+
self.se_block = SEBlock(out_channels, se_channels, out_channels)
|
377
|
+
|
378
|
+
self.shortcut = None
|
379
|
+
if in_channels != out_channels:
|
380
|
+
self.shortcut = Conv1d(
|
381
|
+
in_channels=in_channels,
|
382
|
+
out_channels=out_channels,
|
383
|
+
kernel_size=1,
|
384
|
+
)
|
385
|
+
|
386
|
+
def forward(self, x, lengths=None):
|
387
|
+
"""Processes the input tensor x and returns an output tensor."""
|
388
|
+
residual = x
|
389
|
+
if self.shortcut:
|
390
|
+
residual = self.shortcut(x)
|
391
|
+
|
392
|
+
x = self.tdnn1(x)
|
393
|
+
x = self.res2net_block(x)
|
394
|
+
x = self.tdnn2(x)
|
395
|
+
x = self.se_block(x, lengths)
|
396
|
+
|
397
|
+
return x + residual
|
398
|
+
|
399
|
+
|
400
|
+
class ECAPA_TDNN(torch.nn.Module):
|
401
|
+
"""An implementation of the speaker embedding model in a paper.
|
402
|
+
"ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
|
403
|
+
TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
|
404
|
+
|
405
|
+
Arguments
|
406
|
+
---------
|
407
|
+
input_size : int
|
408
|
+
Expected size of the input dimension.
|
409
|
+
device : str
|
410
|
+
Device used, e.g., "cpu" or "cuda".
|
411
|
+
lin_neurons : int
|
412
|
+
Number of neurons in linear layers.
|
413
|
+
activation : torch class
|
414
|
+
A class for constructing the activation layers.
|
415
|
+
channels : list of ints
|
416
|
+
Output channels for TDNN/SERes2Net layer.
|
417
|
+
kernel_sizes : list of ints
|
418
|
+
List of kernel sizes for each layer.
|
419
|
+
dilations : list of ints
|
420
|
+
List of dilations for kernels in each layer.
|
421
|
+
attention_channels: int
|
422
|
+
The number of attention channels.
|
423
|
+
res2net_scale : int
|
424
|
+
The scale of the Res2Net block.
|
425
|
+
se_channels : int
|
426
|
+
The number of output channels after squeeze.
|
427
|
+
global_context: bool
|
428
|
+
Whether to use global context.
|
429
|
+
groups : list of ints
|
430
|
+
List of groups for kernels in each layer.
|
431
|
+
dropout : float
|
432
|
+
Rate of channel dropout during training.
|
433
|
+
|
434
|
+
Example
|
435
|
+
-------
|
436
|
+
>>> input_feats = torch.rand([5, 120, 80])
|
437
|
+
>>> compute_embedding = ECAPA_TDNN(80, lin_neurons=192)
|
438
|
+
>>> outputs = compute_embedding(input_feats)
|
439
|
+
>>> outputs.shape
|
440
|
+
torch.Size([5, 1, 192])
|
441
|
+
"""
|
442
|
+
|
443
|
+
def __init__(
|
444
|
+
self,
|
445
|
+
input_size,
|
446
|
+
device="cpu",
|
447
|
+
lin_neurons=192,
|
448
|
+
activation=torch.nn.ReLU,
|
449
|
+
channels=[512, 512, 512, 512, 1536],
|
450
|
+
kernel_sizes=[5, 3, 3, 3, 1],
|
451
|
+
dilations=[1, 2, 3, 4, 1],
|
452
|
+
attention_channels=128,
|
453
|
+
res2net_scale=8,
|
454
|
+
se_channels=128,
|
455
|
+
global_context=True,
|
456
|
+
groups=[1, 1, 1, 1, 1],
|
457
|
+
dropout=0.0,
|
458
|
+
):
|
459
|
+
super().__init__()
|
460
|
+
assert len(channels) == len(kernel_sizes)
|
461
|
+
assert len(channels) == len(dilations)
|
462
|
+
self.channels = channels
|
463
|
+
self.blocks = nn.ModuleList()
|
464
|
+
|
465
|
+
# The initial TDNN layer
|
466
|
+
self.blocks.append(
|
467
|
+
TDNNBlock(
|
468
|
+
input_size,
|
469
|
+
channels[0],
|
470
|
+
kernel_sizes[0],
|
471
|
+
dilations[0],
|
472
|
+
activation,
|
473
|
+
groups[0],
|
474
|
+
dropout,
|
475
|
+
)
|
476
|
+
)
|
477
|
+
|
478
|
+
# SE-Res2Net layers
|
479
|
+
for i in range(1, len(channels) - 1):
|
480
|
+
self.blocks.append(
|
481
|
+
SERes2NetBlock(
|
482
|
+
channels[i - 1],
|
483
|
+
channels[i],
|
484
|
+
res2net_scale=res2net_scale,
|
485
|
+
se_channels=se_channels,
|
486
|
+
kernel_size=kernel_sizes[i],
|
487
|
+
dilation=dilations[i],
|
488
|
+
activation=activation,
|
489
|
+
groups=groups[i],
|
490
|
+
dropout=dropout,
|
491
|
+
)
|
492
|
+
)
|
493
|
+
|
494
|
+
# Multi-layer feature aggregation
|
495
|
+
self.mfa = TDNNBlock(
|
496
|
+
channels[-2] * (len(channels) - 2),
|
497
|
+
channels[-1],
|
498
|
+
kernel_sizes[-1],
|
499
|
+
dilations[-1],
|
500
|
+
activation,
|
501
|
+
groups=groups[-1],
|
502
|
+
dropout=dropout,
|
503
|
+
)
|
504
|
+
|
505
|
+
# Attentive Statistical Pooling
|
506
|
+
self.asp = AttentiveStatisticsPooling(
|
507
|
+
channels[-1],
|
508
|
+
attention_channels=attention_channels,
|
509
|
+
global_context=global_context,
|
510
|
+
)
|
511
|
+
self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)
|
512
|
+
|
513
|
+
# Final linear transformation
|
514
|
+
self.fc = Conv1d(
|
515
|
+
in_channels=channels[-1] * 2,
|
516
|
+
out_channels=lin_neurons,
|
517
|
+
kernel_size=1,
|
518
|
+
)
|
519
|
+
|
520
|
+
def forward(self, x, lengths=None):
|
521
|
+
"""Returns the embedding vector.
|
522
|
+
|
523
|
+
Arguments
|
524
|
+
---------
|
525
|
+
x : torch.Tensor
|
526
|
+
Tensor of shape (batch, time, channel).
|
527
|
+
lengths : torch.Tensor
|
528
|
+
Corresponding relative lengths of inputs.
|
529
|
+
|
530
|
+
Returns
|
531
|
+
-------
|
532
|
+
x : torch.Tensor
|
533
|
+
Embedding vector.
|
534
|
+
"""
|
535
|
+
# Minimize transpose for efficiency
|
536
|
+
x = x.transpose(1, 2)
|
537
|
+
|
538
|
+
xl = []
|
539
|
+
for layer in self.blocks:
|
540
|
+
try:
|
541
|
+
x = layer(x, lengths=lengths)
|
542
|
+
except TypeError:
|
543
|
+
x = layer(x)
|
544
|
+
xl.append(x)
|
545
|
+
|
546
|
+
# Multi-layer feature aggregation
|
547
|
+
x = torch.cat(xl[1:], dim=1)
|
548
|
+
x = self.mfa(x)
|
549
|
+
|
550
|
+
# Attentive Statistical Pooling
|
551
|
+
x = self.asp(x, lengths=lengths)
|
552
|
+
x = self.asp_bn(x)
|
553
|
+
|
554
|
+
# Final linear transformation
|
555
|
+
x = self.fc(x)
|
556
|
+
|
557
|
+
x = x.transpose(1, 2)
|
558
|
+
return x
|
559
|
+
|
560
|
+
|
561
|
+
class Classifier(torch.nn.Module):
|
562
|
+
"""This class implements the cosine similarity on the top of features.
|
563
|
+
|
564
|
+
Arguments
|
565
|
+
---------
|
566
|
+
input_size : int
|
567
|
+
Expected size of input dimension.
|
568
|
+
device : str
|
569
|
+
Device used, e.g., "cpu" or "cuda".
|
570
|
+
lin_blocks : int
|
571
|
+
Number of linear layers.
|
572
|
+
lin_neurons : int
|
573
|
+
Number of neurons in linear layers.
|
574
|
+
out_neurons : int
|
575
|
+
Number of classes.
|
576
|
+
|
577
|
+
Example
|
578
|
+
-------
|
579
|
+
>>> classify = Classifier(input_size=2, lin_neurons=2, out_neurons=2)
|
580
|
+
>>> outputs = torch.tensor([ [1., -1.], [-9., 1.], [0.9, 0.1], [0.1, 0.9] ])
|
581
|
+
>>> outputs = outputs.unsqueeze(1)
|
582
|
+
>>> cos = classify(outputs)
|
583
|
+
>>> (cos < -1.0).long().sum()
|
584
|
+
tensor(0)
|
585
|
+
>>> (cos > 1.0).long().sum()
|
586
|
+
tensor(0)
|
587
|
+
"""
|
588
|
+
|
589
|
+
def __init__(
|
590
|
+
self,
|
591
|
+
input_size,
|
592
|
+
device="cpu",
|
593
|
+
lin_blocks=0,
|
594
|
+
lin_neurons=192,
|
595
|
+
out_neurons=1211,
|
596
|
+
):
|
597
|
+
super().__init__()
|
598
|
+
self.blocks = nn.ModuleList()
|
599
|
+
|
600
|
+
for block_index in range(lin_blocks):
|
601
|
+
self.blocks.extend(
|
602
|
+
[
|
603
|
+
_BatchNorm1d(input_size=input_size),
|
604
|
+
Linear(input_size=input_size, n_neurons=lin_neurons),
|
605
|
+
]
|
606
|
+
)
|
607
|
+
input_size = lin_neurons
|
608
|
+
|
609
|
+
# Final Layer
|
610
|
+
self.weight = nn.Parameter(
|
611
|
+
torch.FloatTensor(out_neurons, input_size, device=device)
|
612
|
+
)
|
613
|
+
nn.init.xavier_uniform_(self.weight)
|
614
|
+
|
615
|
+
def forward(self, x):
|
616
|
+
"""Returns the output probabilities over speakers.
|
617
|
+
|
618
|
+
Arguments
|
619
|
+
---------
|
620
|
+
x : torch.Tensor
|
621
|
+
Torch tensor.
|
622
|
+
|
623
|
+
Returns
|
624
|
+
-------
|
625
|
+
out : torch.Tensor
|
626
|
+
Output probabilities over speakers.
|
627
|
+
"""
|
628
|
+
for layer in self.blocks:
|
629
|
+
x = layer(x)
|
630
|
+
|
631
|
+
# Need to be normalized
|
632
|
+
x = F.linear(F.normalize(x.squeeze(1)), F.normalize(self.weight))
|
633
|
+
return x.unsqueeze(1)
|
File without changes
|
@@ -0,0 +1,59 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
import torchaudio
|
3
|
+
import torch
|
4
|
+
from speaker_detector.core import get_embedding, STORAGE_DIR
|
5
|
+
|
6
|
+
CHUNK_DURATION = 2.5 # seconds
|
7
|
+
|
8
|
+
def match_speaker(embedding, speaker_embeddings):
|
9
|
+
scores = {}
|
10
|
+
for name, emb in speaker_embeddings.items():
|
11
|
+
score = torch.nn.functional.cosine_similarity(emb, embedding, dim=0).item()
|
12
|
+
scores[name] = score
|
13
|
+
if not scores:
|
14
|
+
return "unknown", 0.0
|
15
|
+
best = max(scores.items(), key=lambda kv: kv[1])
|
16
|
+
return best[0], round(best[1], 3)
|
17
|
+
|
18
|
+
def analyze_meeting(wav_path):
|
19
|
+
waveform, sample_rate = torchaudio.load(wav_path)
|
20
|
+
duration_sec = waveform.shape[1] / sample_rate
|
21
|
+
|
22
|
+
chunk_samples = int(CHUNK_DURATION * sample_rate)
|
23
|
+
num_chunks = int(waveform.shape[1] / chunk_samples)
|
24
|
+
|
25
|
+
# Load enrolled speaker embeddings
|
26
|
+
speaker_embeddings = {}
|
27
|
+
for spk_dir in STORAGE_DIR.iterdir():
|
28
|
+
if not spk_dir.is_dir():
|
29
|
+
continue
|
30
|
+
wavs = list(spk_dir.glob("*.wav"))
|
31
|
+
if not wavs:
|
32
|
+
continue
|
33
|
+
# Average multiple embeddings
|
34
|
+
embs = [get_embedding(str(wav)) for wav in wavs]
|
35
|
+
speaker_embeddings[spk_dir.name] = torch.stack(embs).mean(dim=0)
|
36
|
+
|
37
|
+
results = []
|
38
|
+
|
39
|
+
for i in range(num_chunks):
|
40
|
+
start_sample = i * chunk_samples
|
41
|
+
end_sample = start_sample + chunk_samples
|
42
|
+
chunk = waveform[:, start_sample:end_sample]
|
43
|
+
|
44
|
+
tmp_path = Path(wav_path).parent / f"tmp_chunk_{i}.wav"
|
45
|
+
torchaudio.save(str(tmp_path), chunk, sample_rate)
|
46
|
+
|
47
|
+
embedding = get_embedding(str(tmp_path))
|
48
|
+
speaker, score = match_speaker(embedding, speaker_embeddings)
|
49
|
+
|
50
|
+
results.append({
|
51
|
+
"start": round(i * CHUNK_DURATION, 2),
|
52
|
+
"end": round((i + 1) * CHUNK_DURATION, 2),
|
53
|
+
"speaker": speaker,
|
54
|
+
"score": score
|
55
|
+
})
|
56
|
+
|
57
|
+
tmp_path.unlink() # clean up
|
58
|
+
|
59
|
+
return results
|
speaker_detector/cli.py
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
import warnings
|
2
|
+
import argparse
|
3
|
+
import os
|
4
|
+
|
5
|
+
def main():
|
6
|
+
parser = argparse.ArgumentParser(prog="speaker-detector", description="Speaker Detector CLI")
|
7
|
+
subparsers = parser.add_subparsers(dest="command")
|
8
|
+
|
9
|
+
# ---- Global options ----
|
10
|
+
parser.add_argument("--verbose", action="store_true", help="Show detailed logs and warnings")
|
11
|
+
|
12
|
+
# ---- enroll ----
|
13
|
+
enroll_cmd = subparsers.add_parser("enroll", help="Enroll a speaker from a .wav file")
|
14
|
+
enroll_cmd.add_argument("speaker_id", help="Name/ID of the speaker")
|
15
|
+
enroll_cmd.add_argument("audio_path", help="Path to .wav file")
|
16
|
+
|
17
|
+
# ---- identify ----
|
18
|
+
identify_cmd = subparsers.add_parser("identify", help="Identify speaker from a .wav file")
|
19
|
+
identify_cmd.add_argument("audio_path", help="Path to .wav file")
|
20
|
+
|
21
|
+
# ---- list-speakers ----
|
22
|
+
subparsers.add_parser("list-speakers", help="List enrolled speakers")
|
23
|
+
|
24
|
+
# ---- export-model ----
|
25
|
+
model_parser = subparsers.add_parser("export-model", help="Export ECAPA model to ONNX")
|
26
|
+
model_parser.add_argument("--pt", required=True, help="Path to embedding_model.ckpt")
|
27
|
+
model_parser.add_argument("--out", default="speaker_embedding.onnx", help="Output ONNX file")
|
28
|
+
|
29
|
+
# ---- export-speaker-json ----
|
30
|
+
emb_parser = subparsers.add_parser("export-speaker-json", help="Convert enrolled .pt file to browser-friendly .json")
|
31
|
+
emb_parser.add_argument("--pt", required=True, help="Path to enrolled_speakers.pt")
|
32
|
+
emb_parser.add_argument("--out", default="speakers.json", help="Output .json file for browser")
|
33
|
+
|
34
|
+
# ---- combine ----
|
35
|
+
comb_parser = subparsers.add_parser("combine", help="Combine individual .pt files into enrolled_speakers.pt")
|
36
|
+
comb_parser.add_argument("--folder", required=True, help="Folder with individual .pt files")
|
37
|
+
comb_parser.add_argument("--out", required=True, help="Output .pt file path")
|
38
|
+
|
39
|
+
# ---- Parse arguments ----
|
40
|
+
args = parser.parse_args()
|
41
|
+
|
42
|
+
# ---- Suppress warnings unless --verbose ----
|
43
|
+
if not args.verbose:
|
44
|
+
warnings.simplefilter("ignore", category=DeprecationWarning)
|
45
|
+
warnings.simplefilter("ignore", category=UserWarning)
|
46
|
+
os.environ["PYTHONWARNINGS"] = "ignore"
|
47
|
+
|
48
|
+
# ---- Import modules after filtering warnings ----
|
49
|
+
from .core import enroll_speaker, identify_speaker, list_speakers
|
50
|
+
from .export_model import export_model_to_onnx
|
51
|
+
from .export_embeddings import export_embeddings_to_json
|
52
|
+
from .combine import combine_embeddings_from_folder
|
53
|
+
|
54
|
+
# ---- Command Dispatch ----
|
55
|
+
if args.command == "enroll":
|
56
|
+
enroll_speaker(args.audio_path, args.speaker_id)
|
57
|
+
print(f"โ
Enrolled: {args.speaker_id}")
|
58
|
+
|
59
|
+
elif args.command == "identify":
|
60
|
+
result = identify_speaker(args.audio_path)
|
61
|
+
print(f"๐ต๏ธ Identified: {result['speaker']} (score: {result['score']})")
|
62
|
+
|
63
|
+
elif args.command == "list-speakers":
|
64
|
+
speakers = list_speakers()
|
65
|
+
if speakers:
|
66
|
+
print("๐ Enrolled Speakers:")
|
67
|
+
for s in speakers:
|
68
|
+
print(f" โข {s}")
|
69
|
+
else:
|
70
|
+
print("โ ๏ธ No speakers enrolled yet.")
|
71
|
+
|
72
|
+
elif args.command == "export-model":
|
73
|
+
export_model_to_onnx(args.pt, args.out)
|
74
|
+
|
75
|
+
elif args.command == "export-speaker-json":
|
76
|
+
export_embeddings_to_json(args.pt, args.out)
|
77
|
+
|
78
|
+
elif args.command == "combine":
|
79
|
+
combine_embeddings_from_folder(args.folder, args.out)
|
80
|
+
|
81
|
+
else:
|
82
|
+
parser.print_help()
|
@@ -0,0 +1,22 @@
|
|
1
|
+
import torch
|
2
|
+
import os
|
3
|
+
|
4
|
+
def combine_embeddings_from_folder(folder_path, output_path):
|
5
|
+
speaker_data = {}
|
6
|
+
|
7
|
+
for fname in os.listdir(folder_path):
|
8
|
+
if fname.endswith(".pt"):
|
9
|
+
label = os.path.splitext(fname)[0]
|
10
|
+
fpath = os.path.join(folder_path, fname)
|
11
|
+
tensor = torch.load(fpath, map_location="cpu")
|
12
|
+
if not isinstance(tensor, torch.Tensor):
|
13
|
+
print(f"โ Skipping {fname}: not a valid tensor")
|
14
|
+
continue
|
15
|
+
speaker_data[label] = tensor
|
16
|
+
|
17
|
+
if not speaker_data:
|
18
|
+
print("โ ๏ธ No valid .pt files found.")
|
19
|
+
return
|
20
|
+
|
21
|
+
torch.save(speaker_data, output_path)
|
22
|
+
print(f"โ
Combined {len(speaker_data)} speakers into {output_path}")
|
speaker_detector/core.py
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
from speechbrain.pretrained import SpeakerRecognition
|
2
|
+
from pathlib import Path
|
3
|
+
import torchaudio
|
4
|
+
import torch
|
5
|
+
|
6
|
+
# Storage directories
|
7
|
+
BASE_DIR = Path(__file__).resolve().parent.parent / "storage"
|
8
|
+
SPEAKER_AUDIO_DIR = BASE_DIR / "speakers"
|
9
|
+
EMBEDDINGS_DIR = BASE_DIR / "embeddings"
|
10
|
+
|
11
|
+
# Ensure they exist
|
12
|
+
SPEAKER_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
|
13
|
+
EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
|
14
|
+
|
15
|
+
# Load model once
|
16
|
+
MODEL = SpeakerRecognition.from_hparams(
|
17
|
+
source="speechbrain/spkrec-ecapa-voxceleb", savedir="model"
|
18
|
+
)
|
19
|
+
|
20
|
+
def get_embedding(audio_path):
|
21
|
+
try:
|
22
|
+
signal, fs = torchaudio.load(audio_path)
|
23
|
+
if signal.numel() == 0:
|
24
|
+
raise ValueError(f"{audio_path} is empty.")
|
25
|
+
return MODEL.encode_batch(signal).squeeze().detach().cpu()
|
26
|
+
except Exception as e:
|
27
|
+
raise RuntimeError(f"Failed to embed {audio_path}: {e}")
|
28
|
+
|
29
|
+
def enroll_speaker(audio_path, speaker_id):
|
30
|
+
speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
|
31
|
+
speaker_dir.mkdir(parents=True, exist_ok=True)
|
32
|
+
|
33
|
+
# Save audio sample
|
34
|
+
existing = list(speaker_dir.glob("*.wav"))
|
35
|
+
new_index = len(existing) + 1
|
36
|
+
dest_path = speaker_dir / f"{new_index}.wav"
|
37
|
+
|
38
|
+
waveform, sample_rate = torchaudio.load(audio_path)
|
39
|
+
if waveform.numel() == 0:
|
40
|
+
raise ValueError("Cannot enroll empty audio file.")
|
41
|
+
|
42
|
+
torchaudio.save(str(dest_path), waveform, sample_rate)
|
43
|
+
print(f"๐ Saved {speaker_id}'s recording #{new_index} โ {dest_path}")
|
44
|
+
|
45
|
+
# Save embedding
|
46
|
+
emb = get_embedding(audio_path)
|
47
|
+
emb_path = EMBEDDINGS_DIR / f"{speaker_id}.pt"
|
48
|
+
torch.save(emb, emb_path)
|
49
|
+
print(f"๐ง Saved embedding for {speaker_id} โ {emb_path}")
|
50
|
+
|
51
|
+
def identify_speaker(audio_path, threshold=0.25):
|
52
|
+
try:
|
53
|
+
test_emb = get_embedding(audio_path)
|
54
|
+
except Exception as e:
|
55
|
+
return {"speaker": "error", "score": 0, "error": str(e)}
|
56
|
+
|
57
|
+
scores = {}
|
58
|
+
for emb_path in EMBEDDINGS_DIR.glob("*.pt"):
|
59
|
+
speaker_name = emb_path.stem
|
60
|
+
try:
|
61
|
+
enrolled_emb = torch.load(emb_path)
|
62
|
+
score = torch.nn.functional.cosine_similarity(enrolled_emb, test_emb, dim=0).item()
|
63
|
+
scores[speaker_name] = score
|
64
|
+
except Exception as e:
|
65
|
+
continue
|
66
|
+
|
67
|
+
if not scores:
|
68
|
+
return {"speaker": "unknown", "score": 0}
|
69
|
+
|
70
|
+
sorted_scores = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
|
71
|
+
best, second = sorted_scores[0], sorted_scores[1] if len(sorted_scores) > 1 else (None, None)
|
72
|
+
auto_thresh = best[1] - (second[1] if second else 0) > 0.1
|
73
|
+
is_match = auto_thresh or best[1] >= threshold
|
74
|
+
|
75
|
+
result = {
|
76
|
+
"speaker": best[0] if is_match else "unknown",
|
77
|
+
"score": round(best[1], 3),
|
78
|
+
"all_scores": {k: round(v, 3) for k, v in sorted_scores}
|
79
|
+
}
|
80
|
+
return result
|
81
|
+
|
82
|
+
def list_speakers():
|
83
|
+
speakers = []
|
84
|
+
for dir in SPEAKER_AUDIO_DIR.iterdir():
|
85
|
+
if dir.is_dir():
|
86
|
+
count = len(list(dir.glob("*.wav")))
|
87
|
+
speakers.append(f"{dir.name} ({count} recording{'s' if count != 1 else ''})")
|
88
|
+
print(f"๐ Found {len(speakers)} enrolled speaker(s): {speakers}")
|
89
|
+
return [s.split()[0] for s in speakers]
|
90
|
+
|
91
|
+
def rebuild_embedding(speaker_id):
|
92
|
+
speaker_dir = SPEAKER_AUDIO_DIR / speaker_id
|
93
|
+
wavs = list(speaker_dir.glob("*.wav"))
|
94
|
+
|
95
|
+
if not wavs:
|
96
|
+
raise RuntimeError(f"No recordings found for {speaker_id}.")
|
97
|
+
|
98
|
+
embeddings = [get_embedding(w) for w in wavs]
|
99
|
+
avg_emb = torch.stack(embeddings).mean(dim=0)
|
100
|
+
|
101
|
+
emb_path = EMBEDDINGS_DIR / f"{speaker_id}.pt"
|
102
|
+
torch.save(avg_emb, emb_path)
|
103
|
+
print(f"๐ Rebuilt embedding for {speaker_id}")
|
@@ -0,0 +1,41 @@
|
|
1
|
+
import torch
|
2
|
+
import json
|
3
|
+
|
4
|
+
def export_embeddings_to_json(pt_path, json_path):
|
5
|
+
"""
|
6
|
+
Converts a .pt file containing speaker embeddings into a
|
7
|
+
JSON file for use in the browser frontend.
|
8
|
+
|
9
|
+
Expected input format:
|
10
|
+
{
|
11
|
+
"lara": tensor([...]),
|
12
|
+
"guest": tensor([...]),
|
13
|
+
...
|
14
|
+
}
|
15
|
+
|
16
|
+
Output format:
|
17
|
+
[
|
18
|
+
{ "label": "lara", "vector": [...] },
|
19
|
+
{ "label": "guest", "vector": [...] },
|
20
|
+
...
|
21
|
+
]
|
22
|
+
"""
|
23
|
+
data = torch.load(pt_path, map_location="cpu")
|
24
|
+
|
25
|
+
if not isinstance(data, dict):
|
26
|
+
raise ValueError("Expected a dict of {label: tensor} in the .pt file")
|
27
|
+
|
28
|
+
converted = []
|
29
|
+
for label, tensor in data.items():
|
30
|
+
if not isinstance(tensor, torch.Tensor):
|
31
|
+
print(f"โ ๏ธ Skipping {label}: not a tensor")
|
32
|
+
continue
|
33
|
+
converted.append({
|
34
|
+
"label": label,
|
35
|
+
"vector": tensor.tolist()
|
36
|
+
})
|
37
|
+
|
38
|
+
with open(json_path, "w") as f:
|
39
|
+
json.dump(converted, f, indent=2)
|
40
|
+
|
41
|
+
print(f"โ
Exported {len(converted)} speaker embeddings to {json_path}")
|
@@ -0,0 +1,40 @@
|
|
1
|
+
import torch
|
2
|
+
from speechbrain.lobes.models.ECAPA_TDNN import ECAPA_TDNN
|
3
|
+
from collections import OrderedDict
|
4
|
+
|
5
|
+
def export_model_to_onnx(ckpt_path, out_path):
|
6
|
+
model = ECAPA_TDNN(
|
7
|
+
input_size=80,
|
8
|
+
channels=[1024, 1024, 1024, 1024, 3072],
|
9
|
+
kernel_sizes=[5, 3, 3, 3, 1],
|
10
|
+
dilations=[1, 2, 3, 4, 1],
|
11
|
+
attention_channels=128,
|
12
|
+
lin_neurons=192,
|
13
|
+
)
|
14
|
+
|
15
|
+
state_dict = torch.load(ckpt_path, map_location="cpu")
|
16
|
+
|
17
|
+
if "model" in state_dict:
|
18
|
+
state_dict = state_dict["model"]
|
19
|
+
|
20
|
+
new_state_dict = OrderedDict()
|
21
|
+
for k, v in state_dict.items():
|
22
|
+
if k.startswith("embedding_model."):
|
23
|
+
k = k[len("embedding_model."):]
|
24
|
+
new_state_dict[k] = v
|
25
|
+
|
26
|
+
model.load_state_dict(new_state_dict)
|
27
|
+
model.eval()
|
28
|
+
|
29
|
+
dummy_input = torch.randn(1, 200, 80)
|
30
|
+
torch.onnx.export(
|
31
|
+
model,
|
32
|
+
dummy_input,
|
33
|
+
out_path,
|
34
|
+
input_names=["features"],
|
35
|
+
output_names=["embedding"],
|
36
|
+
dynamic_axes={"features": {0: "batch", 1: "time"}},
|
37
|
+
opset_version=12,
|
38
|
+
)
|
39
|
+
|
40
|
+
print(f"โ
Exported ECAPA-TDNN to {out_path}")
|
@@ -0,0 +1,110 @@
|
|
1
|
+
import os
|
2
|
+
import torch
|
3
|
+
import torchaudio
|
4
|
+
import requests
|
5
|
+
from pathlib import Path
|
6
|
+
from pydub import AudioSegment
|
7
|
+
from dotenv import load_dotenv
|
8
|
+
from speaker_detector.core import get_embedding, STORAGE_DIR
|
9
|
+
|
10
|
+
load_dotenv()
|
11
|
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
12
|
+
|
13
|
+
CHUNK_DURATION = 8 # seconds
|
14
|
+
SCORE_THRESHOLD = 0.6
|
15
|
+
MIN_VALID_DURATION = 1.0 # seconds
|
16
|
+
WHISPER_API_URL = "https://api.openai.com/v1/audio/transcriptions"
|
17
|
+
|
18
|
+
def match_speaker(embedding, speaker_embeddings):
|
19
|
+
scores = {
|
20
|
+
name: torch.nn.functional.cosine_similarity(emb, embedding, dim=0).item()
|
21
|
+
for name, emb in speaker_embeddings.items()
|
22
|
+
}
|
23
|
+
if not scores:
|
24
|
+
return "unknown", 0.0
|
25
|
+
best = max(scores.items(), key=lambda kv: kv[1])
|
26
|
+
return best[0], round(best[1], 3)
|
27
|
+
|
28
|
+
def transcribe_full_audio(wav_path: Path) -> str:
|
29
|
+
try:
|
30
|
+
with open(wav_path, "rb") as f:
|
31
|
+
response = requests.post(
|
32
|
+
WHISPER_API_URL,
|
33
|
+
headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
|
34
|
+
files={"file": (wav_path.name, f, "audio/wav")},
|
35
|
+
data={
|
36
|
+
"model": "whisper-1",
|
37
|
+
"response_format": "json",
|
38
|
+
"temperature": 0.2,
|
39
|
+
"language": "en",
|
40
|
+
"prompt": "This is a meeting transcription.",
|
41
|
+
},
|
42
|
+
timeout=120
|
43
|
+
)
|
44
|
+
response.raise_for_status()
|
45
|
+
return response.json()["text"].strip()
|
46
|
+
except Exception as e:
|
47
|
+
print(f"โ Whisper failed: {e}")
|
48
|
+
return ""
|
49
|
+
|
50
|
+
def is_valid_audio(path):
|
51
|
+
try:
|
52
|
+
waveform, sample_rate = torchaudio.load(str(path))
|
53
|
+
duration_sec = waveform.shape[1] / sample_rate
|
54
|
+
return duration_sec >= MIN_VALID_DURATION
|
55
|
+
except Exception:
|
56
|
+
return False
|
57
|
+
|
58
|
+
def generate_summary(meeting_dir: Path):
|
59
|
+
meeting_dir = meeting_dir.resolve()
|
60
|
+
chunk_files = sorted([
|
61
|
+
f for f in meeting_dir.iterdir()
|
62
|
+
if f.name.startswith("chunk_") and f.suffix == ".wav" and is_valid_audio(f)
|
63
|
+
])
|
64
|
+
|
65
|
+
if not chunk_files:
|
66
|
+
return {"warning": "No valid .wav chunks found in meeting folder.", "segments": []}
|
67
|
+
|
68
|
+
# Merge all chunks into one file
|
69
|
+
combined = AudioSegment.empty()
|
70
|
+
for f in chunk_files:
|
71
|
+
combined += AudioSegment.from_wav(f)
|
72
|
+
merged_path = meeting_dir / "combined.wav"
|
73
|
+
combined.export(merged_path, format="wav")
|
74
|
+
|
75
|
+
# Get full transcript
|
76
|
+
full_text = transcribe_full_audio(merged_path)
|
77
|
+
print("๐ง Full transcript:", full_text)
|
78
|
+
|
79
|
+
# Load speaker embeddings
|
80
|
+
speaker_embeddings = {}
|
81
|
+
for spk_dir in STORAGE_DIR.iterdir():
|
82
|
+
if spk_dir.is_dir():
|
83
|
+
wavs = [w for w in spk_dir.glob("*.wav") if is_valid_audio(w)]
|
84
|
+
if wavs:
|
85
|
+
embs = [get_embedding(str(w)) for w in wavs]
|
86
|
+
speaker_embeddings[spk_dir.name] = torch.stack(embs).mean(dim=0)
|
87
|
+
|
88
|
+
segments = []
|
89
|
+
total = len(chunk_files)
|
90
|
+
|
91
|
+
for idx, chunk in enumerate(chunk_files):
|
92
|
+
try:
|
93
|
+
emb = get_embedding(chunk)
|
94
|
+
speaker, score = match_speaker(emb, speaker_embeddings)
|
95
|
+
segment_text = f"[chunk {idx+1}]"
|
96
|
+
segments.append({
|
97
|
+
"timestamp": idx * CHUNK_DURATION,
|
98
|
+
"speaker": speaker if score >= SCORE_THRESHOLD else "unknown",
|
99
|
+
"score": round(score, 3),
|
100
|
+
"text": segment_text,
|
101
|
+
"progress": round((idx + 1) / total * 100)
|
102
|
+
})
|
103
|
+
except Exception as e:
|
104
|
+
print(f"โ Failed on {chunk.name}: {e}")
|
105
|
+
|
106
|
+
return {
|
107
|
+
"transcript": full_text,
|
108
|
+
"segments": segments if segments else [],
|
109
|
+
"warning": None if segments else "No speaker segments found."
|
110
|
+
}
|
@@ -0,0 +1,101 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: speaker-detector
|
3
|
+
Version: 0.1.3
|
4
|
+
Summary: A CLI tool for speaker enrollment and identification using SpeechBrain.
|
5
|
+
Author-email: Lara Whybrow <lara.whybrow@gmail.com>
|
6
|
+
License: MIT
|
7
|
+
Project-URL: Homepage, https://github.com/P0llen/speaker-detector
|
8
|
+
Project-URL: Repository, https://github.com/P0llen/speaker-detector
|
9
|
+
Project-URL: Issues, https://github.com/P0llen/speaker-detector/issues
|
10
|
+
Project-URL: Documentation, https://github.com/P0llen/speaker-detector#readme
|
11
|
+
Keywords: speaker-recognition,speechbrain,voice,cli,ai
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
13
|
+
Classifier: Intended Audience :: Developers
|
14
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
20
|
+
Requires-Python: >=3.8
|
21
|
+
Description-Content-Type: text/markdown
|
22
|
+
Requires-Dist: torch
|
23
|
+
Requires-Dist: torchaudio
|
24
|
+
Requires-Dist: speechbrain
|
25
|
+
Requires-Dist: onnx
|
26
|
+
|
27
|
+
# speaker-detector ๐๏ธ
|
28
|
+
|
29
|
+
A lightweight CLI tool for speaker enrollment and voice identification, powered by [SpeechBrain](https://speechbrain.readthedocs.io/).
|
30
|
+
|
31
|
+
## ๐ง Features
|
32
|
+
|
33
|
+
|
34
|
+
- โ
Enroll speakers from .wav audio
|
35
|
+
- ๐ต๏ธ Identify speakers from audio samples
|
36
|
+
- ๐ง ECAPA-TDNN embedding-based matching
|
37
|
+
- ๐๏ธ Simple, fast command-line interface
|
38
|
+
- ๐ Clean file storage in `~/.speaker-detector/`
|
39
|
+
- ๐ Optional `--verbose` mode for debugging
|
40
|
+
|
41
|
+
|
42
|
+
## ๐ฆ Installation
|
43
|
+
|
44
|
+
Install from [TestPyPI](https://test.pypi.org/):
|
45
|
+
|
46
|
+
```bash
|
47
|
+
pip install --index-url https://test.pypi.org/simple/ speaker-detector
|
48
|
+
```
|
49
|
+
|
50
|
+
## ๐ Usage
|
51
|
+
|
52
|
+
## ๐๏ธ Enroll a speaker:
|
53
|
+
|
54
|
+
```bash
|
55
|
+
speaker-detector record --enroll Lara
|
56
|
+
```
|
57
|
+
|
58
|
+
## ๐ต๏ธ Identify a speaker:
|
59
|
+
|
60
|
+
```bash
|
61
|
+
speaker-detector record --test
|
62
|
+
```
|
63
|
+
## ๐ List enrolled speakers:
|
64
|
+
|
65
|
+
```bash
|
66
|
+
speaker-detector list
|
67
|
+
```
|
68
|
+
|
69
|
+
## ๐๏ธ Project Structure
|
70
|
+
|
71
|
+
~/.speaker-detector/enrollments/ Saved .pt voice embeddings
|
72
|
+
~/.speaker-detector/recordings/ CLI-recorded .wav audio files
|
73
|
+
|
74
|
+
๐งน Clean vs Verbose Mode
|
75
|
+
By default, warnings from speechbrain, torch, etc. are hidden for a clean CLI experience.
|
76
|
+
To enable full logs & deprecation warnings:
|
77
|
+
|
78
|
+
speaker-detector --verbose identify samples/test_sample.wav
|
79
|
+
|
80
|
+
๐ Requirements
|
81
|
+
Python 3.8+
|
82
|
+
torch
|
83
|
+
speechbrain
|
84
|
+
numpy
|
85
|
+
soundfile
|
86
|
+
onnxruntime
|
87
|
+
|
88
|
+
| Step | Command | When / Purpose | Output |
|
89
|
+
| --------------------------------- | ------------------------------------------------------------------------------------------------------------------- | ----------------------------- | ---------------------------------------- |
|
90
|
+
| **1. Export ECAPA Model to ONNX** | `speaker-detector export-model --pt models/embedding_model.ckpt --out ecapa_model.onnx` | Run once unless model changes | `ecapa_model.onnx` |
|
91
|
+
| **2. Enroll Speaker** | `speaker-detector enroll <speaker_id> <audio_path>`<br>Example:<br>`speaker-detector enroll Lara samples/lara1.wav` | Run per new speaker | Individual `.pt` files (e.g., `Lara.pt`) |
|
92
|
+
| **3. Combine Embeddings** | `speaker-detector combine --folder data/embeddings/ --out data/enrolled_speakers.pt` | After enrolling speakers | `enrolled_speakers.pt` |
|
93
|
+
| **4. Export Speakers to JSON** | `speaker-detector export-speaker-json --pt data/enrolled_speakers.pt --out public/speakers.json` | For frontend use | `speakers.json` |
|
94
|
+
| **5. Identify Speaker** | `speaker-detector identify samples/test_sample.wav` | Identify speaker from audio | Console output: name + score |
|
95
|
+
| **6. List Enrolled Speakers** | `speaker-detector list-speakers` | Show all enrolled speakers | Console output: list of IDs |
|
96
|
+
| **Verbose Mode (optional)** | Add `--verbose` to any command:<br>`speaker-detector --verbose identify samples/test_sample.wav` | Show warnings, detailed logs | Developer debug info |
|
97
|
+
|
98
|
+
|
99
|
+
|
100
|
+
|
101
|
+
NB: When pushing to Github, do not include any .identifier files.
|
@@ -0,0 +1,15 @@
|
|
1
|
+
speaker_detector/ECAPA_TDNN.py,sha256=KB5T-ye4c9ZWgTgn_SMH-T_-qYSEHQJJtf3xHjsfNPk,19024
|
2
|
+
speaker_detector/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
speaker_detector/__main__.py,sha256=EClCwCzb6h6YBpt0hrnG4h0mlNhNePyg_xBNNSVm1os,65
|
4
|
+
speaker_detector/analyze.py,sha256=sA8qyzczdHUbJw2_1JIbXn1WpiKC5dHLPRtPPoppJzY,1943
|
5
|
+
speaker_detector/cli.py,sha256=TKci4o4Fru-3NqUkPDRQRvtis2niNEAh9sQWwE5t6Us,3521
|
6
|
+
speaker_detector/combine.py,sha256=yCiqG6VMojz0CxSTPqjx0RrUban8oFIcKlA1zFMzaU4,761
|
7
|
+
speaker_detector/core.py,sha256=lQNOcmZs2IJOqrNKlk1BeVQX6tzc7BSpeP5Gordff-E,3586
|
8
|
+
speaker_detector/export_embeddings.py,sha256=OxNXadzEiMEJgpmCG6HHFncUX7DumFvTOys1R6UMUnw,1151
|
9
|
+
speaker_detector/export_model.py,sha256=qVVT2wSCnsPA8pSAEEyIMkY7Kc8uAgepc03MxBMT3xU,1146
|
10
|
+
speaker_detector/generate_summary.py,sha256=oTWEf2bxTCRIUl8L17-J64FyhRbCPnDjihFluEnBWc8,3726
|
11
|
+
speaker_detector-0.1.3.dist-info/METADATA,sha256=ilNHpXunmGzuyDUxadMHolzhHX2aNjsOBWZe8mHDgQc,4564
|
12
|
+
speaker_detector-0.1.3.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
|
13
|
+
speaker_detector-0.1.3.dist-info/entry_points.txt,sha256=2B30ee2cTyeeA49x_TBURl53bDRiLWGK3NWhb9rlK3s,63
|
14
|
+
speaker_detector-0.1.3.dist-info/top_level.txt,sha256=PJ5rfvd3GAbzMbc7-Fwhtufjf6HxzzTiiHociOy7RiM,17
|
15
|
+
speaker_detector-0.1.3.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
speaker_detector
|