agentgui 1.0.177 → 1.0.178
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/speech.js +14 -7
- package/package.json +1 -1
package/lib/speech.js
CHANGED
|
@@ -218,24 +218,29 @@ async function loadVoiceEmbedding(voiceId) {
|
|
|
218
218
|
}
|
|
219
219
|
}
|
|
220
220
|
|
|
221
|
+
let speakerFeatureExtractor = null;
|
|
222
|
+
|
|
221
223
|
async function getSpeakerEmbeddingPipeline() {
|
|
222
224
|
if (speakerEmbeddingPipeline) return speakerEmbeddingPipeline;
|
|
223
225
|
if (speakerEmbeddingLoading) {
|
|
224
226
|
while (speakerEmbeddingLoading) await new Promise(r => setTimeout(r, 100));
|
|
225
|
-
if (!speakerEmbeddingPipeline) throw new Error('Speaker embedding
|
|
227
|
+
if (!speakerEmbeddingPipeline) throw new Error('Speaker embedding model failed to load');
|
|
226
228
|
return speakerEmbeddingPipeline;
|
|
227
229
|
}
|
|
228
230
|
speakerEmbeddingLoading = true;
|
|
229
231
|
try {
|
|
230
|
-
const {
|
|
232
|
+
const { AutoModelForXVector, AutoFeatureExtractor, env } = await loadTransformers();
|
|
231
233
|
env.allowRemoteModels = true;
|
|
232
|
-
|
|
234
|
+
const modelId = 'Xenova/wavlm-base-plus-sv';
|
|
235
|
+
speakerEmbeddingPipeline = await AutoModelForXVector.from_pretrained(modelId, {
|
|
233
236
|
device: 'cpu',
|
|
234
237
|
dtype: 'fp32',
|
|
235
238
|
});
|
|
239
|
+
speakerFeatureExtractor = await AutoFeatureExtractor.from_pretrained(modelId);
|
|
236
240
|
return speakerEmbeddingPipeline;
|
|
237
241
|
} catch (err) {
|
|
238
242
|
speakerEmbeddingPipeline = null;
|
|
243
|
+
speakerFeatureExtractor = null;
|
|
239
244
|
throw new Error('Speaker embedding model load failed: ' + err.message);
|
|
240
245
|
} finally {
|
|
241
246
|
speakerEmbeddingLoading = false;
|
|
@@ -284,11 +289,13 @@ async function generateEmbeddingFromCustomVoice(voiceId) {
|
|
|
284
289
|
if (audio.length < SAMPLE_RATE_STT * 0.5) {
|
|
285
290
|
throw new Error('Audio too short for embedding extraction');
|
|
286
291
|
}
|
|
287
|
-
const
|
|
288
|
-
const
|
|
292
|
+
const model = await getSpeakerEmbeddingPipeline();
|
|
293
|
+
const inputs = await speakerFeatureExtractor(audio, { sampling_rate: SAMPLE_RATE_STT });
|
|
294
|
+
const output = await model(inputs);
|
|
295
|
+
const embData = output.embeddings.data;
|
|
289
296
|
const embedding = new Float32Array(512);
|
|
290
|
-
for (let i = 0; i < Math.min(512,
|
|
291
|
-
embedding[i] =
|
|
297
|
+
for (let i = 0; i < Math.min(512, embData.length); i++) {
|
|
298
|
+
embedding[i] = embData[i];
|
|
292
299
|
}
|
|
293
300
|
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
294
301
|
const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
|