@absolutejs/voice 0.0.14 → 0.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/angular/index.js +1 -1
- package/dist/client/index.js +1 -1
- package/dist/index.js +75 -9
- package/dist/react/index.js +1 -1
- package/dist/svelte/index.js +1 -1
- package/dist/turnDetection.d.ts +3 -1
- package/dist/types.d.ts +2 -0
- package/dist/vue/index.js +1 -1
- package/package.json +1 -1
package/dist/angular/index.js
CHANGED
package/dist/client/index.js
CHANGED
package/dist/index.js
CHANGED
|
@@ -236,12 +236,59 @@ var toVoiceSessionSummary = (session) => ({
|
|
|
236
236
|
|
|
237
237
|
// src/turnDetection.ts
|
|
238
238
|
var DEFAULT_SILENCE_MS = 700;
|
|
239
|
+
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
240
|
+
var toUint8Array = (audio) => {
|
|
241
|
+
if (audio instanceof ArrayBuffer) {
|
|
242
|
+
return new Uint8Array(audio);
|
|
243
|
+
}
|
|
244
|
+
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
245
|
+
};
|
|
246
|
+
var measureAudioLevel = (audio) => {
|
|
247
|
+
const bytes = toUint8Array(audio);
|
|
248
|
+
if (bytes.byteLength < 2) {
|
|
249
|
+
return 0;
|
|
250
|
+
}
|
|
251
|
+
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
252
|
+
if (samples.length === 0) {
|
|
253
|
+
return 0;
|
|
254
|
+
}
|
|
255
|
+
let sumSquares = 0;
|
|
256
|
+
for (const sample of samples) {
|
|
257
|
+
const normalized = sample / 32768;
|
|
258
|
+
sumSquares += normalized * normalized;
|
|
259
|
+
}
|
|
260
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
261
|
+
};
|
|
262
|
+
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
263
|
+
var mergeTranscriptTexts = (transcripts) => {
|
|
264
|
+
const merged = [];
|
|
265
|
+
for (const transcript of transcripts) {
|
|
266
|
+
const nextText = normalizeText(transcript.text);
|
|
267
|
+
if (!nextText) {
|
|
268
|
+
continue;
|
|
269
|
+
}
|
|
270
|
+
const previous = merged.at(-1);
|
|
271
|
+
if (!previous) {
|
|
272
|
+
merged.push(nextText);
|
|
273
|
+
continue;
|
|
274
|
+
}
|
|
275
|
+
if (nextText === previous || previous.includes(nextText)) {
|
|
276
|
+
continue;
|
|
277
|
+
}
|
|
278
|
+
if (nextText.includes(previous)) {
|
|
279
|
+
merged[merged.length - 1] = nextText;
|
|
280
|
+
continue;
|
|
281
|
+
}
|
|
282
|
+
merged.push(nextText);
|
|
283
|
+
}
|
|
284
|
+
return merged.join(" ").trim();
|
|
285
|
+
};
|
|
239
286
|
var buildTurnText = (transcripts, partialText) => {
|
|
240
|
-
const finalText = transcripts
|
|
287
|
+
const finalText = mergeTranscriptTexts(transcripts);
|
|
241
288
|
if (finalText) {
|
|
242
289
|
return finalText;
|
|
243
290
|
}
|
|
244
|
-
return partialText
|
|
291
|
+
return normalizeText(partialText);
|
|
245
292
|
};
|
|
246
293
|
|
|
247
294
|
// src/session.ts
|
|
@@ -269,11 +316,13 @@ var createVoiceSession = (options) => {
|
|
|
269
316
|
timeout: options.reconnect.timeout ?? DEFAULT_RECONNECT_TIMEOUT
|
|
270
317
|
};
|
|
271
318
|
const turnDetection = {
|
|
272
|
-
silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS
|
|
319
|
+
silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
|
|
320
|
+
speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD
|
|
273
321
|
};
|
|
274
322
|
let socket = options.socket;
|
|
275
323
|
let sttSession = null;
|
|
276
324
|
let silenceTimer = null;
|
|
325
|
+
let speechDetected = false;
|
|
277
326
|
const clearSilenceTimer = () => {
|
|
278
327
|
if (!silenceTimer) {
|
|
279
328
|
return;
|
|
@@ -315,7 +364,9 @@ var createVoiceSession = (options) => {
|
|
|
315
364
|
}
|
|
316
365
|
};
|
|
317
366
|
const scheduleSilenceCommit = () => {
|
|
318
|
-
|
|
367
|
+
if (silenceTimer) {
|
|
368
|
+
return;
|
|
369
|
+
}
|
|
319
370
|
silenceTimer = setTimeout(() => {
|
|
320
371
|
api.commitTurn("silence");
|
|
321
372
|
}, turnDetection.silenceMs);
|
|
@@ -338,7 +389,7 @@ var createVoiceSession = (options) => {
|
|
|
338
389
|
const handlePartial = async (transcript) => {
|
|
339
390
|
await writeSession((session) => {
|
|
340
391
|
session.currentTurn.lastAudioAt = Date.now();
|
|
341
|
-
session.currentTurn.partialText = transcript.text;
|
|
392
|
+
session.currentTurn.partialText = buildTurnText(session.currentTurn.transcripts, transcript.text);
|
|
342
393
|
session.lastActivityAt = Date.now();
|
|
343
394
|
session.status = "active";
|
|
344
395
|
});
|
|
@@ -346,7 +397,6 @@ var createVoiceSession = (options) => {
|
|
|
346
397
|
transcript,
|
|
347
398
|
type: "partial"
|
|
348
399
|
});
|
|
349
|
-
scheduleSilenceCommit();
|
|
350
400
|
};
|
|
351
401
|
const handleFinal = async (transcript) => {
|
|
352
402
|
await writeSession((session) => {
|
|
@@ -370,7 +420,6 @@ var createVoiceSession = (options) => {
|
|
|
370
420
|
transcript,
|
|
371
421
|
type: "final"
|
|
372
422
|
});
|
|
373
|
-
scheduleSilenceCommit();
|
|
374
423
|
};
|
|
375
424
|
const ensureAdapter = async () => {
|
|
376
425
|
if (sttSession) {
|
|
@@ -472,6 +521,7 @@ var createVoiceSession = (options) => {
|
|
|
472
521
|
currentSession.status = "active";
|
|
473
522
|
currentSession.turns = [...currentSession.turns, turn];
|
|
474
523
|
});
|
|
524
|
+
speechDetected = false;
|
|
475
525
|
logger.info("voice turn committed", {
|
|
476
526
|
reason,
|
|
477
527
|
sessionId: options.id,
|
|
@@ -505,6 +555,7 @@ var createVoiceSession = (options) => {
|
|
|
505
555
|
type: "complete"
|
|
506
556
|
});
|
|
507
557
|
await closeAdapter("complete");
|
|
558
|
+
speechDetected = false;
|
|
508
559
|
await options.route.onComplete({
|
|
509
560
|
api,
|
|
510
561
|
context: options.context,
|
|
@@ -575,6 +626,7 @@ var createVoiceSession = (options) => {
|
|
|
575
626
|
session.reconnect.lastDisconnectAt = Date.now();
|
|
576
627
|
session.status = "reconnecting";
|
|
577
628
|
});
|
|
629
|
+
speechDetected = false;
|
|
578
630
|
},
|
|
579
631
|
fail: async (error) => {
|
|
580
632
|
clearSilenceTimer();
|
|
@@ -589,6 +641,7 @@ var createVoiceSession = (options) => {
|
|
|
589
641
|
type: "error"
|
|
590
642
|
});
|
|
591
643
|
await closeAdapter("failed");
|
|
644
|
+
speechDetected = false;
|
|
592
645
|
await options.route.onError?.({
|
|
593
646
|
api,
|
|
594
647
|
context: options.context,
|
|
@@ -603,11 +656,22 @@ var createVoiceSession = (options) => {
|
|
|
603
656
|
return;
|
|
604
657
|
}
|
|
605
658
|
const adapter = await ensureAdapter();
|
|
659
|
+
const audioLevel = measureAudioLevel(audio);
|
|
606
660
|
await writeSession((currentSession) => {
|
|
607
661
|
currentSession.currentTurn.lastAudioAt = Date.now();
|
|
608
662
|
currentSession.lastActivityAt = Date.now();
|
|
609
663
|
currentSession.status = "active";
|
|
610
664
|
});
|
|
665
|
+
if (audioLevel >= turnDetection.speechThreshold) {
|
|
666
|
+
speechDetected = true;
|
|
667
|
+
clearSilenceTimer();
|
|
668
|
+
} else if (speechDetected) {
|
|
669
|
+
const currentSession = await readSession();
|
|
670
|
+
const hasTurnText = Boolean(buildTurnText(currentSession.currentTurn.transcripts, currentSession.currentTurn.partialText));
|
|
671
|
+
if (hasTurnText) {
|
|
672
|
+
scheduleSilenceCommit();
|
|
673
|
+
}
|
|
674
|
+
}
|
|
611
675
|
await adapter.send(audio);
|
|
612
676
|
},
|
|
613
677
|
snapshot: async () => readSession()
|
|
@@ -776,7 +840,8 @@ var voice = (config) => {
|
|
|
776
840
|
store: config.session,
|
|
777
841
|
stt: config.stt,
|
|
778
842
|
turnDetection: {
|
|
779
|
-
silenceMs: config.turnDetection?.silenceMs ?? 700
|
|
843
|
+
silenceMs: config.turnDetection?.silenceMs ?? 700,
|
|
844
|
+
speechThreshold: config.turnDetection?.speechThreshold ?? 0.015
|
|
780
845
|
}
|
|
781
846
|
});
|
|
782
847
|
if (!current) {
|
|
@@ -811,7 +876,8 @@ var voice = (config) => {
|
|
|
811
876
|
store: config.session,
|
|
812
877
|
stt: config.stt,
|
|
813
878
|
turnDetection: {
|
|
814
|
-
silenceMs: config.turnDetection?.silenceMs ?? 700
|
|
879
|
+
silenceMs: config.turnDetection?.silenceMs ?? 700,
|
|
880
|
+
speechThreshold: config.turnDetection?.speechThreshold ?? 0.015
|
|
815
881
|
}
|
|
816
882
|
});
|
|
817
883
|
runtime.activeSessions.set(sessionId, session);
|
package/dist/react/index.js
CHANGED
package/dist/svelte/index.js
CHANGED
package/dist/turnDetection.d.ts
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
-
import type { Transcript } from './types';
|
|
1
|
+
import type { AudioChunk, Transcript } from './types';
|
|
2
2
|
export declare const DEFAULT_SILENCE_MS = 700;
|
|
3
|
+
export declare const DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
4
|
+
export declare const measureAudioLevel: (audio: AudioChunk) => number;
|
|
3
5
|
export declare const buildTurnText: (transcripts: Transcript[], partialText: string) => string;
|
package/dist/types.d.ts
CHANGED
|
@@ -211,6 +211,7 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
|
|
|
211
211
|
reconnect?: VoiceReconnectConfig;
|
|
212
212
|
turnDetection?: {
|
|
213
213
|
silenceMs?: number;
|
|
214
|
+
speechThreshold?: number;
|
|
214
215
|
};
|
|
215
216
|
logger?: VoiceLogger;
|
|
216
217
|
htmx?: boolean | VoiceHTMXConfig<TSession, NoInfer<TResult>>;
|
|
@@ -224,6 +225,7 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
|
|
|
224
225
|
reconnect: Required<VoiceReconnectConfig>;
|
|
225
226
|
turnDetection: {
|
|
226
227
|
silenceMs: number;
|
|
228
|
+
speechThreshold: number;
|
|
227
229
|
};
|
|
228
230
|
route: VoiceNormalizedRouteConfig<TContext, TSession, TResult>;
|
|
229
231
|
logger?: VoiceLogger;
|
package/dist/vue/index.js
CHANGED