@absolutejs/voice 0.0.22-beta.597 → 0.0.22-beta.598
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/semanticTurn.d.ts +11 -1
- package/dist/index.js +15 -2
- package/dist/testing/index.js +15 -2
- package/package.json +1 -1
|
@@ -1,10 +1,20 @@
|
|
|
1
|
-
import type { Transcript } from "./types";
|
|
1
|
+
import type { AudioFormat, Transcript } from "./types";
|
|
2
2
|
export type VoiceSemanticTurnInput = {
|
|
3
3
|
audioLevel?: number;
|
|
4
4
|
lastFinalTranscript?: Transcript;
|
|
5
5
|
partialText: string;
|
|
6
6
|
silenceMs: number;
|
|
7
7
|
transcripts: Transcript[];
|
|
8
|
+
/**
|
|
9
|
+
* The current turn's buffered user audio (PCM chunks, oldest→newest) and its
|
|
10
|
+
* format. Lets an AUDIO-based end-of-turn detector (e.g. a smart-turn / Whisper
|
|
11
|
+
* EOT model) judge completion from prosody — pitch, pace, trailing intonation —
|
|
12
|
+
* which a transcript-only judge fundamentally cannot see. Undefined when no
|
|
13
|
+
* audio was buffered for the turn (the runtime only stores chunks above the
|
|
14
|
+
* speech threshold).
|
|
15
|
+
*/
|
|
16
|
+
turnAudio?: ReadonlyArray<Uint8Array>;
|
|
17
|
+
turnAudioFormat?: AudioFormat;
|
|
8
18
|
};
|
|
9
19
|
export type VoiceSemanticTurnVerdict = {
|
|
10
20
|
confidence?: number;
|
package/dist/index.js
CHANGED
|
@@ -4137,6 +4137,17 @@ var createVoiceSession = (options) => {
|
|
|
4137
4137
|
pruneTurnAudio();
|
|
4138
4138
|
return currentTurnAudio.map((audio) => audio.chunk);
|
|
4139
4139
|
};
|
|
4140
|
+
const turnAudioInputFormat = recordingConfig?.userInputFormat ?? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT;
|
|
4141
|
+
const getTurnAudioForDetector = () => {
|
|
4142
|
+
if (!options.semanticTurnDetector || currentTurnAudio.length === 0) {
|
|
4143
|
+
return { turnAudio: undefined, turnAudioFormat: undefined };
|
|
4144
|
+
}
|
|
4145
|
+
const turnAudio = currentTurnAudio.map((audio) => {
|
|
4146
|
+
const c = audio.chunk;
|
|
4147
|
+
return c instanceof ArrayBuffer ? new Uint8Array(c) : new Uint8Array(c.buffer, c.byteOffset, c.byteLength);
|
|
4148
|
+
});
|
|
4149
|
+
return { turnAudio, turnAudioFormat: turnAudioInputFormat };
|
|
4150
|
+
};
|
|
4140
4151
|
const clearSilenceTimer = () => {
|
|
4141
4152
|
if (!silenceTimer) {
|
|
4142
4153
|
return;
|
|
@@ -4476,7 +4487,8 @@ var createVoiceSession = (options) => {
|
|
|
4476
4487
|
lastFinalTranscript: transcripts.at(-1),
|
|
4477
4488
|
partialText,
|
|
4478
4489
|
silenceMs,
|
|
4479
|
-
transcripts
|
|
4490
|
+
transcripts,
|
|
4491
|
+
...getTurnAudioForDetector()
|
|
4480
4492
|
}));
|
|
4481
4493
|
endOfTurn = verdict.endOfTurn;
|
|
4482
4494
|
} catch {
|
|
@@ -5261,7 +5273,8 @@ var createVoiceSession = (options) => {
|
|
|
5261
5273
|
lastFinalTranscript: transcript,
|
|
5262
5274
|
partialText: session.currentTurn.partialText,
|
|
5263
5275
|
silenceMs: session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : 0,
|
|
5264
|
-
transcripts: session.currentTurn.transcripts
|
|
5276
|
+
transcripts: session.currentTurn.transcripts,
|
|
5277
|
+
...getTurnAudioForDetector()
|
|
5265
5278
|
}));
|
|
5266
5279
|
if (verdict.endOfTurn) {
|
|
5267
5280
|
clearSilenceTimer();
|
package/dist/testing/index.js
CHANGED
|
@@ -6364,6 +6364,17 @@ var createVoiceSession = (options) => {
|
|
|
6364
6364
|
pruneTurnAudio();
|
|
6365
6365
|
return currentTurnAudio.map((audio) => audio.chunk);
|
|
6366
6366
|
};
|
|
6367
|
+
const turnAudioInputFormat = recordingConfig?.userInputFormat ?? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT;
|
|
6368
|
+
const getTurnAudioForDetector = () => {
|
|
6369
|
+
if (!options.semanticTurnDetector || currentTurnAudio.length === 0) {
|
|
6370
|
+
return { turnAudio: undefined, turnAudioFormat: undefined };
|
|
6371
|
+
}
|
|
6372
|
+
const turnAudio = currentTurnAudio.map((audio) => {
|
|
6373
|
+
const c = audio.chunk;
|
|
6374
|
+
return c instanceof ArrayBuffer ? new Uint8Array(c) : new Uint8Array(c.buffer, c.byteOffset, c.byteLength);
|
|
6375
|
+
});
|
|
6376
|
+
return { turnAudio, turnAudioFormat: turnAudioInputFormat };
|
|
6377
|
+
};
|
|
6367
6378
|
const clearSilenceTimer = () => {
|
|
6368
6379
|
if (!silenceTimer) {
|
|
6369
6380
|
return;
|
|
@@ -6703,7 +6714,8 @@ var createVoiceSession = (options) => {
|
|
|
6703
6714
|
lastFinalTranscript: transcripts.at(-1),
|
|
6704
6715
|
partialText,
|
|
6705
6716
|
silenceMs,
|
|
6706
|
-
transcripts
|
|
6717
|
+
transcripts,
|
|
6718
|
+
...getTurnAudioForDetector()
|
|
6707
6719
|
}));
|
|
6708
6720
|
endOfTurn = verdict.endOfTurn;
|
|
6709
6721
|
} catch {
|
|
@@ -7488,7 +7500,8 @@ var createVoiceSession = (options) => {
|
|
|
7488
7500
|
lastFinalTranscript: transcript,
|
|
7489
7501
|
partialText: session.currentTurn.partialText,
|
|
7490
7502
|
silenceMs: session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : 0,
|
|
7491
|
-
transcripts: session.currentTurn.transcripts
|
|
7503
|
+
transcripts: session.currentTurn.transcripts,
|
|
7504
|
+
...getTurnAudioForDetector()
|
|
7492
7505
|
}));
|
|
7493
7506
|
if (verdict.endOfTurn) {
|
|
7494
7507
|
clearSilenceTimer();
|