@absolutejs/voice 0.0.22-beta.596 → 0.0.22-beta.598
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/semanticTurn.d.ts +11 -1
- package/dist/index.js +37 -2
- package/dist/testing/index.js +37 -2
- package/package.json +1 -1
|
@@ -1,10 +1,20 @@
|
|
|
1
|
-
import type { Transcript } from "./types";
|
|
1
|
+
import type { AudioFormat, Transcript } from "./types";
|
|
2
2
|
export type VoiceSemanticTurnInput = {
|
|
3
3
|
audioLevel?: number;
|
|
4
4
|
lastFinalTranscript?: Transcript;
|
|
5
5
|
partialText: string;
|
|
6
6
|
silenceMs: number;
|
|
7
7
|
transcripts: Transcript[];
|
|
8
|
+
/**
|
|
9
|
+
* The current turn's buffered user audio (PCM chunks, oldest→newest) and its
|
|
10
|
+
* format. Lets an AUDIO-based end-of-turn detector (e.g. a smart-turn / Whisper
|
|
11
|
+
* EOT model) judge completion from prosody — pitch, pace, trailing intonation —
|
|
12
|
+
* which a transcript-only judge fundamentally cannot see. Undefined when no
|
|
13
|
+
* audio was buffered for the turn (the runtime only stores chunks above the
|
|
14
|
+
* speech threshold).
|
|
15
|
+
*/
|
|
16
|
+
turnAudio?: ReadonlyArray<Uint8Array>;
|
|
17
|
+
turnAudioFormat?: AudioFormat;
|
|
8
18
|
};
|
|
9
19
|
export type VoiceSemanticTurnVerdict = {
|
|
10
20
|
confidence?: number;
|
package/dist/index.js
CHANGED
|
@@ -3757,6 +3757,8 @@ var getBufferedAudioDurationMs = (chunks) => chunks.reduce((total, chunk) => tot
|
|
|
3757
3757
|
var STREAM_SENTENCE_BOUNDARY = /[.!?\u2026]['")\]]*\s/;
|
|
3758
3758
|
var STREAM_CLAUSE_BOUNDARY = /[,;:]\s/g;
|
|
3759
3759
|
var MAX_TTS_CHUNK_CHARS = 320;
|
|
3760
|
+
var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
|
|
3761
|
+
var STREAM_IDLE_FLUSH_MS = 350;
|
|
3760
3762
|
var nextSpeakableBoundary = (buffer) => {
|
|
3761
3763
|
const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
|
|
3762
3764
|
return match ? match.index + match[0].length : -1;
|
|
@@ -4135,6 +4137,17 @@ var createVoiceSession = (options) => {
|
|
|
4135
4137
|
pruneTurnAudio();
|
|
4136
4138
|
return currentTurnAudio.map((audio) => audio.chunk);
|
|
4137
4139
|
};
|
|
4140
|
+
const turnAudioInputFormat = recordingConfig?.userInputFormat ?? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT;
|
|
4141
|
+
const getTurnAudioForDetector = () => {
|
|
4142
|
+
if (!options.semanticTurnDetector || currentTurnAudio.length === 0) {
|
|
4143
|
+
return { turnAudio: undefined, turnAudioFormat: undefined };
|
|
4144
|
+
}
|
|
4145
|
+
const turnAudio = currentTurnAudio.map((audio) => {
|
|
4146
|
+
const c = audio.chunk;
|
|
4147
|
+
return c instanceof ArrayBuffer ? new Uint8Array(c) : new Uint8Array(c.buffer, c.byteOffset, c.byteLength);
|
|
4148
|
+
});
|
|
4149
|
+
return { turnAudio, turnAudioFormat: turnAudioInputFormat };
|
|
4150
|
+
};
|
|
4138
4151
|
const clearSilenceTimer = () => {
|
|
4139
4152
|
if (!silenceTimer) {
|
|
4140
4153
|
return;
|
|
@@ -4474,7 +4487,8 @@ var createVoiceSession = (options) => {
|
|
|
4474
4487
|
lastFinalTranscript: transcripts.at(-1),
|
|
4475
4488
|
partialText,
|
|
4476
4489
|
silenceMs,
|
|
4477
|
-
transcripts
|
|
4490
|
+
transcripts,
|
|
4491
|
+
...getTurnAudioForDetector()
|
|
4478
4492
|
}));
|
|
4479
4493
|
endOfTurn = verdict.endOfTurn;
|
|
4480
4494
|
} catch {
|
|
@@ -5259,7 +5273,8 @@ var createVoiceSession = (options) => {
|
|
|
5259
5273
|
lastFinalTranscript: transcript,
|
|
5260
5274
|
partialText: session.currentTurn.partialText,
|
|
5261
5275
|
silenceMs: session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : 0,
|
|
5262
|
-
transcripts: session.currentTurn.transcripts
|
|
5276
|
+
transcripts: session.currentTurn.transcripts,
|
|
5277
|
+
...getTurnAudioForDetector()
|
|
5263
5278
|
}));
|
|
5264
5279
|
if (verdict.endOfTurn) {
|
|
5265
5280
|
clearSilenceTimer();
|
|
@@ -5448,6 +5463,7 @@ var createVoiceSession = (options) => {
|
|
|
5448
5463
|
let charsSent = 0;
|
|
5449
5464
|
let started = false;
|
|
5450
5465
|
let streamed = false;
|
|
5466
|
+
let idleFlushTimer = null;
|
|
5451
5467
|
let sendChain = Promise.resolve();
|
|
5452
5468
|
let ttsSessionRequest = null;
|
|
5453
5469
|
const ttsStartedAt = Date.now();
|
|
@@ -5507,8 +5523,23 @@ var createVoiceSession = (options) => {
|
|
|
5507
5523
|
}
|
|
5508
5524
|
})();
|
|
5509
5525
|
};
|
|
5526
|
+
const clearIdleFlush = () => {
|
|
5527
|
+
if (idleFlushTimer) {
|
|
5528
|
+
clearTimeout(idleFlushTimer);
|
|
5529
|
+
idleFlushTimer = null;
|
|
5530
|
+
}
|
|
5531
|
+
};
|
|
5532
|
+
const flushOnIdle = () => {
|
|
5533
|
+
idleFlushTimer = null;
|
|
5534
|
+
const pending = buffer.trim();
|
|
5535
|
+
if (pending && STREAM_SENTENCE_END.test(pending)) {
|
|
5536
|
+
flush(buffer);
|
|
5537
|
+
buffer = "";
|
|
5538
|
+
}
|
|
5539
|
+
};
|
|
5510
5540
|
return {
|
|
5511
5541
|
finish: async () => {
|
|
5542
|
+
clearIdleFlush();
|
|
5512
5543
|
if (buffer.trim()) {
|
|
5513
5544
|
flush(buffer);
|
|
5514
5545
|
}
|
|
@@ -5554,6 +5585,10 @@ var createVoiceSession = (options) => {
|
|
|
5554
5585
|
flush(buffer.slice(0, cut));
|
|
5555
5586
|
buffer = buffer.slice(cut);
|
|
5556
5587
|
}
|
|
5588
|
+
clearIdleFlush();
|
|
5589
|
+
if (buffer.trim()) {
|
|
5590
|
+
idleFlushTimer = setTimeout(flushOnIdle, STREAM_IDLE_FLUSH_MS);
|
|
5591
|
+
}
|
|
5557
5592
|
}
|
|
5558
5593
|
};
|
|
5559
5594
|
};
|
package/dist/testing/index.js
CHANGED
|
@@ -5984,6 +5984,8 @@ var getBufferedAudioDurationMs = (chunks) => chunks.reduce((total, chunk) => tot
|
|
|
5984
5984
|
var STREAM_SENTENCE_BOUNDARY = /[.!?\u2026]['")\]]*\s/;
|
|
5985
5985
|
var STREAM_CLAUSE_BOUNDARY = /[,;:]\s/g;
|
|
5986
5986
|
var MAX_TTS_CHUNK_CHARS = 320;
|
|
5987
|
+
var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
|
|
5988
|
+
var STREAM_IDLE_FLUSH_MS = 350;
|
|
5987
5989
|
var nextSpeakableBoundary = (buffer) => {
|
|
5988
5990
|
const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
|
|
5989
5991
|
return match ? match.index + match[0].length : -1;
|
|
@@ -6362,6 +6364,17 @@ var createVoiceSession = (options) => {
|
|
|
6362
6364
|
pruneTurnAudio();
|
|
6363
6365
|
return currentTurnAudio.map((audio) => audio.chunk);
|
|
6364
6366
|
};
|
|
6367
|
+
const turnAudioInputFormat = recordingConfig?.userInputFormat ?? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT;
|
|
6368
|
+
const getTurnAudioForDetector = () => {
|
|
6369
|
+
if (!options.semanticTurnDetector || currentTurnAudio.length === 0) {
|
|
6370
|
+
return { turnAudio: undefined, turnAudioFormat: undefined };
|
|
6371
|
+
}
|
|
6372
|
+
const turnAudio = currentTurnAudio.map((audio) => {
|
|
6373
|
+
const c = audio.chunk;
|
|
6374
|
+
return c instanceof ArrayBuffer ? new Uint8Array(c) : new Uint8Array(c.buffer, c.byteOffset, c.byteLength);
|
|
6375
|
+
});
|
|
6376
|
+
return { turnAudio, turnAudioFormat: turnAudioInputFormat };
|
|
6377
|
+
};
|
|
6365
6378
|
const clearSilenceTimer = () => {
|
|
6366
6379
|
if (!silenceTimer) {
|
|
6367
6380
|
return;
|
|
@@ -6701,7 +6714,8 @@ var createVoiceSession = (options) => {
|
|
|
6701
6714
|
lastFinalTranscript: transcripts.at(-1),
|
|
6702
6715
|
partialText,
|
|
6703
6716
|
silenceMs,
|
|
6704
|
-
transcripts
|
|
6717
|
+
transcripts,
|
|
6718
|
+
...getTurnAudioForDetector()
|
|
6705
6719
|
}));
|
|
6706
6720
|
endOfTurn = verdict.endOfTurn;
|
|
6707
6721
|
} catch {
|
|
@@ -7486,7 +7500,8 @@ var createVoiceSession = (options) => {
|
|
|
7486
7500
|
lastFinalTranscript: transcript,
|
|
7487
7501
|
partialText: session.currentTurn.partialText,
|
|
7488
7502
|
silenceMs: session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : 0,
|
|
7489
|
-
transcripts: session.currentTurn.transcripts
|
|
7503
|
+
transcripts: session.currentTurn.transcripts,
|
|
7504
|
+
...getTurnAudioForDetector()
|
|
7490
7505
|
}));
|
|
7491
7506
|
if (verdict.endOfTurn) {
|
|
7492
7507
|
clearSilenceTimer();
|
|
@@ -7675,6 +7690,7 @@ var createVoiceSession = (options) => {
|
|
|
7675
7690
|
let charsSent = 0;
|
|
7676
7691
|
let started = false;
|
|
7677
7692
|
let streamed = false;
|
|
7693
|
+
let idleFlushTimer = null;
|
|
7678
7694
|
let sendChain = Promise.resolve();
|
|
7679
7695
|
let ttsSessionRequest = null;
|
|
7680
7696
|
const ttsStartedAt = Date.now();
|
|
@@ -7734,8 +7750,23 @@ var createVoiceSession = (options) => {
|
|
|
7734
7750
|
}
|
|
7735
7751
|
})();
|
|
7736
7752
|
};
|
|
7753
|
+
const clearIdleFlush = () => {
|
|
7754
|
+
if (idleFlushTimer) {
|
|
7755
|
+
clearTimeout(idleFlushTimer);
|
|
7756
|
+
idleFlushTimer = null;
|
|
7757
|
+
}
|
|
7758
|
+
};
|
|
7759
|
+
const flushOnIdle = () => {
|
|
7760
|
+
idleFlushTimer = null;
|
|
7761
|
+
const pending = buffer.trim();
|
|
7762
|
+
if (pending && STREAM_SENTENCE_END.test(pending)) {
|
|
7763
|
+
flush(buffer);
|
|
7764
|
+
buffer = "";
|
|
7765
|
+
}
|
|
7766
|
+
};
|
|
7737
7767
|
return {
|
|
7738
7768
|
finish: async () => {
|
|
7769
|
+
clearIdleFlush();
|
|
7739
7770
|
if (buffer.trim()) {
|
|
7740
7771
|
flush(buffer);
|
|
7741
7772
|
}
|
|
@@ -7781,6 +7812,10 @@ var createVoiceSession = (options) => {
|
|
|
7781
7812
|
flush(buffer.slice(0, cut));
|
|
7782
7813
|
buffer = buffer.slice(cut);
|
|
7783
7814
|
}
|
|
7815
|
+
clearIdleFlush();
|
|
7816
|
+
if (buffer.trim()) {
|
|
7817
|
+
idleFlushTimer = setTimeout(flushOnIdle, STREAM_IDLE_FLUSH_MS);
|
|
7818
|
+
}
|
|
7784
7819
|
}
|
|
7785
7820
|
};
|
|
7786
7821
|
};
|