@absolutejs/voice 0.0.22-beta.596 → 0.0.22-beta.598

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,20 @@
1
- import type { Transcript } from "./types";
1
+ import type { AudioFormat, Transcript } from "./types";
2
2
  export type VoiceSemanticTurnInput = {
3
3
  audioLevel?: number;
4
4
  lastFinalTranscript?: Transcript;
5
5
  partialText: string;
6
6
  silenceMs: number;
7
7
  transcripts: Transcript[];
8
+ /**
9
+ * The current turn's buffered user audio (PCM chunks, oldest→newest) and its
10
+ * format. Lets an AUDIO-based end-of-turn detector (e.g. a smart-turn / Whisper
11
+ * EOT model) judge completion from prosody — pitch, pace, trailing intonation —
12
+ * which a transcript-only judge fundamentally cannot see. Undefined when no
13
+ * audio was buffered for the turn (the runtime only stores chunks above the
14
+ * speech threshold).
15
+ */
16
+ turnAudio?: ReadonlyArray<Uint8Array>;
17
+ turnAudioFormat?: AudioFormat;
8
18
  };
9
19
  export type VoiceSemanticTurnVerdict = {
10
20
  confidence?: number;
package/dist/index.js CHANGED
@@ -3757,6 +3757,8 @@ var getBufferedAudioDurationMs = (chunks) => chunks.reduce((total, chunk) => tot
3757
3757
  var STREAM_SENTENCE_BOUNDARY = /[.!?\u2026]['")\]]*\s/;
3758
3758
  var STREAM_CLAUSE_BOUNDARY = /[,;:]\s/g;
3759
3759
  var MAX_TTS_CHUNK_CHARS = 320;
3760
+ var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
3761
+ var STREAM_IDLE_FLUSH_MS = 350;
3760
3762
  var nextSpeakableBoundary = (buffer) => {
3761
3763
  const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
3762
3764
  return match ? match.index + match[0].length : -1;
@@ -4135,6 +4137,17 @@ var createVoiceSession = (options) => {
4135
4137
  pruneTurnAudio();
4136
4138
  return currentTurnAudio.map((audio) => audio.chunk);
4137
4139
  };
4140
+ const turnAudioInputFormat = recordingConfig?.userInputFormat ?? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT;
4141
+ const getTurnAudioForDetector = () => {
4142
+ if (!options.semanticTurnDetector || currentTurnAudio.length === 0) {
4143
+ return { turnAudio: undefined, turnAudioFormat: undefined };
4144
+ }
4145
+ const turnAudio = currentTurnAudio.map((audio) => {
4146
+ const c = audio.chunk;
4147
+ return c instanceof ArrayBuffer ? new Uint8Array(c) : new Uint8Array(c.buffer, c.byteOffset, c.byteLength);
4148
+ });
4149
+ return { turnAudio, turnAudioFormat: turnAudioInputFormat };
4150
+ };
4138
4151
  const clearSilenceTimer = () => {
4139
4152
  if (!silenceTimer) {
4140
4153
  return;
@@ -4474,7 +4487,8 @@ var createVoiceSession = (options) => {
4474
4487
  lastFinalTranscript: transcripts.at(-1),
4475
4488
  partialText,
4476
4489
  silenceMs,
4477
- transcripts
4490
+ transcripts,
4491
+ ...getTurnAudioForDetector()
4478
4492
  }));
4479
4493
  endOfTurn = verdict.endOfTurn;
4480
4494
  } catch {
@@ -5259,7 +5273,8 @@ var createVoiceSession = (options) => {
5259
5273
  lastFinalTranscript: transcript,
5260
5274
  partialText: session.currentTurn.partialText,
5261
5275
  silenceMs: session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : 0,
5262
- transcripts: session.currentTurn.transcripts
5276
+ transcripts: session.currentTurn.transcripts,
5277
+ ...getTurnAudioForDetector()
5263
5278
  }));
5264
5279
  if (verdict.endOfTurn) {
5265
5280
  clearSilenceTimer();
@@ -5448,6 +5463,7 @@ var createVoiceSession = (options) => {
5448
5463
  let charsSent = 0;
5449
5464
  let started = false;
5450
5465
  let streamed = false;
5466
+ let idleFlushTimer = null;
5451
5467
  let sendChain = Promise.resolve();
5452
5468
  let ttsSessionRequest = null;
5453
5469
  const ttsStartedAt = Date.now();
@@ -5507,8 +5523,23 @@ var createVoiceSession = (options) => {
5507
5523
  }
5508
5524
  })();
5509
5525
  };
5526
+ const clearIdleFlush = () => {
5527
+ if (idleFlushTimer) {
5528
+ clearTimeout(idleFlushTimer);
5529
+ idleFlushTimer = null;
5530
+ }
5531
+ };
5532
+ const flushOnIdle = () => {
5533
+ idleFlushTimer = null;
5534
+ const pending = buffer.trim();
5535
+ if (pending && STREAM_SENTENCE_END.test(pending)) {
5536
+ flush(buffer);
5537
+ buffer = "";
5538
+ }
5539
+ };
5510
5540
  return {
5511
5541
  finish: async () => {
5542
+ clearIdleFlush();
5512
5543
  if (buffer.trim()) {
5513
5544
  flush(buffer);
5514
5545
  }
@@ -5554,6 +5585,10 @@ var createVoiceSession = (options) => {
5554
5585
  flush(buffer.slice(0, cut));
5555
5586
  buffer = buffer.slice(cut);
5556
5587
  }
5588
+ clearIdleFlush();
5589
+ if (buffer.trim()) {
5590
+ idleFlushTimer = setTimeout(flushOnIdle, STREAM_IDLE_FLUSH_MS);
5591
+ }
5557
5592
  }
5558
5593
  };
5559
5594
  };
@@ -5984,6 +5984,8 @@ var getBufferedAudioDurationMs = (chunks) => chunks.reduce((total, chunk) => tot
5984
5984
  var STREAM_SENTENCE_BOUNDARY = /[.!?\u2026]['")\]]*\s/;
5985
5985
  var STREAM_CLAUSE_BOUNDARY = /[,;:]\s/g;
5986
5986
  var MAX_TTS_CHUNK_CHARS = 320;
5987
+ var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
5988
+ var STREAM_IDLE_FLUSH_MS = 350;
5987
5989
  var nextSpeakableBoundary = (buffer) => {
5988
5990
  const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
5989
5991
  return match ? match.index + match[0].length : -1;
@@ -6362,6 +6364,17 @@ var createVoiceSession = (options) => {
6362
6364
  pruneTurnAudio();
6363
6365
  return currentTurnAudio.map((audio) => audio.chunk);
6364
6366
  };
6367
+ const turnAudioInputFormat = recordingConfig?.userInputFormat ?? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT;
6368
+ const getTurnAudioForDetector = () => {
6369
+ if (!options.semanticTurnDetector || currentTurnAudio.length === 0) {
6370
+ return { turnAudio: undefined, turnAudioFormat: undefined };
6371
+ }
6372
+ const turnAudio = currentTurnAudio.map((audio) => {
6373
+ const c = audio.chunk;
6374
+ return c instanceof ArrayBuffer ? new Uint8Array(c) : new Uint8Array(c.buffer, c.byteOffset, c.byteLength);
6375
+ });
6376
+ return { turnAudio, turnAudioFormat: turnAudioInputFormat };
6377
+ };
6365
6378
  const clearSilenceTimer = () => {
6366
6379
  if (!silenceTimer) {
6367
6380
  return;
@@ -6701,7 +6714,8 @@ var createVoiceSession = (options) => {
6701
6714
  lastFinalTranscript: transcripts.at(-1),
6702
6715
  partialText,
6703
6716
  silenceMs,
6704
- transcripts
6717
+ transcripts,
6718
+ ...getTurnAudioForDetector()
6705
6719
  }));
6706
6720
  endOfTurn = verdict.endOfTurn;
6707
6721
  } catch {
@@ -7486,7 +7500,8 @@ var createVoiceSession = (options) => {
7486
7500
  lastFinalTranscript: transcript,
7487
7501
  partialText: session.currentTurn.partialText,
7488
7502
  silenceMs: session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : 0,
7489
- transcripts: session.currentTurn.transcripts
7503
+ transcripts: session.currentTurn.transcripts,
7504
+ ...getTurnAudioForDetector()
7490
7505
  }));
7491
7506
  if (verdict.endOfTurn) {
7492
7507
  clearSilenceTimer();
@@ -7675,6 +7690,7 @@ var createVoiceSession = (options) => {
7675
7690
  let charsSent = 0;
7676
7691
  let started = false;
7677
7692
  let streamed = false;
7693
+ let idleFlushTimer = null;
7678
7694
  let sendChain = Promise.resolve();
7679
7695
  let ttsSessionRequest = null;
7680
7696
  const ttsStartedAt = Date.now();
@@ -7734,8 +7750,23 @@ var createVoiceSession = (options) => {
7734
7750
  }
7735
7751
  })();
7736
7752
  };
7753
+ const clearIdleFlush = () => {
7754
+ if (idleFlushTimer) {
7755
+ clearTimeout(idleFlushTimer);
7756
+ idleFlushTimer = null;
7757
+ }
7758
+ };
7759
+ const flushOnIdle = () => {
7760
+ idleFlushTimer = null;
7761
+ const pending = buffer.trim();
7762
+ if (pending && STREAM_SENTENCE_END.test(pending)) {
7763
+ flush(buffer);
7764
+ buffer = "";
7765
+ }
7766
+ };
7737
7767
  return {
7738
7768
  finish: async () => {
7769
+ clearIdleFlush();
7739
7770
  if (buffer.trim()) {
7740
7771
  flush(buffer);
7741
7772
  }
@@ -7781,6 +7812,10 @@ var createVoiceSession = (options) => {
7781
7812
  flush(buffer.slice(0, cut));
7782
7813
  buffer = buffer.slice(cut);
7783
7814
  }
7815
+ clearIdleFlush();
7816
+ if (buffer.trim()) {
7817
+ idleFlushTimer = setTimeout(flushOnIdle, STREAM_IDLE_FLUSH_MS);
7818
+ }
7784
7819
  }
7785
7820
  };
7786
7821
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.596",
3
+ "version": "0.0.22-beta.598",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",