@absolutejs/voice 0.0.14 → 0.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -379,7 +379,7 @@ var createVoiceStreamStore = () => {
379
379
  case "final":
380
380
  state = {
381
381
  ...state,
382
- partial: "",
382
+ partial: action.transcript.text,
383
383
  turns: state.turns.map((turn) => turn)
384
384
  };
385
385
  break;
@@ -375,7 +375,7 @@ var createVoiceStreamStore = () => {
375
375
  case "final":
376
376
  state = {
377
377
  ...state,
378
- partial: "",
378
+ partial: action.transcript.text,
379
379
  turns: state.turns.map((turn) => turn)
380
380
  };
381
381
  break;
package/dist/index.js CHANGED
@@ -236,12 +236,59 @@ var toVoiceSessionSummary = (session) => ({
236
236
 
237
237
  // src/turnDetection.ts
238
238
  var DEFAULT_SILENCE_MS = 700;
239
+ var DEFAULT_SPEECH_THRESHOLD = 0.015;
240
+ var toUint8Array = (audio) => {
241
+ if (audio instanceof ArrayBuffer) {
242
+ return new Uint8Array(audio);
243
+ }
244
+ return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
245
+ };
246
+ var measureAudioLevel = (audio) => {
247
+ const bytes = toUint8Array(audio);
248
+ if (bytes.byteLength < 2) {
249
+ return 0;
250
+ }
251
+ const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
252
+ if (samples.length === 0) {
253
+ return 0;
254
+ }
255
+ let sumSquares = 0;
256
+ for (const sample of samples) {
257
+ const normalized = sample / 32768;
258
+ sumSquares += normalized * normalized;
259
+ }
260
+ return Math.sqrt(sumSquares / samples.length);
261
+ };
262
+ var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
263
+ var mergeTranscriptTexts = (transcripts) => {
264
+ const merged = [];
265
+ for (const transcript of transcripts) {
266
+ const nextText = normalizeText(transcript.text);
267
+ if (!nextText) {
268
+ continue;
269
+ }
270
+ const previous = merged.at(-1);
271
+ if (!previous) {
272
+ merged.push(nextText);
273
+ continue;
274
+ }
275
+ if (nextText === previous || previous.includes(nextText)) {
276
+ continue;
277
+ }
278
+ if (nextText.includes(previous)) {
279
+ merged[merged.length - 1] = nextText;
280
+ continue;
281
+ }
282
+ merged.push(nextText);
283
+ }
284
+ return merged.join(" ").trim();
285
+ };
239
286
  var buildTurnText = (transcripts, partialText) => {
240
- const finalText = transcripts.map((transcript) => transcript.text.trim()).filter(Boolean).join(" ").trim();
287
+ const finalText = mergeTranscriptTexts(transcripts);
241
288
  if (finalText) {
242
289
  return finalText;
243
290
  }
244
- return partialText.trim();
291
+ return normalizeText(partialText);
245
292
  };
246
293
 
247
294
  // src/session.ts
@@ -269,11 +316,13 @@ var createVoiceSession = (options) => {
269
316
  timeout: options.reconnect.timeout ?? DEFAULT_RECONNECT_TIMEOUT
270
317
  };
271
318
  const turnDetection = {
272
- silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS
319
+ silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
320
+ speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD
273
321
  };
274
322
  let socket = options.socket;
275
323
  let sttSession = null;
276
324
  let silenceTimer = null;
325
+ let speechDetected = false;
277
326
  const clearSilenceTimer = () => {
278
327
  if (!silenceTimer) {
279
328
  return;
@@ -315,7 +364,9 @@ var createVoiceSession = (options) => {
315
364
  }
316
365
  };
317
366
  const scheduleSilenceCommit = () => {
318
- clearSilenceTimer();
367
+ if (silenceTimer) {
368
+ return;
369
+ }
319
370
  silenceTimer = setTimeout(() => {
320
371
  api.commitTurn("silence");
321
372
  }, turnDetection.silenceMs);
@@ -338,7 +389,7 @@ var createVoiceSession = (options) => {
338
389
  const handlePartial = async (transcript) => {
339
390
  await writeSession((session) => {
340
391
  session.currentTurn.lastAudioAt = Date.now();
341
- session.currentTurn.partialText = transcript.text;
392
+ session.currentTurn.partialText = buildTurnText(session.currentTurn.transcripts, transcript.text);
342
393
  session.lastActivityAt = Date.now();
343
394
  session.status = "active";
344
395
  });
@@ -346,7 +397,6 @@ var createVoiceSession = (options) => {
346
397
  transcript,
347
398
  type: "partial"
348
399
  });
349
- scheduleSilenceCommit();
350
400
  };
351
401
  const handleFinal = async (transcript) => {
352
402
  await writeSession((session) => {
@@ -370,7 +420,6 @@ var createVoiceSession = (options) => {
370
420
  transcript,
371
421
  type: "final"
372
422
  });
373
- scheduleSilenceCommit();
374
423
  };
375
424
  const ensureAdapter = async () => {
376
425
  if (sttSession) {
@@ -472,6 +521,7 @@ var createVoiceSession = (options) => {
472
521
  currentSession.status = "active";
473
522
  currentSession.turns = [...currentSession.turns, turn];
474
523
  });
524
+ speechDetected = false;
475
525
  logger.info("voice turn committed", {
476
526
  reason,
477
527
  sessionId: options.id,
@@ -505,6 +555,7 @@ var createVoiceSession = (options) => {
505
555
  type: "complete"
506
556
  });
507
557
  await closeAdapter("complete");
558
+ speechDetected = false;
508
559
  await options.route.onComplete({
509
560
  api,
510
561
  context: options.context,
@@ -575,6 +626,7 @@ var createVoiceSession = (options) => {
575
626
  session.reconnect.lastDisconnectAt = Date.now();
576
627
  session.status = "reconnecting";
577
628
  });
629
+ speechDetected = false;
578
630
  },
579
631
  fail: async (error) => {
580
632
  clearSilenceTimer();
@@ -589,6 +641,7 @@ var createVoiceSession = (options) => {
589
641
  type: "error"
590
642
  });
591
643
  await closeAdapter("failed");
644
+ speechDetected = false;
592
645
  await options.route.onError?.({
593
646
  api,
594
647
  context: options.context,
@@ -603,11 +656,22 @@ var createVoiceSession = (options) => {
603
656
  return;
604
657
  }
605
658
  const adapter = await ensureAdapter();
659
+ const audioLevel = measureAudioLevel(audio);
606
660
  await writeSession((currentSession) => {
607
661
  currentSession.currentTurn.lastAudioAt = Date.now();
608
662
  currentSession.lastActivityAt = Date.now();
609
663
  currentSession.status = "active";
610
664
  });
665
+ if (audioLevel >= turnDetection.speechThreshold) {
666
+ speechDetected = true;
667
+ clearSilenceTimer();
668
+ } else if (speechDetected) {
669
+ const currentSession = await readSession();
670
+ const hasTurnText = Boolean(buildTurnText(currentSession.currentTurn.transcripts, currentSession.currentTurn.partialText));
671
+ if (hasTurnText) {
672
+ scheduleSilenceCommit();
673
+ }
674
+ }
611
675
  await adapter.send(audio);
612
676
  },
613
677
  snapshot: async () => readSession()
@@ -776,7 +840,8 @@ var voice = (config) => {
776
840
  store: config.session,
777
841
  stt: config.stt,
778
842
  turnDetection: {
779
- silenceMs: config.turnDetection?.silenceMs ?? 700
843
+ silenceMs: config.turnDetection?.silenceMs ?? 700,
844
+ speechThreshold: config.turnDetection?.speechThreshold ?? 0.015
780
845
  }
781
846
  });
782
847
  if (!current) {
@@ -811,7 +876,8 @@ var voice = (config) => {
811
876
  store: config.session,
812
877
  stt: config.stt,
813
878
  turnDetection: {
814
- silenceMs: config.turnDetection?.silenceMs ?? 700
879
+ silenceMs: config.turnDetection?.silenceMs ?? 700,
880
+ speechThreshold: config.turnDetection?.speechThreshold ?? 0.015
815
881
  }
816
882
  });
817
883
  runtime.activeSessions.set(sessionId, session);
@@ -379,7 +379,7 @@ var createVoiceStreamStore = () => {
379
379
  case "final":
380
380
  state = {
381
381
  ...state,
382
- partial: "",
382
+ partial: action.transcript.text,
383
383
  turns: state.turns.map((turn) => turn)
384
384
  };
385
385
  break;
@@ -376,7 +376,7 @@ var createVoiceStreamStore = () => {
376
376
  case "final":
377
377
  state = {
378
378
  ...state,
379
- partial: "",
379
+ partial: action.transcript.text,
380
380
  turns: state.turns.map((turn) => turn)
381
381
  };
382
382
  break;
@@ -1,3 +1,5 @@
1
- import type { Transcript } from './types';
1
+ import type { AudioChunk, Transcript } from './types';
2
2
  export declare const DEFAULT_SILENCE_MS = 700;
3
+ export declare const DEFAULT_SPEECH_THRESHOLD = 0.015;
4
+ export declare const measureAudioLevel: (audio: AudioChunk) => number;
3
5
  export declare const buildTurnText: (transcripts: Transcript[], partialText: string) => string;
package/dist/types.d.ts CHANGED
@@ -211,6 +211,7 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
211
211
  reconnect?: VoiceReconnectConfig;
212
212
  turnDetection?: {
213
213
  silenceMs?: number;
214
+ speechThreshold?: number;
214
215
  };
215
216
  logger?: VoiceLogger;
216
217
  htmx?: boolean | VoiceHTMXConfig<TSession, NoInfer<TResult>>;
@@ -224,6 +225,7 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
224
225
  reconnect: Required<VoiceReconnectConfig>;
225
226
  turnDetection: {
226
227
  silenceMs: number;
228
+ speechThreshold: number;
227
229
  };
228
230
  route: VoiceNormalizedRouteConfig<TContext, TSession, TResult>;
229
231
  logger?: VoiceLogger;
package/dist/vue/index.js CHANGED
@@ -379,7 +379,7 @@ var createVoiceStreamStore = () => {
379
379
  case "final":
380
380
  state = {
381
381
  ...state,
382
- partial: "",
382
+ partial: action.transcript.text,
383
383
  turns: state.turns.map((turn) => turn)
384
384
  };
385
385
  break;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.14",
3
+ "version": "0.0.16",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",