@absolutejs/voice 0.0.22-beta.482 → 0.0.22-beta.484

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ import type { CreateVoiceSessionOptions, VoiceSessionRecord } from "./types";
2
+ export type VoiceAssistantMode = "cascade" | "s2s";
3
+ export type VoiceSemanticVADConfig = {
4
+ createResponseAutomatically?: boolean;
5
+ eagerness?: "auto" | "high" | "low" | "medium";
6
+ silenceDurationMs?: number;
7
+ };
8
+ export type VoiceAssistantModality = "audio" | "text";
9
+ export declare const resolveVoiceAssistantMode: <TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown>(options: Pick<CreateVoiceSessionOptions<TContext, TSession, TResult>, "realtime" | "stt" | "tts"> & {
10
+ assistantMode?: VoiceAssistantMode;
11
+ }) => VoiceAssistantMode;
12
+ export type VoiceAssistantModeDescriptor = {
13
+ hasRealtime: boolean;
14
+ hasSTT: boolean;
15
+ hasTTS: boolean;
16
+ mode: VoiceAssistantMode;
17
+ modalities: VoiceAssistantModality[];
18
+ };
19
+ export declare const describeVoiceAssistantMode: <TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown>(options: Pick<CreateVoiceSessionOptions<TContext, TSession, TResult>, "realtime" | "stt" | "tts"> & {
20
+ assistantMode?: VoiceAssistantMode;
21
+ modalities?: ReadonlyArray<VoiceAssistantModality>;
22
+ }) => VoiceAssistantModeDescriptor;
package/dist/index.d.ts CHANGED
@@ -77,6 +77,10 @@ export { DEFAULT_VOICE_REDACTION_PATTERNS, createVoiceTranscriptRedactor, redact
77
77
  export type { CreateVoiceTranscriptRedactorOptions, VoiceRedactionPattern, VoiceTranscriptRedactor, } from "./redaction";
78
78
  export { DEFAULT_VOICE_PRICE_BOOK, createVoiceCostAccountant, } from "./costAccounting";
79
79
  export type { CreateVoiceCostAccountantOptions, VoiceCostAccountant, VoiceCostBreakdown, VoiceCostLLMRecord, VoiceCostSTTRecord, VoiceCostTTSRecord, VoiceCostTelephonyRecord, VoicePriceBook, VoiceProviderRates, } from "./costAccounting";
80
+ export { describeVoiceAssistantMode, resolveVoiceAssistantMode, } from "./assistantMode";
81
+ export type { VoiceAssistantMode, VoiceAssistantModality, VoiceAssistantModeDescriptor, VoiceSemanticVADConfig, } from "./assistantMode";
82
+ export { createPunctuationSemanticTurnDetector, createRegexSemanticTurnDetector, } from "./semanticTurn";
83
+ export type { CreatePunctuationSemanticTurnDetectorOptions, CreateRegexSemanticTurnDetectorOptions, VoiceSemanticTurnDetector, VoiceSemanticTurnInput, VoiceSemanticTurnVerdict, } from "./semanticTurn";
80
84
  export { createMonologueAMDDetector } from "./amdDetector";
81
85
  export type { MonologueAMDDetectorOptions, VoiceAMDDetector, VoiceAMDDetectorInput, VoiceAMDVerdict, } from "./amdDetector";
82
86
  export { createVoiceRAGTool } from "./ragTool";
package/dist/index.js CHANGED
@@ -3440,6 +3440,28 @@ var createVoiceMemoryRecordingStore = () => {
3440
3440
  };
3441
3441
  };
3442
3442
 
3443
+ // src/assistantMode.ts
3444
+ var resolveVoiceAssistantMode = (options) => {
3445
+ if (options.assistantMode) {
3446
+ return options.assistantMode;
3447
+ }
3448
+ if (options.realtime) {
3449
+ return "s2s";
3450
+ }
3451
+ return "cascade";
3452
+ };
3453
+ var describeVoiceAssistantMode = (options) => {
3454
+ const mode = resolveVoiceAssistantMode(options);
3455
+ const modalities = options.modalities ? Array.from(new Set(options.modalities)) : ["audio"];
3456
+ return {
3457
+ hasRealtime: Boolean(options.realtime),
3458
+ hasSTT: Boolean(options.stt),
3459
+ hasTTS: Boolean(options.tts),
3460
+ modalities,
3461
+ mode
3462
+ };
3463
+ };
3464
+
3443
3465
  // src/session.ts
3444
3466
  var DEFAULT_RECONNECT_TIMEOUT = 30000;
3445
3467
  var DEFAULT_MAX_RECONNECT_ATTEMPTS = 10;
@@ -4786,6 +4808,18 @@ var createVoiceSession = (options) => {
4786
4808
  session,
4787
4809
  type: "turn.transcript"
4788
4810
  });
4811
+ if (options.semanticTurnDetector) {
4812
+ const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
4813
+ lastFinalTranscript: transcript,
4814
+ partialText: session.currentTurn.partialText,
4815
+ silenceMs: session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : 0,
4816
+ transcripts: session.currentTurn.transcripts
4817
+ }));
4818
+ if (verdict.endOfTurn) {
4819
+ clearSilenceTimer();
4820
+ await requestTurnCommit("vendor");
4821
+ }
4822
+ }
4789
4823
  };
4790
4824
  const resumePendingTurnCommit = (session) => {
4791
4825
  const pendingText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
@@ -4810,13 +4844,20 @@ var createVoiceSession = (options) => {
4810
4844
  if (!inputAdapter) {
4811
4845
  throw new Error("Voice session requires either an stt or realtime adapter.");
4812
4846
  }
4813
- const openedSession = await inputAdapter.open({
4814
- format: options.realtime ? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT : DEFAULT_FORMAT,
4847
+ const openedSession = await (options.realtime ? options.realtime.open({
4848
+ format: options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT,
4815
4849
  languageStrategy: options.languageStrategy,
4816
4850
  lexicon,
4851
+ modalities: options.modalities,
4817
4852
  phraseHints,
4818
4853
  sessionId: options.id
4819
- });
4854
+ }) : inputAdapter.open({
4855
+ format: DEFAULT_FORMAT,
4856
+ languageStrategy: options.languageStrategy,
4857
+ lexicon,
4858
+ phraseHints,
4859
+ sessionId: options.id
4860
+ }));
4820
4861
  const generation = ++adapterGenerationCounter;
4821
4862
  sttSession = openedSession;
4822
4863
  activeAdapterGeneration = generation;
@@ -4986,9 +5027,10 @@ var createVoiceSession = (options) => {
4986
5027
  });
4987
5028
  await appendTrace({
4988
5029
  payload: {
5030
+ assistantMode: resolveVoiceAssistantMode(options),
5031
+ realtimeConfigured: Boolean(options.realtime),
4989
5032
  text: output.assistantText,
4990
- ttsConfigured: Boolean(options.tts),
4991
- realtimeConfigured: Boolean(options.realtime)
5033
+ ttsConfigured: Boolean(options.tts)
4992
5034
  },
4993
5035
  session,
4994
5036
  turnId: turn.id,
@@ -35292,6 +35334,76 @@ var createVoiceCostAccountant = (options = {}) => {
35292
35334
  })
35293
35335
  };
35294
35336
  };
35337
+ // src/semanticTurn.ts
35338
+ var DEFAULT_END_PUNCTUATION = [".", "?", "!"];
35339
+ var DEFAULT_FILLER_WORDS = [
35340
+ "uh",
35341
+ "um",
35342
+ "er",
35343
+ "ah",
35344
+ "like",
35345
+ "you know",
35346
+ "i mean",
35347
+ "well",
35348
+ "so"
35349
+ ];
35350
+ var stripTerminalPunctuation = (text) => text.replace(/[\s.?!]+$/u, "").trim();
35351
+ var createPunctuationSemanticTurnDetector = (options = {}) => {
35352
+ const endPunctuation = options.endPunctuation ?? DEFAULT_END_PUNCTUATION;
35353
+ const fillerWords = (options.fillerWords ?? DEFAULT_FILLER_WORDS).map((word) => word.toLowerCase());
35354
+ const minPartialWords = options.minPartialWords ?? 2;
35355
+ return {
35356
+ evaluate: ({ lastFinalTranscript, partialText }) => {
35357
+ const candidate = partialText.trim().length > 0 ? partialText : lastFinalTranscript?.text ?? "";
35358
+ const trimmed = candidate.trim();
35359
+ if (!trimmed) {
35360
+ return { endOfTurn: false, reason: "empty" };
35361
+ }
35362
+ const wordCount = trimmed.split(/\s+/u).filter(Boolean).length;
35363
+ if (wordCount < minPartialWords) {
35364
+ return { endOfTurn: false, reason: "below-min-words" };
35365
+ }
35366
+ const lastChar = trimmed.at(-1);
35367
+ const endsWithTerminal = typeof lastChar === "string" && endPunctuation.includes(lastChar);
35368
+ if (!endsWithTerminal) {
35369
+ return { endOfTurn: false, reason: "no-terminal-punctuation" };
35370
+ }
35371
+ const lastWord = stripTerminalPunctuation(trimmed).split(/\s+/u).at(-1)?.toLowerCase();
35372
+ if (lastWord && fillerWords.includes(lastWord)) {
35373
+ return { endOfTurn: false, reason: "trailing-filler" };
35374
+ }
35375
+ return {
35376
+ confidence: 0.9,
35377
+ endOfTurn: true,
35378
+ reason: "terminal-punctuation"
35379
+ };
35380
+ }
35381
+ };
35382
+ };
35383
+ var createRegexSemanticTurnDetector = (options) => {
35384
+ const minPartialWords = options.minPartialWords ?? 2;
35385
+ return {
35386
+ evaluate: ({ lastFinalTranscript, partialText }) => {
35387
+ const candidate = partialText.trim().length > 0 ? partialText : lastFinalTranscript?.text ?? "";
35388
+ const trimmed = candidate.trim();
35389
+ if (!trimmed) {
35390
+ return { endOfTurn: false, reason: "empty" };
35391
+ }
35392
+ const wordCount = trimmed.split(/\s+/u).filter(Boolean).length;
35393
+ if (wordCount < minPartialWords) {
35394
+ return { endOfTurn: false, reason: "below-min-words" };
35395
+ }
35396
+ const match = options.endPattern.exec(trimmed);
35397
+ if (!match) {
35398
+ return { endOfTurn: false, reason: "pattern-miss" };
35399
+ }
35400
+ return {
35401
+ endOfTurn: true,
35402
+ reason: "pattern-match"
35403
+ };
35404
+ }
35405
+ };
35406
+ };
35295
35407
  // src/amdDetector.ts
35296
35408
  var createMonologueAMDDetector = (options = {}) => {
35297
35409
  const minMonologueMs = options.minMonologueMs ?? 8000;
@@ -45952,6 +46064,7 @@ export {
45952
46064
  resolveVoiceDiagnosticsTraceFilter,
45953
46065
  resolveVoiceAuditTrailFilter,
45954
46066
  resolveVoiceAuditDeliveryFilter,
46067
+ resolveVoiceAssistantMode,
45955
46068
  resolveVoiceAssistantMemoryNamespace,
45956
46069
  resolveTurnDetectionConfig,
45957
46070
  resolveLatestVoiceCallDebuggerSessionId,
@@ -46146,6 +46259,7 @@ export {
46146
46259
  evaluateVoiceAgentSquadContractEvidence,
46147
46260
  encodeTwilioMulawBase64,
46148
46261
  encodePcmAsWav,
46262
+ describeVoiceAssistantMode,
46149
46263
  deliverVoiceTraceEventsToSinks,
46150
46264
  deliverVoiceObservabilityExport,
46151
46265
  deliverVoiceMonitorIssueNotifications,
@@ -46489,6 +46603,8 @@ export {
46489
46603
  createStoredVoiceExternalObjectMap,
46490
46604
  createStoredVoiceCallReviewArtifact,
46491
46605
  createRiskyTurnCorrectionHandler,
46606
+ createRegexSemanticTurnDetector,
46607
+ createPunctuationSemanticTurnDetector,
46492
46608
  createPlivoVoiceRoutes,
46493
46609
  createPlivoVoiceResponse,
46494
46610
  createPlivoMediaStreamBridge,
@@ -0,0 +1,27 @@
1
+ import type { Transcript } from "./types";
2
+ export type VoiceSemanticTurnInput = {
3
+ audioLevel?: number;
4
+ lastFinalTranscript?: Transcript;
5
+ partialText: string;
6
+ silenceMs: number;
7
+ transcripts: Transcript[];
8
+ };
9
+ export type VoiceSemanticTurnVerdict = {
10
+ confidence?: number;
11
+ endOfTurn: boolean;
12
+ reason?: string;
13
+ };
14
+ export type VoiceSemanticTurnDetector = {
15
+ evaluate: (input: VoiceSemanticTurnInput) => Promise<VoiceSemanticTurnVerdict> | VoiceSemanticTurnVerdict;
16
+ };
17
+ export type CreatePunctuationSemanticTurnDetectorOptions = {
18
+ endPunctuation?: ReadonlyArray<string>;
19
+ fillerWords?: ReadonlyArray<string>;
20
+ minPartialWords?: number;
21
+ };
22
+ export declare const createPunctuationSemanticTurnDetector: (options?: CreatePunctuationSemanticTurnDetectorOptions) => VoiceSemanticTurnDetector;
23
+ export type CreateRegexSemanticTurnDetectorOptions = {
24
+ endPattern: RegExp;
25
+ minPartialWords?: number;
26
+ };
27
+ export declare const createRegexSemanticTurnDetector: (options: CreateRegexSemanticTurnDetectorOptions) => VoiceSemanticTurnDetector;
@@ -5408,6 +5408,28 @@ var createVoiceMemoryRecordingStore = () => {
5408
5408
  };
5409
5409
  };
5410
5410
 
5411
+ // src/assistantMode.ts
5412
+ var resolveVoiceAssistantMode = (options) => {
5413
+ if (options.assistantMode) {
5414
+ return options.assistantMode;
5415
+ }
5416
+ if (options.realtime) {
5417
+ return "s2s";
5418
+ }
5419
+ return "cascade";
5420
+ };
5421
+ var describeVoiceAssistantMode = (options) => {
5422
+ const mode = resolveVoiceAssistantMode(options);
5423
+ const modalities = options.modalities ? Array.from(new Set(options.modalities)) : ["audio"];
5424
+ return {
5425
+ hasRealtime: Boolean(options.realtime),
5426
+ hasSTT: Boolean(options.stt),
5427
+ hasTTS: Boolean(options.tts),
5428
+ modalities,
5429
+ mode
5430
+ };
5431
+ };
5432
+
5411
5433
  // src/session.ts
5412
5434
  var DEFAULT_RECONNECT_TIMEOUT = 30000;
5413
5435
  var DEFAULT_MAX_RECONNECT_ATTEMPTS2 = 10;
@@ -6754,6 +6776,18 @@ var createVoiceSession = (options) => {
6754
6776
  session,
6755
6777
  type: "turn.transcript"
6756
6778
  });
6779
+ if (options.semanticTurnDetector) {
6780
+ const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
6781
+ lastFinalTranscript: transcript,
6782
+ partialText: session.currentTurn.partialText,
6783
+ silenceMs: session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : 0,
6784
+ transcripts: session.currentTurn.transcripts
6785
+ }));
6786
+ if (verdict.endOfTurn) {
6787
+ clearSilenceTimer();
6788
+ await requestTurnCommit("vendor");
6789
+ }
6790
+ }
6757
6791
  };
6758
6792
  const resumePendingTurnCommit = (session) => {
6759
6793
  const pendingText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
@@ -6778,13 +6812,20 @@ var createVoiceSession = (options) => {
6778
6812
  if (!inputAdapter) {
6779
6813
  throw new Error("Voice session requires either an stt or realtime adapter.");
6780
6814
  }
6781
- const openedSession = await inputAdapter.open({
6782
- format: options.realtime ? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT : DEFAULT_FORMAT,
6815
+ const openedSession = await (options.realtime ? options.realtime.open({
6816
+ format: options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT,
6783
6817
  languageStrategy: options.languageStrategy,
6784
6818
  lexicon,
6819
+ modalities: options.modalities,
6785
6820
  phraseHints,
6786
6821
  sessionId: options.id
6787
- });
6822
+ }) : inputAdapter.open({
6823
+ format: DEFAULT_FORMAT,
6824
+ languageStrategy: options.languageStrategy,
6825
+ lexicon,
6826
+ phraseHints,
6827
+ sessionId: options.id
6828
+ }));
6788
6829
  const generation = ++adapterGenerationCounter;
6789
6830
  sttSession = openedSession;
6790
6831
  activeAdapterGeneration = generation;
@@ -6954,9 +6995,10 @@ var createVoiceSession = (options) => {
6954
6995
  });
6955
6996
  await appendTrace({
6956
6997
  payload: {
6998
+ assistantMode: resolveVoiceAssistantMode(options),
6999
+ realtimeConfigured: Boolean(options.realtime),
6957
7000
  text: output.assistantText,
6958
- ttsConfigured: Boolean(options.tts),
6959
- realtimeConfigured: Boolean(options.realtime)
7001
+ ttsConfigured: Boolean(options.tts)
6960
7002
  },
6961
7003
  session,
6962
7004
  turnId: turn.id,
package/dist/types.d.ts CHANGED
@@ -194,7 +194,10 @@ export type RealtimeAdapterOpenOptions = {
194
194
  format: AudioFormat;
195
195
  languageStrategy?: VoiceLanguageStrategy;
196
196
  lexicon?: VoiceLexiconEntry[];
197
+ modalities?: ReadonlyArray<"audio" | "text">;
197
198
  phraseHints?: VoicePhraseHint[];
199
+ promptCacheKey?: string;
200
+ semanticVAD?: import("./assistantMode").VoiceSemanticVADConfig;
198
201
  signal?: AbortSignal;
199
202
  };
200
203
  export type RealtimeAdapter<TOptions extends RealtimeAdapterOpenOptions = RealtimeAdapterOpenOptions> = {
@@ -731,6 +734,9 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
731
734
  provider?: string;
732
735
  };
733
736
  redact?: import("./redaction").VoiceTranscriptRedactor;
737
+ semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
738
+ assistantMode?: import("./assistantMode").VoiceAssistantMode;
739
+ modalities?: ReadonlyArray<"audio" | "text">;
734
740
  reconnect: Required<VoiceReconnectConfig>;
735
741
  phraseHints?: VoicePhraseHint[];
736
742
  sessionMetadata?: Record<string, unknown>;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.482",
3
+ "version": "0.0.22-beta.484",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",