@absolutejs/voice 0.0.22-beta.482 → 0.0.22-beta.484
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/assistantMode.d.ts +22 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +121 -5
- package/dist/semanticTurn.d.ts +27 -0
- package/dist/testing/index.js +47 -5
- package/dist/types.d.ts +6 -0
- package/package.json +1 -1
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import type { CreateVoiceSessionOptions, VoiceSessionRecord } from "./types";
|
|
2
|
+
export type VoiceAssistantMode = "cascade" | "s2s";
|
|
3
|
+
export type VoiceSemanticVADConfig = {
|
|
4
|
+
createResponseAutomatically?: boolean;
|
|
5
|
+
eagerness?: "auto" | "high" | "low" | "medium";
|
|
6
|
+
silenceDurationMs?: number;
|
|
7
|
+
};
|
|
8
|
+
export type VoiceAssistantModality = "audio" | "text";
|
|
9
|
+
export declare const resolveVoiceAssistantMode: <TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown>(options: Pick<CreateVoiceSessionOptions<TContext, TSession, TResult>, "realtime" | "stt" | "tts"> & {
|
|
10
|
+
assistantMode?: VoiceAssistantMode;
|
|
11
|
+
}) => VoiceAssistantMode;
|
|
12
|
+
export type VoiceAssistantModeDescriptor = {
|
|
13
|
+
hasRealtime: boolean;
|
|
14
|
+
hasSTT: boolean;
|
|
15
|
+
hasTTS: boolean;
|
|
16
|
+
mode: VoiceAssistantMode;
|
|
17
|
+
modalities: VoiceAssistantModality[];
|
|
18
|
+
};
|
|
19
|
+
export declare const describeVoiceAssistantMode: <TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown>(options: Pick<CreateVoiceSessionOptions<TContext, TSession, TResult>, "realtime" | "stt" | "tts"> & {
|
|
20
|
+
assistantMode?: VoiceAssistantMode;
|
|
21
|
+
modalities?: ReadonlyArray<VoiceAssistantModality>;
|
|
22
|
+
}) => VoiceAssistantModeDescriptor;
|
package/dist/index.d.ts
CHANGED
|
@@ -77,6 +77,10 @@ export { DEFAULT_VOICE_REDACTION_PATTERNS, createVoiceTranscriptRedactor, redact
|
|
|
77
77
|
export type { CreateVoiceTranscriptRedactorOptions, VoiceRedactionPattern, VoiceTranscriptRedactor, } from "./redaction";
|
|
78
78
|
export { DEFAULT_VOICE_PRICE_BOOK, createVoiceCostAccountant, } from "./costAccounting";
|
|
79
79
|
export type { CreateVoiceCostAccountantOptions, VoiceCostAccountant, VoiceCostBreakdown, VoiceCostLLMRecord, VoiceCostSTTRecord, VoiceCostTTSRecord, VoiceCostTelephonyRecord, VoicePriceBook, VoiceProviderRates, } from "./costAccounting";
|
|
80
|
+
export { describeVoiceAssistantMode, resolveVoiceAssistantMode, } from "./assistantMode";
|
|
81
|
+
export type { VoiceAssistantMode, VoiceAssistantModality, VoiceAssistantModeDescriptor, VoiceSemanticVADConfig, } from "./assistantMode";
|
|
82
|
+
export { createPunctuationSemanticTurnDetector, createRegexSemanticTurnDetector, } from "./semanticTurn";
|
|
83
|
+
export type { CreatePunctuationSemanticTurnDetectorOptions, CreateRegexSemanticTurnDetectorOptions, VoiceSemanticTurnDetector, VoiceSemanticTurnInput, VoiceSemanticTurnVerdict, } from "./semanticTurn";
|
|
80
84
|
export { createMonologueAMDDetector } from "./amdDetector";
|
|
81
85
|
export type { MonologueAMDDetectorOptions, VoiceAMDDetector, VoiceAMDDetectorInput, VoiceAMDVerdict, } from "./amdDetector";
|
|
82
86
|
export { createVoiceRAGTool } from "./ragTool";
|
package/dist/index.js
CHANGED
|
@@ -3440,6 +3440,28 @@ var createVoiceMemoryRecordingStore = () => {
|
|
|
3440
3440
|
};
|
|
3441
3441
|
};
|
|
3442
3442
|
|
|
3443
|
+
// src/assistantMode.ts
|
|
3444
|
+
var resolveVoiceAssistantMode = (options) => {
|
|
3445
|
+
if (options.assistantMode) {
|
|
3446
|
+
return options.assistantMode;
|
|
3447
|
+
}
|
|
3448
|
+
if (options.realtime) {
|
|
3449
|
+
return "s2s";
|
|
3450
|
+
}
|
|
3451
|
+
return "cascade";
|
|
3452
|
+
};
|
|
3453
|
+
var describeVoiceAssistantMode = (options) => {
|
|
3454
|
+
const mode = resolveVoiceAssistantMode(options);
|
|
3455
|
+
const modalities = options.modalities ? Array.from(new Set(options.modalities)) : ["audio"];
|
|
3456
|
+
return {
|
|
3457
|
+
hasRealtime: Boolean(options.realtime),
|
|
3458
|
+
hasSTT: Boolean(options.stt),
|
|
3459
|
+
hasTTS: Boolean(options.tts),
|
|
3460
|
+
modalities,
|
|
3461
|
+
mode
|
|
3462
|
+
};
|
|
3463
|
+
};
|
|
3464
|
+
|
|
3443
3465
|
// src/session.ts
|
|
3444
3466
|
var DEFAULT_RECONNECT_TIMEOUT = 30000;
|
|
3445
3467
|
var DEFAULT_MAX_RECONNECT_ATTEMPTS = 10;
|
|
@@ -4786,6 +4808,18 @@ var createVoiceSession = (options) => {
|
|
|
4786
4808
|
session,
|
|
4787
4809
|
type: "turn.transcript"
|
|
4788
4810
|
});
|
|
4811
|
+
if (options.semanticTurnDetector) {
|
|
4812
|
+
const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
|
|
4813
|
+
lastFinalTranscript: transcript,
|
|
4814
|
+
partialText: session.currentTurn.partialText,
|
|
4815
|
+
silenceMs: session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : 0,
|
|
4816
|
+
transcripts: session.currentTurn.transcripts
|
|
4817
|
+
}));
|
|
4818
|
+
if (verdict.endOfTurn) {
|
|
4819
|
+
clearSilenceTimer();
|
|
4820
|
+
await requestTurnCommit("vendor");
|
|
4821
|
+
}
|
|
4822
|
+
}
|
|
4789
4823
|
};
|
|
4790
4824
|
const resumePendingTurnCommit = (session) => {
|
|
4791
4825
|
const pendingText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
@@ -4810,13 +4844,20 @@ var createVoiceSession = (options) => {
|
|
|
4810
4844
|
if (!inputAdapter) {
|
|
4811
4845
|
throw new Error("Voice session requires either an stt or realtime adapter.");
|
|
4812
4846
|
}
|
|
4813
|
-
const openedSession = await
|
|
4814
|
-
format: options.
|
|
4847
|
+
const openedSession = await (options.realtime ? options.realtime.open({
|
|
4848
|
+
format: options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT,
|
|
4815
4849
|
languageStrategy: options.languageStrategy,
|
|
4816
4850
|
lexicon,
|
|
4851
|
+
modalities: options.modalities,
|
|
4817
4852
|
phraseHints,
|
|
4818
4853
|
sessionId: options.id
|
|
4819
|
-
})
|
|
4854
|
+
}) : inputAdapter.open({
|
|
4855
|
+
format: DEFAULT_FORMAT,
|
|
4856
|
+
languageStrategy: options.languageStrategy,
|
|
4857
|
+
lexicon,
|
|
4858
|
+
phraseHints,
|
|
4859
|
+
sessionId: options.id
|
|
4860
|
+
}));
|
|
4820
4861
|
const generation = ++adapterGenerationCounter;
|
|
4821
4862
|
sttSession = openedSession;
|
|
4822
4863
|
activeAdapterGeneration = generation;
|
|
@@ -4986,9 +5027,10 @@ var createVoiceSession = (options) => {
|
|
|
4986
5027
|
});
|
|
4987
5028
|
await appendTrace({
|
|
4988
5029
|
payload: {
|
|
5030
|
+
assistantMode: resolveVoiceAssistantMode(options),
|
|
5031
|
+
realtimeConfigured: Boolean(options.realtime),
|
|
4989
5032
|
text: output.assistantText,
|
|
4990
|
-
ttsConfigured: Boolean(options.tts)
|
|
4991
|
-
realtimeConfigured: Boolean(options.realtime)
|
|
5033
|
+
ttsConfigured: Boolean(options.tts)
|
|
4992
5034
|
},
|
|
4993
5035
|
session,
|
|
4994
5036
|
turnId: turn.id,
|
|
@@ -35292,6 +35334,76 @@ var createVoiceCostAccountant = (options = {}) => {
|
|
|
35292
35334
|
})
|
|
35293
35335
|
};
|
|
35294
35336
|
};
|
|
35337
|
+
// src/semanticTurn.ts
|
|
35338
|
+
var DEFAULT_END_PUNCTUATION = [".", "?", "!"];
|
|
35339
|
+
var DEFAULT_FILLER_WORDS = [
|
|
35340
|
+
"uh",
|
|
35341
|
+
"um",
|
|
35342
|
+
"er",
|
|
35343
|
+
"ah",
|
|
35344
|
+
"like",
|
|
35345
|
+
"you know",
|
|
35346
|
+
"i mean",
|
|
35347
|
+
"well",
|
|
35348
|
+
"so"
|
|
35349
|
+
];
|
|
35350
|
+
var stripTerminalPunctuation = (text) => text.replace(/[\s.?!]+$/u, "").trim();
|
|
35351
|
+
var createPunctuationSemanticTurnDetector = (options = {}) => {
|
|
35352
|
+
const endPunctuation = options.endPunctuation ?? DEFAULT_END_PUNCTUATION;
|
|
35353
|
+
const fillerWords = (options.fillerWords ?? DEFAULT_FILLER_WORDS).map((word) => word.toLowerCase());
|
|
35354
|
+
const minPartialWords = options.minPartialWords ?? 2;
|
|
35355
|
+
return {
|
|
35356
|
+
evaluate: ({ lastFinalTranscript, partialText }) => {
|
|
35357
|
+
const candidate = partialText.trim().length > 0 ? partialText : lastFinalTranscript?.text ?? "";
|
|
35358
|
+
const trimmed = candidate.trim();
|
|
35359
|
+
if (!trimmed) {
|
|
35360
|
+
return { endOfTurn: false, reason: "empty" };
|
|
35361
|
+
}
|
|
35362
|
+
const wordCount = trimmed.split(/\s+/u).filter(Boolean).length;
|
|
35363
|
+
if (wordCount < minPartialWords) {
|
|
35364
|
+
return { endOfTurn: false, reason: "below-min-words" };
|
|
35365
|
+
}
|
|
35366
|
+
const lastChar = trimmed.at(-1);
|
|
35367
|
+
const endsWithTerminal = typeof lastChar === "string" && endPunctuation.includes(lastChar);
|
|
35368
|
+
if (!endsWithTerminal) {
|
|
35369
|
+
return { endOfTurn: false, reason: "no-terminal-punctuation" };
|
|
35370
|
+
}
|
|
35371
|
+
const lastWord = stripTerminalPunctuation(trimmed).split(/\s+/u).at(-1)?.toLowerCase();
|
|
35372
|
+
if (lastWord && fillerWords.includes(lastWord)) {
|
|
35373
|
+
return { endOfTurn: false, reason: "trailing-filler" };
|
|
35374
|
+
}
|
|
35375
|
+
return {
|
|
35376
|
+
confidence: 0.9,
|
|
35377
|
+
endOfTurn: true,
|
|
35378
|
+
reason: "terminal-punctuation"
|
|
35379
|
+
};
|
|
35380
|
+
}
|
|
35381
|
+
};
|
|
35382
|
+
};
|
|
35383
|
+
var createRegexSemanticTurnDetector = (options) => {
|
|
35384
|
+
const minPartialWords = options.minPartialWords ?? 2;
|
|
35385
|
+
return {
|
|
35386
|
+
evaluate: ({ lastFinalTranscript, partialText }) => {
|
|
35387
|
+
const candidate = partialText.trim().length > 0 ? partialText : lastFinalTranscript?.text ?? "";
|
|
35388
|
+
const trimmed = candidate.trim();
|
|
35389
|
+
if (!trimmed) {
|
|
35390
|
+
return { endOfTurn: false, reason: "empty" };
|
|
35391
|
+
}
|
|
35392
|
+
const wordCount = trimmed.split(/\s+/u).filter(Boolean).length;
|
|
35393
|
+
if (wordCount < minPartialWords) {
|
|
35394
|
+
return { endOfTurn: false, reason: "below-min-words" };
|
|
35395
|
+
}
|
|
35396
|
+
const match = options.endPattern.exec(trimmed);
|
|
35397
|
+
if (!match) {
|
|
35398
|
+
return { endOfTurn: false, reason: "pattern-miss" };
|
|
35399
|
+
}
|
|
35400
|
+
return {
|
|
35401
|
+
endOfTurn: true,
|
|
35402
|
+
reason: "pattern-match"
|
|
35403
|
+
};
|
|
35404
|
+
}
|
|
35405
|
+
};
|
|
35406
|
+
};
|
|
35295
35407
|
// src/amdDetector.ts
|
|
35296
35408
|
var createMonologueAMDDetector = (options = {}) => {
|
|
35297
35409
|
const minMonologueMs = options.minMonologueMs ?? 8000;
|
|
@@ -45952,6 +46064,7 @@ export {
|
|
|
45952
46064
|
resolveVoiceDiagnosticsTraceFilter,
|
|
45953
46065
|
resolveVoiceAuditTrailFilter,
|
|
45954
46066
|
resolveVoiceAuditDeliveryFilter,
|
|
46067
|
+
resolveVoiceAssistantMode,
|
|
45955
46068
|
resolveVoiceAssistantMemoryNamespace,
|
|
45956
46069
|
resolveTurnDetectionConfig,
|
|
45957
46070
|
resolveLatestVoiceCallDebuggerSessionId,
|
|
@@ -46146,6 +46259,7 @@ export {
|
|
|
46146
46259
|
evaluateVoiceAgentSquadContractEvidence,
|
|
46147
46260
|
encodeTwilioMulawBase64,
|
|
46148
46261
|
encodePcmAsWav,
|
|
46262
|
+
describeVoiceAssistantMode,
|
|
46149
46263
|
deliverVoiceTraceEventsToSinks,
|
|
46150
46264
|
deliverVoiceObservabilityExport,
|
|
46151
46265
|
deliverVoiceMonitorIssueNotifications,
|
|
@@ -46489,6 +46603,8 @@ export {
|
|
|
46489
46603
|
createStoredVoiceExternalObjectMap,
|
|
46490
46604
|
createStoredVoiceCallReviewArtifact,
|
|
46491
46605
|
createRiskyTurnCorrectionHandler,
|
|
46606
|
+
createRegexSemanticTurnDetector,
|
|
46607
|
+
createPunctuationSemanticTurnDetector,
|
|
46492
46608
|
createPlivoVoiceRoutes,
|
|
46493
46609
|
createPlivoVoiceResponse,
|
|
46494
46610
|
createPlivoMediaStreamBridge,
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import type { Transcript } from "./types";
|
|
2
|
+
export type VoiceSemanticTurnInput = {
|
|
3
|
+
audioLevel?: number;
|
|
4
|
+
lastFinalTranscript?: Transcript;
|
|
5
|
+
partialText: string;
|
|
6
|
+
silenceMs: number;
|
|
7
|
+
transcripts: Transcript[];
|
|
8
|
+
};
|
|
9
|
+
export type VoiceSemanticTurnVerdict = {
|
|
10
|
+
confidence?: number;
|
|
11
|
+
endOfTurn: boolean;
|
|
12
|
+
reason?: string;
|
|
13
|
+
};
|
|
14
|
+
export type VoiceSemanticTurnDetector = {
|
|
15
|
+
evaluate: (input: VoiceSemanticTurnInput) => Promise<VoiceSemanticTurnVerdict> | VoiceSemanticTurnVerdict;
|
|
16
|
+
};
|
|
17
|
+
export type CreatePunctuationSemanticTurnDetectorOptions = {
|
|
18
|
+
endPunctuation?: ReadonlyArray<string>;
|
|
19
|
+
fillerWords?: ReadonlyArray<string>;
|
|
20
|
+
minPartialWords?: number;
|
|
21
|
+
};
|
|
22
|
+
export declare const createPunctuationSemanticTurnDetector: (options?: CreatePunctuationSemanticTurnDetectorOptions) => VoiceSemanticTurnDetector;
|
|
23
|
+
export type CreateRegexSemanticTurnDetectorOptions = {
|
|
24
|
+
endPattern: RegExp;
|
|
25
|
+
minPartialWords?: number;
|
|
26
|
+
};
|
|
27
|
+
export declare const createRegexSemanticTurnDetector: (options: CreateRegexSemanticTurnDetectorOptions) => VoiceSemanticTurnDetector;
|
package/dist/testing/index.js
CHANGED
|
@@ -5408,6 +5408,28 @@ var createVoiceMemoryRecordingStore = () => {
|
|
|
5408
5408
|
};
|
|
5409
5409
|
};
|
|
5410
5410
|
|
|
5411
|
+
// src/assistantMode.ts
|
|
5412
|
+
var resolveVoiceAssistantMode = (options) => {
|
|
5413
|
+
if (options.assistantMode) {
|
|
5414
|
+
return options.assistantMode;
|
|
5415
|
+
}
|
|
5416
|
+
if (options.realtime) {
|
|
5417
|
+
return "s2s";
|
|
5418
|
+
}
|
|
5419
|
+
return "cascade";
|
|
5420
|
+
};
|
|
5421
|
+
var describeVoiceAssistantMode = (options) => {
|
|
5422
|
+
const mode = resolveVoiceAssistantMode(options);
|
|
5423
|
+
const modalities = options.modalities ? Array.from(new Set(options.modalities)) : ["audio"];
|
|
5424
|
+
return {
|
|
5425
|
+
hasRealtime: Boolean(options.realtime),
|
|
5426
|
+
hasSTT: Boolean(options.stt),
|
|
5427
|
+
hasTTS: Boolean(options.tts),
|
|
5428
|
+
modalities,
|
|
5429
|
+
mode
|
|
5430
|
+
};
|
|
5431
|
+
};
|
|
5432
|
+
|
|
5411
5433
|
// src/session.ts
|
|
5412
5434
|
var DEFAULT_RECONNECT_TIMEOUT = 30000;
|
|
5413
5435
|
var DEFAULT_MAX_RECONNECT_ATTEMPTS2 = 10;
|
|
@@ -6754,6 +6776,18 @@ var createVoiceSession = (options) => {
|
|
|
6754
6776
|
session,
|
|
6755
6777
|
type: "turn.transcript"
|
|
6756
6778
|
});
|
|
6779
|
+
if (options.semanticTurnDetector) {
|
|
6780
|
+
const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
|
|
6781
|
+
lastFinalTranscript: transcript,
|
|
6782
|
+
partialText: session.currentTurn.partialText,
|
|
6783
|
+
silenceMs: session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : 0,
|
|
6784
|
+
transcripts: session.currentTurn.transcripts
|
|
6785
|
+
}));
|
|
6786
|
+
if (verdict.endOfTurn) {
|
|
6787
|
+
clearSilenceTimer();
|
|
6788
|
+
await requestTurnCommit("vendor");
|
|
6789
|
+
}
|
|
6790
|
+
}
|
|
6757
6791
|
};
|
|
6758
6792
|
const resumePendingTurnCommit = (session) => {
|
|
6759
6793
|
const pendingText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
@@ -6778,13 +6812,20 @@ var createVoiceSession = (options) => {
|
|
|
6778
6812
|
if (!inputAdapter) {
|
|
6779
6813
|
throw new Error("Voice session requires either an stt or realtime adapter.");
|
|
6780
6814
|
}
|
|
6781
|
-
const openedSession = await
|
|
6782
|
-
format: options.
|
|
6815
|
+
const openedSession = await (options.realtime ? options.realtime.open({
|
|
6816
|
+
format: options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT,
|
|
6783
6817
|
languageStrategy: options.languageStrategy,
|
|
6784
6818
|
lexicon,
|
|
6819
|
+
modalities: options.modalities,
|
|
6785
6820
|
phraseHints,
|
|
6786
6821
|
sessionId: options.id
|
|
6787
|
-
})
|
|
6822
|
+
}) : inputAdapter.open({
|
|
6823
|
+
format: DEFAULT_FORMAT,
|
|
6824
|
+
languageStrategy: options.languageStrategy,
|
|
6825
|
+
lexicon,
|
|
6826
|
+
phraseHints,
|
|
6827
|
+
sessionId: options.id
|
|
6828
|
+
}));
|
|
6788
6829
|
const generation = ++adapterGenerationCounter;
|
|
6789
6830
|
sttSession = openedSession;
|
|
6790
6831
|
activeAdapterGeneration = generation;
|
|
@@ -6954,9 +6995,10 @@ var createVoiceSession = (options) => {
|
|
|
6954
6995
|
});
|
|
6955
6996
|
await appendTrace({
|
|
6956
6997
|
payload: {
|
|
6998
|
+
assistantMode: resolveVoiceAssistantMode(options),
|
|
6999
|
+
realtimeConfigured: Boolean(options.realtime),
|
|
6957
7000
|
text: output.assistantText,
|
|
6958
|
-
ttsConfigured: Boolean(options.tts)
|
|
6959
|
-
realtimeConfigured: Boolean(options.realtime)
|
|
7001
|
+
ttsConfigured: Boolean(options.tts)
|
|
6960
7002
|
},
|
|
6961
7003
|
session,
|
|
6962
7004
|
turnId: turn.id,
|
package/dist/types.d.ts
CHANGED
|
@@ -194,7 +194,10 @@ export type RealtimeAdapterOpenOptions = {
|
|
|
194
194
|
format: AudioFormat;
|
|
195
195
|
languageStrategy?: VoiceLanguageStrategy;
|
|
196
196
|
lexicon?: VoiceLexiconEntry[];
|
|
197
|
+
modalities?: ReadonlyArray<"audio" | "text">;
|
|
197
198
|
phraseHints?: VoicePhraseHint[];
|
|
199
|
+
promptCacheKey?: string;
|
|
200
|
+
semanticVAD?: import("./assistantMode").VoiceSemanticVADConfig;
|
|
198
201
|
signal?: AbortSignal;
|
|
199
202
|
};
|
|
200
203
|
export type RealtimeAdapter<TOptions extends RealtimeAdapterOpenOptions = RealtimeAdapterOpenOptions> = {
|
|
@@ -731,6 +734,9 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
|
|
|
731
734
|
provider?: string;
|
|
732
735
|
};
|
|
733
736
|
redact?: import("./redaction").VoiceTranscriptRedactor;
|
|
737
|
+
semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
|
|
738
|
+
assistantMode?: import("./assistantMode").VoiceAssistantMode;
|
|
739
|
+
modalities?: ReadonlyArray<"audio" | "text">;
|
|
734
740
|
reconnect: Required<VoiceReconnectConfig>;
|
|
735
741
|
phraseHints?: VoicePhraseHint[];
|
|
736
742
|
sessionMetadata?: Record<string, unknown>;
|