@absolutejs/voice 0.0.19 → 0.0.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +387 -4
- package/dist/angular/index.d.ts +1 -0
- package/dist/angular/index.js +669 -3
- package/dist/angular/voice-controller.service.d.ts +21 -0
- package/dist/audioConditioning.d.ts +3 -0
- package/dist/client/actions.d.ts +7 -0
- package/dist/client/connection.d.ts +5 -0
- package/dist/client/controller.d.ts +2 -0
- package/dist/client/htmxBootstrap.js +576 -167
- package/dist/client/index.d.ts +1 -0
- package/dist/client/index.js +486 -3
- package/dist/client/microphone.d.ts +4 -2
- package/dist/correction.d.ts +16 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +1314 -283
- package/dist/presets.d.ts +13 -0
- package/dist/react/index.d.ts +1 -0
- package/dist/react/index.js +642 -3
- package/dist/react/useVoiceController.d.ts +20 -0
- package/dist/react/useVoiceStream.d.ts +1 -0
- package/dist/store.d.ts +2 -2
- package/dist/svelte/index.d.ts +1 -0
- package/dist/svelte/index.js +607 -3
- package/dist/testing/benchmark.d.ts +36 -0
- package/dist/testing/fixtures.d.ts +1 -0
- package/dist/testing/index.d.ts +2 -0
- package/dist/testing/index.js +1975 -4
- package/dist/testing/resilience.d.ts +20 -0
- package/dist/testing/sessionBenchmark.d.ts +126 -0
- package/dist/testing/stt.d.ts +1 -0
- package/dist/turnDetection.d.ts +5 -1
- package/dist/turnProfiles.d.ts +6 -0
- package/dist/types.d.ts +198 -8
- package/dist/vue/index.d.ts +1 -0
- package/dist/vue/index.js +660 -3
- package/dist/vue/useVoiceController.d.ts +19 -0
- package/fixtures/README.md +24 -0
- package/fixtures/manifest.json +127 -0
- package/fixtures/pcm/dialogue-three-clean.pcm +0 -0
- package/fixtures/pcm/dialogue-three-mixed.pcm +0 -0
- package/fixtures/pcm/dialogue-two-clean.pcm +0 -0
- package/fixtures/pcm/dialogue-two-noisy.pcm +0 -0
- package/fixtures/pcm/multiturn-three-mixed.pcm +0 -0
- package/fixtures/pcm/multiturn-two-clean.pcm +0 -0
- package/fixtures/pcm/stella-bulgaria-bulgarian20.pcm +0 -0
- package/fixtures/pcm/stella-jamaica-jamaican-creole-english1.pcm +0 -0
- package/fixtures/pcm/stella-liberia-liberian-pidgin-english2.pcm +0 -0
- package/fixtures/pcm/stella-sierra-leone-krio5.pcm +0 -0
- package/package.json +25 -1
package/dist/index.js
CHANGED
|
@@ -69,6 +69,61 @@ var __decorateElement = (array, flags, name, decorators, target, extra) => {
|
|
|
69
69
|
return k || __decoratorMetadata(array, target), desc && __defProp(target, name, desc), p ? k ^ 4 ? extra : desc : target;
|
|
70
70
|
};
|
|
71
71
|
|
|
72
|
+
// src/audioConditioning.ts
|
|
73
|
+
var DEFAULT_TARGET_LEVEL = 0.08;
|
|
74
|
+
var DEFAULT_MAX_GAIN = 3;
|
|
75
|
+
var DEFAULT_NOISE_GATE_THRESHOLD = 0.006;
|
|
76
|
+
var DEFAULT_NOISE_GATE_ATTENUATION = 0.15;
|
|
77
|
+
var toInt16Array = (audio) => {
|
|
78
|
+
if (audio instanceof ArrayBuffer) {
|
|
79
|
+
return new Int16Array(audio, 0, Math.floor(audio.byteLength / 2));
|
|
80
|
+
}
|
|
81
|
+
return new Int16Array(audio.buffer, audio.byteOffset, Math.floor(audio.byteLength / 2));
|
|
82
|
+
};
|
|
83
|
+
var computeRms = (samples) => {
|
|
84
|
+
if (samples.length === 0) {
|
|
85
|
+
return 0;
|
|
86
|
+
}
|
|
87
|
+
let sumSquares = 0;
|
|
88
|
+
for (const sample of samples) {
|
|
89
|
+
const normalized = sample / 32768;
|
|
90
|
+
sumSquares += normalized * normalized;
|
|
91
|
+
}
|
|
92
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
93
|
+
};
|
|
94
|
+
var resolveAudioConditioningConfig = (config) => {
|
|
95
|
+
if (!config || config.enabled === false) {
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
return {
|
|
99
|
+
enabled: true,
|
|
100
|
+
maxGain: config.maxGain ?? DEFAULT_MAX_GAIN,
|
|
101
|
+
noiseGateAttenuation: config.noiseGateAttenuation ?? DEFAULT_NOISE_GATE_ATTENUATION,
|
|
102
|
+
noiseGateThreshold: config.noiseGateThreshold ?? DEFAULT_NOISE_GATE_THRESHOLD,
|
|
103
|
+
targetLevel: config.targetLevel ?? DEFAULT_TARGET_LEVEL
|
|
104
|
+
};
|
|
105
|
+
};
|
|
106
|
+
var conditionAudioChunk = (audio, config) => {
|
|
107
|
+
if (!config) {
|
|
108
|
+
return audio;
|
|
109
|
+
}
|
|
110
|
+
const source = toInt16Array(audio);
|
|
111
|
+
if (source.length === 0) {
|
|
112
|
+
return audio;
|
|
113
|
+
}
|
|
114
|
+
const rms = computeRms(source);
|
|
115
|
+
const output = new Int16Array(source.length);
|
|
116
|
+
const gateFactor = rms < config.noiseGateThreshold ? config.noiseGateAttenuation : 1;
|
|
117
|
+
const baseLevel = Math.max(rms * gateFactor, 0.000001);
|
|
118
|
+
const gain = Math.min(config.maxGain, config.targetLevel / baseLevel);
|
|
119
|
+
const appliedGain = Math.max(0.25, gain) * gateFactor;
|
|
120
|
+
for (let index = 0;index < source.length; index += 1) {
|
|
121
|
+
const next = Math.round(source[index] * appliedGain);
|
|
122
|
+
output[index] = Math.max(-32768, Math.min(32767, next));
|
|
123
|
+
}
|
|
124
|
+
return new Uint8Array(output.buffer);
|
|
125
|
+
};
|
|
126
|
+
|
|
72
127
|
// src/plugin.ts
|
|
73
128
|
import { Elysia } from "elysia";
|
|
74
129
|
import { resolve } from "path";
|
|
@@ -118,6 +173,10 @@ var defaultMetrics = (input) => {
|
|
|
118
173
|
'<span class="voice-metric-label">Session</span>',
|
|
119
174
|
`<span class="voice-metric-value">${escapeHtml(input.sessionId)}</span>`,
|
|
120
175
|
"</div>",
|
|
176
|
+
input.session?.scenarioId ? `<div class="voice-metric">
|
|
177
|
+
<span class="voice-metric-label">Scenario</span>
|
|
178
|
+
<span class="voice-metric-value">${escapeHtml(input.session.scenarioId)}</span>
|
|
179
|
+
</div>` : "",
|
|
121
180
|
'<div class="voice-metric">',
|
|
122
181
|
'<span class="voice-metric-label">Status</span>',
|
|
123
182
|
`<span class="voice-metric-value">${escapeHtml(input.status)}</span>`,
|
|
@@ -207,24 +266,245 @@ var resolveLogger = (logger) => ({
|
|
|
207
266
|
...logger
|
|
208
267
|
});
|
|
209
268
|
|
|
269
|
+
// src/turnProfiles.ts
|
|
270
|
+
var TURN_PROFILE_DEFAULTS = {
|
|
271
|
+
balanced: {
|
|
272
|
+
qualityProfile: "general",
|
|
273
|
+
silenceMs: 1400,
|
|
274
|
+
speechThreshold: 0.012,
|
|
275
|
+
transcriptStabilityMs: 1000
|
|
276
|
+
},
|
|
277
|
+
fast: {
|
|
278
|
+
qualityProfile: "general",
|
|
279
|
+
silenceMs: 700,
|
|
280
|
+
speechThreshold: 0.015,
|
|
281
|
+
transcriptStabilityMs: 450
|
|
282
|
+
},
|
|
283
|
+
"long-form": {
|
|
284
|
+
qualityProfile: "general",
|
|
285
|
+
silenceMs: 2200,
|
|
286
|
+
speechThreshold: 0.01,
|
|
287
|
+
transcriptStabilityMs: 1500
|
|
288
|
+
}
|
|
289
|
+
};
|
|
290
|
+
var QUALITY_PROFILE_DEFAULTS = {
|
|
291
|
+
general: {},
|
|
292
|
+
"accent-heavy": {
|
|
293
|
+
silenceMs: 1200,
|
|
294
|
+
speechThreshold: 0.01,
|
|
295
|
+
transcriptStabilityMs: 1200
|
|
296
|
+
},
|
|
297
|
+
"noisy-room": {
|
|
298
|
+
silenceMs: 2000,
|
|
299
|
+
speechThreshold: 0.02,
|
|
300
|
+
transcriptStabilityMs: 1600
|
|
301
|
+
},
|
|
302
|
+
"short-command": {
|
|
303
|
+
silenceMs: 500,
|
|
304
|
+
speechThreshold: 0.016,
|
|
305
|
+
transcriptStabilityMs: 420
|
|
306
|
+
}
|
|
307
|
+
};
|
|
308
|
+
var DEFAULT_TURN_PROFILE = "fast";
|
|
309
|
+
var DEFAULT_QUALITY_PROFILE = "general";
|
|
310
|
+
var resolveTurnDetectionConfig = (config) => {
|
|
311
|
+
const profile = config?.profile ?? DEFAULT_TURN_PROFILE;
|
|
312
|
+
const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
|
|
313
|
+
const preset = TURN_PROFILE_DEFAULTS[profile];
|
|
314
|
+
const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
|
|
315
|
+
return {
|
|
316
|
+
profile,
|
|
317
|
+
qualityProfile,
|
|
318
|
+
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
319
|
+
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
320
|
+
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
|
321
|
+
};
|
|
322
|
+
};
|
|
323
|
+
|
|
324
|
+
// src/presets.ts
|
|
325
|
+
var PRESET_INPUTS = {
|
|
326
|
+
chat: {
|
|
327
|
+
audioConditioning: {
|
|
328
|
+
enabled: true,
|
|
329
|
+
maxGain: 2.5,
|
|
330
|
+
noiseGateAttenuation: 0,
|
|
331
|
+
noiseGateThreshold: 0.004,
|
|
332
|
+
targetLevel: 0.08
|
|
333
|
+
},
|
|
334
|
+
capture: {
|
|
335
|
+
channelCount: 1,
|
|
336
|
+
sampleRateHz: 16000
|
|
337
|
+
},
|
|
338
|
+
connection: {
|
|
339
|
+
maxReconnectAttempts: 10,
|
|
340
|
+
pingInterval: 30000,
|
|
341
|
+
reconnect: true
|
|
342
|
+
},
|
|
343
|
+
sttLifecycle: "continuous",
|
|
344
|
+
turnDetection: {
|
|
345
|
+
qualityProfile: "short-command",
|
|
346
|
+
profile: "balanced"
|
|
347
|
+
}
|
|
348
|
+
},
|
|
349
|
+
default: {
|
|
350
|
+
capture: {
|
|
351
|
+
channelCount: 1,
|
|
352
|
+
sampleRateHz: 16000
|
|
353
|
+
},
|
|
354
|
+
connection: {
|
|
355
|
+
maxReconnectAttempts: 10,
|
|
356
|
+
pingInterval: 30000,
|
|
357
|
+
reconnect: true
|
|
358
|
+
},
|
|
359
|
+
sttLifecycle: "continuous",
|
|
360
|
+
turnDetection: {
|
|
361
|
+
qualityProfile: "general",
|
|
362
|
+
profile: "fast"
|
|
363
|
+
}
|
|
364
|
+
},
|
|
365
|
+
dictation: {
|
|
366
|
+
audioConditioning: {
|
|
367
|
+
enabled: true,
|
|
368
|
+
maxGain: 2.25,
|
|
369
|
+
noiseGateAttenuation: 0.05,
|
|
370
|
+
noiseGateThreshold: 0.003,
|
|
371
|
+
targetLevel: 0.08
|
|
372
|
+
},
|
|
373
|
+
capture: {
|
|
374
|
+
channelCount: 1,
|
|
375
|
+
sampleRateHz: 16000
|
|
376
|
+
},
|
|
377
|
+
connection: {
|
|
378
|
+
maxReconnectAttempts: 12,
|
|
379
|
+
pingInterval: 30000,
|
|
380
|
+
reconnect: true
|
|
381
|
+
},
|
|
382
|
+
sttLifecycle: "continuous",
|
|
383
|
+
turnDetection: {
|
|
384
|
+
qualityProfile: "accent-heavy",
|
|
385
|
+
profile: "long-form"
|
|
386
|
+
}
|
|
387
|
+
},
|
|
388
|
+
"guided-intake": {
|
|
389
|
+
audioConditioning: {
|
|
390
|
+
enabled: true,
|
|
391
|
+
maxGain: 2.5,
|
|
392
|
+
noiseGateAttenuation: 0,
|
|
393
|
+
noiseGateThreshold: 0.004,
|
|
394
|
+
targetLevel: 0.08
|
|
395
|
+
},
|
|
396
|
+
capture: {
|
|
397
|
+
channelCount: 1,
|
|
398
|
+
sampleRateHz: 16000
|
|
399
|
+
},
|
|
400
|
+
connection: {
|
|
401
|
+
maxReconnectAttempts: 12,
|
|
402
|
+
pingInterval: 30000,
|
|
403
|
+
reconnect: true
|
|
404
|
+
},
|
|
405
|
+
sttLifecycle: "turn-scoped",
|
|
406
|
+
turnDetection: {
|
|
407
|
+
qualityProfile: "accent-heavy",
|
|
408
|
+
profile: "long-form"
|
|
409
|
+
}
|
|
410
|
+
},
|
|
411
|
+
"noisy-room": {
|
|
412
|
+
audioConditioning: {
|
|
413
|
+
enabled: true,
|
|
414
|
+
maxGain: 3,
|
|
415
|
+
noiseGateAttenuation: 0.12,
|
|
416
|
+
noiseGateThreshold: 0.006,
|
|
417
|
+
targetLevel: 0.085
|
|
418
|
+
},
|
|
419
|
+
capture: {
|
|
420
|
+
channelCount: 1,
|
|
421
|
+
sampleRateHz: 16000
|
|
422
|
+
},
|
|
423
|
+
connection: {
|
|
424
|
+
maxReconnectAttempts: 14,
|
|
425
|
+
pingInterval: 45000,
|
|
426
|
+
reconnect: true
|
|
427
|
+
},
|
|
428
|
+
sttLifecycle: "continuous",
|
|
429
|
+
turnDetection: {
|
|
430
|
+
qualityProfile: "noisy-room",
|
|
431
|
+
profile: "long-form",
|
|
432
|
+
silenceMs: 2100,
|
|
433
|
+
speechThreshold: 0.02,
|
|
434
|
+
transcriptStabilityMs: 1650
|
|
435
|
+
}
|
|
436
|
+
},
|
|
437
|
+
reliability: {
|
|
438
|
+
audioConditioning: {
|
|
439
|
+
enabled: true,
|
|
440
|
+
maxGain: 2.9,
|
|
441
|
+
noiseGateAttenuation: 0.08,
|
|
442
|
+
noiseGateThreshold: 0.005,
|
|
443
|
+
targetLevel: 0.08
|
|
444
|
+
},
|
|
445
|
+
capture: {
|
|
446
|
+
channelCount: 1,
|
|
447
|
+
sampleRateHz: 16000
|
|
448
|
+
},
|
|
449
|
+
connection: {
|
|
450
|
+
maxReconnectAttempts: 14,
|
|
451
|
+
pingInterval: 45000,
|
|
452
|
+
reconnect: true
|
|
453
|
+
},
|
|
454
|
+
sttLifecycle: "continuous",
|
|
455
|
+
turnDetection: {
|
|
456
|
+
qualityProfile: "noisy-room",
|
|
457
|
+
profile: "long-form"
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
};
|
|
461
|
+
var resolveVoiceRuntimePreset = (name = "default") => {
|
|
462
|
+
const preset = PRESET_INPUTS[name];
|
|
463
|
+
return {
|
|
464
|
+
audioConditioning: resolveAudioConditioningConfig(preset.audioConditioning),
|
|
465
|
+
capture: {
|
|
466
|
+
channelCount: preset.capture?.channelCount ?? 1,
|
|
467
|
+
sampleRateHz: preset.capture?.sampleRateHz ?? 16000
|
|
468
|
+
},
|
|
469
|
+
connection: {
|
|
470
|
+
...preset.connection
|
|
471
|
+
},
|
|
472
|
+
name,
|
|
473
|
+
sttLifecycle: preset.sttLifecycle ?? "continuous",
|
|
474
|
+
turnDetection: resolveTurnDetectionConfig(preset.turnDetection)
|
|
475
|
+
};
|
|
476
|
+
};
|
|
477
|
+
|
|
210
478
|
// src/store.ts
|
|
211
479
|
var createId = () => crypto.randomUUID();
|
|
212
|
-
var createVoiceSessionRecord = (id) => ({
|
|
480
|
+
var createVoiceSessionRecord = (id, scenarioId) => ({
|
|
213
481
|
committedTurnIds: [],
|
|
214
482
|
createdAt: Date.now(),
|
|
215
483
|
currentTurn: {
|
|
216
484
|
finalText: "",
|
|
485
|
+
lastSpeechAt: undefined,
|
|
486
|
+
lastTranscriptAt: undefined,
|
|
487
|
+
partialEndedAt: undefined,
|
|
488
|
+
partialStartedAt: undefined,
|
|
217
489
|
partialText: "",
|
|
490
|
+
silenceStartedAt: undefined,
|
|
218
491
|
transcripts: []
|
|
219
492
|
},
|
|
220
493
|
id,
|
|
494
|
+
scenarioId,
|
|
221
495
|
reconnect: { attempts: 0 },
|
|
222
496
|
status: "active",
|
|
223
497
|
transcripts: [],
|
|
224
|
-
turns: []
|
|
498
|
+
turns: [],
|
|
499
|
+
lastCommittedTurn: {
|
|
500
|
+
committedAt: 0,
|
|
501
|
+
signature: "",
|
|
502
|
+
text: "",
|
|
503
|
+
transcriptIds: []
|
|
504
|
+
}
|
|
225
505
|
});
|
|
226
|
-
var resetVoiceSessionRecord = (id, existing) => ({
|
|
227
|
-
...createVoiceSessionRecord(id),
|
|
506
|
+
var resetVoiceSessionRecord = (id, existing, scenarioId) => ({
|
|
507
|
+
...createVoiceSessionRecord(id, scenarioId),
|
|
228
508
|
metadata: existing?.metadata
|
|
229
509
|
});
|
|
230
510
|
var toVoiceSessionSummary = (session) => ({
|
|
@@ -261,6 +541,61 @@ var measureAudioLevel = (audio) => {
|
|
|
261
541
|
return Math.sqrt(sumSquares / samples.length);
|
|
262
542
|
};
|
|
263
543
|
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
544
|
+
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
545
|
+
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
546
|
+
const current = normalizeText(currentText);
|
|
547
|
+
const next = normalizeText(nextText);
|
|
548
|
+
if (!current) {
|
|
549
|
+
return next;
|
|
550
|
+
}
|
|
551
|
+
if (!next) {
|
|
552
|
+
return current;
|
|
553
|
+
}
|
|
554
|
+
if (current === next || current.includes(next)) {
|
|
555
|
+
return current;
|
|
556
|
+
}
|
|
557
|
+
if (next.includes(current)) {
|
|
558
|
+
return next;
|
|
559
|
+
}
|
|
560
|
+
if (countWords(next) > countWords(current)) {
|
|
561
|
+
return next;
|
|
562
|
+
}
|
|
563
|
+
return current;
|
|
564
|
+
};
|
|
565
|
+
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
566
|
+
const current = normalizeText(currentText);
|
|
567
|
+
const next = normalizeText(nextText);
|
|
568
|
+
if (!current) {
|
|
569
|
+
return next;
|
|
570
|
+
}
|
|
571
|
+
if (!next) {
|
|
572
|
+
return current;
|
|
573
|
+
}
|
|
574
|
+
const currentWords = current.split(" ");
|
|
575
|
+
const nextWords = next.split(" ");
|
|
576
|
+
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
577
|
+
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
578
|
+
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
579
|
+
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
580
|
+
if (currentSuffix === nextPrefix) {
|
|
581
|
+
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
return `${current} ${next}`.trim();
|
|
585
|
+
};
|
|
586
|
+
var countCommonPrefixWords = (currentText, nextText) => {
|
|
587
|
+
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
588
|
+
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
589
|
+
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
590
|
+
let count = 0;
|
|
591
|
+
for (let index = 0;index < maxWords; index += 1) {
|
|
592
|
+
if (currentWords[index] !== nextWords[index]) {
|
|
593
|
+
break;
|
|
594
|
+
}
|
|
595
|
+
count += 1;
|
|
596
|
+
}
|
|
597
|
+
return count;
|
|
598
|
+
};
|
|
264
599
|
var mergeTranscriptTexts = (transcripts) => {
|
|
265
600
|
const merged = [];
|
|
266
601
|
for (const transcript of transcripts) {
|
|
@@ -284,24 +619,141 @@ var mergeTranscriptTexts = (transcripts) => {
|
|
|
284
619
|
}
|
|
285
620
|
return merged.join(" ").trim();
|
|
286
621
|
};
|
|
287
|
-
var buildTurnText = (transcripts, partialText) => {
|
|
622
|
+
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
288
623
|
const finalText = mergeTranscriptTexts(transcripts);
|
|
289
|
-
|
|
290
|
-
|
|
624
|
+
const nextPartial = normalizeText(partialText);
|
|
625
|
+
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
626
|
+
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
627
|
+
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
291
628
|
}
|
|
292
|
-
return
|
|
629
|
+
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
293
630
|
};
|
|
294
631
|
|
|
295
632
|
// src/session.ts
|
|
296
633
|
var DEFAULT_RECONNECT_TIMEOUT = 30000;
|
|
297
634
|
var DEFAULT_MAX_RECONNECT_ATTEMPTS = 10;
|
|
635
|
+
var DEFAULT_TRANSCRIPT_STABILITY_MS = 450;
|
|
636
|
+
var DEFAULT_FALLBACK_REPLAY_MS = 8000;
|
|
637
|
+
var DEFAULT_FALLBACK_SETTLE_MS = 220;
|
|
638
|
+
var DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS = 2500;
|
|
639
|
+
var DEFAULT_FALLBACK_CONFIDENCE_THRESHOLD = 0.6;
|
|
640
|
+
var DEFAULT_FALLBACK_MIN_TEXT_LENGTH = 2;
|
|
641
|
+
var DEFAULT_FALLBACK_MAX_ATTEMPTS_PER_TURN = 1;
|
|
642
|
+
var DEFAULT_DUPLICATE_TURN_WINDOW_MS = 5000;
|
|
643
|
+
var FALLBACK_CONFIDENCE_SELECTION_DELTA = 0.05;
|
|
644
|
+
var FALLBACK_WORD_COUNT_SELECTION_MARGIN_RATIO = 0.12;
|
|
645
|
+
var DEFAULT_FORMAT = {
|
|
646
|
+
channels: 1,
|
|
647
|
+
container: "raw",
|
|
648
|
+
encoding: "pcm_s16le",
|
|
649
|
+
sampleRateHz: 16000
|
|
650
|
+
};
|
|
298
651
|
var toError = (value) => value instanceof Error ? value : new Error(String(value));
|
|
299
652
|
var createEmptyCurrentTurn = () => ({
|
|
300
653
|
finalText: "",
|
|
654
|
+
lastSpeechAt: undefined,
|
|
655
|
+
lastTranscriptAt: undefined,
|
|
656
|
+
partialEndedAt: undefined,
|
|
657
|
+
partialStartedAt: undefined,
|
|
301
658
|
partialText: "",
|
|
659
|
+
silenceStartedAt: undefined,
|
|
302
660
|
transcripts: []
|
|
303
661
|
});
|
|
304
662
|
var cloneTranscript = (transcript) => ({ ...transcript });
|
|
663
|
+
var countWords2 = (text) => text.trim().split(/\s+/).filter(Boolean).length;
|
|
664
|
+
var normalizeText2 = (text) => text.trim().replace(/\s+/g, " ");
|
|
665
|
+
var getAudioChunkDurationMs = (chunk) => chunk.byteLength / (DEFAULT_FORMAT.sampleRateHz * DEFAULT_FORMAT.channels * 2) * 1000;
|
|
666
|
+
var getBufferedAudioDurationMs = (chunks) => chunks.reduce((total, chunk) => total + getAudioChunkDurationMs(chunk), 0);
|
|
667
|
+
var calculateMeanConfidence = (transcripts) => {
|
|
668
|
+
let sum = 0;
|
|
669
|
+
let total = 0;
|
|
670
|
+
for (const transcript of transcripts) {
|
|
671
|
+
if (typeof transcript.confidence === "number") {
|
|
672
|
+
sum += transcript.confidence;
|
|
673
|
+
total += 1;
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
if (total === 0) {
|
|
677
|
+
return 0;
|
|
678
|
+
}
|
|
679
|
+
return sum / total;
|
|
680
|
+
};
|
|
681
|
+
var createTurnQuality = (transcripts, source, fallbackUsed, fallbackDiagnostics, correctionDiagnostics) => {
|
|
682
|
+
const sampledTranscripts = transcripts.filter((transcript) => typeof transcript.confidence === "number");
|
|
683
|
+
const confidenceSampleCount = sampledTranscripts.length;
|
|
684
|
+
return {
|
|
685
|
+
averageConfidence: confidenceSampleCount > 0 ? sampledTranscripts.reduce((sum, transcript) => sum + transcript.confidence, 0) / confidenceSampleCount : undefined,
|
|
686
|
+
confidenceSampleCount,
|
|
687
|
+
correction: correctionDiagnostics,
|
|
688
|
+
fallback: fallbackDiagnostics,
|
|
689
|
+
fallbackUsed,
|
|
690
|
+
finalTranscriptCount: transcripts.filter((transcript) => transcript.isFinal).length,
|
|
691
|
+
partialTranscriptCount: transcripts.filter((transcript) => !transcript.isFinal).length,
|
|
692
|
+
selectedTranscriptCount: transcripts.length,
|
|
693
|
+
source
|
|
694
|
+
};
|
|
695
|
+
};
|
|
696
|
+
var normalizeCorrectionText = (text) => normalizeText2(text);
|
|
697
|
+
var isFallbackNeeded = (candidate, config) => {
|
|
698
|
+
const trimmed = normalizeText2(candidate.text);
|
|
699
|
+
const wordCount = countWords2(trimmed);
|
|
700
|
+
if (config.trigger === "always") {
|
|
701
|
+
return true;
|
|
702
|
+
}
|
|
703
|
+
if (config.trigger === "empty-turn") {
|
|
704
|
+
return wordCount < config.minTextLength;
|
|
705
|
+
}
|
|
706
|
+
const averageConfidence = calculateMeanConfidence(candidate.transcripts);
|
|
707
|
+
if (config.trigger === "low-confidence") {
|
|
708
|
+
return averageConfidence > 0 && averageConfidence < config.confidenceThreshold;
|
|
709
|
+
}
|
|
710
|
+
return averageConfidence > 0 && averageConfidence < config.confidenceThreshold || wordCount < config.minTextLength;
|
|
711
|
+
};
|
|
712
|
+
var selectBetterTurnText = (candidate, fallback) => {
|
|
713
|
+
if (!fallback.text) {
|
|
714
|
+
return {
|
|
715
|
+
reason: "fallback-empty",
|
|
716
|
+
winner: candidate
|
|
717
|
+
};
|
|
718
|
+
}
|
|
719
|
+
if (!candidate.text) {
|
|
720
|
+
return {
|
|
721
|
+
reason: "primary-empty",
|
|
722
|
+
winner: fallback
|
|
723
|
+
};
|
|
724
|
+
}
|
|
725
|
+
const largestWordCount = Math.max(candidate.wordCount, fallback.wordCount, 1);
|
|
726
|
+
const wordCountDelta = fallback.wordCount - candidate.wordCount;
|
|
727
|
+
const wordCountDeltaRatio = Math.abs(wordCountDelta) / largestWordCount;
|
|
728
|
+
if (wordCountDeltaRatio >= FALLBACK_WORD_COUNT_SELECTION_MARGIN_RATIO && wordCountDelta !== 0) {
|
|
729
|
+
return {
|
|
730
|
+
reason: "word-count-margin",
|
|
731
|
+
winner: wordCountDelta > 0 ? fallback : candidate
|
|
732
|
+
};
|
|
733
|
+
}
|
|
734
|
+
if (fallback.confidence > candidate.confidence + FALLBACK_CONFIDENCE_SELECTION_DELTA) {
|
|
735
|
+
return {
|
|
736
|
+
reason: "confidence-margin",
|
|
737
|
+
winner: fallback
|
|
738
|
+
};
|
|
739
|
+
}
|
|
740
|
+
if (candidate.confidence > fallback.confidence + FALLBACK_CONFIDENCE_SELECTION_DELTA) {
|
|
741
|
+
return {
|
|
742
|
+
reason: "kept-primary",
|
|
743
|
+
winner: candidate
|
|
744
|
+
};
|
|
745
|
+
}
|
|
746
|
+
if (fallback.wordCount > candidate.wordCount) {
|
|
747
|
+
return {
|
|
748
|
+
reason: "word-count-tiebreak",
|
|
749
|
+
winner: fallback
|
|
750
|
+
};
|
|
751
|
+
}
|
|
752
|
+
return {
|
|
753
|
+
reason: "kept-primary",
|
|
754
|
+
winner: candidate
|
|
755
|
+
};
|
|
756
|
+
};
|
|
305
757
|
var setTurnResult = (session, turnId, input) => {
|
|
306
758
|
session.turns = session.turns.map((turn) => turn.id === turnId ? {
|
|
307
759
|
...turn,
|
|
@@ -318,12 +770,55 @@ var createVoiceSession = (options) => {
|
|
|
318
770
|
};
|
|
319
771
|
const turnDetection = {
|
|
320
772
|
silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
|
|
321
|
-
speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD
|
|
773
|
+
speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
|
|
774
|
+
transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
|
|
322
775
|
};
|
|
776
|
+
const sttFallback = options.sttFallback ? {
|
|
777
|
+
adapter: options.sttFallback.adapter,
|
|
778
|
+
completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
|
|
779
|
+
confidenceThreshold: options.sttFallback.confidenceThreshold ?? DEFAULT_FALLBACK_CONFIDENCE_THRESHOLD,
|
|
780
|
+
maxAttemptsPerTurn: options.sttFallback.maxAttemptsPerTurn ?? DEFAULT_FALLBACK_MAX_ATTEMPTS_PER_TURN,
|
|
781
|
+
minTextLength: options.sttFallback.minTextLength ?? DEFAULT_FALLBACK_MIN_TEXT_LENGTH,
|
|
782
|
+
replayWindowMs: options.sttFallback.replayWindowMs ?? DEFAULT_FALLBACK_REPLAY_MS,
|
|
783
|
+
settleMs: options.sttFallback.settleMs ?? DEFAULT_FALLBACK_SETTLE_MS,
|
|
784
|
+
trigger: options.sttFallback.trigger ?? "empty-or-low-confidence"
|
|
785
|
+
} : undefined;
|
|
786
|
+
const phraseHints = options.phraseHints ?? [];
|
|
323
787
|
let socket = options.socket;
|
|
324
788
|
let sttSession = null;
|
|
325
789
|
let silenceTimer = null;
|
|
326
790
|
let speechDetected = false;
|
|
791
|
+
let operationQueue = Promise.resolve();
|
|
792
|
+
let adapterGenerationCounter = 0;
|
|
793
|
+
let activeAdapterGeneration = 0;
|
|
794
|
+
const currentTurnAudio = [];
|
|
795
|
+
let fallbackAttemptsForCurrentTurn = 0;
|
|
796
|
+
const pruneTurnAudio = () => {
|
|
797
|
+
const replayWindowMs = sttFallback?.replayWindowMs ?? DEFAULT_FALLBACK_REPLAY_MS;
|
|
798
|
+
const cutoffAt = Date.now() - replayWindowMs;
|
|
799
|
+
let index = 0;
|
|
800
|
+
while (index < currentTurnAudio.length && currentTurnAudio[index].recordedAt < cutoffAt) {
|
|
801
|
+
index += 1;
|
|
802
|
+
}
|
|
803
|
+
if (index > 0) {
|
|
804
|
+
currentTurnAudio.splice(0, index);
|
|
805
|
+
}
|
|
806
|
+
};
|
|
807
|
+
const pushTurnAudio = (audio) => {
|
|
808
|
+
const chunk = audio instanceof ArrayBuffer ? new Uint8Array(audio.slice(0)) : new Uint8Array(audio.buffer.slice(audio.byteOffset, audio.byteOffset + audio.byteLength));
|
|
809
|
+
currentTurnAudio.push({
|
|
810
|
+
chunk,
|
|
811
|
+
recordedAt: Date.now()
|
|
812
|
+
});
|
|
813
|
+
pruneTurnAudio();
|
|
814
|
+
};
|
|
815
|
+
const getFallbackWindowAudio = () => {
|
|
816
|
+
if (!sttFallback?.adapter) {
|
|
817
|
+
return [];
|
|
818
|
+
}
|
|
819
|
+
pruneTurnAudio();
|
|
820
|
+
return currentTurnAudio.map((audio) => audio.chunk);
|
|
821
|
+
};
|
|
327
822
|
const clearSilenceTimer = () => {
|
|
328
823
|
if (!silenceTimer) {
|
|
329
824
|
return;
|
|
@@ -349,12 +844,28 @@ var createVoiceSession = (options) => {
|
|
|
349
844
|
await options.store.set(options.id, session);
|
|
350
845
|
return session;
|
|
351
846
|
};
|
|
847
|
+
const runSerial = (phase, operation) => {
|
|
848
|
+
const result = operationQueue.then(async () => {
|
|
849
|
+
logger.debug("voice session operation", {
|
|
850
|
+
phase,
|
|
851
|
+
sessionId: options.id
|
|
852
|
+
});
|
|
853
|
+
return await operation();
|
|
854
|
+
});
|
|
855
|
+
operationQueue = result.then(() => {
|
|
856
|
+
return;
|
|
857
|
+
}, () => {
|
|
858
|
+
return;
|
|
859
|
+
});
|
|
860
|
+
return result;
|
|
861
|
+
};
|
|
352
862
|
const closeAdapter = async (reason) => {
|
|
353
863
|
if (!sttSession) {
|
|
354
864
|
return;
|
|
355
865
|
}
|
|
356
866
|
const activeSession = sttSession;
|
|
357
867
|
sttSession = null;
|
|
868
|
+
activeAdapterGeneration = 0;
|
|
358
869
|
try {
|
|
359
870
|
await activeSession.close(reason);
|
|
360
871
|
} catch (error) {
|
|
@@ -364,13 +875,87 @@ var createVoiceSession = (options) => {
|
|
|
364
875
|
});
|
|
365
876
|
}
|
|
366
877
|
};
|
|
367
|
-
const
|
|
368
|
-
if (silenceTimer) {
|
|
878
|
+
const scheduleTurnCommit = (delayMs, reason, reset = true) => {
|
|
879
|
+
if (!reset && silenceTimer) {
|
|
369
880
|
return;
|
|
370
881
|
}
|
|
882
|
+
if (reset) {
|
|
883
|
+
clearSilenceTimer();
|
|
884
|
+
}
|
|
371
885
|
silenceTimer = setTimeout(() => {
|
|
372
|
-
|
|
373
|
-
|
|
886
|
+
silenceTimer = null;
|
|
887
|
+
api.commitTurn(reason);
|
|
888
|
+
}, delayMs);
|
|
889
|
+
};
|
|
890
|
+
const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
|
|
891
|
+
const requestTurnCommit = async (reason) => {
|
|
892
|
+
const session = await readSession();
|
|
893
|
+
const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
894
|
+
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
895
|
+
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
896
|
+
});
|
|
897
|
+
if (!text) {
|
|
898
|
+
return;
|
|
899
|
+
}
|
|
900
|
+
const transcriptStabilityAge = session.currentTurn.lastTranscriptAt !== undefined ? Date.now() - session.currentTurn.lastTranscriptAt : undefined;
|
|
901
|
+
if (reason !== "manual" && typeof transcriptStabilityAge === "number" && transcriptStabilityAge < turnDetection.transcriptStabilityMs) {
|
|
902
|
+
scheduleTurnCommit(turnDetection.transcriptStabilityMs - transcriptStabilityAge, reason);
|
|
903
|
+
return;
|
|
904
|
+
}
|
|
905
|
+
await commitTurnInternal(reason);
|
|
906
|
+
};
|
|
907
|
+
const failInternal = async (error) => {
|
|
908
|
+
clearSilenceTimer();
|
|
909
|
+
const session = await writeSession((currentSession) => {
|
|
910
|
+
currentSession.lastActivityAt = Date.now();
|
|
911
|
+
currentSession.status = "failed";
|
|
912
|
+
});
|
|
913
|
+
const resolvedError = toError(error);
|
|
914
|
+
await send({
|
|
915
|
+
message: resolvedError.message,
|
|
916
|
+
recoverable: false,
|
|
917
|
+
type: "error"
|
|
918
|
+
});
|
|
919
|
+
await closeAdapter("failed");
|
|
920
|
+
speechDetected = false;
|
|
921
|
+
rewindFallbackTurnAudio();
|
|
922
|
+
await options.route.onError?.({
|
|
923
|
+
api,
|
|
924
|
+
context: options.context,
|
|
925
|
+
error: resolvedError,
|
|
926
|
+
session,
|
|
927
|
+
sessionId: options.id
|
|
928
|
+
});
|
|
929
|
+
};
|
|
930
|
+
const completeInternal = async (result) => {
|
|
931
|
+
clearSilenceTimer();
|
|
932
|
+
const session = await writeSession((currentSession) => {
|
|
933
|
+
if (currentSession.status === "completed") {
|
|
934
|
+
return;
|
|
935
|
+
}
|
|
936
|
+
currentSession.lastActivityAt = Date.now();
|
|
937
|
+
currentSession.status = "completed";
|
|
938
|
+
if (result !== undefined && currentSession.turns.length > 0) {
|
|
939
|
+
const lastTurn = currentSession.turns.at(-1);
|
|
940
|
+
if (lastTurn) {
|
|
941
|
+
setTurnResult(currentSession, lastTurn.id, {
|
|
942
|
+
result
|
|
943
|
+
});
|
|
944
|
+
}
|
|
945
|
+
}
|
|
946
|
+
});
|
|
947
|
+
await send({
|
|
948
|
+
sessionId: options.id,
|
|
949
|
+
type: "complete"
|
|
950
|
+
});
|
|
951
|
+
await closeAdapter("complete");
|
|
952
|
+
speechDetected = false;
|
|
953
|
+
rewindFallbackTurnAudio();
|
|
954
|
+
await options.route.onComplete({
|
|
955
|
+
api,
|
|
956
|
+
context: options.context,
|
|
957
|
+
session
|
|
958
|
+
});
|
|
374
959
|
};
|
|
375
960
|
const handleError = async (event) => {
|
|
376
961
|
await send({
|
|
@@ -379,18 +964,273 @@ var createVoiceSession = (options) => {
|
|
|
379
964
|
type: "error"
|
|
380
965
|
});
|
|
381
966
|
if (!event.recoverable) {
|
|
382
|
-
await
|
|
967
|
+
await failInternal(event.error);
|
|
383
968
|
}
|
|
384
969
|
};
|
|
385
970
|
const handleClose = async (event) => {
|
|
386
971
|
if (event.recoverable === false) {
|
|
387
|
-
await
|
|
972
|
+
await failInternal(new Error(event.reason ?? "Speech-to-text session closed"));
|
|
973
|
+
return;
|
|
974
|
+
}
|
|
975
|
+
if (!event.reason) {
|
|
976
|
+
await closeAdapter("provider stream closed");
|
|
977
|
+
return;
|
|
978
|
+
}
|
|
979
|
+
await closeAdapter(event.reason);
|
|
980
|
+
};
|
|
981
|
+
const rewindFallbackTurnAudio = () => {
|
|
982
|
+
fallbackAttemptsForCurrentTurn = 0;
|
|
983
|
+
currentTurnAudio.length = 0;
|
|
984
|
+
};
|
|
985
|
+
const runFallbackTranscription = async (primaryText, primaryTranscripts) => {
|
|
986
|
+
if (!sttFallback?.adapter || fallbackAttemptsForCurrentTurn >= sttFallback.maxAttemptsPerTurn) {
|
|
987
|
+
return null;
|
|
988
|
+
}
|
|
989
|
+
const candidate = {
|
|
990
|
+
text: primaryText,
|
|
991
|
+
transcripts: primaryTranscripts
|
|
992
|
+
};
|
|
993
|
+
if (!isFallbackNeeded(candidate, sttFallback)) {
|
|
994
|
+
return null;
|
|
995
|
+
}
|
|
996
|
+
fallbackAttemptsForCurrentTurn += 1;
|
|
997
|
+
const replayAudio = getFallbackWindowAudio();
|
|
998
|
+
if (replayAudio.length === 0) {
|
|
999
|
+
return null;
|
|
1000
|
+
}
|
|
1001
|
+
let fallbackSession = null;
|
|
1002
|
+
const fallbackTranscripts = [];
|
|
1003
|
+
let fallbackClosed = false;
|
|
1004
|
+
let fallbackEndOfTurnReceived = false;
|
|
1005
|
+
let fallbackFinalReceived = false;
|
|
1006
|
+
let lastFallbackTranscriptAt = 0;
|
|
1007
|
+
try {
|
|
1008
|
+
fallbackSession = await sttFallback.adapter.open({
|
|
1009
|
+
format: DEFAULT_FORMAT,
|
|
1010
|
+
phraseHints,
|
|
1011
|
+
sessionId: `${options.id}:fallback:${fallbackAttemptsForCurrentTurn}`
|
|
1012
|
+
});
|
|
1013
|
+
} catch (error) {
|
|
1014
|
+
logger.warn("voice stt fallback open failed", {
|
|
1015
|
+
error: toError(error).message,
|
|
1016
|
+
sessionId: options.id
|
|
1017
|
+
});
|
|
1018
|
+
return null;
|
|
1019
|
+
}
|
|
1020
|
+
const unsubscribers = [
|
|
1021
|
+
fallbackSession.on("final", ({ transcript }) => {
|
|
1022
|
+
fallbackFinalReceived = true;
|
|
1023
|
+
lastFallbackTranscriptAt = Date.now();
|
|
1024
|
+
fallbackTranscripts.push(cloneTranscript(transcript));
|
|
1025
|
+
}),
|
|
1026
|
+
fallbackSession.on("partial", ({ transcript }) => {
|
|
1027
|
+
lastFallbackTranscriptAt = Date.now();
|
|
1028
|
+
fallbackTranscripts.push(cloneTranscript(transcript));
|
|
1029
|
+
}),
|
|
1030
|
+
fallbackSession.on("endOfTurn", () => {
|
|
1031
|
+
fallbackEndOfTurnReceived = true;
|
|
1032
|
+
}),
|
|
1033
|
+
fallbackSession.on("error", (event) => {
|
|
1034
|
+
logger.warn("voice stt fallback error", {
|
|
1035
|
+
error: toError(event.error).message,
|
|
1036
|
+
sessionId: options.id
|
|
1037
|
+
});
|
|
1038
|
+
}),
|
|
1039
|
+
fallbackSession.on("close", () => {
|
|
1040
|
+
fallbackClosed = true;
|
|
1041
|
+
})
|
|
1042
|
+
];
|
|
1043
|
+
const closeFallback = async (reason) => {
|
|
1044
|
+
if (!fallbackSession) {
|
|
1045
|
+
return;
|
|
1046
|
+
}
|
|
1047
|
+
try {
|
|
1048
|
+
await fallbackSession.close(reason);
|
|
1049
|
+
} catch (error) {
|
|
1050
|
+
logger.warn("voice stt fallback close failed", {
|
|
1051
|
+
error: toError(error).message,
|
|
1052
|
+
sessionId: options.id
|
|
1053
|
+
});
|
|
1054
|
+
} finally {
|
|
1055
|
+
fallbackSession = null;
|
|
1056
|
+
}
|
|
1057
|
+
};
|
|
1058
|
+
try {
|
|
1059
|
+
for (const chunk of replayAudio) {
|
|
1060
|
+
await fallbackSession.send(chunk);
|
|
1061
|
+
}
|
|
1062
|
+
const replayDurationMs = getBufferedAudioDurationMs(replayAudio);
|
|
1063
|
+
const completionTimeoutMs = Math.max(sttFallback.completionTimeoutMs, Math.min(4000, Math.max(sttFallback.settleMs * 4, Math.round(replayDurationMs * 0.18))));
|
|
1064
|
+
const waitStartedAt = Date.now();
|
|
1065
|
+
while (Date.now() - waitStartedAt < completionTimeoutMs) {
|
|
1066
|
+
const idleMs = lastFallbackTranscriptAt > 0 ? Date.now() - lastFallbackTranscriptAt : Date.now() - waitStartedAt;
|
|
1067
|
+
if (fallbackEndOfTurnReceived && idleMs >= sttFallback.settleMs) {
|
|
1068
|
+
break;
|
|
1069
|
+
}
|
|
1070
|
+
if (fallbackFinalReceived && idleMs >= sttFallback.settleMs) {
|
|
1071
|
+
break;
|
|
1072
|
+
}
|
|
1073
|
+
if (fallbackClosed && (lastFallbackTranscriptAt === 0 || idleMs >= sttFallback.settleMs)) {
|
|
1074
|
+
break;
|
|
1075
|
+
}
|
|
1076
|
+
await Bun.sleep(Math.min(75, Math.max(25, sttFallback.settleMs / 2)));
|
|
1077
|
+
}
|
|
1078
|
+
} catch (error) {
|
|
1079
|
+
logger.warn("voice stt fallback failed", {
|
|
1080
|
+
error: toError(error).message,
|
|
1081
|
+
sessionId: options.id
|
|
1082
|
+
});
|
|
1083
|
+
} finally {
|
|
1084
|
+
await closeFallback("fallback-complete");
|
|
1085
|
+
for (const unsubscribe of unsubscribers) {
|
|
1086
|
+
unsubscribe();
|
|
1087
|
+
}
|
|
1088
|
+
}
|
|
1089
|
+
if (fallbackTranscripts.length === 0) {
|
|
1090
|
+
return null;
|
|
1091
|
+
}
|
|
1092
|
+
const fallbackText = buildTurnText(fallbackTranscripts, "", {});
|
|
1093
|
+
const fallbackConfidence = calculateMeanConfidence(fallbackTranscripts);
|
|
1094
|
+
const fallbackCandidate = {
|
|
1095
|
+
confidence: fallbackConfidence,
|
|
1096
|
+
text: fallbackText,
|
|
1097
|
+
wordCount: countWords2(normalizeText2(fallbackText))
|
|
1098
|
+
};
|
|
1099
|
+
const primaryCandidate = {
|
|
1100
|
+
confidence: calculateMeanConfidence(primaryTranscripts),
|
|
1101
|
+
text: primaryText,
|
|
1102
|
+
wordCount: countWords2(normalizeText2(primaryText))
|
|
1103
|
+
};
|
|
1104
|
+
const selection = selectBetterTurnText(primaryCandidate, fallbackCandidate);
|
|
1105
|
+
const diagnostics = {
|
|
1106
|
+
attempted: true,
|
|
1107
|
+
fallbackConfidence: fallbackCandidate.confidence,
|
|
1108
|
+
fallbackText: fallbackCandidate.text,
|
|
1109
|
+
fallbackWordCount: fallbackCandidate.wordCount,
|
|
1110
|
+
primaryConfidence: primaryCandidate.confidence,
|
|
1111
|
+
primaryText,
|
|
1112
|
+
primaryWordCount: primaryCandidate.wordCount,
|
|
1113
|
+
selected: selection.winner.text === fallbackCandidate.text,
|
|
1114
|
+
selectionReason: selection.reason,
|
|
1115
|
+
trigger: sttFallback.trigger
|
|
1116
|
+
};
|
|
1117
|
+
if (selection.winner.text === primaryCandidate.text) {
|
|
1118
|
+
return {
|
|
1119
|
+
diagnostics,
|
|
1120
|
+
fallbackUsed: false,
|
|
1121
|
+
source: "primary",
|
|
1122
|
+
text: primaryText,
|
|
1123
|
+
transcripts: primaryTranscripts.map((transcript) => ({
|
|
1124
|
+
...transcript,
|
|
1125
|
+
isFinal: true
|
|
1126
|
+
}))
|
|
1127
|
+
};
|
|
1128
|
+
}
|
|
1129
|
+
const candidateTranscripts = fallbackText === fallbackCandidate.text ? fallbackTranscripts : [];
|
|
1130
|
+
return {
|
|
1131
|
+
diagnostics,
|
|
1132
|
+
fallbackUsed: true,
|
|
1133
|
+
source: "fallback",
|
|
1134
|
+
text: selection.winner.text,
|
|
1135
|
+
transcripts: candidateTranscripts.length > 0 ? candidateTranscripts.map((transcript) => ({
|
|
1136
|
+
...transcript,
|
|
1137
|
+
isFinal: true
|
|
1138
|
+
})) : [{ id: createId(), isFinal: false, text: selection.winner.text }]
|
|
1139
|
+
};
|
|
1140
|
+
};
|
|
1141
|
+
const getFinalTranscriptIds = (transcripts) => {
|
|
1142
|
+
const finalTranscriptIds = transcripts.filter((transcript) => transcript.isFinal).map((transcript) => transcript.id);
|
|
1143
|
+
const fallbackIds = transcripts.map((transcript) => transcript.id);
|
|
1144
|
+
return finalTranscriptIds.length > 0 ? finalTranscriptIds : fallbackIds;
|
|
1145
|
+
};
|
|
1146
|
+
const runTurnCorrection = async (input) => {
|
|
1147
|
+
if (!options.route.correctTurn) {
|
|
1148
|
+
return;
|
|
388
1149
|
}
|
|
1150
|
+
const originalText = input.text;
|
|
1151
|
+
const result = await options.route.correctTurn({
|
|
1152
|
+
api,
|
|
1153
|
+
context: options.context,
|
|
1154
|
+
fallback: input.fallbackDiagnostics,
|
|
1155
|
+
phraseHints,
|
|
1156
|
+
session: input.session,
|
|
1157
|
+
text: originalText,
|
|
1158
|
+
transcripts: input.transcripts.map(cloneTranscript)
|
|
1159
|
+
});
|
|
1160
|
+
const nextText = typeof result === "string" ? result : typeof result?.text === "string" ? result.text : originalText;
|
|
1161
|
+
const correctedText = normalizeCorrectionText(nextText);
|
|
1162
|
+
const normalizedOriginal = normalizeCorrectionText(originalText);
|
|
1163
|
+
return {
|
|
1164
|
+
diagnostics: {
|
|
1165
|
+
attempted: true,
|
|
1166
|
+
changed: correctedText.length > 0 && correctedText !== normalizedOriginal,
|
|
1167
|
+
correctedText: correctedText.length > 0 ? correctedText : normalizedOriginal,
|
|
1168
|
+
metadata: typeof result === "object" ? result.metadata : undefined,
|
|
1169
|
+
originalText,
|
|
1170
|
+
provider: typeof result === "object" ? result.provider : undefined,
|
|
1171
|
+
reason: typeof result === "object" ? result.reason : undefined
|
|
1172
|
+
},
|
|
1173
|
+
text: correctedText.length > 0 ? correctedText : originalText
|
|
1174
|
+
};
|
|
1175
|
+
};
|
|
1176
|
+
const ensureCommittedTurnGuard = (session) => {
|
|
1177
|
+
if (!session.lastCommittedTurn) {
|
|
1178
|
+
session.lastCommittedTurn = {
|
|
1179
|
+
committedAt: 0,
|
|
1180
|
+
signature: "",
|
|
1181
|
+
text: "",
|
|
1182
|
+
transcriptIds: []
|
|
1183
|
+
};
|
|
1184
|
+
}
|
|
1185
|
+
return session;
|
|
1186
|
+
};
|
|
1187
|
+
const buildTurnSignature = (session, finalText, transcriptIdsOverride) => {
|
|
1188
|
+
const finalTranscriptIds = transcriptIdsOverride ?? getFinalTranscriptIds(session.currentTurn.transcripts);
|
|
1189
|
+
return `${normalizeText2(finalText)}|${finalTranscriptIds.join(",")}`;
|
|
1190
|
+
};
|
|
1191
|
+
const isDuplicateTurnCommit = (session, finalText) => {
|
|
1192
|
+
const signature = buildTurnSignature(session, finalText);
|
|
1193
|
+
const committedTurn = session.lastCommittedTurn;
|
|
1194
|
+
const isRecent = committedTurn && committedTurn.committedAt > 0 && Date.now() - committedTurn.committedAt < DEFAULT_DUPLICATE_TURN_WINDOW_MS;
|
|
1195
|
+
const committedSignature = committedTurn?.signature ?? "";
|
|
1196
|
+
const committedTranscriptIds = committedTurn?.transcriptIds ?? [];
|
|
1197
|
+
const committedText = normalizeText2(committedTurn?.text ?? "");
|
|
1198
|
+
const isSameText = normalizeText2(finalText) === committedText;
|
|
1199
|
+
const hasNoNewAudioSinceCommit = (session.currentTurn.lastAudioAt ?? 0) <= (committedTurn?.committedAt ?? 0);
|
|
1200
|
+
if (!isRecent) {
|
|
1201
|
+
return false;
|
|
1202
|
+
}
|
|
1203
|
+
if (isSameText && hasNoNewAudioSinceCommit) {
|
|
1204
|
+
return true;
|
|
1205
|
+
}
|
|
1206
|
+
if (signature !== committedSignature) {
|
|
1207
|
+
return false;
|
|
1208
|
+
}
|
|
1209
|
+
const lastSignatureIds = new Set(committedTranscriptIds);
|
|
1210
|
+
const hasNoNewFinalIds = session.currentTurn.transcripts.every((transcript) => !transcript.isFinal || lastSignatureIds.has(transcript.id));
|
|
1211
|
+
return isRecent && hasNoNewFinalIds;
|
|
1212
|
+
};
|
|
1213
|
+
const markTurnCommitted = (session, finalText, committedTranscripts) => {
|
|
1214
|
+
session.lastCommittedTurn = {
|
|
1215
|
+
...session.lastCommittedTurn ?? {},
|
|
1216
|
+
committedAt: Date.now(),
|
|
1217
|
+
signature: buildTurnSignature(session, finalText, getFinalTranscriptIds(committedTranscripts)),
|
|
1218
|
+
text: normalizeText2(finalText),
|
|
1219
|
+
transcriptIds: getFinalTranscriptIds(committedTranscripts)
|
|
1220
|
+
};
|
|
389
1221
|
};
|
|
390
1222
|
const handlePartial = async (transcript) => {
|
|
391
1223
|
await writeSession((session) => {
|
|
392
|
-
session.currentTurn.
|
|
393
|
-
|
|
1224
|
+
const nextPartialStartedAt = transcript.startedAtMs ?? session.currentTurn.partialStartedAt;
|
|
1225
|
+
const nextPartialEndedAt = transcript.endedAtMs ?? session.currentTurn.partialEndedAt;
|
|
1226
|
+
const preferredPartial = selectPreferredTranscriptText(session.currentTurn.partialText, transcript.text);
|
|
1227
|
+
session.currentTurn.lastTranscriptAt = Date.now();
|
|
1228
|
+
session.currentTurn.partialStartedAt = nextPartialStartedAt;
|
|
1229
|
+
session.currentTurn.partialEndedAt = nextPartialEndedAt;
|
|
1230
|
+
session.currentTurn.partialText = buildTurnText(session.currentTurn.transcripts, preferredPartial, {
|
|
1231
|
+
partialEndedAtMs: nextPartialEndedAt,
|
|
1232
|
+
partialStartedAtMs: nextPartialStartedAt
|
|
1233
|
+
});
|
|
394
1234
|
session.lastActivityAt = Date.now();
|
|
395
1235
|
session.status = "active";
|
|
396
1236
|
});
|
|
@@ -412,8 +1252,11 @@ var createVoiceSession = (options) => {
|
|
|
412
1252
|
cloneTranscript(transcript)
|
|
413
1253
|
];
|
|
414
1254
|
}
|
|
415
|
-
session.currentTurn.finalText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText
|
|
416
|
-
|
|
1255
|
+
session.currentTurn.finalText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
1256
|
+
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
1257
|
+
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
1258
|
+
});
|
|
1259
|
+
session.currentTurn.lastTranscriptAt = Date.now();
|
|
417
1260
|
session.lastActivityAt = Date.now();
|
|
418
1261
|
session.status = "active";
|
|
419
1262
|
});
|
|
@@ -422,36 +1265,60 @@ var createVoiceSession = (options) => {
|
|
|
422
1265
|
type: "final"
|
|
423
1266
|
});
|
|
424
1267
|
};
|
|
1268
|
+
const resumePendingTurnCommit = (session) => {
|
|
1269
|
+
const pendingText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
1270
|
+
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
1271
|
+
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
1272
|
+
});
|
|
1273
|
+
if (!pendingText) {
|
|
1274
|
+
speechDetected = false;
|
|
1275
|
+
return;
|
|
1276
|
+
}
|
|
1277
|
+
speechDetected = true;
|
|
1278
|
+
const audioAge = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : session.currentTurn.lastSpeechAt !== undefined ? Date.now() - session.currentTurn.lastSpeechAt : 0;
|
|
1279
|
+
const transcriptAge = session.currentTurn.lastTranscriptAt !== undefined ? Date.now() - session.currentTurn.lastTranscriptAt : turnDetection.transcriptStabilityMs;
|
|
1280
|
+
const delayMs = Math.max(0, turnDetection.silenceMs - audioAge, turnDetection.transcriptStabilityMs - transcriptAge);
|
|
1281
|
+
scheduleSilenceCommit(delayMs);
|
|
1282
|
+
};
|
|
425
1283
|
const ensureAdapter = async () => {
|
|
426
1284
|
if (sttSession) {
|
|
427
1285
|
return sttSession;
|
|
428
1286
|
}
|
|
429
|
-
|
|
430
|
-
format:
|
|
431
|
-
|
|
432
|
-
container: "raw",
|
|
433
|
-
encoding: "pcm_s16le",
|
|
434
|
-
sampleRateHz: 16000
|
|
435
|
-
},
|
|
1287
|
+
const openedSession = await options.stt.open({
|
|
1288
|
+
format: DEFAULT_FORMAT,
|
|
1289
|
+
phraseHints,
|
|
436
1290
|
sessionId: options.id
|
|
437
1291
|
});
|
|
438
|
-
|
|
439
|
-
|
|
1292
|
+
const generation = ++adapterGenerationCounter;
|
|
1293
|
+
sttSession = openedSession;
|
|
1294
|
+
activeAdapterGeneration = generation;
|
|
1295
|
+
const runAdapterEvent = (phase, handler) => {
|
|
1296
|
+
runSerial(phase, async () => {
|
|
1297
|
+
if (activeAdapterGeneration !== generation) {
|
|
1298
|
+
return;
|
|
1299
|
+
}
|
|
1300
|
+
await handler();
|
|
1301
|
+
});
|
|
1302
|
+
};
|
|
1303
|
+
openedSession.on("partial", ({ transcript }) => {
|
|
1304
|
+
runAdapterEvent("adapter.partial", () => handlePartial(transcript));
|
|
440
1305
|
});
|
|
441
|
-
|
|
442
|
-
handleFinal(transcript);
|
|
1306
|
+
openedSession.on("final", ({ transcript }) => {
|
|
1307
|
+
runAdapterEvent("adapter.final", () => handleFinal(transcript));
|
|
443
1308
|
});
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
1309
|
+
openedSession.on("endOfTurn", ({ reason }) => {
|
|
1310
|
+
runAdapterEvent("adapter.endOfTurn", async () => {
|
|
1311
|
+
clearSilenceTimer();
|
|
1312
|
+
await requestTurnCommit(reason);
|
|
1313
|
+
});
|
|
447
1314
|
});
|
|
448
|
-
|
|
449
|
-
handleError(event);
|
|
1315
|
+
openedSession.on("error", (event) => {
|
|
1316
|
+
runAdapterEvent("adapter.error", () => handleError(event));
|
|
450
1317
|
});
|
|
451
|
-
|
|
452
|
-
handleClose(event);
|
|
1318
|
+
openedSession.on("close", (event) => {
|
|
1319
|
+
runAdapterEvent("adapter.close", () => handleClose(event));
|
|
453
1320
|
});
|
|
454
|
-
return
|
|
1321
|
+
return openedSession;
|
|
455
1322
|
};
|
|
456
1323
|
const completeTurn = async (session, turn) => {
|
|
457
1324
|
const output = await options.route.onTurn({
|
|
@@ -480,207 +1347,267 @@ var createVoiceSession = (options) => {
|
|
|
480
1347
|
});
|
|
481
1348
|
}
|
|
482
1349
|
if (output?.complete) {
|
|
483
|
-
await
|
|
1350
|
+
await completeInternal(output.result);
|
|
484
1351
|
}
|
|
485
1352
|
};
|
|
486
|
-
const
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
1353
|
+
const commitTurnInternal = async (reason = "manual") => {
|
|
1354
|
+
clearSilenceTimer();
|
|
1355
|
+
const session = await readSession();
|
|
1356
|
+
if (session.status === "completed" || session.status === "failed") {
|
|
1357
|
+
return;
|
|
1358
|
+
}
|
|
1359
|
+
const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
1360
|
+
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
1361
|
+
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
1362
|
+
});
|
|
1363
|
+
let transcripts = session.currentTurn.transcripts.length ? session.currentTurn.transcripts.map(cloneTranscript) : [];
|
|
1364
|
+
let finalText = text;
|
|
1365
|
+
const transcriptStabilityAge = session.currentTurn.lastTranscriptAt !== undefined ? Date.now() - session.currentTurn.lastTranscriptAt : undefined;
|
|
1366
|
+
const fallbackSelection = await runFallbackTranscription(text, session.currentTurn.transcripts);
|
|
1367
|
+
const source = fallbackSelection?.source ?? "primary";
|
|
1368
|
+
const fallbackUsed = fallbackSelection?.fallbackUsed ?? false;
|
|
1369
|
+
const fallbackDiagnostics = fallbackSelection?.diagnostics;
|
|
1370
|
+
if (fallbackSelection) {
|
|
1371
|
+
finalText = fallbackSelection.text;
|
|
1372
|
+
transcripts = fallbackSelection.transcripts.length ? fallbackSelection.transcripts.map(cloneTranscript) : transcripts.length ? transcripts : [
|
|
1373
|
+
{
|
|
1374
|
+
id: createId(),
|
|
1375
|
+
isFinal: false,
|
|
1376
|
+
text: finalText
|
|
1377
|
+
}
|
|
1378
|
+
];
|
|
1379
|
+
if (fallbackSelection.fallbackUsed) {
|
|
1380
|
+
logger.info("voice fallback turn selected", {
|
|
1381
|
+
reason,
|
|
1382
|
+
sessionId: options.id,
|
|
1383
|
+
text: finalText
|
|
1384
|
+
});
|
|
502
1385
|
}
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
currentSession.lastActivityAt = Date.now();
|
|
522
|
-
currentSession.status = "active";
|
|
523
|
-
currentSession.turns = [...currentSession.turns, turn];
|
|
524
|
-
});
|
|
525
|
-
speechDetected = false;
|
|
526
|
-
logger.info("voice turn committed", {
|
|
1386
|
+
}
|
|
1387
|
+
const correctionSelection = await runTurnCorrection({
|
|
1388
|
+
fallbackDiagnostics,
|
|
1389
|
+
fallbackUsed,
|
|
1390
|
+
session,
|
|
1391
|
+
source,
|
|
1392
|
+
text: finalText,
|
|
1393
|
+
transcripts
|
|
1394
|
+
});
|
|
1395
|
+
const correctionDiagnostics = correctionSelection?.diagnostics;
|
|
1396
|
+
if (correctionSelection) {
|
|
1397
|
+
finalText = correctionSelection.text;
|
|
1398
|
+
}
|
|
1399
|
+
if (!finalText) {
|
|
1400
|
+
return;
|
|
1401
|
+
}
|
|
1402
|
+
if (isDuplicateTurnCommit(session, finalText)) {
|
|
1403
|
+
logger.debug("voice turn commit deduped", {
|
|
527
1404
|
reason,
|
|
528
|
-
sessionId: options.id
|
|
529
|
-
turnId: turn.id
|
|
530
|
-
});
|
|
531
|
-
await send({
|
|
532
|
-
turn,
|
|
533
|
-
type: "turn"
|
|
1405
|
+
sessionId: options.id
|
|
534
1406
|
});
|
|
535
|
-
|
|
536
|
-
}
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
}
|
|
1407
|
+
return;
|
|
1408
|
+
}
|
|
1409
|
+
if (typeof transcriptStabilityAge === "number" && transcriptStabilityAge < turnDetection.transcriptStabilityMs && reason !== "manual") {
|
|
1410
|
+
scheduleTurnCommit(turnDetection.transcriptStabilityMs - transcriptStabilityAge, reason, false);
|
|
1411
|
+
return;
|
|
1412
|
+
}
|
|
1413
|
+
const turn = {
|
|
1414
|
+
committedAt: Date.now(),
|
|
1415
|
+
id: createId(),
|
|
1416
|
+
text: finalText,
|
|
1417
|
+
quality: createTurnQuality(transcripts, source, fallbackUsed, fallbackDiagnostics, correctionDiagnostics),
|
|
1418
|
+
transcripts: transcripts.length > 0 ? transcripts : [
|
|
1419
|
+
{
|
|
1420
|
+
id: createId(),
|
|
1421
|
+
isFinal: false,
|
|
1422
|
+
text: finalText
|
|
552
1423
|
}
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
1424
|
+
]
|
|
1425
|
+
};
|
|
1426
|
+
const updatedSession = await writeSession((currentSession) => {
|
|
1427
|
+
currentSession.committedTurnIds = [
|
|
1428
|
+
...currentSession.committedTurnIds,
|
|
1429
|
+
turn.id
|
|
1430
|
+
];
|
|
1431
|
+
currentSession.currentTurn = createEmptyCurrentTurn();
|
|
1432
|
+
currentSession.lastActivityAt = Date.now();
|
|
1433
|
+
currentSession.status = "active";
|
|
1434
|
+
currentSession.turns = [...currentSession.turns, turn];
|
|
1435
|
+
markTurnCommitted(currentSession, finalText, transcripts);
|
|
1436
|
+
});
|
|
1437
|
+
speechDetected = false;
|
|
1438
|
+
rewindFallbackTurnAudio();
|
|
1439
|
+
logger.info("voice turn committed", {
|
|
1440
|
+
reason,
|
|
1441
|
+
sessionId: options.id,
|
|
1442
|
+
turnId: turn.id
|
|
1443
|
+
});
|
|
1444
|
+
await send({
|
|
1445
|
+
turn,
|
|
1446
|
+
type: "turn"
|
|
1447
|
+
});
|
|
1448
|
+
if (options.sttLifecycle === "turn-scoped") {
|
|
1449
|
+
await closeAdapter("turn-commit");
|
|
1450
|
+
}
|
|
1451
|
+
await completeTurn(updatedSession, turn);
|
|
1452
|
+
};
|
|
1453
|
+
const connectInternal = async (nextSocket) => {
|
|
1454
|
+
socket = nextSocket;
|
|
1455
|
+
const existingSession = await options.store.get(options.id);
|
|
1456
|
+
let session = existingSession ?? createVoiceSessionRecord(options.id, options.scenarioId);
|
|
1457
|
+
if (options.scenarioId && session.scenarioId !== options.scenarioId) {
|
|
1458
|
+
session.scenarioId = options.scenarioId;
|
|
1459
|
+
}
|
|
1460
|
+
ensureCommittedTurnGuard(session);
|
|
1461
|
+
let shouldFireOnSession = !existingSession;
|
|
1462
|
+
if (existingSession?.scenarioId && options.scenarioId && existingSession.scenarioId !== options.scenarioId) {
|
|
1463
|
+
session = resetVoiceSessionRecord(options.id, existingSession, options.scenarioId);
|
|
1464
|
+
shouldFireOnSession = true;
|
|
1465
|
+
}
|
|
1466
|
+
rewindFallbackTurnAudio();
|
|
1467
|
+
if (existingSession?.status === "reconnecting") {
|
|
1468
|
+
const nextAttempts = existingSession.reconnect.attempts + 1;
|
|
1469
|
+
const reconnectExpired = existingSession.reconnect.lastDisconnectAt !== undefined && Date.now() - existingSession.reconnect.lastDisconnectAt > reconnect.timeout;
|
|
1470
|
+
const tooManyAttempts = nextAttempts > reconnect.maxAttempts;
|
|
1471
|
+
if (reconnect.strategy === "fail" && (reconnectExpired || tooManyAttempts)) {
|
|
1472
|
+
await failInternal(new Error("Voice session reconnect policy exhausted"));
|
|
1473
|
+
return;
|
|
1474
|
+
}
|
|
1475
|
+
if (reconnect.strategy === "restart" && (reconnectExpired || tooManyAttempts)) {
|
|
1476
|
+
session = resetVoiceSessionRecord(options.id, existingSession, options.scenarioId);
|
|
1477
|
+
shouldFireOnSession = true;
|
|
1478
|
+
} else {
|
|
1479
|
+
session = {
|
|
1480
|
+
...existingSession,
|
|
1481
|
+
reconnect: {
|
|
1482
|
+
...existingSession.reconnect,
|
|
1483
|
+
attempts: nextAttempts
|
|
1484
|
+
},
|
|
1485
|
+
status: "active"
|
|
1486
|
+
};
|
|
1487
|
+
}
|
|
1488
|
+
}
|
|
1489
|
+
await options.store.set(options.id, session);
|
|
1490
|
+
await send({
|
|
1491
|
+
sessionId: options.id,
|
|
1492
|
+
status: session.status,
|
|
1493
|
+
scenarioId: session.scenarioId,
|
|
1494
|
+
type: "session"
|
|
1495
|
+
});
|
|
1496
|
+
if (shouldFireOnSession) {
|
|
1497
|
+
await options.route.onSession?.({
|
|
561
1498
|
api,
|
|
562
1499
|
context: options.context,
|
|
563
1500
|
session
|
|
564
1501
|
});
|
|
565
|
-
}
|
|
566
|
-
|
|
567
|
-
socket = nextSocket;
|
|
568
|
-
const existingSession = await options.store.get(options.id);
|
|
569
|
-
let session = existingSession ?? createVoiceSessionRecord(options.id);
|
|
570
|
-
let shouldFireOnSession = !existingSession;
|
|
571
|
-
if (existingSession?.status === "reconnecting") {
|
|
572
|
-
const nextAttempts = existingSession.reconnect.attempts + 1;
|
|
573
|
-
const reconnectExpired = existingSession.reconnect.lastDisconnectAt !== undefined && Date.now() - existingSession.reconnect.lastDisconnectAt > reconnect.timeout;
|
|
574
|
-
const tooManyAttempts = nextAttempts > reconnect.maxAttempts;
|
|
575
|
-
if (reconnect.strategy === "fail" && (reconnectExpired || tooManyAttempts)) {
|
|
576
|
-
await api.fail(new Error("Voice session reconnect policy exhausted"));
|
|
577
|
-
return;
|
|
578
|
-
}
|
|
579
|
-
if (reconnect.strategy === "restart" && (reconnectExpired || tooManyAttempts)) {
|
|
580
|
-
session = resetVoiceSessionRecord(options.id, existingSession);
|
|
581
|
-
shouldFireOnSession = true;
|
|
582
|
-
} else {
|
|
583
|
-
session = {
|
|
584
|
-
...existingSession,
|
|
585
|
-
reconnect: {
|
|
586
|
-
...existingSession.reconnect,
|
|
587
|
-
attempts: nextAttempts
|
|
588
|
-
},
|
|
589
|
-
status: "active"
|
|
590
|
-
};
|
|
591
|
-
}
|
|
592
|
-
}
|
|
593
|
-
await options.store.set(options.id, session);
|
|
1502
|
+
}
|
|
1503
|
+
if (session.status === "completed") {
|
|
594
1504
|
await send({
|
|
595
1505
|
sessionId: options.id,
|
|
596
|
-
|
|
597
|
-
type: "session"
|
|
1506
|
+
type: "complete"
|
|
598
1507
|
});
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
1508
|
+
return;
|
|
1509
|
+
}
|
|
1510
|
+
resumePendingTurnCommit(session);
|
|
1511
|
+
await ensureAdapter();
|
|
1512
|
+
};
|
|
1513
|
+
const disconnectInternal = async (event) => {
|
|
1514
|
+
clearSilenceTimer();
|
|
1515
|
+
await closeAdapter(event?.reason);
|
|
1516
|
+
rewindFallbackTurnAudio();
|
|
1517
|
+
if (reconnect.strategy === "fail") {
|
|
1518
|
+
await failInternal(new Error(event?.reason ?? "Voice socket disconnected"));
|
|
1519
|
+
return;
|
|
1520
|
+
}
|
|
1521
|
+
await writeSession((session) => {
|
|
1522
|
+
if (session.status === "completed" || session.status === "failed") {
|
|
611
1523
|
return;
|
|
612
1524
|
}
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
1525
|
+
session.lastActivityAt = Date.now();
|
|
1526
|
+
session.reconnect.lastDisconnectAt = Date.now();
|
|
1527
|
+
session.status = "reconnecting";
|
|
1528
|
+
});
|
|
1529
|
+
speechDetected = false;
|
|
1530
|
+
};
|
|
1531
|
+
const receiveAudioInternal = async (audio) => {
|
|
1532
|
+
const session = await readSession();
|
|
1533
|
+
if (session.status === "completed" || session.status === "failed") {
|
|
1534
|
+
return;
|
|
1535
|
+
}
|
|
1536
|
+
const adapter = await ensureAdapter();
|
|
1537
|
+
const conditionedAudio = conditionAudioChunk(audio, options.audioConditioning);
|
|
1538
|
+
const audioLevel = measureAudioLevel(conditionedAudio);
|
|
1539
|
+
const shouldStoreAudio = speechDetected || audioLevel >= turnDetection.speechThreshold;
|
|
1540
|
+
await writeSession((currentSession) => {
|
|
1541
|
+
currentSession.currentTurn.lastAudioAt = Date.now();
|
|
1542
|
+
currentSession.lastActivityAt = Date.now();
|
|
1543
|
+
currentSession.status = "active";
|
|
1544
|
+
if (audioLevel >= turnDetection.speechThreshold) {
|
|
1545
|
+
currentSession.currentTurn.lastSpeechAt = Date.now();
|
|
1546
|
+
currentSession.currentTurn.silenceStartedAt = undefined;
|
|
1547
|
+
} else if (speechDetected && currentSession.currentTurn.silenceStartedAt === undefined) {
|
|
1548
|
+
currentSession.currentTurn.silenceStartedAt = Date.now();
|
|
621
1549
|
}
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
session.status = "reconnecting";
|
|
629
|
-
});
|
|
630
|
-
speechDetected = false;
|
|
631
|
-
},
|
|
632
|
-
fail: async (error) => {
|
|
1550
|
+
});
|
|
1551
|
+
if (shouldStoreAudio) {
|
|
1552
|
+
pushTurnAudio(conditionedAudio);
|
|
1553
|
+
}
|
|
1554
|
+
if (audioLevel >= turnDetection.speechThreshold) {
|
|
1555
|
+
speechDetected = true;
|
|
633
1556
|
clearSilenceTimer();
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
type: "error"
|
|
643
|
-
});
|
|
644
|
-
await closeAdapter("failed");
|
|
645
|
-
speechDetected = false;
|
|
646
|
-
await options.route.onError?.({
|
|
647
|
-
api,
|
|
648
|
-
context: options.context,
|
|
649
|
-
error: resolvedError,
|
|
650
|
-
session,
|
|
651
|
-
sessionId: options.id
|
|
652
|
-
});
|
|
653
|
-
},
|
|
654
|
-
receiveAudio: async (audio) => {
|
|
655
|
-
const session = await readSession();
|
|
656
|
-
if (session.status === "completed" || session.status === "failed") {
|
|
657
|
-
return;
|
|
1557
|
+
} else if (speechDetected) {
|
|
1558
|
+
const currentSession = await readSession();
|
|
1559
|
+
const hasTurnText = Boolean(buildTurnText(currentSession.currentTurn.transcripts, currentSession.currentTurn.partialText, {
|
|
1560
|
+
partialEndedAtMs: currentSession.currentTurn.partialEndedAt,
|
|
1561
|
+
partialStartedAtMs: currentSession.currentTurn.partialStartedAt
|
|
1562
|
+
}));
|
|
1563
|
+
if (hasTurnText) {
|
|
1564
|
+
scheduleSilenceCommit(turnDetection.silenceMs, false);
|
|
658
1565
|
}
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
if (audioLevel >= turnDetection.speechThreshold) {
|
|
667
|
-
speechDetected = true;
|
|
1566
|
+
}
|
|
1567
|
+
await adapter.send(conditionedAudio);
|
|
1568
|
+
};
|
|
1569
|
+
const api = {
|
|
1570
|
+
id: options.id,
|
|
1571
|
+
close: async (reason) => {
|
|
1572
|
+
await runSerial("api.close", async () => {
|
|
668
1573
|
clearSilenceTimer();
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
if (hasTurnText) {
|
|
673
|
-
scheduleSilenceCommit();
|
|
674
|
-
}
|
|
675
|
-
}
|
|
676
|
-
await adapter.send(audio);
|
|
1574
|
+
await closeAdapter(reason);
|
|
1575
|
+
await Promise.resolve(socket.close(1000, reason));
|
|
1576
|
+
});
|
|
677
1577
|
},
|
|
678
|
-
|
|
1578
|
+
commitTurn: async (reason = "manual") => runSerial("api.commitTurn", async () => {
|
|
1579
|
+
await commitTurnInternal(reason);
|
|
1580
|
+
}),
|
|
1581
|
+
complete: async (result) => runSerial("api.complete", async () => {
|
|
1582
|
+
await completeInternal(result);
|
|
1583
|
+
}),
|
|
1584
|
+
connect: async (nextSocket) => runSerial("api.connect", async () => {
|
|
1585
|
+
await connectInternal(nextSocket);
|
|
1586
|
+
}),
|
|
1587
|
+
disconnect: async (event) => runSerial("api.disconnect", async () => {
|
|
1588
|
+
await disconnectInternal(event);
|
|
1589
|
+
}),
|
|
1590
|
+
fail: async (error) => runSerial("api.fail", async () => {
|
|
1591
|
+
await failInternal(error);
|
|
1592
|
+
}),
|
|
1593
|
+
receiveAudio: async (audio) => runSerial("api.receiveAudio", async () => {
|
|
1594
|
+
await receiveAudioInternal(audio);
|
|
1595
|
+
}),
|
|
1596
|
+
snapshot: async () => runSerial("api.snapshot", async () => readSession())
|
|
679
1597
|
};
|
|
680
1598
|
return api;
|
|
681
1599
|
};
|
|
682
1600
|
|
|
683
1601
|
// src/plugin.ts
|
|
1602
|
+
var resolveQueryScenario = (query) => {
|
|
1603
|
+
if (typeof query?.scenarioId === "string" && query.scenarioId.trim()) {
|
|
1604
|
+
return query.scenarioId.trim();
|
|
1605
|
+
}
|
|
1606
|
+
if (typeof query?.mode === "string" && query.mode.trim()) {
|
|
1607
|
+
return query.mode.trim();
|
|
1608
|
+
}
|
|
1609
|
+
return null;
|
|
1610
|
+
};
|
|
684
1611
|
var HTMX_BOOTSTRAP_DIST_CANDIDATES = [
|
|
685
1612
|
resolve(import.meta.dir, "client", "htmxBootstrap.js"),
|
|
686
1613
|
resolve(import.meta.dir, "..", "dist", "client", "htmxBootstrap.js")
|
|
@@ -727,6 +1654,21 @@ ${log}` : ""}`);
|
|
|
727
1654
|
};
|
|
728
1655
|
})();
|
|
729
1656
|
var isArrayBufferView = (value) => typeof value === "object" && value !== null && ArrayBuffer.isView(value);
|
|
1657
|
+
var resolveSTTFallbackConfig = (config) => {
|
|
1658
|
+
if (!config) {
|
|
1659
|
+
return;
|
|
1660
|
+
}
|
|
1661
|
+
return {
|
|
1662
|
+
adapter: config.adapter,
|
|
1663
|
+
completionTimeoutMs: config.completionTimeoutMs ?? 2500,
|
|
1664
|
+
confidenceThreshold: config.confidenceThreshold ?? 0.6,
|
|
1665
|
+
maxAttemptsPerTurn: config.maxAttemptsPerTurn ?? 1,
|
|
1666
|
+
minTextLength: config.minTextLength ?? 2,
|
|
1667
|
+
replayWindowMs: config.replayWindowMs ?? 8000,
|
|
1668
|
+
settleMs: config.settleMs ?? 220,
|
|
1669
|
+
trigger: config.trigger ?? "empty-or-low-confidence"
|
|
1670
|
+
};
|
|
1671
|
+
};
|
|
730
1672
|
var isVoiceClientMessage = (value) => {
|
|
731
1673
|
if (!value || typeof value !== "object" || !("type" in value)) {
|
|
732
1674
|
return false;
|
|
@@ -739,7 +1681,7 @@ var isVoiceClientMessage = (value) => {
|
|
|
739
1681
|
case "ping":
|
|
740
1682
|
return true;
|
|
741
1683
|
case "start":
|
|
742
|
-
return !("sessionId" in value) || typeof value.sessionId === "string";
|
|
1684
|
+
return (!("sessionId" in value) || typeof value.sessionId === "string") && (!("scenarioId" in value) || typeof value.scenarioId === "string");
|
|
743
1685
|
default:
|
|
744
1686
|
return false;
|
|
745
1687
|
}
|
|
@@ -759,14 +1701,16 @@ var parseClientMessage = (raw) => {
|
|
|
759
1701
|
return null;
|
|
760
1702
|
};
|
|
761
1703
|
var resolveSessionId = (runtime, ws) => {
|
|
762
|
-
const existing = runtime.socketSessions.get(ws);
|
|
763
|
-
if (existing) {
|
|
764
|
-
return existing;
|
|
765
|
-
}
|
|
766
1704
|
const query = ws.data && typeof ws.data === "object" && "query" in ws.data ? ws.data.query : undefined;
|
|
767
|
-
const
|
|
768
|
-
|
|
769
|
-
|
|
1705
|
+
const existing = runtime.socketSessions.get(ws);
|
|
1706
|
+
const providedSessionId = typeof query?.sessionId === "string" && query.sessionId.trim() ? query.sessionId.trim() : existing?.sessionId ?? createId();
|
|
1707
|
+
const scenarioId = resolveQueryScenario(query) ?? existing?.scenarioId ?? null;
|
|
1708
|
+
const resolved = {
|
|
1709
|
+
sessionId: providedSessionId,
|
|
1710
|
+
scenarioId
|
|
1711
|
+
};
|
|
1712
|
+
runtime.socketSessions.set(ws, resolved);
|
|
1713
|
+
return resolved;
|
|
770
1714
|
};
|
|
771
1715
|
var toAudioChunk = (raw) => {
|
|
772
1716
|
if (raw instanceof ArrayBuffer) {
|
|
@@ -792,6 +1736,38 @@ var normalizeOnTurn = (handler) => {
|
|
|
792
1736
|
}
|
|
793
1737
|
return handler;
|
|
794
1738
|
};
|
|
1739
|
+
var resolveSessionOptions = (config) => {
|
|
1740
|
+
const preset = resolveVoiceRuntimePreset(config.preset);
|
|
1741
|
+
return {
|
|
1742
|
+
audioConditioning: config.audioConditioning !== undefined ? resolveAudioConditioningConfig(config.audioConditioning) : preset.audioConditioning,
|
|
1743
|
+
sttFallback: resolveSTTFallbackConfig(config.sttFallback),
|
|
1744
|
+
logger: config.logger,
|
|
1745
|
+
reconnect: {
|
|
1746
|
+
maxAttempts: config.reconnect?.maxAttempts ?? 10,
|
|
1747
|
+
strategy: config.reconnect?.strategy ?? "resume-last-turn",
|
|
1748
|
+
timeout: config.reconnect?.timeout ?? 30000
|
|
1749
|
+
},
|
|
1750
|
+
sttLifecycle: config.sttLifecycle ?? preset.sttLifecycle,
|
|
1751
|
+
turnDetection: resolveTurnDetectionConfig({
|
|
1752
|
+
...preset.turnDetection,
|
|
1753
|
+
...config.turnDetection
|
|
1754
|
+
})
|
|
1755
|
+
};
|
|
1756
|
+
};
|
|
1757
|
+
var normalizePhraseHints = (hints) => (hints ?? []).map((hint) => ({
|
|
1758
|
+
...hint,
|
|
1759
|
+
aliases: hint.aliases?.filter((value) => typeof value === "string" && value.trim().length > 0),
|
|
1760
|
+
text: hint.text.trim()
|
|
1761
|
+
})).filter((hint) => hint.text.length > 0);
|
|
1762
|
+
var resolvePhraseHints = async (config, input) => {
|
|
1763
|
+
if (!config.phraseHints) {
|
|
1764
|
+
return [];
|
|
1765
|
+
}
|
|
1766
|
+
if (typeof config.phraseHints === "function") {
|
|
1767
|
+
return normalizePhraseHints(await config.phraseHints(input));
|
|
1768
|
+
}
|
|
1769
|
+
return normalizePhraseHints(config.phraseHints);
|
|
1770
|
+
};
|
|
795
1771
|
var voice = (config) => {
|
|
796
1772
|
const runtime = {
|
|
797
1773
|
activeSessions: new Map,
|
|
@@ -799,11 +1775,42 @@ var voice = (config) => {
|
|
|
799
1775
|
socketSessions: new WeakMap
|
|
800
1776
|
};
|
|
801
1777
|
const onTurn = normalizeOnTurn(config.onTurn);
|
|
1778
|
+
const sessionOptions = resolveSessionOptions(config);
|
|
802
1779
|
const htmxOptions = config.htmx && typeof config.htmx === "object" ? config.htmx : undefined;
|
|
803
1780
|
const htmxRoute = htmxOptions?.route ?? `${config.path}/htmx/session`;
|
|
804
1781
|
const htmxBootstrapRoute = htmxOptions?.bootstrapRoute ?? `${config.path}/htmx/bootstrap.js`;
|
|
805
1782
|
const htmxRenderers = resolveVoiceHTMXRenderers(config.htmx && config.htmx !== true ? config.htmx : undefined);
|
|
806
1783
|
const htmxTargets = resolveVoiceHTMXTargets(htmxOptions?.targets);
|
|
1784
|
+
const createManagedSession = async (ws, sessionId, scenarioId) => {
|
|
1785
|
+
const context = ws.data;
|
|
1786
|
+
const phraseHints = await resolvePhraseHints(config, {
|
|
1787
|
+
context,
|
|
1788
|
+
scenarioId,
|
|
1789
|
+
sessionId
|
|
1790
|
+
});
|
|
1791
|
+
return createVoiceSession({
|
|
1792
|
+
audioConditioning: sessionOptions.audioConditioning,
|
|
1793
|
+
context,
|
|
1794
|
+
id: sessionId,
|
|
1795
|
+
logger: sessionOptions.logger,
|
|
1796
|
+
phraseHints,
|
|
1797
|
+
reconnect: sessionOptions.reconnect,
|
|
1798
|
+
route: {
|
|
1799
|
+
correctTurn: config.correctTurn,
|
|
1800
|
+
onComplete: config.onComplete,
|
|
1801
|
+
onError: config.onError,
|
|
1802
|
+
onSession: config.onSession,
|
|
1803
|
+
onTurn
|
|
1804
|
+
},
|
|
1805
|
+
scenarioId,
|
|
1806
|
+
socket: createSocketAdapter(ws),
|
|
1807
|
+
store: config.session,
|
|
1808
|
+
stt: config.stt,
|
|
1809
|
+
sttFallback: sessionOptions.sttFallback,
|
|
1810
|
+
sttLifecycle: sessionOptions.sttLifecycle,
|
|
1811
|
+
turnDetection: sessionOptions.turnDetection
|
|
1812
|
+
});
|
|
1813
|
+
};
|
|
807
1814
|
const htmxRoutes = () => {
|
|
808
1815
|
if (!config.htmx) {
|
|
809
1816
|
return new Elysia;
|
|
@@ -833,12 +1840,12 @@ var voice = (config) => {
|
|
|
833
1840
|
};
|
|
834
1841
|
return new Elysia({ name: "absolutejs-voice" }).ws(config.path, {
|
|
835
1842
|
close: async (ws, code, reason) => {
|
|
836
|
-
const
|
|
837
|
-
if (!
|
|
1843
|
+
const socketState = runtime.socketSessions.get(ws);
|
|
1844
|
+
if (!socketState) {
|
|
838
1845
|
return;
|
|
839
1846
|
}
|
|
840
|
-
const session = runtime.activeSessions.get(sessionId);
|
|
841
|
-
runtime.activeSessions.delete(sessionId);
|
|
1847
|
+
const session = runtime.activeSessions.get(socketState.sessionId);
|
|
1848
|
+
runtime.activeSessions.delete(socketState.sessionId);
|
|
842
1849
|
if (session) {
|
|
843
1850
|
await session.disconnect({
|
|
844
1851
|
code,
|
|
@@ -849,8 +1856,8 @@ var voice = (config) => {
|
|
|
849
1856
|
}
|
|
850
1857
|
},
|
|
851
1858
|
message: async (ws, raw) => {
|
|
852
|
-
const
|
|
853
|
-
const current = runtime.activeSessions.get(sessionId);
|
|
1859
|
+
const sessionState = resolveSessionId(runtime, ws);
|
|
1860
|
+
const current = runtime.activeSessions.get(sessionState.sessionId);
|
|
854
1861
|
const message = parseClientMessage(raw);
|
|
855
1862
|
if (message) {
|
|
856
1863
|
if (message.type === "ping") {
|
|
@@ -861,10 +1868,27 @@ var voice = (config) => {
|
|
|
861
1868
|
}
|
|
862
1869
|
if (message.type === "close" && current) {
|
|
863
1870
|
await current.close(message.reason);
|
|
864
|
-
runtime.activeSessions.delete(sessionId);
|
|
1871
|
+
runtime.activeSessions.delete(sessionState.sessionId);
|
|
1872
|
+
}
|
|
1873
|
+
if (message.type === "start" && message.sessionId && message.sessionId !== sessionState.sessionId) {
|
|
1874
|
+
const currentSession = runtime.activeSessions.get(sessionState.sessionId);
|
|
1875
|
+
if (currentSession) {
|
|
1876
|
+
await currentSession.close("session-switch");
|
|
1877
|
+
runtime.activeSessions.delete(sessionState.sessionId);
|
|
1878
|
+
}
|
|
1879
|
+
sessionState.sessionId = message.sessionId;
|
|
1880
|
+
runtime.socketSessions.set(ws, {
|
|
1881
|
+
...sessionState,
|
|
1882
|
+
sessionId: message.sessionId,
|
|
1883
|
+
scenarioId: sessionState.scenarioId
|
|
1884
|
+
});
|
|
865
1885
|
}
|
|
866
|
-
if (message.type === "start" && message.
|
|
867
|
-
|
|
1886
|
+
if (message.type === "start" && message.scenarioId) {
|
|
1887
|
+
sessionState.scenarioId = message.scenarioId;
|
|
1888
|
+
runtime.socketSessions.set(ws, {
|
|
1889
|
+
...sessionState,
|
|
1890
|
+
scenarioId: message.scenarioId
|
|
1891
|
+
});
|
|
868
1892
|
}
|
|
869
1893
|
return;
|
|
870
1894
|
}
|
|
@@ -872,66 +1896,22 @@ var voice = (config) => {
|
|
|
872
1896
|
if (!audio) {
|
|
873
1897
|
return;
|
|
874
1898
|
}
|
|
875
|
-
const session = current ??
|
|
876
|
-
context: ws.data,
|
|
877
|
-
id: sessionId,
|
|
878
|
-
logger: config.logger,
|
|
879
|
-
reconnect: {
|
|
880
|
-
maxAttempts: config.reconnect?.maxAttempts ?? 10,
|
|
881
|
-
strategy: config.reconnect?.strategy ?? "resume-last-turn",
|
|
882
|
-
timeout: config.reconnect?.timeout ?? 30000
|
|
883
|
-
},
|
|
884
|
-
route: {
|
|
885
|
-
onComplete: config.onComplete,
|
|
886
|
-
onError: config.onError,
|
|
887
|
-
onSession: config.onSession,
|
|
888
|
-
onTurn
|
|
889
|
-
},
|
|
890
|
-
socket: createSocketAdapter(ws),
|
|
891
|
-
store: config.session,
|
|
892
|
-
stt: config.stt,
|
|
893
|
-
turnDetection: {
|
|
894
|
-
silenceMs: config.turnDetection?.silenceMs ?? 700,
|
|
895
|
-
speechThreshold: config.turnDetection?.speechThreshold ?? 0.015
|
|
896
|
-
}
|
|
897
|
-
});
|
|
1899
|
+
const session = current ?? await createManagedSession(ws, sessionState.sessionId, sessionState.scenarioId ?? undefined);
|
|
898
1900
|
if (!current) {
|
|
899
|
-
runtime.activeSessions.set(sessionId, session);
|
|
1901
|
+
runtime.activeSessions.set(sessionState.sessionId, session);
|
|
900
1902
|
await session.connect(createSocketAdapter(ws));
|
|
901
1903
|
}
|
|
902
1904
|
await session.receiveAudio(audio);
|
|
903
1905
|
},
|
|
904
1906
|
open: async (ws) => {
|
|
905
|
-
const
|
|
906
|
-
const existing = runtime.activeSessions.get(sessionId);
|
|
1907
|
+
const sessionState = resolveSessionId(runtime, ws);
|
|
1908
|
+
const existing = runtime.activeSessions.get(sessionState.sessionId);
|
|
907
1909
|
if (existing) {
|
|
908
1910
|
await existing.close("superseded");
|
|
909
|
-
runtime.activeSessions.delete(sessionId);
|
|
1911
|
+
runtime.activeSessions.delete(sessionState.sessionId);
|
|
910
1912
|
}
|
|
911
|
-
const session =
|
|
912
|
-
|
|
913
|
-
id: sessionId,
|
|
914
|
-
logger: config.logger,
|
|
915
|
-
reconnect: {
|
|
916
|
-
maxAttempts: config.reconnect?.maxAttempts ?? 10,
|
|
917
|
-
strategy: config.reconnect?.strategy ?? "resume-last-turn",
|
|
918
|
-
timeout: config.reconnect?.timeout ?? 30000
|
|
919
|
-
},
|
|
920
|
-
route: {
|
|
921
|
-
onComplete: config.onComplete,
|
|
922
|
-
onError: config.onError,
|
|
923
|
-
onSession: config.onSession,
|
|
924
|
-
onTurn
|
|
925
|
-
},
|
|
926
|
-
socket: createSocketAdapter(ws),
|
|
927
|
-
store: config.session,
|
|
928
|
-
stt: config.stt,
|
|
929
|
-
turnDetection: {
|
|
930
|
-
silenceMs: config.turnDetection?.silenceMs ?? 700,
|
|
931
|
-
speechThreshold: config.turnDetection?.speechThreshold ?? 0.015
|
|
932
|
-
}
|
|
933
|
-
});
|
|
934
|
-
runtime.activeSessions.set(sessionId, session);
|
|
1913
|
+
const session = await createManagedSession(ws, sessionState.sessionId, sessionState.scenarioId ?? undefined);
|
|
1914
|
+
runtime.activeSessions.set(sessionState.sessionId, session);
|
|
935
1915
|
await session.connect(createSocketAdapter(ws));
|
|
936
1916
|
}
|
|
937
1917
|
}).use(htmxRoutes());
|
|
@@ -957,10 +1937,61 @@ var createVoiceMemoryStore = () => {
|
|
|
957
1937
|
};
|
|
958
1938
|
return { get, getOrCreate, list, remove, set };
|
|
959
1939
|
};
|
|
1940
|
+
// src/correction.ts
|
|
1941
|
+
var escapeRegExp = (value) => value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
1942
|
+
var normalizeHintAliases = (hint) => (hint.aliases ?? []).map((alias) => alias.trim()).filter((alias) => alias.length > 0).sort((left, right) => right.length - left.length);
|
|
1943
|
+
var applyPhraseHintCorrections = (text, phraseHints) => {
|
|
1944
|
+
let corrected = text;
|
|
1945
|
+
const matches = [];
|
|
1946
|
+
for (const hint of phraseHints) {
|
|
1947
|
+
for (const alias of normalizeHintAliases(hint)) {
|
|
1948
|
+
const matcher = new RegExp(`\\b${escapeRegExp(alias)}\\b`, "gi");
|
|
1949
|
+
if (!matcher.test(corrected)) {
|
|
1950
|
+
continue;
|
|
1951
|
+
}
|
|
1952
|
+
corrected = corrected.replace(matcher, hint.text);
|
|
1953
|
+
matches.push({
|
|
1954
|
+
alias,
|
|
1955
|
+
hint
|
|
1956
|
+
});
|
|
1957
|
+
}
|
|
1958
|
+
}
|
|
1959
|
+
return {
|
|
1960
|
+
changed: corrected !== text,
|
|
1961
|
+
matches,
|
|
1962
|
+
text: corrected
|
|
1963
|
+
};
|
|
1964
|
+
};
|
|
1965
|
+
var createPhraseHintCorrectionHandler = (options = {}) => {
|
|
1966
|
+
const provider = options.provider ?? "@absolutejs/voice";
|
|
1967
|
+
const reason = options.reason ?? "phrase-hint-correction";
|
|
1968
|
+
return async ({ phraseHints, text }) => {
|
|
1969
|
+
const result = applyPhraseHintCorrections(text, phraseHints);
|
|
1970
|
+
if (!result.changed) {
|
|
1971
|
+
return;
|
|
1972
|
+
}
|
|
1973
|
+
return {
|
|
1974
|
+
metadata: result.matches.length > 0 ? {
|
|
1975
|
+
matchedAliases: result.matches.map((match) => match.alias),
|
|
1976
|
+
matchedHints: result.matches.map((match) => match.hint.text)
|
|
1977
|
+
} : undefined,
|
|
1978
|
+
provider,
|
|
1979
|
+
reason,
|
|
1980
|
+
text: result.text
|
|
1981
|
+
};
|
|
1982
|
+
};
|
|
1983
|
+
};
|
|
960
1984
|
export {
|
|
961
1985
|
voice,
|
|
1986
|
+
resolveVoiceRuntimePreset,
|
|
1987
|
+
resolveTurnDetectionConfig,
|
|
1988
|
+
resolveAudioConditioningConfig,
|
|
962
1989
|
createVoiceSessionRecord,
|
|
963
1990
|
createVoiceSession,
|
|
964
1991
|
createVoiceMemoryStore,
|
|
965
|
-
|
|
1992
|
+
createPhraseHintCorrectionHandler,
|
|
1993
|
+
createId,
|
|
1994
|
+
conditionAudioChunk,
|
|
1995
|
+
applyPhraseHintCorrections,
|
|
1996
|
+
TURN_PROFILE_DEFAULTS
|
|
966
1997
|
};
|