@absolutejs/voice 0.0.19 → 0.0.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +387 -4
- package/dist/angular/index.d.ts +1 -0
- package/dist/angular/index.js +669 -3
- package/dist/angular/voice-controller.service.d.ts +21 -0
- package/dist/audioConditioning.d.ts +3 -0
- package/dist/client/actions.d.ts +7 -0
- package/dist/client/connection.d.ts +5 -0
- package/dist/client/controller.d.ts +2 -0
- package/dist/client/htmxBootstrap.js +576 -167
- package/dist/client/index.d.ts +1 -0
- package/dist/client/index.js +486 -3
- package/dist/client/microphone.d.ts +4 -2
- package/dist/correction.d.ts +16 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +1314 -283
- package/dist/presets.d.ts +13 -0
- package/dist/react/index.d.ts +1 -0
- package/dist/react/index.js +642 -3
- package/dist/react/useVoiceController.d.ts +20 -0
- package/dist/react/useVoiceStream.d.ts +1 -0
- package/dist/store.d.ts +2 -2
- package/dist/svelte/index.d.ts +1 -0
- package/dist/svelte/index.js +607 -3
- package/dist/testing/benchmark.d.ts +36 -0
- package/dist/testing/fixtures.d.ts +1 -0
- package/dist/testing/index.d.ts +2 -0
- package/dist/testing/index.js +1975 -4
- package/dist/testing/resilience.d.ts +20 -0
- package/dist/testing/sessionBenchmark.d.ts +126 -0
- package/dist/testing/stt.d.ts +1 -0
- package/dist/turnDetection.d.ts +5 -1
- package/dist/turnProfiles.d.ts +6 -0
- package/dist/types.d.ts +198 -8
- package/dist/vue/index.d.ts +1 -0
- package/dist/vue/index.js +660 -3
- package/dist/vue/useVoiceController.d.ts +19 -0
- package/fixtures/README.md +24 -0
- package/fixtures/manifest.json +127 -0
- package/fixtures/pcm/dialogue-three-clean.pcm +0 -0
- package/fixtures/pcm/dialogue-three-mixed.pcm +0 -0
- package/fixtures/pcm/dialogue-two-clean.pcm +0 -0
- package/fixtures/pcm/dialogue-two-noisy.pcm +0 -0
- package/fixtures/pcm/multiturn-three-mixed.pcm +0 -0
- package/fixtures/pcm/multiturn-two-clean.pcm +0 -0
- package/fixtures/pcm/stella-bulgaria-bulgarian20.pcm +0 -0
- package/fixtures/pcm/stella-jamaica-jamaican-creole-english1.pcm +0 -0
- package/fixtures/pcm/stella-liberia-liberian-pidgin-english2.pcm +0 -0
- package/fixtures/pcm/stella-sierra-leone-krio5.pcm +0 -0
- package/package.json +25 -1
package/dist/testing/index.js
CHANGED
|
@@ -95,6 +95,61 @@ var measureAudioLevel = (audio) => {
|
|
|
95
95
|
return Math.sqrt(sumSquares / samples.length);
|
|
96
96
|
};
|
|
97
97
|
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
98
|
+
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
99
|
+
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
100
|
+
const current = normalizeText(currentText);
|
|
101
|
+
const next = normalizeText(nextText);
|
|
102
|
+
if (!current) {
|
|
103
|
+
return next;
|
|
104
|
+
}
|
|
105
|
+
if (!next) {
|
|
106
|
+
return current;
|
|
107
|
+
}
|
|
108
|
+
if (current === next || current.includes(next)) {
|
|
109
|
+
return current;
|
|
110
|
+
}
|
|
111
|
+
if (next.includes(current)) {
|
|
112
|
+
return next;
|
|
113
|
+
}
|
|
114
|
+
if (countWords(next) > countWords(current)) {
|
|
115
|
+
return next;
|
|
116
|
+
}
|
|
117
|
+
return current;
|
|
118
|
+
};
|
|
119
|
+
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
120
|
+
const current = normalizeText(currentText);
|
|
121
|
+
const next = normalizeText(nextText);
|
|
122
|
+
if (!current) {
|
|
123
|
+
return next;
|
|
124
|
+
}
|
|
125
|
+
if (!next) {
|
|
126
|
+
return current;
|
|
127
|
+
}
|
|
128
|
+
const currentWords = current.split(" ");
|
|
129
|
+
const nextWords = next.split(" ");
|
|
130
|
+
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
131
|
+
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
132
|
+
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
133
|
+
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
134
|
+
if (currentSuffix === nextPrefix) {
|
|
135
|
+
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
return `${current} ${next}`.trim();
|
|
139
|
+
};
|
|
140
|
+
var countCommonPrefixWords = (currentText, nextText) => {
|
|
141
|
+
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
142
|
+
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
143
|
+
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
144
|
+
let count = 0;
|
|
145
|
+
for (let index = 0;index < maxWords; index += 1) {
|
|
146
|
+
if (currentWords[index] !== nextWords[index]) {
|
|
147
|
+
break;
|
|
148
|
+
}
|
|
149
|
+
count += 1;
|
|
150
|
+
}
|
|
151
|
+
return count;
|
|
152
|
+
};
|
|
98
153
|
var mergeTranscriptTexts = (transcripts) => {
|
|
99
154
|
const merged = [];
|
|
100
155
|
for (const transcript of transcripts) {
|
|
@@ -118,12 +173,14 @@ var mergeTranscriptTexts = (transcripts) => {
|
|
|
118
173
|
}
|
|
119
174
|
return merged.join(" ").trim();
|
|
120
175
|
};
|
|
121
|
-
var buildTurnText = (transcripts, partialText) => {
|
|
176
|
+
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
122
177
|
const finalText = mergeTranscriptTexts(transcripts);
|
|
123
|
-
|
|
124
|
-
|
|
178
|
+
const nextPartial = normalizeText(partialText);
|
|
179
|
+
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
180
|
+
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
181
|
+
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
125
182
|
}
|
|
126
|
-
return
|
|
183
|
+
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
127
184
|
};
|
|
128
185
|
|
|
129
186
|
// src/testing/accuracy.ts
|
|
@@ -204,6 +261,7 @@ var runSTTAdapterFixture = async (adapter, fixture, options = {}) => {
|
|
|
204
261
|
const settleMs = options.settleMs ?? 500;
|
|
205
262
|
const waitForRealtimeMs = options.waitForRealtimeMs ?? 0;
|
|
206
263
|
let lastActivityAt = Date.now();
|
|
264
|
+
let speechEndedAt = startedAt;
|
|
207
265
|
const markActive = () => {
|
|
208
266
|
lastActivityAt = Date.now();
|
|
209
267
|
};
|
|
@@ -240,12 +298,15 @@ var runSTTAdapterFixture = async (adapter, fixture, options = {}) => {
|
|
|
240
298
|
const realtimeDelayMs = waitForRealtimeMs > 0 ? waitForRealtimeMs : chunkDurationMs;
|
|
241
299
|
for (const chunk of chunks) {
|
|
242
300
|
await session.send(chunk);
|
|
301
|
+
markActive();
|
|
243
302
|
await Bun.sleep(realtimeDelayMs);
|
|
244
303
|
}
|
|
304
|
+
speechEndedAt = Date.now();
|
|
245
305
|
if (tailPaddingMs > 0) {
|
|
246
306
|
const tailBytes = Math.max(2, Math.floor(bytesPerMillisecond * tailPaddingMs));
|
|
247
307
|
for (const chunk of chunkAudio(createSilence(tailBytes), bytesPerChunk)) {
|
|
248
308
|
await session.send(chunk);
|
|
309
|
+
markActive();
|
|
249
310
|
await Bun.sleep(realtimeDelayMs);
|
|
250
311
|
}
|
|
251
312
|
}
|
|
@@ -265,11 +326,30 @@ var runSTTAdapterFixture = async (adapter, fixture, options = {}) => {
|
|
|
265
326
|
finalEvents,
|
|
266
327
|
finalText,
|
|
267
328
|
partialEvents,
|
|
329
|
+
speechEndedAt,
|
|
268
330
|
startedAt
|
|
269
331
|
};
|
|
270
332
|
};
|
|
271
333
|
|
|
272
334
|
// src/testing/benchmark.ts
|
|
335
|
+
var resolveFixtureEnvironment = (fixture) => {
|
|
336
|
+
const tags = new Set(fixture.tags ?? []);
|
|
337
|
+
const hasAccent = tags.has("accent") || tags.has("speech-accent-archive");
|
|
338
|
+
const hasNoisy = tags.has("noisy") || tags.has("synthetic-noise") || tags.has("stress");
|
|
339
|
+
if (hasAccent && hasNoisy) {
|
|
340
|
+
return "accent-noisy";
|
|
341
|
+
}
|
|
342
|
+
if (hasAccent) {
|
|
343
|
+
return "accent";
|
|
344
|
+
}
|
|
345
|
+
if (hasNoisy) {
|
|
346
|
+
return "noisy";
|
|
347
|
+
}
|
|
348
|
+
if (tags.has("clean")) {
|
|
349
|
+
return "clean";
|
|
350
|
+
}
|
|
351
|
+
return "other";
|
|
352
|
+
};
|
|
273
353
|
var normalizeBenchmarkText = (value) => value.toLowerCase().replace(/[^\p{L}\p{N}\s']/gu, " ").replace(/\s+/g, " ").trim();
|
|
274
354
|
var scoreExpectedTerms = (actualText, expectedTerms) => {
|
|
275
355
|
const normalizedActual = normalizeBenchmarkText(actualText);
|
|
@@ -300,10 +380,46 @@ var roundMetric = (value, digits = 4) => {
|
|
|
300
380
|
const factor = 10 ** digits;
|
|
301
381
|
return Math.round(value * factor) / factor;
|
|
302
382
|
};
|
|
383
|
+
var calculateGroupSummary = (fixtures) => {
|
|
384
|
+
const grouped = new Map;
|
|
385
|
+
for (const fixture of fixtures) {
|
|
386
|
+
const existing = grouped.get(fixture.group) ?? [];
|
|
387
|
+
existing.push(fixture);
|
|
388
|
+
grouped.set(fixture.group, existing);
|
|
389
|
+
}
|
|
390
|
+
return Array.from(grouped.entries()).map(([group, results]) => {
|
|
391
|
+
const fixtureCount = results.length;
|
|
392
|
+
const passCount = results.filter((fixture) => fixture.passes).length;
|
|
393
|
+
const averageWordErrorRate = average(results.map((result) => result.accuracy.wordErrorRate)) ?? 0;
|
|
394
|
+
const averageTermRecall = average(results.map((result) => result.expectedTerms.recall)) ?? 0;
|
|
395
|
+
const averageElapsedMs = average(results.map((result) => result.elapsedMs));
|
|
396
|
+
const accuracy = 1 - averageWordErrorRate;
|
|
397
|
+
return {
|
|
398
|
+
averageElapsedMs: roundMetric(averageElapsedMs, 2) ?? 0,
|
|
399
|
+
averageTermRecall: roundMetric(averageTermRecall) ?? 0,
|
|
400
|
+
averageWordErrorRate: roundMetric(averageWordErrorRate) ?? 0,
|
|
401
|
+
fixturesWithErrors: results.filter((fixture) => fixture.errorCount > 0).length,
|
|
402
|
+
fixturesWithFragments: results.filter((fixture) => fixture.fragmentationCount > 0).length,
|
|
403
|
+
fixtureCount,
|
|
404
|
+
group,
|
|
405
|
+
passCount,
|
|
406
|
+
passRate: fixtureCount > 0 ? roundMetric(passCount / fixtureCount) ?? 0 : 0,
|
|
407
|
+
wordAccuracyRate: roundMetric(accuracy) ?? 0
|
|
408
|
+
};
|
|
409
|
+
}).sort((a, b) => a.group.localeCompare(b.group));
|
|
410
|
+
};
|
|
303
411
|
var toFixtureBenchmarkResult = (fixture, result, elapsedMs) => {
|
|
412
|
+
const toPostSpeechLatency = (timestamp) => {
|
|
413
|
+
if (typeof timestamp !== "number") {
|
|
414
|
+
return;
|
|
415
|
+
}
|
|
416
|
+
return Math.max(0, timestamp - result.speechEndedAt);
|
|
417
|
+
};
|
|
304
418
|
const timeToFirstPartialMs = result.partialEvents[0] ? result.partialEvents[0].receivedAt - result.startedAt : undefined;
|
|
305
419
|
const timeToFirstFinalMs = result.finalEvents[0] ? result.finalEvents[0].receivedAt - result.startedAt : undefined;
|
|
306
420
|
const timeToEndOfTurnMs = result.endOfTurnEvents[0] ? result.endOfTurnEvents[0].receivedAt - result.startedAt : undefined;
|
|
421
|
+
const postSpeechTimeToFirstFinalMs = toPostSpeechLatency(result.finalEvents[0]?.receivedAt);
|
|
422
|
+
const postSpeechTimeToEndOfTurnMs = toPostSpeechLatency(result.endOfTurnEvents[0]?.receivedAt);
|
|
307
423
|
const expectedTerms = scoreExpectedTerms(result.finalText, fixture.expectedTerms);
|
|
308
424
|
return {
|
|
309
425
|
accuracy: result.accuracy,
|
|
@@ -317,8 +433,11 @@ var toFixtureBenchmarkResult = (fixture, result, elapsedMs) => {
|
|
|
317
433
|
finalText: result.finalText,
|
|
318
434
|
fixtureId: fixture.id,
|
|
319
435
|
fragmentationCount: Math.max(0, result.finalEvents.length - 1),
|
|
436
|
+
group: resolveFixtureEnvironment(fixture),
|
|
320
437
|
passes: result.errorEvents.length === 0 && result.finalText.trim().length > 0 && result.accuracy.passesThreshold,
|
|
321
438
|
partialCount: result.partialEvents.length,
|
|
439
|
+
postSpeechTimeToEndOfTurnMs,
|
|
440
|
+
postSpeechTimeToFirstFinalMs,
|
|
322
441
|
tags: fixture.tags ?? [],
|
|
323
442
|
timeToEndOfTurnMs,
|
|
324
443
|
timeToFirstFinalMs,
|
|
@@ -336,6 +455,8 @@ var summarizeSTTBenchmark = (adapterId, fixtures) => {
|
|
|
336
455
|
averageEndOfTurnCount: roundMetric(average(fixtures.map((fixture) => fixture.endOfTurnCount)), 2) ?? 0,
|
|
337
456
|
averageFinalCount: roundMetric(average(fixtures.map((fixture) => fixture.finalCount)), 2) ?? 0,
|
|
338
457
|
averageTermRecall: roundMetric(average(fixtures.map((fixture) => fixture.expectedTerms.recall))) ?? 0,
|
|
458
|
+
averagePostSpeechTimeToEndOfTurnMs: roundMetric(average(fixtures.map((fixture) => fixture.postSpeechTimeToEndOfTurnMs)), 2),
|
|
459
|
+
averagePostSpeechTimeToFirstFinalMs: roundMetric(average(fixtures.map((fixture) => fixture.postSpeechTimeToFirstFinalMs)), 2),
|
|
339
460
|
averageTimeToEndOfTurnMs: roundMetric(average(fixtures.map((fixture) => fixture.timeToEndOfTurnMs)), 2),
|
|
340
461
|
averageTimeToFirstFinalMs: roundMetric(average(fixtures.map((fixture) => fixture.timeToFirstFinalMs)), 2),
|
|
341
462
|
averageTimeToFirstPartialMs: roundMetric(average(fixtures.map((fixture) => fixture.timeToFirstPartialMs)), 2),
|
|
@@ -343,12 +464,51 @@ var summarizeSTTBenchmark = (adapterId, fixtures) => {
|
|
|
343
464
|
fixtureCount,
|
|
344
465
|
fixturesWithErrors: fixtures.filter((fixture) => fixture.errorCount > 0).length,
|
|
345
466
|
fixturesWithFragmentation: fixtures.filter((fixture) => fixture.fragmentationCount > 0).length,
|
|
467
|
+
groupSummaries: calculateGroupSummary(fixtures),
|
|
346
468
|
passCount,
|
|
347
469
|
passRate: fixtureCount > 0 ? roundMetric(passCount / fixtureCount) ?? 0 : 0,
|
|
348
470
|
totalErrorCount: fixtures.reduce((sum, fixture) => sum + fixture.errorCount, 0),
|
|
349
471
|
wordAccuracyRate: fixtureCount > 0 ? roundMetric(1 - (average(fixtures.map((fixture) => fixture.accuracy.wordErrorRate)) ?? 0)) ?? 0 : 0
|
|
350
472
|
};
|
|
351
473
|
};
|
|
474
|
+
var evaluateSTTBenchmarkAcceptance = (report, thresholds = {}) => {
|
|
475
|
+
const failures = [];
|
|
476
|
+
const details = thresholds;
|
|
477
|
+
const overallPassRate = details.overallPassRate;
|
|
478
|
+
if (overallPassRate !== undefined && report.summary.passRate < overallPassRate) {
|
|
479
|
+
failures.push(`overall passRate ${(report.summary.passRate * 100).toFixed(2)}% below ${(overallPassRate * 100).toFixed(2)}%`);
|
|
480
|
+
}
|
|
481
|
+
const minTermRecall = details.termRecall;
|
|
482
|
+
if (minTermRecall !== undefined && report.summary.averageTermRecall < minTermRecall) {
|
|
483
|
+
failures.push(`overall term recall ${report.summary.averageTermRecall.toFixed(4)} below ${minTermRecall.toFixed(4)}`);
|
|
484
|
+
}
|
|
485
|
+
const minWordAccuracy = details.wordAccuracyRate;
|
|
486
|
+
if (minWordAccuracy !== undefined && report.summary.wordAccuracyRate < minWordAccuracy) {
|
|
487
|
+
failures.push(`overall word accuracy ${(report.summary.wordAccuracyRate * 100).toFixed(2)}% below ${(minWordAccuracy * 100).toFixed(2)}%`);
|
|
488
|
+
}
|
|
489
|
+
const groupThresholds = details.groupPassRate;
|
|
490
|
+
if (groupThresholds) {
|
|
491
|
+
for (const groupSummary of report.summary.groupSummaries) {
|
|
492
|
+
const threshold = groupThresholds[groupSummary.group];
|
|
493
|
+
if (!threshold) {
|
|
494
|
+
continue;
|
|
495
|
+
}
|
|
496
|
+
if (threshold.passRate !== undefined && groupSummary.passRate < threshold.passRate) {
|
|
497
|
+
failures.push(`${groupSummary.group} passRate ${(groupSummary.passRate * 100).toFixed(2)}% below ${(threshold.passRate * 100).toFixed(2)}%`);
|
|
498
|
+
}
|
|
499
|
+
if (threshold.wordAccuracyRate !== undefined && groupSummary.wordAccuracyRate < threshold.wordAccuracyRate) {
|
|
500
|
+
failures.push(`${groupSummary.group} wordAccuracy ${(groupSummary.wordAccuracyRate * 100).toFixed(2)}% below ${(threshold.wordAccuracyRate * 100).toFixed(2)}%`);
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
const score = roundMetric(report.summary.passRate * 0.45 + report.summary.wordAccuracyRate * 0.35 + report.summary.averageTermRecall * 0.2, 3) ?? 0;
|
|
505
|
+
return {
|
|
506
|
+
adapterId: report.adapterId,
|
|
507
|
+
failures,
|
|
508
|
+
passed: failures.length === 0,
|
|
509
|
+
score
|
|
510
|
+
};
|
|
511
|
+
};
|
|
352
512
|
var compareSTTBenchmarks = (reports) => {
|
|
353
513
|
const entries = reports.map((report) => ({
|
|
354
514
|
adapterId: report.adapterId,
|
|
@@ -434,13 +594,1824 @@ var loadVoiceTestFixtures = async (fixtureDirectory) => {
|
|
|
434
594
|
};
|
|
435
595
|
}));
|
|
436
596
|
};
|
|
597
|
+
// src/store.ts
|
|
598
|
+
var createId = () => crypto.randomUUID();
|
|
599
|
+
var createVoiceSessionRecord = (id, scenarioId) => ({
|
|
600
|
+
committedTurnIds: [],
|
|
601
|
+
createdAt: Date.now(),
|
|
602
|
+
currentTurn: {
|
|
603
|
+
finalText: "",
|
|
604
|
+
lastSpeechAt: undefined,
|
|
605
|
+
lastTranscriptAt: undefined,
|
|
606
|
+
partialEndedAt: undefined,
|
|
607
|
+
partialStartedAt: undefined,
|
|
608
|
+
partialText: "",
|
|
609
|
+
silenceStartedAt: undefined,
|
|
610
|
+
transcripts: []
|
|
611
|
+
},
|
|
612
|
+
id,
|
|
613
|
+
scenarioId,
|
|
614
|
+
reconnect: { attempts: 0 },
|
|
615
|
+
status: "active",
|
|
616
|
+
transcripts: [],
|
|
617
|
+
turns: [],
|
|
618
|
+
lastCommittedTurn: {
|
|
619
|
+
committedAt: 0,
|
|
620
|
+
signature: "",
|
|
621
|
+
text: "",
|
|
622
|
+
transcriptIds: []
|
|
623
|
+
}
|
|
624
|
+
});
|
|
625
|
+
var resetVoiceSessionRecord = (id, existing, scenarioId) => ({
|
|
626
|
+
...createVoiceSessionRecord(id, scenarioId),
|
|
627
|
+
metadata: existing?.metadata
|
|
628
|
+
});
|
|
629
|
+
var toVoiceSessionSummary = (session) => ({
|
|
630
|
+
createdAt: session.createdAt,
|
|
631
|
+
id: session.id,
|
|
632
|
+
lastActivityAt: session.lastActivityAt,
|
|
633
|
+
status: session.status,
|
|
634
|
+
turnCount: session.turns.length
|
|
635
|
+
});
|
|
636
|
+
|
|
637
|
+
// src/memoryStore.ts
|
|
638
|
+
var createVoiceMemoryStore = () => {
|
|
639
|
+
const sessions = new Map;
|
|
640
|
+
const get = async (id) => sessions.get(id);
|
|
641
|
+
const getOrCreate = async (id) => {
|
|
642
|
+
let session = sessions.get(id);
|
|
643
|
+
if (!session) {
|
|
644
|
+
session = createVoiceSessionRecord(id);
|
|
645
|
+
sessions.set(id, session);
|
|
646
|
+
}
|
|
647
|
+
return session;
|
|
648
|
+
};
|
|
649
|
+
const set = async (id, value) => {
|
|
650
|
+
sessions.set(id, value);
|
|
651
|
+
};
|
|
652
|
+
const list = async () => Array.from(sessions.values()).map((session) => toVoiceSessionSummary(session)).sort((first, second) => (second.lastActivityAt ?? second.createdAt) - (first.lastActivityAt ?? first.createdAt));
|
|
653
|
+
const remove = async (id) => {
|
|
654
|
+
sessions.delete(id);
|
|
655
|
+
};
|
|
656
|
+
return { get, getOrCreate, list, remove, set };
|
|
657
|
+
};
|
|
658
|
+
|
|
659
|
+
// src/audioConditioning.ts
|
|
660
|
+
var DEFAULT_TARGET_LEVEL = 0.08;
|
|
661
|
+
var DEFAULT_MAX_GAIN = 3;
|
|
662
|
+
var DEFAULT_NOISE_GATE_THRESHOLD = 0.006;
|
|
663
|
+
var DEFAULT_NOISE_GATE_ATTENUATION = 0.15;
|
|
664
|
+
var toInt16Array = (audio) => {
|
|
665
|
+
if (audio instanceof ArrayBuffer) {
|
|
666
|
+
return new Int16Array(audio, 0, Math.floor(audio.byteLength / 2));
|
|
667
|
+
}
|
|
668
|
+
return new Int16Array(audio.buffer, audio.byteOffset, Math.floor(audio.byteLength / 2));
|
|
669
|
+
};
|
|
670
|
+
var computeRms = (samples) => {
|
|
671
|
+
if (samples.length === 0) {
|
|
672
|
+
return 0;
|
|
673
|
+
}
|
|
674
|
+
let sumSquares = 0;
|
|
675
|
+
for (const sample of samples) {
|
|
676
|
+
const normalized = sample / 32768;
|
|
677
|
+
sumSquares += normalized * normalized;
|
|
678
|
+
}
|
|
679
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
680
|
+
};
|
|
681
|
+
var resolveAudioConditioningConfig = (config) => {
|
|
682
|
+
if (!config || config.enabled === false) {
|
|
683
|
+
return;
|
|
684
|
+
}
|
|
685
|
+
return {
|
|
686
|
+
enabled: true,
|
|
687
|
+
maxGain: config.maxGain ?? DEFAULT_MAX_GAIN,
|
|
688
|
+
noiseGateAttenuation: config.noiseGateAttenuation ?? DEFAULT_NOISE_GATE_ATTENUATION,
|
|
689
|
+
noiseGateThreshold: config.noiseGateThreshold ?? DEFAULT_NOISE_GATE_THRESHOLD,
|
|
690
|
+
targetLevel: config.targetLevel ?? DEFAULT_TARGET_LEVEL
|
|
691
|
+
};
|
|
692
|
+
};
|
|
693
|
+
var conditionAudioChunk = (audio, config) => {
|
|
694
|
+
if (!config) {
|
|
695
|
+
return audio;
|
|
696
|
+
}
|
|
697
|
+
const source = toInt16Array(audio);
|
|
698
|
+
if (source.length === 0) {
|
|
699
|
+
return audio;
|
|
700
|
+
}
|
|
701
|
+
const rms = computeRms(source);
|
|
702
|
+
const output = new Int16Array(source.length);
|
|
703
|
+
const gateFactor = rms < config.noiseGateThreshold ? config.noiseGateAttenuation : 1;
|
|
704
|
+
const baseLevel = Math.max(rms * gateFactor, 0.000001);
|
|
705
|
+
const gain = Math.min(config.maxGain, config.targetLevel / baseLevel);
|
|
706
|
+
const appliedGain = Math.max(0.25, gain) * gateFactor;
|
|
707
|
+
for (let index = 0;index < source.length; index += 1) {
|
|
708
|
+
const next = Math.round(source[index] * appliedGain);
|
|
709
|
+
output[index] = Math.max(-32768, Math.min(32767, next));
|
|
710
|
+
}
|
|
711
|
+
return new Uint8Array(output.buffer);
|
|
712
|
+
};
|
|
713
|
+
|
|
714
|
+
// src/logger.ts
|
|
715
|
+
var noop = () => {};
|
|
716
|
+
var createNoopLogger = () => ({
|
|
717
|
+
debug: noop,
|
|
718
|
+
error: noop,
|
|
719
|
+
info: noop,
|
|
720
|
+
warn: noop
|
|
721
|
+
});
|
|
722
|
+
var resolveLogger = (logger) => ({
|
|
723
|
+
...createNoopLogger(),
|
|
724
|
+
...logger
|
|
725
|
+
});
|
|
726
|
+
|
|
727
|
+
// src/session.ts
|
|
728
|
+
var DEFAULT_RECONNECT_TIMEOUT = 30000;
|
|
729
|
+
var DEFAULT_MAX_RECONNECT_ATTEMPTS = 10;
|
|
730
|
+
var DEFAULT_TRANSCRIPT_STABILITY_MS = 450;
|
|
731
|
+
var DEFAULT_FALLBACK_REPLAY_MS = 8000;
|
|
732
|
+
var DEFAULT_FALLBACK_SETTLE_MS = 220;
|
|
733
|
+
var DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS = 2500;
|
|
734
|
+
var DEFAULT_FALLBACK_CONFIDENCE_THRESHOLD = 0.6;
|
|
735
|
+
var DEFAULT_FALLBACK_MIN_TEXT_LENGTH = 2;
|
|
736
|
+
var DEFAULT_FALLBACK_MAX_ATTEMPTS_PER_TURN = 1;
|
|
737
|
+
var DEFAULT_DUPLICATE_TURN_WINDOW_MS = 5000;
|
|
738
|
+
var FALLBACK_CONFIDENCE_SELECTION_DELTA = 0.05;
|
|
739
|
+
var FALLBACK_WORD_COUNT_SELECTION_MARGIN_RATIO = 0.12;
|
|
740
|
+
var DEFAULT_FORMAT = {
|
|
741
|
+
channels: 1,
|
|
742
|
+
container: "raw",
|
|
743
|
+
encoding: "pcm_s16le",
|
|
744
|
+
sampleRateHz: 16000
|
|
745
|
+
};
|
|
746
|
+
var toError = (value) => value instanceof Error ? value : new Error(String(value));
|
|
747
|
+
var createEmptyCurrentTurn = () => ({
|
|
748
|
+
finalText: "",
|
|
749
|
+
lastSpeechAt: undefined,
|
|
750
|
+
lastTranscriptAt: undefined,
|
|
751
|
+
partialEndedAt: undefined,
|
|
752
|
+
partialStartedAt: undefined,
|
|
753
|
+
partialText: "",
|
|
754
|
+
silenceStartedAt: undefined,
|
|
755
|
+
transcripts: []
|
|
756
|
+
});
|
|
757
|
+
var cloneTranscript = (transcript) => ({ ...transcript });
|
|
758
|
+
var countWords2 = (text) => text.trim().split(/\s+/).filter(Boolean).length;
|
|
759
|
+
var normalizeText2 = (text) => text.trim().replace(/\s+/g, " ");
|
|
760
|
+
var getAudioChunkDurationMs = (chunk) => chunk.byteLength / (DEFAULT_FORMAT.sampleRateHz * DEFAULT_FORMAT.channels * 2) * 1000;
|
|
761
|
+
var getBufferedAudioDurationMs = (chunks) => chunks.reduce((total, chunk) => total + getAudioChunkDurationMs(chunk), 0);
|
|
762
|
+
var calculateMeanConfidence = (transcripts) => {
|
|
763
|
+
let sum = 0;
|
|
764
|
+
let total = 0;
|
|
765
|
+
for (const transcript of transcripts) {
|
|
766
|
+
if (typeof transcript.confidence === "number") {
|
|
767
|
+
sum += transcript.confidence;
|
|
768
|
+
total += 1;
|
|
769
|
+
}
|
|
770
|
+
}
|
|
771
|
+
if (total === 0) {
|
|
772
|
+
return 0;
|
|
773
|
+
}
|
|
774
|
+
return sum / total;
|
|
775
|
+
};
|
|
776
|
+
var createTurnQuality = (transcripts, source, fallbackUsed, fallbackDiagnostics, correctionDiagnostics) => {
|
|
777
|
+
const sampledTranscripts = transcripts.filter((transcript) => typeof transcript.confidence === "number");
|
|
778
|
+
const confidenceSampleCount = sampledTranscripts.length;
|
|
779
|
+
return {
|
|
780
|
+
averageConfidence: confidenceSampleCount > 0 ? sampledTranscripts.reduce((sum, transcript) => sum + transcript.confidence, 0) / confidenceSampleCount : undefined,
|
|
781
|
+
confidenceSampleCount,
|
|
782
|
+
correction: correctionDiagnostics,
|
|
783
|
+
fallback: fallbackDiagnostics,
|
|
784
|
+
fallbackUsed,
|
|
785
|
+
finalTranscriptCount: transcripts.filter((transcript) => transcript.isFinal).length,
|
|
786
|
+
partialTranscriptCount: transcripts.filter((transcript) => !transcript.isFinal).length,
|
|
787
|
+
selectedTranscriptCount: transcripts.length,
|
|
788
|
+
source
|
|
789
|
+
};
|
|
790
|
+
};
|
|
791
|
+
var normalizeCorrectionText = (text) => normalizeText2(text);
|
|
792
|
+
var isFallbackNeeded = (candidate, config) => {
|
|
793
|
+
const trimmed = normalizeText2(candidate.text);
|
|
794
|
+
const wordCount = countWords2(trimmed);
|
|
795
|
+
if (config.trigger === "always") {
|
|
796
|
+
return true;
|
|
797
|
+
}
|
|
798
|
+
if (config.trigger === "empty-turn") {
|
|
799
|
+
return wordCount < config.minTextLength;
|
|
800
|
+
}
|
|
801
|
+
const averageConfidence = calculateMeanConfidence(candidate.transcripts);
|
|
802
|
+
if (config.trigger === "low-confidence") {
|
|
803
|
+
return averageConfidence > 0 && averageConfidence < config.confidenceThreshold;
|
|
804
|
+
}
|
|
805
|
+
return averageConfidence > 0 && averageConfidence < config.confidenceThreshold || wordCount < config.minTextLength;
|
|
806
|
+
};
|
|
807
|
+
var selectBetterTurnText = (candidate, fallback) => {
|
|
808
|
+
if (!fallback.text) {
|
|
809
|
+
return {
|
|
810
|
+
reason: "fallback-empty",
|
|
811
|
+
winner: candidate
|
|
812
|
+
};
|
|
813
|
+
}
|
|
814
|
+
if (!candidate.text) {
|
|
815
|
+
return {
|
|
816
|
+
reason: "primary-empty",
|
|
817
|
+
winner: fallback
|
|
818
|
+
};
|
|
819
|
+
}
|
|
820
|
+
const largestWordCount = Math.max(candidate.wordCount, fallback.wordCount, 1);
|
|
821
|
+
const wordCountDelta = fallback.wordCount - candidate.wordCount;
|
|
822
|
+
const wordCountDeltaRatio = Math.abs(wordCountDelta) / largestWordCount;
|
|
823
|
+
if (wordCountDeltaRatio >= FALLBACK_WORD_COUNT_SELECTION_MARGIN_RATIO && wordCountDelta !== 0) {
|
|
824
|
+
return {
|
|
825
|
+
reason: "word-count-margin",
|
|
826
|
+
winner: wordCountDelta > 0 ? fallback : candidate
|
|
827
|
+
};
|
|
828
|
+
}
|
|
829
|
+
if (fallback.confidence > candidate.confidence + FALLBACK_CONFIDENCE_SELECTION_DELTA) {
|
|
830
|
+
return {
|
|
831
|
+
reason: "confidence-margin",
|
|
832
|
+
winner: fallback
|
|
833
|
+
};
|
|
834
|
+
}
|
|
835
|
+
if (candidate.confidence > fallback.confidence + FALLBACK_CONFIDENCE_SELECTION_DELTA) {
|
|
836
|
+
return {
|
|
837
|
+
reason: "kept-primary",
|
|
838
|
+
winner: candidate
|
|
839
|
+
};
|
|
840
|
+
}
|
|
841
|
+
if (fallback.wordCount > candidate.wordCount) {
|
|
842
|
+
return {
|
|
843
|
+
reason: "word-count-tiebreak",
|
|
844
|
+
winner: fallback
|
|
845
|
+
};
|
|
846
|
+
}
|
|
847
|
+
return {
|
|
848
|
+
reason: "kept-primary",
|
|
849
|
+
winner: candidate
|
|
850
|
+
};
|
|
851
|
+
};
|
|
852
|
+
var setTurnResult = (session, turnId, input) => {
|
|
853
|
+
session.turns = session.turns.map((turn) => turn.id === turnId ? {
|
|
854
|
+
...turn,
|
|
855
|
+
assistantText: input.assistantText ?? turn.assistantText,
|
|
856
|
+
result: input.result ?? turn.result
|
|
857
|
+
} : turn);
|
|
858
|
+
};
|
|
859
|
+
var createVoiceSession = (options) => {
|
|
860
|
+
const logger = resolveLogger(options.logger);
|
|
861
|
+
const reconnect = {
|
|
862
|
+
maxAttempts: options.reconnect.maxAttempts ?? DEFAULT_MAX_RECONNECT_ATTEMPTS,
|
|
863
|
+
strategy: options.reconnect.strategy ?? "resume-last-turn",
|
|
864
|
+
timeout: options.reconnect.timeout ?? DEFAULT_RECONNECT_TIMEOUT
|
|
865
|
+
};
|
|
866
|
+
const turnDetection = {
|
|
867
|
+
silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
|
|
868
|
+
speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
|
|
869
|
+
transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
|
|
870
|
+
};
|
|
871
|
+
const sttFallback = options.sttFallback ? {
|
|
872
|
+
adapter: options.sttFallback.adapter,
|
|
873
|
+
completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
|
|
874
|
+
confidenceThreshold: options.sttFallback.confidenceThreshold ?? DEFAULT_FALLBACK_CONFIDENCE_THRESHOLD,
|
|
875
|
+
maxAttemptsPerTurn: options.sttFallback.maxAttemptsPerTurn ?? DEFAULT_FALLBACK_MAX_ATTEMPTS_PER_TURN,
|
|
876
|
+
minTextLength: options.sttFallback.minTextLength ?? DEFAULT_FALLBACK_MIN_TEXT_LENGTH,
|
|
877
|
+
replayWindowMs: options.sttFallback.replayWindowMs ?? DEFAULT_FALLBACK_REPLAY_MS,
|
|
878
|
+
settleMs: options.sttFallback.settleMs ?? DEFAULT_FALLBACK_SETTLE_MS,
|
|
879
|
+
trigger: options.sttFallback.trigger ?? "empty-or-low-confidence"
|
|
880
|
+
} : undefined;
|
|
881
|
+
const phraseHints = options.phraseHints ?? [];
|
|
882
|
+
let socket = options.socket;
|
|
883
|
+
let sttSession = null;
|
|
884
|
+
let silenceTimer = null;
|
|
885
|
+
let speechDetected = false;
|
|
886
|
+
let operationQueue = Promise.resolve();
|
|
887
|
+
let adapterGenerationCounter = 0;
|
|
888
|
+
let activeAdapterGeneration = 0;
|
|
889
|
+
const currentTurnAudio = [];
|
|
890
|
+
let fallbackAttemptsForCurrentTurn = 0;
|
|
891
|
+
const pruneTurnAudio = () => {
|
|
892
|
+
const replayWindowMs = sttFallback?.replayWindowMs ?? DEFAULT_FALLBACK_REPLAY_MS;
|
|
893
|
+
const cutoffAt = Date.now() - replayWindowMs;
|
|
894
|
+
let index = 0;
|
|
895
|
+
while (index < currentTurnAudio.length && currentTurnAudio[index].recordedAt < cutoffAt) {
|
|
896
|
+
index += 1;
|
|
897
|
+
}
|
|
898
|
+
if (index > 0) {
|
|
899
|
+
currentTurnAudio.splice(0, index);
|
|
900
|
+
}
|
|
901
|
+
};
|
|
902
|
+
const pushTurnAudio = (audio) => {
|
|
903
|
+
const chunk = audio instanceof ArrayBuffer ? new Uint8Array(audio.slice(0)) : new Uint8Array(audio.buffer.slice(audio.byteOffset, audio.byteOffset + audio.byteLength));
|
|
904
|
+
currentTurnAudio.push({
|
|
905
|
+
chunk,
|
|
906
|
+
recordedAt: Date.now()
|
|
907
|
+
});
|
|
908
|
+
pruneTurnAudio();
|
|
909
|
+
};
|
|
910
|
+
const getFallbackWindowAudio = () => {
|
|
911
|
+
if (!sttFallback?.adapter) {
|
|
912
|
+
return [];
|
|
913
|
+
}
|
|
914
|
+
pruneTurnAudio();
|
|
915
|
+
return currentTurnAudio.map((audio) => audio.chunk);
|
|
916
|
+
};
|
|
917
|
+
const clearSilenceTimer = () => {
|
|
918
|
+
if (!silenceTimer) {
|
|
919
|
+
return;
|
|
920
|
+
}
|
|
921
|
+
clearTimeout(silenceTimer);
|
|
922
|
+
silenceTimer = null;
|
|
923
|
+
};
|
|
924
|
+
const send = async (message) => {
|
|
925
|
+
try {
|
|
926
|
+
await Promise.resolve(socket.send(JSON.stringify(message)));
|
|
927
|
+
} catch (error) {
|
|
928
|
+
logger.warn("voice socket send failed", {
|
|
929
|
+
error: toError(error).message,
|
|
930
|
+
sessionId: options.id,
|
|
931
|
+
type: message.type
|
|
932
|
+
});
|
|
933
|
+
}
|
|
934
|
+
};
|
|
935
|
+
const readSession = async () => options.store.getOrCreate(options.id);
|
|
936
|
+
const writeSession = async (mutate) => {
|
|
937
|
+
const session = await options.store.getOrCreate(options.id);
|
|
938
|
+
mutate(session);
|
|
939
|
+
await options.store.set(options.id, session);
|
|
940
|
+
return session;
|
|
941
|
+
};
|
|
942
|
+
const runSerial = (phase, operation) => {
|
|
943
|
+
const result = operationQueue.then(async () => {
|
|
944
|
+
logger.debug("voice session operation", {
|
|
945
|
+
phase,
|
|
946
|
+
sessionId: options.id
|
|
947
|
+
});
|
|
948
|
+
return await operation();
|
|
949
|
+
});
|
|
950
|
+
operationQueue = result.then(() => {
|
|
951
|
+
return;
|
|
952
|
+
}, () => {
|
|
953
|
+
return;
|
|
954
|
+
});
|
|
955
|
+
return result;
|
|
956
|
+
};
|
|
957
|
+
const closeAdapter = async (reason) => {
|
|
958
|
+
if (!sttSession) {
|
|
959
|
+
return;
|
|
960
|
+
}
|
|
961
|
+
const activeSession = sttSession;
|
|
962
|
+
sttSession = null;
|
|
963
|
+
activeAdapterGeneration = 0;
|
|
964
|
+
try {
|
|
965
|
+
await activeSession.close(reason);
|
|
966
|
+
} catch (error) {
|
|
967
|
+
logger.warn("voice stt close failed", {
|
|
968
|
+
error: toError(error).message,
|
|
969
|
+
sessionId: options.id
|
|
970
|
+
});
|
|
971
|
+
}
|
|
972
|
+
};
|
|
973
|
+
const scheduleTurnCommit = (delayMs, reason, reset = true) => {
|
|
974
|
+
if (!reset && silenceTimer) {
|
|
975
|
+
return;
|
|
976
|
+
}
|
|
977
|
+
if (reset) {
|
|
978
|
+
clearSilenceTimer();
|
|
979
|
+
}
|
|
980
|
+
silenceTimer = setTimeout(() => {
|
|
981
|
+
silenceTimer = null;
|
|
982
|
+
api.commitTurn(reason);
|
|
983
|
+
}, delayMs);
|
|
984
|
+
};
|
|
985
|
+
const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
|
|
986
|
+
const requestTurnCommit = async (reason) => {
|
|
987
|
+
const session = await readSession();
|
|
988
|
+
const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
989
|
+
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
990
|
+
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
991
|
+
});
|
|
992
|
+
if (!text) {
|
|
993
|
+
return;
|
|
994
|
+
}
|
|
995
|
+
const transcriptStabilityAge = session.currentTurn.lastTranscriptAt !== undefined ? Date.now() - session.currentTurn.lastTranscriptAt : undefined;
|
|
996
|
+
if (reason !== "manual" && typeof transcriptStabilityAge === "number" && transcriptStabilityAge < turnDetection.transcriptStabilityMs) {
|
|
997
|
+
scheduleTurnCommit(turnDetection.transcriptStabilityMs - transcriptStabilityAge, reason);
|
|
998
|
+
return;
|
|
999
|
+
}
|
|
1000
|
+
await commitTurnInternal(reason);
|
|
1001
|
+
};
|
|
1002
|
+
const failInternal = async (error) => {
|
|
1003
|
+
clearSilenceTimer();
|
|
1004
|
+
const session = await writeSession((currentSession) => {
|
|
1005
|
+
currentSession.lastActivityAt = Date.now();
|
|
1006
|
+
currentSession.status = "failed";
|
|
1007
|
+
});
|
|
1008
|
+
const resolvedError = toError(error);
|
|
1009
|
+
await send({
|
|
1010
|
+
message: resolvedError.message,
|
|
1011
|
+
recoverable: false,
|
|
1012
|
+
type: "error"
|
|
1013
|
+
});
|
|
1014
|
+
await closeAdapter("failed");
|
|
1015
|
+
speechDetected = false;
|
|
1016
|
+
rewindFallbackTurnAudio();
|
|
1017
|
+
await options.route.onError?.({
|
|
1018
|
+
api,
|
|
1019
|
+
context: options.context,
|
|
1020
|
+
error: resolvedError,
|
|
1021
|
+
session,
|
|
1022
|
+
sessionId: options.id
|
|
1023
|
+
});
|
|
1024
|
+
};
|
|
1025
|
+
const completeInternal = async (result) => {
|
|
1026
|
+
clearSilenceTimer();
|
|
1027
|
+
const session = await writeSession((currentSession) => {
|
|
1028
|
+
if (currentSession.status === "completed") {
|
|
1029
|
+
return;
|
|
1030
|
+
}
|
|
1031
|
+
currentSession.lastActivityAt = Date.now();
|
|
1032
|
+
currentSession.status = "completed";
|
|
1033
|
+
if (result !== undefined && currentSession.turns.length > 0) {
|
|
1034
|
+
const lastTurn = currentSession.turns.at(-1);
|
|
1035
|
+
if (lastTurn) {
|
|
1036
|
+
setTurnResult(currentSession, lastTurn.id, {
|
|
1037
|
+
result
|
|
1038
|
+
});
|
|
1039
|
+
}
|
|
1040
|
+
}
|
|
1041
|
+
});
|
|
1042
|
+
await send({
|
|
1043
|
+
sessionId: options.id,
|
|
1044
|
+
type: "complete"
|
|
1045
|
+
});
|
|
1046
|
+
await closeAdapter("complete");
|
|
1047
|
+
speechDetected = false;
|
|
1048
|
+
rewindFallbackTurnAudio();
|
|
1049
|
+
await options.route.onComplete({
|
|
1050
|
+
api,
|
|
1051
|
+
context: options.context,
|
|
1052
|
+
session
|
|
1053
|
+
});
|
|
1054
|
+
};
|
|
1055
|
+
const handleError = async (event) => {
|
|
1056
|
+
await send({
|
|
1057
|
+
message: event.error.message,
|
|
1058
|
+
recoverable: event.recoverable,
|
|
1059
|
+
type: "error"
|
|
1060
|
+
});
|
|
1061
|
+
if (!event.recoverable) {
|
|
1062
|
+
await failInternal(event.error);
|
|
1063
|
+
}
|
|
1064
|
+
};
|
|
1065
|
+
const handleClose = async (event) => {
|
|
1066
|
+
if (event.recoverable === false) {
|
|
1067
|
+
await failInternal(new Error(event.reason ?? "Speech-to-text session closed"));
|
|
1068
|
+
return;
|
|
1069
|
+
}
|
|
1070
|
+
if (!event.reason) {
|
|
1071
|
+
await closeAdapter("provider stream closed");
|
|
1072
|
+
return;
|
|
1073
|
+
}
|
|
1074
|
+
await closeAdapter(event.reason);
|
|
1075
|
+
};
|
|
1076
|
+
const rewindFallbackTurnAudio = () => {
|
|
1077
|
+
fallbackAttemptsForCurrentTurn = 0;
|
|
1078
|
+
currentTurnAudio.length = 0;
|
|
1079
|
+
};
|
|
1080
|
+
const runFallbackTranscription = async (primaryText, primaryTranscripts) => {
|
|
1081
|
+
if (!sttFallback?.adapter || fallbackAttemptsForCurrentTurn >= sttFallback.maxAttemptsPerTurn) {
|
|
1082
|
+
return null;
|
|
1083
|
+
}
|
|
1084
|
+
const candidate = {
|
|
1085
|
+
text: primaryText,
|
|
1086
|
+
transcripts: primaryTranscripts
|
|
1087
|
+
};
|
|
1088
|
+
if (!isFallbackNeeded(candidate, sttFallback)) {
|
|
1089
|
+
return null;
|
|
1090
|
+
}
|
|
1091
|
+
fallbackAttemptsForCurrentTurn += 1;
|
|
1092
|
+
const replayAudio = getFallbackWindowAudio();
|
|
1093
|
+
if (replayAudio.length === 0) {
|
|
1094
|
+
return null;
|
|
1095
|
+
}
|
|
1096
|
+
let fallbackSession = null;
|
|
1097
|
+
const fallbackTranscripts = [];
|
|
1098
|
+
let fallbackClosed = false;
|
|
1099
|
+
let fallbackEndOfTurnReceived = false;
|
|
1100
|
+
let fallbackFinalReceived = false;
|
|
1101
|
+
let lastFallbackTranscriptAt = 0;
|
|
1102
|
+
try {
|
|
1103
|
+
fallbackSession = await sttFallback.adapter.open({
|
|
1104
|
+
format: DEFAULT_FORMAT,
|
|
1105
|
+
phraseHints,
|
|
1106
|
+
sessionId: `${options.id}:fallback:${fallbackAttemptsForCurrentTurn}`
|
|
1107
|
+
});
|
|
1108
|
+
} catch (error) {
|
|
1109
|
+
logger.warn("voice stt fallback open failed", {
|
|
1110
|
+
error: toError(error).message,
|
|
1111
|
+
sessionId: options.id
|
|
1112
|
+
});
|
|
1113
|
+
return null;
|
|
1114
|
+
}
|
|
1115
|
+
const unsubscribers = [
|
|
1116
|
+
fallbackSession.on("final", ({ transcript }) => {
|
|
1117
|
+
fallbackFinalReceived = true;
|
|
1118
|
+
lastFallbackTranscriptAt = Date.now();
|
|
1119
|
+
fallbackTranscripts.push(cloneTranscript(transcript));
|
|
1120
|
+
}),
|
|
1121
|
+
fallbackSession.on("partial", ({ transcript }) => {
|
|
1122
|
+
lastFallbackTranscriptAt = Date.now();
|
|
1123
|
+
fallbackTranscripts.push(cloneTranscript(transcript));
|
|
1124
|
+
}),
|
|
1125
|
+
fallbackSession.on("endOfTurn", () => {
|
|
1126
|
+
fallbackEndOfTurnReceived = true;
|
|
1127
|
+
}),
|
|
1128
|
+
fallbackSession.on("error", (event) => {
|
|
1129
|
+
logger.warn("voice stt fallback error", {
|
|
1130
|
+
error: toError(event.error).message,
|
|
1131
|
+
sessionId: options.id
|
|
1132
|
+
});
|
|
1133
|
+
}),
|
|
1134
|
+
fallbackSession.on("close", () => {
|
|
1135
|
+
fallbackClosed = true;
|
|
1136
|
+
})
|
|
1137
|
+
];
|
|
1138
|
+
const closeFallback = async (reason) => {
|
|
1139
|
+
if (!fallbackSession) {
|
|
1140
|
+
return;
|
|
1141
|
+
}
|
|
1142
|
+
try {
|
|
1143
|
+
await fallbackSession.close(reason);
|
|
1144
|
+
} catch (error) {
|
|
1145
|
+
logger.warn("voice stt fallback close failed", {
|
|
1146
|
+
error: toError(error).message,
|
|
1147
|
+
sessionId: options.id
|
|
1148
|
+
});
|
|
1149
|
+
} finally {
|
|
1150
|
+
fallbackSession = null;
|
|
1151
|
+
}
|
|
1152
|
+
};
|
|
1153
|
+
try {
|
|
1154
|
+
for (const chunk of replayAudio) {
|
|
1155
|
+
await fallbackSession.send(chunk);
|
|
1156
|
+
}
|
|
1157
|
+
const replayDurationMs = getBufferedAudioDurationMs(replayAudio);
|
|
1158
|
+
const completionTimeoutMs = Math.max(sttFallback.completionTimeoutMs, Math.min(4000, Math.max(sttFallback.settleMs * 4, Math.round(replayDurationMs * 0.18))));
|
|
1159
|
+
const waitStartedAt = Date.now();
|
|
1160
|
+
while (Date.now() - waitStartedAt < completionTimeoutMs) {
|
|
1161
|
+
const idleMs = lastFallbackTranscriptAt > 0 ? Date.now() - lastFallbackTranscriptAt : Date.now() - waitStartedAt;
|
|
1162
|
+
if (fallbackEndOfTurnReceived && idleMs >= sttFallback.settleMs) {
|
|
1163
|
+
break;
|
|
1164
|
+
}
|
|
1165
|
+
if (fallbackFinalReceived && idleMs >= sttFallback.settleMs) {
|
|
1166
|
+
break;
|
|
1167
|
+
}
|
|
1168
|
+
if (fallbackClosed && (lastFallbackTranscriptAt === 0 || idleMs >= sttFallback.settleMs)) {
|
|
1169
|
+
break;
|
|
1170
|
+
}
|
|
1171
|
+
await Bun.sleep(Math.min(75, Math.max(25, sttFallback.settleMs / 2)));
|
|
1172
|
+
}
|
|
1173
|
+
} catch (error) {
|
|
1174
|
+
logger.warn("voice stt fallback failed", {
|
|
1175
|
+
error: toError(error).message,
|
|
1176
|
+
sessionId: options.id
|
|
1177
|
+
});
|
|
1178
|
+
} finally {
|
|
1179
|
+
await closeFallback("fallback-complete");
|
|
1180
|
+
for (const unsubscribe of unsubscribers) {
|
|
1181
|
+
unsubscribe();
|
|
1182
|
+
}
|
|
1183
|
+
}
|
|
1184
|
+
if (fallbackTranscripts.length === 0) {
|
|
1185
|
+
return null;
|
|
1186
|
+
}
|
|
1187
|
+
const fallbackText = buildTurnText(fallbackTranscripts, "", {});
|
|
1188
|
+
const fallbackConfidence = calculateMeanConfidence(fallbackTranscripts);
|
|
1189
|
+
const fallbackCandidate = {
|
|
1190
|
+
confidence: fallbackConfidence,
|
|
1191
|
+
text: fallbackText,
|
|
1192
|
+
wordCount: countWords2(normalizeText2(fallbackText))
|
|
1193
|
+
};
|
|
1194
|
+
const primaryCandidate = {
|
|
1195
|
+
confidence: calculateMeanConfidence(primaryTranscripts),
|
|
1196
|
+
text: primaryText,
|
|
1197
|
+
wordCount: countWords2(normalizeText2(primaryText))
|
|
1198
|
+
};
|
|
1199
|
+
const selection = selectBetterTurnText(primaryCandidate, fallbackCandidate);
|
|
1200
|
+
const diagnostics = {
|
|
1201
|
+
attempted: true,
|
|
1202
|
+
fallbackConfidence: fallbackCandidate.confidence,
|
|
1203
|
+
fallbackText: fallbackCandidate.text,
|
|
1204
|
+
fallbackWordCount: fallbackCandidate.wordCount,
|
|
1205
|
+
primaryConfidence: primaryCandidate.confidence,
|
|
1206
|
+
primaryText,
|
|
1207
|
+
primaryWordCount: primaryCandidate.wordCount,
|
|
1208
|
+
selected: selection.winner.text === fallbackCandidate.text,
|
|
1209
|
+
selectionReason: selection.reason,
|
|
1210
|
+
trigger: sttFallback.trigger
|
|
1211
|
+
};
|
|
1212
|
+
if (selection.winner.text === primaryCandidate.text) {
|
|
1213
|
+
return {
|
|
1214
|
+
diagnostics,
|
|
1215
|
+
fallbackUsed: false,
|
|
1216
|
+
source: "primary",
|
|
1217
|
+
text: primaryText,
|
|
1218
|
+
transcripts: primaryTranscripts.map((transcript) => ({
|
|
1219
|
+
...transcript,
|
|
1220
|
+
isFinal: true
|
|
1221
|
+
}))
|
|
1222
|
+
};
|
|
1223
|
+
}
|
|
1224
|
+
const candidateTranscripts = fallbackText === fallbackCandidate.text ? fallbackTranscripts : [];
|
|
1225
|
+
return {
|
|
1226
|
+
diagnostics,
|
|
1227
|
+
fallbackUsed: true,
|
|
1228
|
+
source: "fallback",
|
|
1229
|
+
text: selection.winner.text,
|
|
1230
|
+
transcripts: candidateTranscripts.length > 0 ? candidateTranscripts.map((transcript) => ({
|
|
1231
|
+
...transcript,
|
|
1232
|
+
isFinal: true
|
|
1233
|
+
})) : [{ id: createId(), isFinal: false, text: selection.winner.text }]
|
|
1234
|
+
};
|
|
1235
|
+
};
|
|
1236
|
+
const getFinalTranscriptIds = (transcripts) => {
|
|
1237
|
+
const finalTranscriptIds = transcripts.filter((transcript) => transcript.isFinal).map((transcript) => transcript.id);
|
|
1238
|
+
const fallbackIds = transcripts.map((transcript) => transcript.id);
|
|
1239
|
+
return finalTranscriptIds.length > 0 ? finalTranscriptIds : fallbackIds;
|
|
1240
|
+
};
|
|
1241
|
+
const runTurnCorrection = async (input) => {
|
|
1242
|
+
if (!options.route.correctTurn) {
|
|
1243
|
+
return;
|
|
1244
|
+
}
|
|
1245
|
+
const originalText = input.text;
|
|
1246
|
+
const result = await options.route.correctTurn({
|
|
1247
|
+
api,
|
|
1248
|
+
context: options.context,
|
|
1249
|
+
fallback: input.fallbackDiagnostics,
|
|
1250
|
+
phraseHints,
|
|
1251
|
+
session: input.session,
|
|
1252
|
+
text: originalText,
|
|
1253
|
+
transcripts: input.transcripts.map(cloneTranscript)
|
|
1254
|
+
});
|
|
1255
|
+
const nextText = typeof result === "string" ? result : typeof result?.text === "string" ? result.text : originalText;
|
|
1256
|
+
const correctedText = normalizeCorrectionText(nextText);
|
|
1257
|
+
const normalizedOriginal = normalizeCorrectionText(originalText);
|
|
1258
|
+
return {
|
|
1259
|
+
diagnostics: {
|
|
1260
|
+
attempted: true,
|
|
1261
|
+
changed: correctedText.length > 0 && correctedText !== normalizedOriginal,
|
|
1262
|
+
correctedText: correctedText.length > 0 ? correctedText : normalizedOriginal,
|
|
1263
|
+
metadata: typeof result === "object" ? result.metadata : undefined,
|
|
1264
|
+
originalText,
|
|
1265
|
+
provider: typeof result === "object" ? result.provider : undefined,
|
|
1266
|
+
reason: typeof result === "object" ? result.reason : undefined
|
|
1267
|
+
},
|
|
1268
|
+
text: correctedText.length > 0 ? correctedText : originalText
|
|
1269
|
+
};
|
|
1270
|
+
};
|
|
1271
|
+
const ensureCommittedTurnGuard = (session) => {
|
|
1272
|
+
if (!session.lastCommittedTurn) {
|
|
1273
|
+
session.lastCommittedTurn = {
|
|
1274
|
+
committedAt: 0,
|
|
1275
|
+
signature: "",
|
|
1276
|
+
text: "",
|
|
1277
|
+
transcriptIds: []
|
|
1278
|
+
};
|
|
1279
|
+
}
|
|
1280
|
+
return session;
|
|
1281
|
+
};
|
|
1282
|
+
const buildTurnSignature = (session, finalText, transcriptIdsOverride) => {
|
|
1283
|
+
const finalTranscriptIds = transcriptIdsOverride ?? getFinalTranscriptIds(session.currentTurn.transcripts);
|
|
1284
|
+
return `${normalizeText2(finalText)}|${finalTranscriptIds.join(",")}`;
|
|
1285
|
+
};
|
|
1286
|
+
const isDuplicateTurnCommit = (session, finalText) => {
|
|
1287
|
+
const signature = buildTurnSignature(session, finalText);
|
|
1288
|
+
const committedTurn = session.lastCommittedTurn;
|
|
1289
|
+
const isRecent = committedTurn && committedTurn.committedAt > 0 && Date.now() - committedTurn.committedAt < DEFAULT_DUPLICATE_TURN_WINDOW_MS;
|
|
1290
|
+
const committedSignature = committedTurn?.signature ?? "";
|
|
1291
|
+
const committedTranscriptIds = committedTurn?.transcriptIds ?? [];
|
|
1292
|
+
const committedText = normalizeText2(committedTurn?.text ?? "");
|
|
1293
|
+
const isSameText = normalizeText2(finalText) === committedText;
|
|
1294
|
+
const hasNoNewAudioSinceCommit = (session.currentTurn.lastAudioAt ?? 0) <= (committedTurn?.committedAt ?? 0);
|
|
1295
|
+
if (!isRecent) {
|
|
1296
|
+
return false;
|
|
1297
|
+
}
|
|
1298
|
+
if (isSameText && hasNoNewAudioSinceCommit) {
|
|
1299
|
+
return true;
|
|
1300
|
+
}
|
|
1301
|
+
if (signature !== committedSignature) {
|
|
1302
|
+
return false;
|
|
1303
|
+
}
|
|
1304
|
+
const lastSignatureIds = new Set(committedTranscriptIds);
|
|
1305
|
+
const hasNoNewFinalIds = session.currentTurn.transcripts.every((transcript) => !transcript.isFinal || lastSignatureIds.has(transcript.id));
|
|
1306
|
+
return isRecent && hasNoNewFinalIds;
|
|
1307
|
+
};
|
|
1308
|
+
const markTurnCommitted = (session, finalText, committedTranscripts) => {
|
|
1309
|
+
session.lastCommittedTurn = {
|
|
1310
|
+
...session.lastCommittedTurn ?? {},
|
|
1311
|
+
committedAt: Date.now(),
|
|
1312
|
+
signature: buildTurnSignature(session, finalText, getFinalTranscriptIds(committedTranscripts)),
|
|
1313
|
+
text: normalizeText2(finalText),
|
|
1314
|
+
transcriptIds: getFinalTranscriptIds(committedTranscripts)
|
|
1315
|
+
};
|
|
1316
|
+
};
|
|
1317
|
+
const handlePartial = async (transcript) => {
|
|
1318
|
+
await writeSession((session) => {
|
|
1319
|
+
const nextPartialStartedAt = transcript.startedAtMs ?? session.currentTurn.partialStartedAt;
|
|
1320
|
+
const nextPartialEndedAt = transcript.endedAtMs ?? session.currentTurn.partialEndedAt;
|
|
1321
|
+
const preferredPartial = selectPreferredTranscriptText(session.currentTurn.partialText, transcript.text);
|
|
1322
|
+
session.currentTurn.lastTranscriptAt = Date.now();
|
|
1323
|
+
session.currentTurn.partialStartedAt = nextPartialStartedAt;
|
|
1324
|
+
session.currentTurn.partialEndedAt = nextPartialEndedAt;
|
|
1325
|
+
session.currentTurn.partialText = buildTurnText(session.currentTurn.transcripts, preferredPartial, {
|
|
1326
|
+
partialEndedAtMs: nextPartialEndedAt,
|
|
1327
|
+
partialStartedAtMs: nextPartialStartedAt
|
|
1328
|
+
});
|
|
1329
|
+
session.lastActivityAt = Date.now();
|
|
1330
|
+
session.status = "active";
|
|
1331
|
+
});
|
|
1332
|
+
await send({
|
|
1333
|
+
transcript,
|
|
1334
|
+
type: "partial"
|
|
1335
|
+
});
|
|
1336
|
+
};
|
|
1337
|
+
const handleFinal = async (transcript) => {
|
|
1338
|
+
await writeSession((session) => {
|
|
1339
|
+
const alreadyPresent = session.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
|
|
1340
|
+
if (!alreadyPresent) {
|
|
1341
|
+
session.currentTurn.transcripts = [
|
|
1342
|
+
...session.currentTurn.transcripts,
|
|
1343
|
+
cloneTranscript(transcript)
|
|
1344
|
+
];
|
|
1345
|
+
session.transcripts = [
|
|
1346
|
+
...session.transcripts,
|
|
1347
|
+
cloneTranscript(transcript)
|
|
1348
|
+
];
|
|
1349
|
+
}
|
|
1350
|
+
session.currentTurn.finalText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
1351
|
+
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
1352
|
+
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
1353
|
+
});
|
|
1354
|
+
session.currentTurn.lastTranscriptAt = Date.now();
|
|
1355
|
+
session.lastActivityAt = Date.now();
|
|
1356
|
+
session.status = "active";
|
|
1357
|
+
});
|
|
1358
|
+
await send({
|
|
1359
|
+
transcript,
|
|
1360
|
+
type: "final"
|
|
1361
|
+
});
|
|
1362
|
+
};
|
|
1363
|
+
const resumePendingTurnCommit = (session) => {
|
|
1364
|
+
const pendingText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
1365
|
+
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
1366
|
+
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
1367
|
+
});
|
|
1368
|
+
if (!pendingText) {
|
|
1369
|
+
speechDetected = false;
|
|
1370
|
+
return;
|
|
1371
|
+
}
|
|
1372
|
+
speechDetected = true;
|
|
1373
|
+
const audioAge = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : session.currentTurn.lastSpeechAt !== undefined ? Date.now() - session.currentTurn.lastSpeechAt : 0;
|
|
1374
|
+
const transcriptAge = session.currentTurn.lastTranscriptAt !== undefined ? Date.now() - session.currentTurn.lastTranscriptAt : turnDetection.transcriptStabilityMs;
|
|
1375
|
+
const delayMs = Math.max(0, turnDetection.silenceMs - audioAge, turnDetection.transcriptStabilityMs - transcriptAge);
|
|
1376
|
+
scheduleSilenceCommit(delayMs);
|
|
1377
|
+
};
|
|
1378
|
+
const ensureAdapter = async () => {
|
|
1379
|
+
if (sttSession) {
|
|
1380
|
+
return sttSession;
|
|
1381
|
+
}
|
|
1382
|
+
const openedSession = await options.stt.open({
|
|
1383
|
+
format: DEFAULT_FORMAT,
|
|
1384
|
+
phraseHints,
|
|
1385
|
+
sessionId: options.id
|
|
1386
|
+
});
|
|
1387
|
+
const generation = ++adapterGenerationCounter;
|
|
1388
|
+
sttSession = openedSession;
|
|
1389
|
+
activeAdapterGeneration = generation;
|
|
1390
|
+
const runAdapterEvent = (phase, handler) => {
|
|
1391
|
+
runSerial(phase, async () => {
|
|
1392
|
+
if (activeAdapterGeneration !== generation) {
|
|
1393
|
+
return;
|
|
1394
|
+
}
|
|
1395
|
+
await handler();
|
|
1396
|
+
});
|
|
1397
|
+
};
|
|
1398
|
+
openedSession.on("partial", ({ transcript }) => {
|
|
1399
|
+
runAdapterEvent("adapter.partial", () => handlePartial(transcript));
|
|
1400
|
+
});
|
|
1401
|
+
openedSession.on("final", ({ transcript }) => {
|
|
1402
|
+
runAdapterEvent("adapter.final", () => handleFinal(transcript));
|
|
1403
|
+
});
|
|
1404
|
+
openedSession.on("endOfTurn", ({ reason }) => {
|
|
1405
|
+
runAdapterEvent("adapter.endOfTurn", async () => {
|
|
1406
|
+
clearSilenceTimer();
|
|
1407
|
+
await requestTurnCommit(reason);
|
|
1408
|
+
});
|
|
1409
|
+
});
|
|
1410
|
+
openedSession.on("error", (event) => {
|
|
1411
|
+
runAdapterEvent("adapter.error", () => handleError(event));
|
|
1412
|
+
});
|
|
1413
|
+
openedSession.on("close", (event) => {
|
|
1414
|
+
runAdapterEvent("adapter.close", () => handleClose(event));
|
|
1415
|
+
});
|
|
1416
|
+
return openedSession;
|
|
1417
|
+
};
|
|
1418
|
+
const completeTurn = async (session, turn) => {
|
|
1419
|
+
const output = await options.route.onTurn({
|
|
1420
|
+
api,
|
|
1421
|
+
context: options.context,
|
|
1422
|
+
session,
|
|
1423
|
+
turn
|
|
1424
|
+
});
|
|
1425
|
+
if (output?.assistantText) {
|
|
1426
|
+
await writeSession((currentSession) => {
|
|
1427
|
+
setTurnResult(currentSession, turn.id, {
|
|
1428
|
+
assistantText: output.assistantText
|
|
1429
|
+
});
|
|
1430
|
+
});
|
|
1431
|
+
await send({
|
|
1432
|
+
text: output.assistantText,
|
|
1433
|
+
turnId: turn.id,
|
|
1434
|
+
type: "assistant"
|
|
1435
|
+
});
|
|
1436
|
+
}
|
|
1437
|
+
if (output?.result !== undefined) {
|
|
1438
|
+
await writeSession((currentSession) => {
|
|
1439
|
+
setTurnResult(currentSession, turn.id, {
|
|
1440
|
+
result: output.result
|
|
1441
|
+
});
|
|
1442
|
+
});
|
|
1443
|
+
}
|
|
1444
|
+
if (output?.complete) {
|
|
1445
|
+
await completeInternal(output.result);
|
|
1446
|
+
}
|
|
1447
|
+
};
|
|
1448
|
+
const commitTurnInternal = async (reason = "manual") => {
|
|
1449
|
+
clearSilenceTimer();
|
|
1450
|
+
const session = await readSession();
|
|
1451
|
+
if (session.status === "completed" || session.status === "failed") {
|
|
1452
|
+
return;
|
|
1453
|
+
}
|
|
1454
|
+
const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
1455
|
+
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
1456
|
+
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
1457
|
+
});
|
|
1458
|
+
let transcripts = session.currentTurn.transcripts.length ? session.currentTurn.transcripts.map(cloneTranscript) : [];
|
|
1459
|
+
let finalText = text;
|
|
1460
|
+
const transcriptStabilityAge = session.currentTurn.lastTranscriptAt !== undefined ? Date.now() - session.currentTurn.lastTranscriptAt : undefined;
|
|
1461
|
+
const fallbackSelection = await runFallbackTranscription(text, session.currentTurn.transcripts);
|
|
1462
|
+
const source = fallbackSelection?.source ?? "primary";
|
|
1463
|
+
const fallbackUsed = fallbackSelection?.fallbackUsed ?? false;
|
|
1464
|
+
const fallbackDiagnostics = fallbackSelection?.diagnostics;
|
|
1465
|
+
if (fallbackSelection) {
|
|
1466
|
+
finalText = fallbackSelection.text;
|
|
1467
|
+
transcripts = fallbackSelection.transcripts.length ? fallbackSelection.transcripts.map(cloneTranscript) : transcripts.length ? transcripts : [
|
|
1468
|
+
{
|
|
1469
|
+
id: createId(),
|
|
1470
|
+
isFinal: false,
|
|
1471
|
+
text: finalText
|
|
1472
|
+
}
|
|
1473
|
+
];
|
|
1474
|
+
if (fallbackSelection.fallbackUsed) {
|
|
1475
|
+
logger.info("voice fallback turn selected", {
|
|
1476
|
+
reason,
|
|
1477
|
+
sessionId: options.id,
|
|
1478
|
+
text: finalText
|
|
1479
|
+
});
|
|
1480
|
+
}
|
|
1481
|
+
}
|
|
1482
|
+
const correctionSelection = await runTurnCorrection({
|
|
1483
|
+
fallbackDiagnostics,
|
|
1484
|
+
fallbackUsed,
|
|
1485
|
+
session,
|
|
1486
|
+
source,
|
|
1487
|
+
text: finalText,
|
|
1488
|
+
transcripts
|
|
1489
|
+
});
|
|
1490
|
+
const correctionDiagnostics = correctionSelection?.diagnostics;
|
|
1491
|
+
if (correctionSelection) {
|
|
1492
|
+
finalText = correctionSelection.text;
|
|
1493
|
+
}
|
|
1494
|
+
if (!finalText) {
|
|
1495
|
+
return;
|
|
1496
|
+
}
|
|
1497
|
+
if (isDuplicateTurnCommit(session, finalText)) {
|
|
1498
|
+
logger.debug("voice turn commit deduped", {
|
|
1499
|
+
reason,
|
|
1500
|
+
sessionId: options.id
|
|
1501
|
+
});
|
|
1502
|
+
return;
|
|
1503
|
+
}
|
|
1504
|
+
if (typeof transcriptStabilityAge === "number" && transcriptStabilityAge < turnDetection.transcriptStabilityMs && reason !== "manual") {
|
|
1505
|
+
scheduleTurnCommit(turnDetection.transcriptStabilityMs - transcriptStabilityAge, reason, false);
|
|
1506
|
+
return;
|
|
1507
|
+
}
|
|
1508
|
+
const turn = {
|
|
1509
|
+
committedAt: Date.now(),
|
|
1510
|
+
id: createId(),
|
|
1511
|
+
text: finalText,
|
|
1512
|
+
quality: createTurnQuality(transcripts, source, fallbackUsed, fallbackDiagnostics, correctionDiagnostics),
|
|
1513
|
+
transcripts: transcripts.length > 0 ? transcripts : [
|
|
1514
|
+
{
|
|
1515
|
+
id: createId(),
|
|
1516
|
+
isFinal: false,
|
|
1517
|
+
text: finalText
|
|
1518
|
+
}
|
|
1519
|
+
]
|
|
1520
|
+
};
|
|
1521
|
+
const updatedSession = await writeSession((currentSession) => {
|
|
1522
|
+
currentSession.committedTurnIds = [
|
|
1523
|
+
...currentSession.committedTurnIds,
|
|
1524
|
+
turn.id
|
|
1525
|
+
];
|
|
1526
|
+
currentSession.currentTurn = createEmptyCurrentTurn();
|
|
1527
|
+
currentSession.lastActivityAt = Date.now();
|
|
1528
|
+
currentSession.status = "active";
|
|
1529
|
+
currentSession.turns = [...currentSession.turns, turn];
|
|
1530
|
+
markTurnCommitted(currentSession, finalText, transcripts);
|
|
1531
|
+
});
|
|
1532
|
+
speechDetected = false;
|
|
1533
|
+
rewindFallbackTurnAudio();
|
|
1534
|
+
logger.info("voice turn committed", {
|
|
1535
|
+
reason,
|
|
1536
|
+
sessionId: options.id,
|
|
1537
|
+
turnId: turn.id
|
|
1538
|
+
});
|
|
1539
|
+
await send({
|
|
1540
|
+
turn,
|
|
1541
|
+
type: "turn"
|
|
1542
|
+
});
|
|
1543
|
+
if (options.sttLifecycle === "turn-scoped") {
|
|
1544
|
+
await closeAdapter("turn-commit");
|
|
1545
|
+
}
|
|
1546
|
+
await completeTurn(updatedSession, turn);
|
|
1547
|
+
};
|
|
1548
|
+
const connectInternal = async (nextSocket) => {
|
|
1549
|
+
socket = nextSocket;
|
|
1550
|
+
const existingSession = await options.store.get(options.id);
|
|
1551
|
+
let session = existingSession ?? createVoiceSessionRecord(options.id, options.scenarioId);
|
|
1552
|
+
if (options.scenarioId && session.scenarioId !== options.scenarioId) {
|
|
1553
|
+
session.scenarioId = options.scenarioId;
|
|
1554
|
+
}
|
|
1555
|
+
ensureCommittedTurnGuard(session);
|
|
1556
|
+
let shouldFireOnSession = !existingSession;
|
|
1557
|
+
if (existingSession?.scenarioId && options.scenarioId && existingSession.scenarioId !== options.scenarioId) {
|
|
1558
|
+
session = resetVoiceSessionRecord(options.id, existingSession, options.scenarioId);
|
|
1559
|
+
shouldFireOnSession = true;
|
|
1560
|
+
}
|
|
1561
|
+
rewindFallbackTurnAudio();
|
|
1562
|
+
if (existingSession?.status === "reconnecting") {
|
|
1563
|
+
const nextAttempts = existingSession.reconnect.attempts + 1;
|
|
1564
|
+
const reconnectExpired = existingSession.reconnect.lastDisconnectAt !== undefined && Date.now() - existingSession.reconnect.lastDisconnectAt > reconnect.timeout;
|
|
1565
|
+
const tooManyAttempts = nextAttempts > reconnect.maxAttempts;
|
|
1566
|
+
if (reconnect.strategy === "fail" && (reconnectExpired || tooManyAttempts)) {
|
|
1567
|
+
await failInternal(new Error("Voice session reconnect policy exhausted"));
|
|
1568
|
+
return;
|
|
1569
|
+
}
|
|
1570
|
+
if (reconnect.strategy === "restart" && (reconnectExpired || tooManyAttempts)) {
|
|
1571
|
+
session = resetVoiceSessionRecord(options.id, existingSession, options.scenarioId);
|
|
1572
|
+
shouldFireOnSession = true;
|
|
1573
|
+
} else {
|
|
1574
|
+
session = {
|
|
1575
|
+
...existingSession,
|
|
1576
|
+
reconnect: {
|
|
1577
|
+
...existingSession.reconnect,
|
|
1578
|
+
attempts: nextAttempts
|
|
1579
|
+
},
|
|
1580
|
+
status: "active"
|
|
1581
|
+
};
|
|
1582
|
+
}
|
|
1583
|
+
}
|
|
1584
|
+
await options.store.set(options.id, session);
|
|
1585
|
+
await send({
|
|
1586
|
+
sessionId: options.id,
|
|
1587
|
+
status: session.status,
|
|
1588
|
+
scenarioId: session.scenarioId,
|
|
1589
|
+
type: "session"
|
|
1590
|
+
});
|
|
1591
|
+
if (shouldFireOnSession) {
|
|
1592
|
+
await options.route.onSession?.({
|
|
1593
|
+
api,
|
|
1594
|
+
context: options.context,
|
|
1595
|
+
session
|
|
1596
|
+
});
|
|
1597
|
+
}
|
|
1598
|
+
if (session.status === "completed") {
|
|
1599
|
+
await send({
|
|
1600
|
+
sessionId: options.id,
|
|
1601
|
+
type: "complete"
|
|
1602
|
+
});
|
|
1603
|
+
return;
|
|
1604
|
+
}
|
|
1605
|
+
resumePendingTurnCommit(session);
|
|
1606
|
+
await ensureAdapter();
|
|
1607
|
+
};
|
|
1608
|
+
const disconnectInternal = async (event) => {
|
|
1609
|
+
clearSilenceTimer();
|
|
1610
|
+
await closeAdapter(event?.reason);
|
|
1611
|
+
rewindFallbackTurnAudio();
|
|
1612
|
+
if (reconnect.strategy === "fail") {
|
|
1613
|
+
await failInternal(new Error(event?.reason ?? "Voice socket disconnected"));
|
|
1614
|
+
return;
|
|
1615
|
+
}
|
|
1616
|
+
await writeSession((session) => {
|
|
1617
|
+
if (session.status === "completed" || session.status === "failed") {
|
|
1618
|
+
return;
|
|
1619
|
+
}
|
|
1620
|
+
session.lastActivityAt = Date.now();
|
|
1621
|
+
session.reconnect.lastDisconnectAt = Date.now();
|
|
1622
|
+
session.status = "reconnecting";
|
|
1623
|
+
});
|
|
1624
|
+
speechDetected = false;
|
|
1625
|
+
};
|
|
1626
|
+
const receiveAudioInternal = async (audio) => {
|
|
1627
|
+
const session = await readSession();
|
|
1628
|
+
if (session.status === "completed" || session.status === "failed") {
|
|
1629
|
+
return;
|
|
1630
|
+
}
|
|
1631
|
+
const adapter = await ensureAdapter();
|
|
1632
|
+
const conditionedAudio = conditionAudioChunk(audio, options.audioConditioning);
|
|
1633
|
+
const audioLevel = measureAudioLevel(conditionedAudio);
|
|
1634
|
+
const shouldStoreAudio = speechDetected || audioLevel >= turnDetection.speechThreshold;
|
|
1635
|
+
await writeSession((currentSession) => {
|
|
1636
|
+
currentSession.currentTurn.lastAudioAt = Date.now();
|
|
1637
|
+
currentSession.lastActivityAt = Date.now();
|
|
1638
|
+
currentSession.status = "active";
|
|
1639
|
+
if (audioLevel >= turnDetection.speechThreshold) {
|
|
1640
|
+
currentSession.currentTurn.lastSpeechAt = Date.now();
|
|
1641
|
+
currentSession.currentTurn.silenceStartedAt = undefined;
|
|
1642
|
+
} else if (speechDetected && currentSession.currentTurn.silenceStartedAt === undefined) {
|
|
1643
|
+
currentSession.currentTurn.silenceStartedAt = Date.now();
|
|
1644
|
+
}
|
|
1645
|
+
});
|
|
1646
|
+
if (shouldStoreAudio) {
|
|
1647
|
+
pushTurnAudio(conditionedAudio);
|
|
1648
|
+
}
|
|
1649
|
+
if (audioLevel >= turnDetection.speechThreshold) {
|
|
1650
|
+
speechDetected = true;
|
|
1651
|
+
clearSilenceTimer();
|
|
1652
|
+
} else if (speechDetected) {
|
|
1653
|
+
const currentSession = await readSession();
|
|
1654
|
+
const hasTurnText = Boolean(buildTurnText(currentSession.currentTurn.transcripts, currentSession.currentTurn.partialText, {
|
|
1655
|
+
partialEndedAtMs: currentSession.currentTurn.partialEndedAt,
|
|
1656
|
+
partialStartedAtMs: currentSession.currentTurn.partialStartedAt
|
|
1657
|
+
}));
|
|
1658
|
+
if (hasTurnText) {
|
|
1659
|
+
scheduleSilenceCommit(turnDetection.silenceMs, false);
|
|
1660
|
+
}
|
|
1661
|
+
}
|
|
1662
|
+
await adapter.send(conditionedAudio);
|
|
1663
|
+
};
|
|
1664
|
+
const api = {
|
|
1665
|
+
id: options.id,
|
|
1666
|
+
close: async (reason) => {
|
|
1667
|
+
await runSerial("api.close", async () => {
|
|
1668
|
+
clearSilenceTimer();
|
|
1669
|
+
await closeAdapter(reason);
|
|
1670
|
+
await Promise.resolve(socket.close(1000, reason));
|
|
1671
|
+
});
|
|
1672
|
+
},
|
|
1673
|
+
commitTurn: async (reason = "manual") => runSerial("api.commitTurn", async () => {
|
|
1674
|
+
await commitTurnInternal(reason);
|
|
1675
|
+
}),
|
|
1676
|
+
complete: async (result) => runSerial("api.complete", async () => {
|
|
1677
|
+
await completeInternal(result);
|
|
1678
|
+
}),
|
|
1679
|
+
connect: async (nextSocket) => runSerial("api.connect", async () => {
|
|
1680
|
+
await connectInternal(nextSocket);
|
|
1681
|
+
}),
|
|
1682
|
+
disconnect: async (event) => runSerial("api.disconnect", async () => {
|
|
1683
|
+
await disconnectInternal(event);
|
|
1684
|
+
}),
|
|
1685
|
+
fail: async (error) => runSerial("api.fail", async () => {
|
|
1686
|
+
await failInternal(error);
|
|
1687
|
+
}),
|
|
1688
|
+
receiveAudio: async (audio) => runSerial("api.receiveAudio", async () => {
|
|
1689
|
+
await receiveAudioInternal(audio);
|
|
1690
|
+
}),
|
|
1691
|
+
snapshot: async () => runSerial("api.snapshot", async () => readSession())
|
|
1692
|
+
};
|
|
1693
|
+
return api;
|
|
1694
|
+
};
|
|
1695
|
+
|
|
1696
|
+
// src/turnProfiles.ts
|
|
1697
|
+
var TURN_PROFILE_DEFAULTS = {
|
|
1698
|
+
balanced: {
|
|
1699
|
+
qualityProfile: "general",
|
|
1700
|
+
silenceMs: 1400,
|
|
1701
|
+
speechThreshold: 0.012,
|
|
1702
|
+
transcriptStabilityMs: 1000
|
|
1703
|
+
},
|
|
1704
|
+
fast: {
|
|
1705
|
+
qualityProfile: "general",
|
|
1706
|
+
silenceMs: 700,
|
|
1707
|
+
speechThreshold: 0.015,
|
|
1708
|
+
transcriptStabilityMs: 450
|
|
1709
|
+
},
|
|
1710
|
+
"long-form": {
|
|
1711
|
+
qualityProfile: "general",
|
|
1712
|
+
silenceMs: 2200,
|
|
1713
|
+
speechThreshold: 0.01,
|
|
1714
|
+
transcriptStabilityMs: 1500
|
|
1715
|
+
}
|
|
1716
|
+
};
|
|
1717
|
+
var QUALITY_PROFILE_DEFAULTS = {
|
|
1718
|
+
general: {},
|
|
1719
|
+
"accent-heavy": {
|
|
1720
|
+
silenceMs: 1200,
|
|
1721
|
+
speechThreshold: 0.01,
|
|
1722
|
+
transcriptStabilityMs: 1200
|
|
1723
|
+
},
|
|
1724
|
+
"noisy-room": {
|
|
1725
|
+
silenceMs: 2000,
|
|
1726
|
+
speechThreshold: 0.02,
|
|
1727
|
+
transcriptStabilityMs: 1600
|
|
1728
|
+
},
|
|
1729
|
+
"short-command": {
|
|
1730
|
+
silenceMs: 500,
|
|
1731
|
+
speechThreshold: 0.016,
|
|
1732
|
+
transcriptStabilityMs: 420
|
|
1733
|
+
}
|
|
1734
|
+
};
|
|
1735
|
+
var DEFAULT_TURN_PROFILE = "fast";
|
|
1736
|
+
var DEFAULT_QUALITY_PROFILE = "general";
|
|
1737
|
+
var resolveTurnDetectionConfig = (config) => {
|
|
1738
|
+
const profile = config?.profile ?? DEFAULT_TURN_PROFILE;
|
|
1739
|
+
const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
|
|
1740
|
+
const preset = TURN_PROFILE_DEFAULTS[profile];
|
|
1741
|
+
const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
|
|
1742
|
+
return {
|
|
1743
|
+
profile,
|
|
1744
|
+
qualityProfile,
|
|
1745
|
+
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
1746
|
+
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
1747
|
+
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
|
1748
|
+
};
|
|
1749
|
+
};
|
|
1750
|
+
|
|
1751
|
+
// src/testing/resilience.ts
|
|
1752
|
+
var roundMetric2 = (value, digits = 4) => {
|
|
1753
|
+
const factor = 10 ** digits;
|
|
1754
|
+
return Math.round(value * factor) / factor;
|
|
1755
|
+
};
|
|
1756
|
+
var createMockSocket = () => ({
|
|
1757
|
+
close: async () => {},
|
|
1758
|
+
send: async () => {}
|
|
1759
|
+
});
|
|
1760
|
+
var createSpeechChunk = (sample) => new Int16Array(160).fill(sample);
|
|
1761
|
+
var createFakeAdapter = () => {
|
|
1762
|
+
const listeners = {
|
|
1763
|
+
close: [],
|
|
1764
|
+
endOfTurn: [],
|
|
1765
|
+
error: [],
|
|
1766
|
+
final: [],
|
|
1767
|
+
partial: []
|
|
1768
|
+
};
|
|
1769
|
+
const session = {
|
|
1770
|
+
close: async () => {},
|
|
1771
|
+
emit: async (event, payload) => {
|
|
1772
|
+
for (const listener of listeners[event]) {
|
|
1773
|
+
await listener(payload);
|
|
1774
|
+
}
|
|
1775
|
+
},
|
|
1776
|
+
on: (event, handler) => {
|
|
1777
|
+
listeners[event].push(handler);
|
|
1778
|
+
return () => {
|
|
1779
|
+
const index = listeners[event].indexOf(handler);
|
|
1780
|
+
if (index >= 0) {
|
|
1781
|
+
listeners[event].splice(index, 1);
|
|
1782
|
+
}
|
|
1783
|
+
};
|
|
1784
|
+
},
|
|
1785
|
+
send: async (_audio) => {}
|
|
1786
|
+
};
|
|
1787
|
+
return {
|
|
1788
|
+
adapter: {
|
|
1789
|
+
kind: "stt",
|
|
1790
|
+
open: () => session
|
|
1791
|
+
},
|
|
1792
|
+
session
|
|
1793
|
+
};
|
|
1794
|
+
};
|
|
1795
|
+
var runScenario = async (id, title, run) => {
|
|
1796
|
+
const store = createVoiceMemoryStore();
|
|
1797
|
+
const adapter = createFakeAdapter();
|
|
1798
|
+
const turns = [];
|
|
1799
|
+
const voice = createVoiceSession({
|
|
1800
|
+
context: {},
|
|
1801
|
+
id,
|
|
1802
|
+
logger: {},
|
|
1803
|
+
reconnect: {
|
|
1804
|
+
maxAttempts: 2,
|
|
1805
|
+
strategy: "resume-last-turn",
|
|
1806
|
+
timeout: 5000
|
|
1807
|
+
},
|
|
1808
|
+
route: {
|
|
1809
|
+
onComplete: async () => {},
|
|
1810
|
+
onTurn: async ({ turn }) => {
|
|
1811
|
+
turns.push(turn.text);
|
|
1812
|
+
}
|
|
1813
|
+
},
|
|
1814
|
+
socket: createMockSocket(),
|
|
1815
|
+
store,
|
|
1816
|
+
stt: adapter.adapter,
|
|
1817
|
+
sttLifecycle: "continuous",
|
|
1818
|
+
turnDetection: resolveTurnDetectionConfig({
|
|
1819
|
+
silenceMs: 20,
|
|
1820
|
+
speechThreshold: 0.01,
|
|
1821
|
+
transcriptStabilityMs: 5
|
|
1822
|
+
})
|
|
1823
|
+
});
|
|
1824
|
+
await voice.connect(createMockSocket());
|
|
1825
|
+
try {
|
|
1826
|
+
await run({
|
|
1827
|
+
adapter,
|
|
1828
|
+
commit: async (text, transcriptId = `${id}-${turns.length}`) => {
|
|
1829
|
+
await adapter.session.emit("final", {
|
|
1830
|
+
receivedAt: Date.now(),
|
|
1831
|
+
transcript: {
|
|
1832
|
+
id: transcriptId,
|
|
1833
|
+
isFinal: true,
|
|
1834
|
+
text
|
|
1835
|
+
},
|
|
1836
|
+
type: "final"
|
|
1837
|
+
});
|
|
1838
|
+
await voice.receiveAudio(createSpeechChunk(16000));
|
|
1839
|
+
await voice.receiveAudio(createSpeechChunk(0));
|
|
1840
|
+
await Bun.sleep(60);
|
|
1841
|
+
},
|
|
1842
|
+
connectNewSocket: async () => {
|
|
1843
|
+
await voice.connect(createMockSocket());
|
|
1844
|
+
},
|
|
1845
|
+
disconnect: async () => {
|
|
1846
|
+
await voice.disconnect({
|
|
1847
|
+
recoverable: true,
|
|
1848
|
+
type: "close"
|
|
1849
|
+
});
|
|
1850
|
+
},
|
|
1851
|
+
emitEndOfTurn: async () => {
|
|
1852
|
+
await adapter.session.emit("endOfTurn", {
|
|
1853
|
+
reason: "vendor",
|
|
1854
|
+
receivedAt: Date.now(),
|
|
1855
|
+
type: "endOfTurn"
|
|
1856
|
+
});
|
|
1857
|
+
},
|
|
1858
|
+
emitFinal: async (text, transcriptId = `${id}-${turns.length}`) => {
|
|
1859
|
+
await adapter.session.emit("final", {
|
|
1860
|
+
receivedAt: Date.now(),
|
|
1861
|
+
transcript: {
|
|
1862
|
+
id: transcriptId,
|
|
1863
|
+
isFinal: true,
|
|
1864
|
+
text
|
|
1865
|
+
},
|
|
1866
|
+
type: "final"
|
|
1867
|
+
});
|
|
1868
|
+
},
|
|
1869
|
+
turns
|
|
1870
|
+
});
|
|
1871
|
+
} finally {
|
|
1872
|
+
await voice.close("resilience-complete");
|
|
1873
|
+
}
|
|
1874
|
+
const uniqueTurns = new Set(turns.map((turn) => turn.toLowerCase()));
|
|
1875
|
+
const replayedTurns = turns.length - uniqueTurns.size;
|
|
1876
|
+
return {
|
|
1877
|
+
actualTurns: turns,
|
|
1878
|
+
id,
|
|
1879
|
+
passes: replayedTurns === 0,
|
|
1880
|
+
replayedTurns,
|
|
1881
|
+
title
|
|
1882
|
+
};
|
|
1883
|
+
};
|
|
1884
|
+
var runVoiceResilienceBenchmark = async () => {
|
|
1885
|
+
const scenarios = await Promise.all([
|
|
1886
|
+
runScenario("resume-no-replay", "Reconnect after first turn does not replay committed text", async ({ commit, connectNewSocket, disconnect }) => {
|
|
1887
|
+
await commit("Reconnect should not duplicate prior turns");
|
|
1888
|
+
await disconnect();
|
|
1889
|
+
await connectNewSocket();
|
|
1890
|
+
await commit("A second turn should still commit after resume");
|
|
1891
|
+
}),
|
|
1892
|
+
runScenario("duplicate-final-id", "Duplicate transcript ids do not create replayed turns", async ({ adapter, connectNewSocket, disconnect, turns, commit }) => {
|
|
1893
|
+
await commit("Duplicate final ids should still produce one turn", "same-id");
|
|
1894
|
+
await disconnect();
|
|
1895
|
+
await connectNewSocket();
|
|
1896
|
+
await adapter.session.emit("final", {
|
|
1897
|
+
receivedAt: Date.now(),
|
|
1898
|
+
transcript: {
|
|
1899
|
+
id: "same-id",
|
|
1900
|
+
isFinal: true,
|
|
1901
|
+
text: "Duplicate final ids should still produce one turn"
|
|
1902
|
+
},
|
|
1903
|
+
type: "final"
|
|
1904
|
+
});
|
|
1905
|
+
if (turns.length === 1) {
|
|
1906
|
+
await commit("Fresh transcripts should still commit later");
|
|
1907
|
+
}
|
|
1908
|
+
}),
|
|
1909
|
+
runScenario("duplicate-end-of-turn", "Repeated end-of-turn events for the same turn stay deduped", async ({ emitFinal, emitEndOfTurn, turns }) => {
|
|
1910
|
+
await emitFinal("Repeated end-of-turn should only commit once", "dup-endofturn");
|
|
1911
|
+
await emitEndOfTurn();
|
|
1912
|
+
await emitEndOfTurn();
|
|
1913
|
+
await Bun.sleep(80);
|
|
1914
|
+
if (turns.length !== 1) {
|
|
1915
|
+
throw new Error("Repeated end-of-turn events created duplicate turns");
|
|
1916
|
+
}
|
|
1917
|
+
}),
|
|
1918
|
+
runScenario("duplicate-end-of-turn-jitter", "End-of-turn jitter does not trigger extra commits", async ({ emitFinal, emitEndOfTurn, turns }) => {
|
|
1919
|
+
await emitFinal("Noisy end-of-turn signals should still commit once", "dup-endofturn-jitter");
|
|
1920
|
+
for (const delayMs of [40, 95, 180, 120]) {
|
|
1921
|
+
await Bun.sleep(delayMs);
|
|
1922
|
+
await emitEndOfTurn();
|
|
1923
|
+
}
|
|
1924
|
+
await Bun.sleep(80);
|
|
1925
|
+
if (turns.length !== 1) {
|
|
1926
|
+
throw new Error("Jittered end-of-turn signals created duplicate turns");
|
|
1927
|
+
}
|
|
1928
|
+
}),
|
|
1929
|
+
runScenario("reconnect-duplicate-text-no-new-audio", "Reconnect duplicate text with different ids and no audio does not replay turn", async ({
|
|
1930
|
+
adapter,
|
|
1931
|
+
connectNewSocket,
|
|
1932
|
+
disconnect,
|
|
1933
|
+
emitEndOfTurn,
|
|
1934
|
+
emitFinal,
|
|
1935
|
+
turns
|
|
1936
|
+
}) => {
|
|
1937
|
+
await emitFinal("Reconnect duplicate text should be suppressed", "dup-text-reconnect-1");
|
|
1938
|
+
await emitEndOfTurn();
|
|
1939
|
+
await Bun.sleep(60);
|
|
1940
|
+
await disconnect();
|
|
1941
|
+
await connectNewSocket();
|
|
1942
|
+
await adapter.session.emit("final", {
|
|
1943
|
+
receivedAt: Date.now(),
|
|
1944
|
+
transcript: {
|
|
1945
|
+
id: "dup-text-reconnect-2",
|
|
1946
|
+
isFinal: true,
|
|
1947
|
+
text: "Reconnect duplicate text should be suppressed"
|
|
1948
|
+
},
|
|
1949
|
+
type: "final"
|
|
1950
|
+
});
|
|
1951
|
+
for (const delayMs of [40, 70, 110]) {
|
|
1952
|
+
await Bun.sleep(delayMs);
|
|
1953
|
+
await emitEndOfTurn();
|
|
1954
|
+
}
|
|
1955
|
+
await Bun.sleep(60);
|
|
1956
|
+
if (turns.length !== 1) {
|
|
1957
|
+
throw new Error("Reconnect duplicate text was committed twice");
|
|
1958
|
+
}
|
|
1959
|
+
}),
|
|
1960
|
+
runScenario("reconnect-end-of-turn-jitter", "End-of-turn jitter after reconnect does not replay committed turns", async ({
|
|
1961
|
+
adapter,
|
|
1962
|
+
connectNewSocket,
|
|
1963
|
+
disconnect,
|
|
1964
|
+
emitEndOfTurn,
|
|
1965
|
+
emitFinal,
|
|
1966
|
+
turns
|
|
1967
|
+
}) => {
|
|
1968
|
+
await emitFinal("Reconnect duplicate end-of-turn should dedupe", "resume-jitter");
|
|
1969
|
+
await emitEndOfTurn();
|
|
1970
|
+
await Bun.sleep(60);
|
|
1971
|
+
await disconnect();
|
|
1972
|
+
await connectNewSocket();
|
|
1973
|
+
await adapter.session.emit("final", {
|
|
1974
|
+
receivedAt: Date.now(),
|
|
1975
|
+
transcript: {
|
|
1976
|
+
id: "resume-jitter",
|
|
1977
|
+
isFinal: true,
|
|
1978
|
+
text: "Reconnect duplicate end-of-turn should dedupe"
|
|
1979
|
+
},
|
|
1980
|
+
type: "final"
|
|
1981
|
+
});
|
|
1982
|
+
for (const delayMs of [50, 80, 120, 180]) {
|
|
1983
|
+
await Bun.sleep(delayMs);
|
|
1984
|
+
await emitEndOfTurn();
|
|
1985
|
+
}
|
|
1986
|
+
await Bun.sleep(80);
|
|
1987
|
+
if (turns.length !== 1) {
|
|
1988
|
+
throw new Error("Reconnected jittered end-of-turn signals replayed a committed turn");
|
|
1989
|
+
}
|
|
1990
|
+
})
|
|
1991
|
+
]);
|
|
1992
|
+
const passCount = scenarios.filter((scenario) => scenario.passes).length;
|
|
1993
|
+
const replayFailures = scenarios.filter((scenario) => scenario.replayedTurns > 0).length;
|
|
1994
|
+
return {
|
|
1995
|
+
generatedAt: Date.now(),
|
|
1996
|
+
scenarios,
|
|
1997
|
+
summary: {
|
|
1998
|
+
duplicateTurnRate: roundMetric2(scenarios.length > 0 ? replayFailures / scenarios.length : 0),
|
|
1999
|
+
passCount,
|
|
2000
|
+
passRate: roundMetric2(scenarios.length > 0 ? passCount / scenarios.length : 0),
|
|
2001
|
+
replayFailureRate: roundMetric2(scenarios.length > 0 ? replayFailures / scenarios.length : 0),
|
|
2002
|
+
scenarioCount: scenarios.length
|
|
2003
|
+
}
|
|
2004
|
+
};
|
|
2005
|
+
};
|
|
2006
|
+
// src/testing/sessionBenchmark.ts
|
|
2007
|
+
var average2 = (values) => values.length > 0 ? values.reduce((sum, value) => sum + value, 0) / values.length : 0;
|
|
2008
|
+
var normalizeTurnText = (value) => value.toLowerCase().replace(/[^\p{L}\p{N}\s']/gu, " ").replace(/\s+/g, " ").trim();
|
|
2009
|
+
var roundMetric3 = (value, digits = 4) => {
|
|
2010
|
+
const factor = 10 ** digits;
|
|
2011
|
+
return Math.round(value * factor) / factor;
|
|
2012
|
+
};
|
|
2013
|
+
var resolveBenchmarkFallbackConfig = (config) => {
|
|
2014
|
+
if (!config) {
|
|
2015
|
+
return;
|
|
2016
|
+
}
|
|
2017
|
+
return {
|
|
2018
|
+
adapter: config.adapter,
|
|
2019
|
+
completionTimeoutMs: config.completionTimeoutMs ?? 2500,
|
|
2020
|
+
confidenceThreshold: config.confidenceThreshold ?? 0.6,
|
|
2021
|
+
maxAttemptsPerTurn: config.maxAttemptsPerTurn ?? 1,
|
|
2022
|
+
minTextLength: config.minTextLength ?? 2,
|
|
2023
|
+
replayWindowMs: config.replayWindowMs ?? 8000,
|
|
2024
|
+
settleMs: config.settleMs ?? 220,
|
|
2025
|
+
trigger: config.trigger ?? "empty-or-low-confidence"
|
|
2026
|
+
};
|
|
2027
|
+
};
|
|
2028
|
+
var chunkAudio2 = (audio, bytesPerChunk) => {
|
|
2029
|
+
const chunks = [];
|
|
2030
|
+
for (let offset = 0;offset < audio.byteLength; offset += bytesPerChunk) {
|
|
2031
|
+
chunks.push(audio.slice(offset, offset + bytesPerChunk));
|
|
2032
|
+
}
|
|
2033
|
+
return chunks;
|
|
2034
|
+
};
|
|
2035
|
+
var createSilence2 = (byteLength) => new Uint8Array(byteLength);
|
|
2036
|
+
var countUnexpectedDuplicateTurns = (actualTurns, expectedTurns) => {
|
|
2037
|
+
const expectedCounts = new Map;
|
|
2038
|
+
for (const turn of expectedTurns) {
|
|
2039
|
+
const key = normalizeTurnText(turn);
|
|
2040
|
+
expectedCounts.set(key, (expectedCounts.get(key) ?? 0) + 1);
|
|
2041
|
+
}
|
|
2042
|
+
const actualCounts = new Map;
|
|
2043
|
+
for (const turn of actualTurns) {
|
|
2044
|
+
const key = normalizeTurnText(turn);
|
|
2045
|
+
actualCounts.set(key, (actualCounts.get(key) ?? 0) + 1);
|
|
2046
|
+
}
|
|
2047
|
+
let duplicates = 0;
|
|
2048
|
+
for (const [key, actualCount] of actualCounts.entries()) {
|
|
2049
|
+
const expectedCount = expectedCounts.get(key) ?? 0;
|
|
2050
|
+
const allowedOccurrences = Math.max(expectedCount, 1);
|
|
2051
|
+
if (actualCount > allowedOccurrences) {
|
|
2052
|
+
duplicates += actualCount - allowedOccurrences;
|
|
2053
|
+
}
|
|
2054
|
+
}
|
|
2055
|
+
return duplicates;
|
|
2056
|
+
};
|
|
2057
|
+
var normalizeSocketMessage = (data) => {
|
|
2058
|
+
if (typeof data !== "string") {
|
|
2059
|
+
return {
|
|
2060
|
+
byteLength: data instanceof ArrayBuffer ? data.byteLength : data.byteLength,
|
|
2061
|
+
kind: "binary"
|
|
2062
|
+
};
|
|
2063
|
+
}
|
|
2064
|
+
try {
|
|
2065
|
+
return JSON.parse(data);
|
|
2066
|
+
} catch {
|
|
2067
|
+
return data;
|
|
2068
|
+
}
|
|
2069
|
+
};
|
|
2070
|
+
var createMockSocket2 = (onEvent) => ({
|
|
2071
|
+
close: async (code, reason) => {
|
|
2072
|
+
onEvent?.({
|
|
2073
|
+
data: {
|
|
2074
|
+
code,
|
|
2075
|
+
reason
|
|
2076
|
+
},
|
|
2077
|
+
phase: "socket.close"
|
|
2078
|
+
});
|
|
2079
|
+
},
|
|
2080
|
+
send: async (data) => {
|
|
2081
|
+
onEvent?.({
|
|
2082
|
+
data: normalizeSocketMessage(data),
|
|
2083
|
+
phase: "socket.send"
|
|
2084
|
+
});
|
|
2085
|
+
}
|
|
2086
|
+
});
|
|
2087
|
+
var waitForSessionIdle = async (session, settleMs, idleTimeoutMs) => {
|
|
2088
|
+
const startedAt = Date.now();
|
|
2089
|
+
while (Date.now() - startedAt < idleTimeoutMs) {
|
|
2090
|
+
const snapshot = await session.snapshot();
|
|
2091
|
+
const pendingText = snapshot.currentTurn.finalText || snapshot.currentTurn.partialText;
|
|
2092
|
+
const lastActivityAt = snapshot.lastActivityAt ?? snapshot.createdAt;
|
|
2093
|
+
if (!pendingText && Date.now() - lastActivityAt >= settleMs) {
|
|
2094
|
+
return;
|
|
2095
|
+
}
|
|
2096
|
+
await Bun.sleep(Math.min(100, settleMs));
|
|
2097
|
+
}
|
|
2098
|
+
};
|
|
2099
|
+
var runVoiceSessionBenchmarkScenario = async (adapter, fixture, options = {}) => {
|
|
2100
|
+
const store = createVoiceMemoryStore();
|
|
2101
|
+
const committedTurns = [];
|
|
2102
|
+
const traceStartedAt = Date.now();
|
|
2103
|
+
const trace = [];
|
|
2104
|
+
const pushTrace = (entry) => {
|
|
2105
|
+
if (!options.trace) {
|
|
2106
|
+
return;
|
|
2107
|
+
}
|
|
2108
|
+
trace.push({
|
|
2109
|
+
...entry,
|
|
2110
|
+
atMs: Date.now() - traceStartedAt
|
|
2111
|
+
});
|
|
2112
|
+
};
|
|
2113
|
+
const captureSnapshot = async (phase) => {
|
|
2114
|
+
if (!options.trace) {
|
|
2115
|
+
return;
|
|
2116
|
+
}
|
|
2117
|
+
const snapshot = await store.getOrCreate(`session-bench-${fixture.id}`);
|
|
2118
|
+
pushTrace({
|
|
2119
|
+
data: {
|
|
2120
|
+
currentTurn: {
|
|
2121
|
+
finalText: snapshot.currentTurn.finalText,
|
|
2122
|
+
lastAudioAt: snapshot.currentTurn.lastAudioAt,
|
|
2123
|
+
lastSpeechAt: snapshot.currentTurn.lastSpeechAt,
|
|
2124
|
+
lastTranscriptAt: snapshot.currentTurn.lastTranscriptAt,
|
|
2125
|
+
partialText: snapshot.currentTurn.partialText,
|
|
2126
|
+
silenceStartedAt: snapshot.currentTurn.silenceStartedAt,
|
|
2127
|
+
transcriptCount: snapshot.currentTurn.transcripts.length
|
|
2128
|
+
},
|
|
2129
|
+
lastActivityAt: snapshot.lastActivityAt,
|
|
2130
|
+
status: snapshot.status,
|
|
2131
|
+
turns: snapshot.turns.map((turn) => turn.text)
|
|
2132
|
+
},
|
|
2133
|
+
phase
|
|
2134
|
+
});
|
|
2135
|
+
};
|
|
2136
|
+
const logger = {
|
|
2137
|
+
debug: (message, meta) => {
|
|
2138
|
+
pushTrace({
|
|
2139
|
+
data: meta,
|
|
2140
|
+
phase: `logger.debug:${message}`
|
|
2141
|
+
});
|
|
2142
|
+
},
|
|
2143
|
+
error: (message, meta) => {
|
|
2144
|
+
pushTrace({
|
|
2145
|
+
data: meta,
|
|
2146
|
+
phase: `logger.error:${message}`
|
|
2147
|
+
});
|
|
2148
|
+
},
|
|
2149
|
+
info: (message, meta) => {
|
|
2150
|
+
pushTrace({
|
|
2151
|
+
data: meta,
|
|
2152
|
+
phase: `logger.info:${message}`
|
|
2153
|
+
});
|
|
2154
|
+
},
|
|
2155
|
+
warn: (message, meta) => {
|
|
2156
|
+
pushTrace({
|
|
2157
|
+
data: meta,
|
|
2158
|
+
phase: `logger.warn:${message}`
|
|
2159
|
+
});
|
|
2160
|
+
}
|
|
2161
|
+
};
|
|
2162
|
+
const session = createVoiceSession({
|
|
2163
|
+
audioConditioning: resolveAudioConditioningConfig(fixture.audioConditioning),
|
|
2164
|
+
context: {},
|
|
2165
|
+
id: `session-bench-${fixture.id}`,
|
|
2166
|
+
logger,
|
|
2167
|
+
reconnect: {
|
|
2168
|
+
maxAttempts: 2,
|
|
2169
|
+
strategy: "resume-last-turn",
|
|
2170
|
+
timeout: 5000
|
|
2171
|
+
},
|
|
2172
|
+
route: {
|
|
2173
|
+
correctTurn: options.correctTurn,
|
|
2174
|
+
onComplete: async () => {},
|
|
2175
|
+
onTurn: async ({ turn }) => {
|
|
2176
|
+
committedTurns.push({
|
|
2177
|
+
quality: turn.quality,
|
|
2178
|
+
text: turn.text
|
|
2179
|
+
});
|
|
2180
|
+
pushTrace({
|
|
2181
|
+
data: {
|
|
2182
|
+
quality: turn.quality,
|
|
2183
|
+
text: turn.text,
|
|
2184
|
+
transcriptCount: turn.transcripts.length,
|
|
2185
|
+
turnId: turn.id
|
|
2186
|
+
},
|
|
2187
|
+
phase: "route.onTurn"
|
|
2188
|
+
});
|
|
2189
|
+
}
|
|
2190
|
+
},
|
|
2191
|
+
phraseHints: fixture.phraseHints,
|
|
2192
|
+
socket: createMockSocket2(pushTrace),
|
|
2193
|
+
store,
|
|
2194
|
+
stt: adapter,
|
|
2195
|
+
sttFallback: resolveBenchmarkFallbackConfig(options.sttFallback),
|
|
2196
|
+
sttLifecycle: fixture.sttLifecycle ?? "continuous",
|
|
2197
|
+
turnDetection: resolveTurnDetectionConfig({
|
|
2198
|
+
profile: fixture.turnProfile ?? "balanced",
|
|
2199
|
+
silenceMs: fixture.silenceMs ?? DEFAULT_SILENCE_MS,
|
|
2200
|
+
speechThreshold: fixture.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
|
|
2201
|
+
transcriptStabilityMs: fixture.transcriptStabilityMs ?? 900
|
|
2202
|
+
})
|
|
2203
|
+
});
|
|
2204
|
+
const startedAt = Date.now();
|
|
2205
|
+
let reconnectTriggered = false;
|
|
2206
|
+
await session.connect(createMockSocket2(pushTrace));
|
|
2207
|
+
await captureSnapshot("session.connected");
|
|
2208
|
+
try {
|
|
2209
|
+
const chunkDurationMs = fixture.chunkDurationMs ?? 100;
|
|
2210
|
+
const bytesPerMillisecond = fixture.format.sampleRateHz * fixture.format.channels * 2 / 1000;
|
|
2211
|
+
const bytesPerChunk = Math.max(2, Math.floor(bytesPerMillisecond * chunkDurationMs));
|
|
2212
|
+
const chunks = chunkAudio2(fixture.audio, bytesPerChunk);
|
|
2213
|
+
for (const [index, chunk] of chunks.entries()) {
|
|
2214
|
+
await session.receiveAudio(chunk);
|
|
2215
|
+
await Bun.sleep(chunkDurationMs);
|
|
2216
|
+
if (fixture.reconnectAtChunkIndex !== undefined && index === fixture.reconnectAtChunkIndex && !reconnectTriggered) {
|
|
2217
|
+
reconnectTriggered = true;
|
|
2218
|
+
pushTrace({
|
|
2219
|
+
data: {
|
|
2220
|
+
chunkIndex: index
|
|
2221
|
+
},
|
|
2222
|
+
phase: "reconnect.begin"
|
|
2223
|
+
});
|
|
2224
|
+
await captureSnapshot("reconnect.pre-disconnect");
|
|
2225
|
+
await session.disconnect({
|
|
2226
|
+
reason: "benchmark-reconnect",
|
|
2227
|
+
recoverable: true,
|
|
2228
|
+
type: "close"
|
|
2229
|
+
});
|
|
2230
|
+
await captureSnapshot("reconnect.post-disconnect");
|
|
2231
|
+
await Bun.sleep(fixture.reconnectPauseMs ?? 150);
|
|
2232
|
+
await session.connect(createMockSocket2(pushTrace));
|
|
2233
|
+
await captureSnapshot("reconnect.post-connect");
|
|
2234
|
+
}
|
|
2235
|
+
}
|
|
2236
|
+
const tailPaddingMs = fixture.tailPaddingMs ?? 1200;
|
|
2237
|
+
if (tailPaddingMs > 0) {
|
|
2238
|
+
const tailBytes = Math.max(2, Math.floor(bytesPerMillisecond * tailPaddingMs));
|
|
2239
|
+
for (const chunk of chunkAudio2(createSilence2(tailBytes), bytesPerChunk)) {
|
|
2240
|
+
await session.receiveAudio(chunk);
|
|
2241
|
+
await Bun.sleep(chunkDurationMs);
|
|
2242
|
+
}
|
|
2243
|
+
}
|
|
2244
|
+
await waitForSessionIdle(session, Math.max(1200, (fixture.silenceMs ?? DEFAULT_SILENCE_MS) + (fixture.transcriptStabilityMs ?? 900)), 8000);
|
|
2245
|
+
await captureSnapshot("session.idle");
|
|
2246
|
+
} finally {
|
|
2247
|
+
await captureSnapshot("session.pre-close");
|
|
2248
|
+
await session.close("session-benchmark-complete");
|
|
2249
|
+
}
|
|
2250
|
+
const duplicateTurnCount = countUnexpectedDuplicateTurns(committedTurns.map((turn) => turn.text), fixture.expectedTurnTexts);
|
|
2251
|
+
const turnResults = fixture.expectedTurnTexts.map((expectedText, index) => {
|
|
2252
|
+
const actualTurn = committedTurns[index];
|
|
2253
|
+
const actualText = actualTurn?.text;
|
|
2254
|
+
if (!actualText) {
|
|
2255
|
+
return {
|
|
2256
|
+
actualText: "",
|
|
2257
|
+
expectedText,
|
|
2258
|
+
index,
|
|
2259
|
+
passes: false
|
|
2260
|
+
};
|
|
2261
|
+
}
|
|
2262
|
+
const accuracy = scoreTranscriptAccuracy(actualText, expectedText, fixture.transcriptThreshold ?? 0.35);
|
|
2263
|
+
return {
|
|
2264
|
+
actualText,
|
|
2265
|
+
accuracy,
|
|
2266
|
+
expectedText,
|
|
2267
|
+
index,
|
|
2268
|
+
passes: accuracy.passesThreshold,
|
|
2269
|
+
quality: actualTurn?.quality
|
|
2270
|
+
};
|
|
2271
|
+
});
|
|
2272
|
+
for (let index = fixture.expectedTurnTexts.length;index < committedTurns.length; index += 1) {
|
|
2273
|
+
turnResults.push({
|
|
2274
|
+
actualText: committedTurns[index]?.text ?? "",
|
|
2275
|
+
expectedText: undefined,
|
|
2276
|
+
index,
|
|
2277
|
+
passes: false,
|
|
2278
|
+
quality: committedTurns[index]?.quality
|
|
2279
|
+
});
|
|
2280
|
+
}
|
|
2281
|
+
const turnCountDelta = committedTurns.length - fixture.expectedTurnTexts.length;
|
|
2282
|
+
return {
|
|
2283
|
+
actualTurns: committedTurns.map((turn) => turn.text),
|
|
2284
|
+
duplicateTurnCount,
|
|
2285
|
+
elapsedMs: Date.now() - startedAt,
|
|
2286
|
+
expectedTurns: fixture.expectedTurnTexts,
|
|
2287
|
+
fixtureId: fixture.id,
|
|
2288
|
+
passes: duplicateTurnCount === 0 && turnCountDelta === 0 && turnResults.every((result) => result.passes),
|
|
2289
|
+
reconnectTriggered,
|
|
2290
|
+
tags: fixture.tags ?? [],
|
|
2291
|
+
title: fixture.title,
|
|
2292
|
+
turnCountDelta,
|
|
2293
|
+
turnResults,
|
|
2294
|
+
trace: options.trace ? trace : undefined
|
|
2295
|
+
};
|
|
2296
|
+
};
|
|
2297
|
+
var summarizeVoiceSessionBenchmark = (adapterId, scenarios) => {
|
|
2298
|
+
const passCount = scenarios.filter((scenario) => scenario.passes).length;
|
|
2299
|
+
const reconnectScenarios = scenarios.filter((scenario) => scenario.reconnectTriggered);
|
|
2300
|
+
const reconnectSuccessCount = reconnectScenarios.filter((scenario) => scenario.passes).length;
|
|
2301
|
+
const turnAccuracies = scenarios.flatMap((scenario) => scenario.turnResults.map((turn) => turn.accuracy?.wordErrorRate).filter((value) => typeof value === "number"));
|
|
2302
|
+
return {
|
|
2303
|
+
adapterId,
|
|
2304
|
+
averageElapsedMs: roundMetric3(average2(scenarios.map((scenario) => scenario.elapsedMs)), 2),
|
|
2305
|
+
averageWordErrorRate: roundMetric3(average2(turnAccuracies)),
|
|
2306
|
+
duplicateTurnRate: roundMetric3(scenarios.length > 0 ? scenarios.filter((scenario) => scenario.duplicateTurnCount > 0).length / scenarios.length : 0),
|
|
2307
|
+
passCount,
|
|
2308
|
+
passRate: roundMetric3(scenarios.length > 0 ? passCount / scenarios.length : 0),
|
|
2309
|
+
reconnectSuccessRate: roundMetric3(reconnectScenarios.length > 0 ? reconnectSuccessCount / reconnectScenarios.length : 1),
|
|
2310
|
+
scenarioCount: scenarios.length,
|
|
2311
|
+
scenariosWithDuplicateTurns: scenarios.filter((scenario) => scenario.duplicateTurnCount > 0).length,
|
|
2312
|
+
scenariosWithTurnCountMismatch: scenarios.filter((scenario) => scenario.turnCountDelta !== 0).length
|
|
2313
|
+
};
|
|
2314
|
+
};
|
|
2315
|
+
var summarizeVoiceSessionBenchmarkSeries = (input) => {
|
|
2316
|
+
const scenarioMap = new Map;
|
|
2317
|
+
for (const report of input.reports) {
|
|
2318
|
+
for (const scenario of report.scenarios) {
|
|
2319
|
+
const entries = scenarioMap.get(scenario.fixtureId) ?? [];
|
|
2320
|
+
entries.push(scenario);
|
|
2321
|
+
scenarioMap.set(scenario.fixtureId, entries);
|
|
2322
|
+
}
|
|
2323
|
+
}
|
|
2324
|
+
const scenarioAggregates = [...scenarioMap.entries()].map(([fixtureId, results]) => {
|
|
2325
|
+
const wordErrorRates = results.flatMap((scenario) => scenario.turnResults.map((turn) => turn.accuracy?.wordErrorRate).filter((value) => typeof value === "number"));
|
|
2326
|
+
const reconnectRuns = results.filter((scenario) => scenario.reconnectTriggered);
|
|
2327
|
+
const passCount = results.filter((scenario) => scenario.passes).length;
|
|
2328
|
+
const sample = results[0];
|
|
2329
|
+
return {
|
|
2330
|
+
averageElapsedMs: roundMetric3(average2(results.map((scenario) => scenario.elapsedMs)), 2),
|
|
2331
|
+
averageWordErrorRate: roundMetric3(average2(wordErrorRates)),
|
|
2332
|
+
bestWordErrorRate: roundMetric3(wordErrorRates.length > 0 ? Math.min(...wordErrorRates) : 0),
|
|
2333
|
+
fixtureId,
|
|
2334
|
+
passCount,
|
|
2335
|
+
passRate: roundMetric3(results.length > 0 ? passCount / results.length : 0),
|
|
2336
|
+
reconnectSuccessRate: roundMetric3(reconnectRuns.length > 0 ? reconnectRuns.filter((scenario) => scenario.passes).length / reconnectRuns.length : 1),
|
|
2337
|
+
runCount: results.length,
|
|
2338
|
+
tags: sample.tags,
|
|
2339
|
+
title: sample.title,
|
|
2340
|
+
worstWordErrorRate: roundMetric3(wordErrorRates.length > 0 ? Math.max(...wordErrorRates) : 0)
|
|
2341
|
+
};
|
|
2342
|
+
});
|
|
2343
|
+
const totalRunCount = input.reports.reduce((sum, report) => sum + report.scenarios.length, 0);
|
|
2344
|
+
const totalPassCount = input.reports.reduce((sum, report) => sum + report.summary.passCount, 0);
|
|
2345
|
+
const reconnectRates = scenarioAggregates.map((scenario) => scenario.reconnectSuccessRate).filter((value) => Number.isFinite(value));
|
|
2346
|
+
return {
|
|
2347
|
+
adapterId: input.adapterId,
|
|
2348
|
+
generatedAt: Date.now(),
|
|
2349
|
+
runCount: input.reports.length,
|
|
2350
|
+
scenarios: scenarioAggregates,
|
|
2351
|
+
summary: {
|
|
2352
|
+
adapterId: input.adapterId,
|
|
2353
|
+
averageElapsedMs: roundMetric3(average2(scenarioAggregates.map((scenario) => scenario.averageElapsedMs)), 2),
|
|
2354
|
+
averagePassRate: roundMetric3(average2(scenarioAggregates.map((scenario) => scenario.passRate))),
|
|
2355
|
+
averageWordErrorRate: roundMetric3(average2(scenarioAggregates.map((scenario) => scenario.averageWordErrorRate))),
|
|
2356
|
+
flakyScenarioCount: scenarioAggregates.filter((scenario) => scenario.passRate > 0 && scenario.passRate < 1).length,
|
|
2357
|
+
generatedRunCount: input.reports.length,
|
|
2358
|
+
reconnectSuccessRate: roundMetric3(average2(reconnectRates)),
|
|
2359
|
+
scenarioCount: scenarioAggregates.length,
|
|
2360
|
+
stableScenarioCount: scenarioAggregates.filter((scenario) => scenario.passRate === 1).length,
|
|
2361
|
+
totalPassCount,
|
|
2362
|
+
totalRunCount
|
|
2363
|
+
}
|
|
2364
|
+
};
|
|
2365
|
+
};
|
|
2366
|
+
var runVoiceSessionBenchmark = async (input) => {
|
|
2367
|
+
const scenarioResults = [];
|
|
2368
|
+
for (const scenario of input.scenarios) {
|
|
2369
|
+
scenarioResults.push(await runVoiceSessionBenchmarkScenario(input.adapter, scenario, {
|
|
2370
|
+
correctTurn: input.correctTurn,
|
|
2371
|
+
sttFallback: input.sttFallback,
|
|
2372
|
+
trace: input.trace
|
|
2373
|
+
}));
|
|
2374
|
+
}
|
|
2375
|
+
return {
|
|
2376
|
+
adapterId: input.adapterId,
|
|
2377
|
+
generatedAt: Date.now(),
|
|
2378
|
+
scenarios: scenarioResults,
|
|
2379
|
+
summary: summarizeVoiceSessionBenchmark(input.adapterId, scenarioResults)
|
|
2380
|
+
};
|
|
2381
|
+
};
|
|
2382
|
+
var runVoiceSessionBenchmarkSeries = async (input) => {
|
|
2383
|
+
const reports = [];
|
|
2384
|
+
const runCount = Math.max(1, Math.floor(input.runs));
|
|
2385
|
+
for (let runIndex = 0;runIndex < runCount; runIndex += 1) {
|
|
2386
|
+
reports.push(await runVoiceSessionBenchmark({
|
|
2387
|
+
adapter: input.adapter,
|
|
2388
|
+
adapterId: input.adapterId,
|
|
2389
|
+
correctTurn: input.correctTurn,
|
|
2390
|
+
scenarios: input.scenarios,
|
|
2391
|
+
sttFallback: input.sttFallback,
|
|
2392
|
+
trace: input.trace
|
|
2393
|
+
}));
|
|
2394
|
+
}
|
|
2395
|
+
return summarizeVoiceSessionBenchmarkSeries({
|
|
2396
|
+
adapterId: input.adapterId,
|
|
2397
|
+
reports
|
|
2398
|
+
});
|
|
2399
|
+
};
|
|
437
2400
|
export {
|
|
2401
|
+
summarizeVoiceSessionBenchmarkSeries,
|
|
2402
|
+
summarizeVoiceSessionBenchmark,
|
|
438
2403
|
summarizeSTTBenchmark,
|
|
439
2404
|
scoreTranscriptAccuracy,
|
|
2405
|
+
runVoiceSessionBenchmarkSeries,
|
|
2406
|
+
runVoiceSessionBenchmarkScenario,
|
|
2407
|
+
runVoiceSessionBenchmark,
|
|
2408
|
+
runVoiceResilienceBenchmark,
|
|
440
2409
|
runSTTAdapterFixture,
|
|
441
2410
|
runSTTAdapterBenchmark,
|
|
2411
|
+
resolveFixtureEnvironment,
|
|
442
2412
|
mergeFinalTranscriptText,
|
|
443
2413
|
loadVoiceTestFixtures,
|
|
444
2414
|
getVoiceFixtureDirectory,
|
|
2415
|
+
evaluateSTTBenchmarkAcceptance,
|
|
445
2416
|
compareSTTBenchmarks
|
|
446
2417
|
};
|