@absolutejs/voice 0.0.20 → 0.0.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +387 -4
- package/dist/angular/index.d.ts +1 -0
- package/dist/angular/index.js +669 -3
- package/dist/angular/voice-controller.service.d.ts +21 -0
- package/dist/audioConditioning.d.ts +3 -0
- package/dist/client/actions.d.ts +7 -0
- package/dist/client/connection.d.ts +5 -0
- package/dist/client/controller.d.ts +2 -0
- package/dist/client/htmxBootstrap.js +576 -167
- package/dist/client/index.d.ts +1 -0
- package/dist/client/index.js +486 -3
- package/dist/client/microphone.d.ts +4 -2
- package/dist/correction.d.ts +16 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +1314 -283
- package/dist/presets.d.ts +13 -0
- package/dist/react/index.d.ts +1 -0
- package/dist/react/index.js +642 -3
- package/dist/react/useVoiceController.d.ts +20 -0
- package/dist/react/useVoiceStream.d.ts +1 -0
- package/dist/store.d.ts +2 -2
- package/dist/svelte/index.d.ts +1 -0
- package/dist/svelte/index.js +607 -3
- package/dist/testing/benchmark.d.ts +36 -0
- package/dist/testing/index.js +1453 -241
- package/dist/testing/sessionBenchmark.d.ts +67 -2
- package/dist/testing/stt.d.ts +1 -0
- package/dist/turnDetection.d.ts +5 -1
- package/dist/turnProfiles.d.ts +6 -0
- package/dist/types.d.ts +198 -8
- package/dist/vue/index.d.ts +1 -0
- package/dist/vue/index.js +660 -3
- package/dist/vue/useVoiceController.d.ts +19 -0
- package/fixtures/README.md +9 -0
- package/fixtures/manifest.json +59 -1
- package/fixtures/pcm/dialogue-three-clean.pcm +0 -0
- package/fixtures/pcm/dialogue-three-mixed.pcm +0 -0
- package/fixtures/pcm/dialogue-two-clean.pcm +0 -0
- package/fixtures/pcm/dialogue-two-noisy.pcm +0 -0
- package/package.json +21 -1
package/dist/testing/index.js
CHANGED
|
@@ -95,6 +95,61 @@ var measureAudioLevel = (audio) => {
|
|
|
95
95
|
return Math.sqrt(sumSquares / samples.length);
|
|
96
96
|
};
|
|
97
97
|
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
98
|
+
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
99
|
+
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
100
|
+
const current = normalizeText(currentText);
|
|
101
|
+
const next = normalizeText(nextText);
|
|
102
|
+
if (!current) {
|
|
103
|
+
return next;
|
|
104
|
+
}
|
|
105
|
+
if (!next) {
|
|
106
|
+
return current;
|
|
107
|
+
}
|
|
108
|
+
if (current === next || current.includes(next)) {
|
|
109
|
+
return current;
|
|
110
|
+
}
|
|
111
|
+
if (next.includes(current)) {
|
|
112
|
+
return next;
|
|
113
|
+
}
|
|
114
|
+
if (countWords(next) > countWords(current)) {
|
|
115
|
+
return next;
|
|
116
|
+
}
|
|
117
|
+
return current;
|
|
118
|
+
};
|
|
119
|
+
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
120
|
+
const current = normalizeText(currentText);
|
|
121
|
+
const next = normalizeText(nextText);
|
|
122
|
+
if (!current) {
|
|
123
|
+
return next;
|
|
124
|
+
}
|
|
125
|
+
if (!next) {
|
|
126
|
+
return current;
|
|
127
|
+
}
|
|
128
|
+
const currentWords = current.split(" ");
|
|
129
|
+
const nextWords = next.split(" ");
|
|
130
|
+
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
131
|
+
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
132
|
+
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
133
|
+
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
134
|
+
if (currentSuffix === nextPrefix) {
|
|
135
|
+
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
return `${current} ${next}`.trim();
|
|
139
|
+
};
|
|
140
|
+
var countCommonPrefixWords = (currentText, nextText) => {
|
|
141
|
+
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
142
|
+
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
143
|
+
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
144
|
+
let count = 0;
|
|
145
|
+
for (let index = 0;index < maxWords; index += 1) {
|
|
146
|
+
if (currentWords[index] !== nextWords[index]) {
|
|
147
|
+
break;
|
|
148
|
+
}
|
|
149
|
+
count += 1;
|
|
150
|
+
}
|
|
151
|
+
return count;
|
|
152
|
+
};
|
|
98
153
|
var mergeTranscriptTexts = (transcripts) => {
|
|
99
154
|
const merged = [];
|
|
100
155
|
for (const transcript of transcripts) {
|
|
@@ -118,12 +173,14 @@ var mergeTranscriptTexts = (transcripts) => {
|
|
|
118
173
|
}
|
|
119
174
|
return merged.join(" ").trim();
|
|
120
175
|
};
|
|
121
|
-
var buildTurnText = (transcripts, partialText) => {
|
|
176
|
+
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
122
177
|
const finalText = mergeTranscriptTexts(transcripts);
|
|
123
|
-
|
|
124
|
-
|
|
178
|
+
const nextPartial = normalizeText(partialText);
|
|
179
|
+
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
180
|
+
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
181
|
+
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
125
182
|
}
|
|
126
|
-
return
|
|
183
|
+
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
127
184
|
};
|
|
128
185
|
|
|
129
186
|
// src/testing/accuracy.ts
|
|
@@ -204,6 +261,7 @@ var runSTTAdapterFixture = async (adapter, fixture, options = {}) => {
|
|
|
204
261
|
const settleMs = options.settleMs ?? 500;
|
|
205
262
|
const waitForRealtimeMs = options.waitForRealtimeMs ?? 0;
|
|
206
263
|
let lastActivityAt = Date.now();
|
|
264
|
+
let speechEndedAt = startedAt;
|
|
207
265
|
const markActive = () => {
|
|
208
266
|
lastActivityAt = Date.now();
|
|
209
267
|
};
|
|
@@ -240,12 +298,15 @@ var runSTTAdapterFixture = async (adapter, fixture, options = {}) => {
|
|
|
240
298
|
const realtimeDelayMs = waitForRealtimeMs > 0 ? waitForRealtimeMs : chunkDurationMs;
|
|
241
299
|
for (const chunk of chunks) {
|
|
242
300
|
await session.send(chunk);
|
|
301
|
+
markActive();
|
|
243
302
|
await Bun.sleep(realtimeDelayMs);
|
|
244
303
|
}
|
|
304
|
+
speechEndedAt = Date.now();
|
|
245
305
|
if (tailPaddingMs > 0) {
|
|
246
306
|
const tailBytes = Math.max(2, Math.floor(bytesPerMillisecond * tailPaddingMs));
|
|
247
307
|
for (const chunk of chunkAudio(createSilence(tailBytes), bytesPerChunk)) {
|
|
248
308
|
await session.send(chunk);
|
|
309
|
+
markActive();
|
|
249
310
|
await Bun.sleep(realtimeDelayMs);
|
|
250
311
|
}
|
|
251
312
|
}
|
|
@@ -265,11 +326,30 @@ var runSTTAdapterFixture = async (adapter, fixture, options = {}) => {
|
|
|
265
326
|
finalEvents,
|
|
266
327
|
finalText,
|
|
267
328
|
partialEvents,
|
|
329
|
+
speechEndedAt,
|
|
268
330
|
startedAt
|
|
269
331
|
};
|
|
270
332
|
};
|
|
271
333
|
|
|
272
334
|
// src/testing/benchmark.ts
|
|
335
|
+
var resolveFixtureEnvironment = (fixture) => {
|
|
336
|
+
const tags = new Set(fixture.tags ?? []);
|
|
337
|
+
const hasAccent = tags.has("accent") || tags.has("speech-accent-archive");
|
|
338
|
+
const hasNoisy = tags.has("noisy") || tags.has("synthetic-noise") || tags.has("stress");
|
|
339
|
+
if (hasAccent && hasNoisy) {
|
|
340
|
+
return "accent-noisy";
|
|
341
|
+
}
|
|
342
|
+
if (hasAccent) {
|
|
343
|
+
return "accent";
|
|
344
|
+
}
|
|
345
|
+
if (hasNoisy) {
|
|
346
|
+
return "noisy";
|
|
347
|
+
}
|
|
348
|
+
if (tags.has("clean")) {
|
|
349
|
+
return "clean";
|
|
350
|
+
}
|
|
351
|
+
return "other";
|
|
352
|
+
};
|
|
273
353
|
var normalizeBenchmarkText = (value) => value.toLowerCase().replace(/[^\p{L}\p{N}\s']/gu, " ").replace(/\s+/g, " ").trim();
|
|
274
354
|
var scoreExpectedTerms = (actualText, expectedTerms) => {
|
|
275
355
|
const normalizedActual = normalizeBenchmarkText(actualText);
|
|
@@ -300,10 +380,46 @@ var roundMetric = (value, digits = 4) => {
|
|
|
300
380
|
const factor = 10 ** digits;
|
|
301
381
|
return Math.round(value * factor) / factor;
|
|
302
382
|
};
|
|
383
|
+
var calculateGroupSummary = (fixtures) => {
|
|
384
|
+
const grouped = new Map;
|
|
385
|
+
for (const fixture of fixtures) {
|
|
386
|
+
const existing = grouped.get(fixture.group) ?? [];
|
|
387
|
+
existing.push(fixture);
|
|
388
|
+
grouped.set(fixture.group, existing);
|
|
389
|
+
}
|
|
390
|
+
return Array.from(grouped.entries()).map(([group, results]) => {
|
|
391
|
+
const fixtureCount = results.length;
|
|
392
|
+
const passCount = results.filter((fixture) => fixture.passes).length;
|
|
393
|
+
const averageWordErrorRate = average(results.map((result) => result.accuracy.wordErrorRate)) ?? 0;
|
|
394
|
+
const averageTermRecall = average(results.map((result) => result.expectedTerms.recall)) ?? 0;
|
|
395
|
+
const averageElapsedMs = average(results.map((result) => result.elapsedMs));
|
|
396
|
+
const accuracy = 1 - averageWordErrorRate;
|
|
397
|
+
return {
|
|
398
|
+
averageElapsedMs: roundMetric(averageElapsedMs, 2) ?? 0,
|
|
399
|
+
averageTermRecall: roundMetric(averageTermRecall) ?? 0,
|
|
400
|
+
averageWordErrorRate: roundMetric(averageWordErrorRate) ?? 0,
|
|
401
|
+
fixturesWithErrors: results.filter((fixture) => fixture.errorCount > 0).length,
|
|
402
|
+
fixturesWithFragments: results.filter((fixture) => fixture.fragmentationCount > 0).length,
|
|
403
|
+
fixtureCount,
|
|
404
|
+
group,
|
|
405
|
+
passCount,
|
|
406
|
+
passRate: fixtureCount > 0 ? roundMetric(passCount / fixtureCount) ?? 0 : 0,
|
|
407
|
+
wordAccuracyRate: roundMetric(accuracy) ?? 0
|
|
408
|
+
};
|
|
409
|
+
}).sort((a, b) => a.group.localeCompare(b.group));
|
|
410
|
+
};
|
|
303
411
|
var toFixtureBenchmarkResult = (fixture, result, elapsedMs) => {
|
|
412
|
+
const toPostSpeechLatency = (timestamp) => {
|
|
413
|
+
if (typeof timestamp !== "number") {
|
|
414
|
+
return;
|
|
415
|
+
}
|
|
416
|
+
return Math.max(0, timestamp - result.speechEndedAt);
|
|
417
|
+
};
|
|
304
418
|
const timeToFirstPartialMs = result.partialEvents[0] ? result.partialEvents[0].receivedAt - result.startedAt : undefined;
|
|
305
419
|
const timeToFirstFinalMs = result.finalEvents[0] ? result.finalEvents[0].receivedAt - result.startedAt : undefined;
|
|
306
420
|
const timeToEndOfTurnMs = result.endOfTurnEvents[0] ? result.endOfTurnEvents[0].receivedAt - result.startedAt : undefined;
|
|
421
|
+
const postSpeechTimeToFirstFinalMs = toPostSpeechLatency(result.finalEvents[0]?.receivedAt);
|
|
422
|
+
const postSpeechTimeToEndOfTurnMs = toPostSpeechLatency(result.endOfTurnEvents[0]?.receivedAt);
|
|
307
423
|
const expectedTerms = scoreExpectedTerms(result.finalText, fixture.expectedTerms);
|
|
308
424
|
return {
|
|
309
425
|
accuracy: result.accuracy,
|
|
@@ -317,8 +433,11 @@ var toFixtureBenchmarkResult = (fixture, result, elapsedMs) => {
|
|
|
317
433
|
finalText: result.finalText,
|
|
318
434
|
fixtureId: fixture.id,
|
|
319
435
|
fragmentationCount: Math.max(0, result.finalEvents.length - 1),
|
|
436
|
+
group: resolveFixtureEnvironment(fixture),
|
|
320
437
|
passes: result.errorEvents.length === 0 && result.finalText.trim().length > 0 && result.accuracy.passesThreshold,
|
|
321
438
|
partialCount: result.partialEvents.length,
|
|
439
|
+
postSpeechTimeToEndOfTurnMs,
|
|
440
|
+
postSpeechTimeToFirstFinalMs,
|
|
322
441
|
tags: fixture.tags ?? [],
|
|
323
442
|
timeToEndOfTurnMs,
|
|
324
443
|
timeToFirstFinalMs,
|
|
@@ -336,6 +455,8 @@ var summarizeSTTBenchmark = (adapterId, fixtures) => {
|
|
|
336
455
|
averageEndOfTurnCount: roundMetric(average(fixtures.map((fixture) => fixture.endOfTurnCount)), 2) ?? 0,
|
|
337
456
|
averageFinalCount: roundMetric(average(fixtures.map((fixture) => fixture.finalCount)), 2) ?? 0,
|
|
338
457
|
averageTermRecall: roundMetric(average(fixtures.map((fixture) => fixture.expectedTerms.recall))) ?? 0,
|
|
458
|
+
averagePostSpeechTimeToEndOfTurnMs: roundMetric(average(fixtures.map((fixture) => fixture.postSpeechTimeToEndOfTurnMs)), 2),
|
|
459
|
+
averagePostSpeechTimeToFirstFinalMs: roundMetric(average(fixtures.map((fixture) => fixture.postSpeechTimeToFirstFinalMs)), 2),
|
|
339
460
|
averageTimeToEndOfTurnMs: roundMetric(average(fixtures.map((fixture) => fixture.timeToEndOfTurnMs)), 2),
|
|
340
461
|
averageTimeToFirstFinalMs: roundMetric(average(fixtures.map((fixture) => fixture.timeToFirstFinalMs)), 2),
|
|
341
462
|
averageTimeToFirstPartialMs: roundMetric(average(fixtures.map((fixture) => fixture.timeToFirstPartialMs)), 2),
|
|
@@ -343,12 +464,51 @@ var summarizeSTTBenchmark = (adapterId, fixtures) => {
|
|
|
343
464
|
fixtureCount,
|
|
344
465
|
fixturesWithErrors: fixtures.filter((fixture) => fixture.errorCount > 0).length,
|
|
345
466
|
fixturesWithFragmentation: fixtures.filter((fixture) => fixture.fragmentationCount > 0).length,
|
|
467
|
+
groupSummaries: calculateGroupSummary(fixtures),
|
|
346
468
|
passCount,
|
|
347
469
|
passRate: fixtureCount > 0 ? roundMetric(passCount / fixtureCount) ?? 0 : 0,
|
|
348
470
|
totalErrorCount: fixtures.reduce((sum, fixture) => sum + fixture.errorCount, 0),
|
|
349
471
|
wordAccuracyRate: fixtureCount > 0 ? roundMetric(1 - (average(fixtures.map((fixture) => fixture.accuracy.wordErrorRate)) ?? 0)) ?? 0 : 0
|
|
350
472
|
};
|
|
351
473
|
};
|
|
474
|
+
var evaluateSTTBenchmarkAcceptance = (report, thresholds = {}) => {
|
|
475
|
+
const failures = [];
|
|
476
|
+
const details = thresholds;
|
|
477
|
+
const overallPassRate = details.overallPassRate;
|
|
478
|
+
if (overallPassRate !== undefined && report.summary.passRate < overallPassRate) {
|
|
479
|
+
failures.push(`overall passRate ${(report.summary.passRate * 100).toFixed(2)}% below ${(overallPassRate * 100).toFixed(2)}%`);
|
|
480
|
+
}
|
|
481
|
+
const minTermRecall = details.termRecall;
|
|
482
|
+
if (minTermRecall !== undefined && report.summary.averageTermRecall < minTermRecall) {
|
|
483
|
+
failures.push(`overall term recall ${report.summary.averageTermRecall.toFixed(4)} below ${minTermRecall.toFixed(4)}`);
|
|
484
|
+
}
|
|
485
|
+
const minWordAccuracy = details.wordAccuracyRate;
|
|
486
|
+
if (minWordAccuracy !== undefined && report.summary.wordAccuracyRate < minWordAccuracy) {
|
|
487
|
+
failures.push(`overall word accuracy ${(report.summary.wordAccuracyRate * 100).toFixed(2)}% below ${(minWordAccuracy * 100).toFixed(2)}%`);
|
|
488
|
+
}
|
|
489
|
+
const groupThresholds = details.groupPassRate;
|
|
490
|
+
if (groupThresholds) {
|
|
491
|
+
for (const groupSummary of report.summary.groupSummaries) {
|
|
492
|
+
const threshold = groupThresholds[groupSummary.group];
|
|
493
|
+
if (!threshold) {
|
|
494
|
+
continue;
|
|
495
|
+
}
|
|
496
|
+
if (threshold.passRate !== undefined && groupSummary.passRate < threshold.passRate) {
|
|
497
|
+
failures.push(`${groupSummary.group} passRate ${(groupSummary.passRate * 100).toFixed(2)}% below ${(threshold.passRate * 100).toFixed(2)}%`);
|
|
498
|
+
}
|
|
499
|
+
if (threshold.wordAccuracyRate !== undefined && groupSummary.wordAccuracyRate < threshold.wordAccuracyRate) {
|
|
500
|
+
failures.push(`${groupSummary.group} wordAccuracy ${(groupSummary.wordAccuracyRate * 100).toFixed(2)}% below ${(threshold.wordAccuracyRate * 100).toFixed(2)}%`);
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
const score = roundMetric(report.summary.passRate * 0.45 + report.summary.wordAccuracyRate * 0.35 + report.summary.averageTermRecall * 0.2, 3) ?? 0;
|
|
505
|
+
return {
|
|
506
|
+
adapterId: report.adapterId,
|
|
507
|
+
failures,
|
|
508
|
+
passed: failures.length === 0,
|
|
509
|
+
score
|
|
510
|
+
};
|
|
511
|
+
};
|
|
352
512
|
var compareSTTBenchmarks = (reports) => {
|
|
353
513
|
const entries = reports.map((report) => ({
|
|
354
514
|
adapterId: report.adapterId,
|
|
@@ -436,22 +596,34 @@ var loadVoiceTestFixtures = async (fixtureDirectory) => {
|
|
|
436
596
|
};
|
|
437
597
|
// src/store.ts
|
|
438
598
|
var createId = () => crypto.randomUUID();
|
|
439
|
-
var createVoiceSessionRecord = (id) => ({
|
|
599
|
+
var createVoiceSessionRecord = (id, scenarioId) => ({
|
|
440
600
|
committedTurnIds: [],
|
|
441
601
|
createdAt: Date.now(),
|
|
442
602
|
currentTurn: {
|
|
443
603
|
finalText: "",
|
|
604
|
+
lastSpeechAt: undefined,
|
|
605
|
+
lastTranscriptAt: undefined,
|
|
606
|
+
partialEndedAt: undefined,
|
|
607
|
+
partialStartedAt: undefined,
|
|
444
608
|
partialText: "",
|
|
609
|
+
silenceStartedAt: undefined,
|
|
445
610
|
transcripts: []
|
|
446
611
|
},
|
|
447
612
|
id,
|
|
613
|
+
scenarioId,
|
|
448
614
|
reconnect: { attempts: 0 },
|
|
449
615
|
status: "active",
|
|
450
616
|
transcripts: [],
|
|
451
|
-
turns: []
|
|
617
|
+
turns: [],
|
|
618
|
+
lastCommittedTurn: {
|
|
619
|
+
committedAt: 0,
|
|
620
|
+
signature: "",
|
|
621
|
+
text: "",
|
|
622
|
+
transcriptIds: []
|
|
623
|
+
}
|
|
452
624
|
});
|
|
453
|
-
var resetVoiceSessionRecord = (id, existing) => ({
|
|
454
|
-
...createVoiceSessionRecord(id),
|
|
625
|
+
var resetVoiceSessionRecord = (id, existing, scenarioId) => ({
|
|
626
|
+
...createVoiceSessionRecord(id, scenarioId),
|
|
455
627
|
metadata: existing?.metadata
|
|
456
628
|
});
|
|
457
629
|
var toVoiceSessionSummary = (session) => ({
|
|
@@ -484,6 +656,61 @@ var createVoiceMemoryStore = () => {
|
|
|
484
656
|
return { get, getOrCreate, list, remove, set };
|
|
485
657
|
};
|
|
486
658
|
|
|
659
|
+
// src/audioConditioning.ts
|
|
660
|
+
var DEFAULT_TARGET_LEVEL = 0.08;
|
|
661
|
+
var DEFAULT_MAX_GAIN = 3;
|
|
662
|
+
var DEFAULT_NOISE_GATE_THRESHOLD = 0.006;
|
|
663
|
+
var DEFAULT_NOISE_GATE_ATTENUATION = 0.15;
|
|
664
|
+
var toInt16Array = (audio) => {
|
|
665
|
+
if (audio instanceof ArrayBuffer) {
|
|
666
|
+
return new Int16Array(audio, 0, Math.floor(audio.byteLength / 2));
|
|
667
|
+
}
|
|
668
|
+
return new Int16Array(audio.buffer, audio.byteOffset, Math.floor(audio.byteLength / 2));
|
|
669
|
+
};
|
|
670
|
+
var computeRms = (samples) => {
|
|
671
|
+
if (samples.length === 0) {
|
|
672
|
+
return 0;
|
|
673
|
+
}
|
|
674
|
+
let sumSquares = 0;
|
|
675
|
+
for (const sample of samples) {
|
|
676
|
+
const normalized = sample / 32768;
|
|
677
|
+
sumSquares += normalized * normalized;
|
|
678
|
+
}
|
|
679
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
680
|
+
};
|
|
681
|
+
var resolveAudioConditioningConfig = (config) => {
|
|
682
|
+
if (!config || config.enabled === false) {
|
|
683
|
+
return;
|
|
684
|
+
}
|
|
685
|
+
return {
|
|
686
|
+
enabled: true,
|
|
687
|
+
maxGain: config.maxGain ?? DEFAULT_MAX_GAIN,
|
|
688
|
+
noiseGateAttenuation: config.noiseGateAttenuation ?? DEFAULT_NOISE_GATE_ATTENUATION,
|
|
689
|
+
noiseGateThreshold: config.noiseGateThreshold ?? DEFAULT_NOISE_GATE_THRESHOLD,
|
|
690
|
+
targetLevel: config.targetLevel ?? DEFAULT_TARGET_LEVEL
|
|
691
|
+
};
|
|
692
|
+
};
|
|
693
|
+
var conditionAudioChunk = (audio, config) => {
|
|
694
|
+
if (!config) {
|
|
695
|
+
return audio;
|
|
696
|
+
}
|
|
697
|
+
const source = toInt16Array(audio);
|
|
698
|
+
if (source.length === 0) {
|
|
699
|
+
return audio;
|
|
700
|
+
}
|
|
701
|
+
const rms = computeRms(source);
|
|
702
|
+
const output = new Int16Array(source.length);
|
|
703
|
+
const gateFactor = rms < config.noiseGateThreshold ? config.noiseGateAttenuation : 1;
|
|
704
|
+
const baseLevel = Math.max(rms * gateFactor, 0.000001);
|
|
705
|
+
const gain = Math.min(config.maxGain, config.targetLevel / baseLevel);
|
|
706
|
+
const appliedGain = Math.max(0.25, gain) * gateFactor;
|
|
707
|
+
for (let index = 0;index < source.length; index += 1) {
|
|
708
|
+
const next = Math.round(source[index] * appliedGain);
|
|
709
|
+
output[index] = Math.max(-32768, Math.min(32767, next));
|
|
710
|
+
}
|
|
711
|
+
return new Uint8Array(output.buffer);
|
|
712
|
+
};
|
|
713
|
+
|
|
487
714
|
// src/logger.ts
|
|
488
715
|
var noop = () => {};
|
|
489
716
|
var createNoopLogger = () => ({
|
|
@@ -500,13 +727,128 @@ var resolveLogger = (logger) => ({
|
|
|
500
727
|
// src/session.ts
|
|
501
728
|
var DEFAULT_RECONNECT_TIMEOUT = 30000;
|
|
502
729
|
var DEFAULT_MAX_RECONNECT_ATTEMPTS = 10;
|
|
730
|
+
var DEFAULT_TRANSCRIPT_STABILITY_MS = 450;
|
|
731
|
+
var DEFAULT_FALLBACK_REPLAY_MS = 8000;
|
|
732
|
+
var DEFAULT_FALLBACK_SETTLE_MS = 220;
|
|
733
|
+
var DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS = 2500;
|
|
734
|
+
var DEFAULT_FALLBACK_CONFIDENCE_THRESHOLD = 0.6;
|
|
735
|
+
var DEFAULT_FALLBACK_MIN_TEXT_LENGTH = 2;
|
|
736
|
+
var DEFAULT_FALLBACK_MAX_ATTEMPTS_PER_TURN = 1;
|
|
737
|
+
var DEFAULT_DUPLICATE_TURN_WINDOW_MS = 5000;
|
|
738
|
+
var FALLBACK_CONFIDENCE_SELECTION_DELTA = 0.05;
|
|
739
|
+
var FALLBACK_WORD_COUNT_SELECTION_MARGIN_RATIO = 0.12;
|
|
740
|
+
var DEFAULT_FORMAT = {
|
|
741
|
+
channels: 1,
|
|
742
|
+
container: "raw",
|
|
743
|
+
encoding: "pcm_s16le",
|
|
744
|
+
sampleRateHz: 16000
|
|
745
|
+
};
|
|
503
746
|
var toError = (value) => value instanceof Error ? value : new Error(String(value));
|
|
504
747
|
var createEmptyCurrentTurn = () => ({
|
|
505
748
|
finalText: "",
|
|
749
|
+
lastSpeechAt: undefined,
|
|
750
|
+
lastTranscriptAt: undefined,
|
|
751
|
+
partialEndedAt: undefined,
|
|
752
|
+
partialStartedAt: undefined,
|
|
506
753
|
partialText: "",
|
|
754
|
+
silenceStartedAt: undefined,
|
|
507
755
|
transcripts: []
|
|
508
756
|
});
|
|
509
757
|
var cloneTranscript = (transcript) => ({ ...transcript });
|
|
758
|
+
var countWords2 = (text) => text.trim().split(/\s+/).filter(Boolean).length;
|
|
759
|
+
var normalizeText2 = (text) => text.trim().replace(/\s+/g, " ");
|
|
760
|
+
var getAudioChunkDurationMs = (chunk) => chunk.byteLength / (DEFAULT_FORMAT.sampleRateHz * DEFAULT_FORMAT.channels * 2) * 1000;
|
|
761
|
+
var getBufferedAudioDurationMs = (chunks) => chunks.reduce((total, chunk) => total + getAudioChunkDurationMs(chunk), 0);
|
|
762
|
+
var calculateMeanConfidence = (transcripts) => {
|
|
763
|
+
let sum = 0;
|
|
764
|
+
let total = 0;
|
|
765
|
+
for (const transcript of transcripts) {
|
|
766
|
+
if (typeof transcript.confidence === "number") {
|
|
767
|
+
sum += transcript.confidence;
|
|
768
|
+
total += 1;
|
|
769
|
+
}
|
|
770
|
+
}
|
|
771
|
+
if (total === 0) {
|
|
772
|
+
return 0;
|
|
773
|
+
}
|
|
774
|
+
return sum / total;
|
|
775
|
+
};
|
|
776
|
+
var createTurnQuality = (transcripts, source, fallbackUsed, fallbackDiagnostics, correctionDiagnostics) => {
|
|
777
|
+
const sampledTranscripts = transcripts.filter((transcript) => typeof transcript.confidence === "number");
|
|
778
|
+
const confidenceSampleCount = sampledTranscripts.length;
|
|
779
|
+
return {
|
|
780
|
+
averageConfidence: confidenceSampleCount > 0 ? sampledTranscripts.reduce((sum, transcript) => sum + transcript.confidence, 0) / confidenceSampleCount : undefined,
|
|
781
|
+
confidenceSampleCount,
|
|
782
|
+
correction: correctionDiagnostics,
|
|
783
|
+
fallback: fallbackDiagnostics,
|
|
784
|
+
fallbackUsed,
|
|
785
|
+
finalTranscriptCount: transcripts.filter((transcript) => transcript.isFinal).length,
|
|
786
|
+
partialTranscriptCount: transcripts.filter((transcript) => !transcript.isFinal).length,
|
|
787
|
+
selectedTranscriptCount: transcripts.length,
|
|
788
|
+
source
|
|
789
|
+
};
|
|
790
|
+
};
|
|
791
|
+
var normalizeCorrectionText = (text) => normalizeText2(text);
|
|
792
|
+
var isFallbackNeeded = (candidate, config) => {
|
|
793
|
+
const trimmed = normalizeText2(candidate.text);
|
|
794
|
+
const wordCount = countWords2(trimmed);
|
|
795
|
+
if (config.trigger === "always") {
|
|
796
|
+
return true;
|
|
797
|
+
}
|
|
798
|
+
if (config.trigger === "empty-turn") {
|
|
799
|
+
return wordCount < config.minTextLength;
|
|
800
|
+
}
|
|
801
|
+
const averageConfidence = calculateMeanConfidence(candidate.transcripts);
|
|
802
|
+
if (config.trigger === "low-confidence") {
|
|
803
|
+
return averageConfidence > 0 && averageConfidence < config.confidenceThreshold;
|
|
804
|
+
}
|
|
805
|
+
return averageConfidence > 0 && averageConfidence < config.confidenceThreshold || wordCount < config.minTextLength;
|
|
806
|
+
};
|
|
807
|
+
var selectBetterTurnText = (candidate, fallback) => {
|
|
808
|
+
if (!fallback.text) {
|
|
809
|
+
return {
|
|
810
|
+
reason: "fallback-empty",
|
|
811
|
+
winner: candidate
|
|
812
|
+
};
|
|
813
|
+
}
|
|
814
|
+
if (!candidate.text) {
|
|
815
|
+
return {
|
|
816
|
+
reason: "primary-empty",
|
|
817
|
+
winner: fallback
|
|
818
|
+
};
|
|
819
|
+
}
|
|
820
|
+
const largestWordCount = Math.max(candidate.wordCount, fallback.wordCount, 1);
|
|
821
|
+
const wordCountDelta = fallback.wordCount - candidate.wordCount;
|
|
822
|
+
const wordCountDeltaRatio = Math.abs(wordCountDelta) / largestWordCount;
|
|
823
|
+
if (wordCountDeltaRatio >= FALLBACK_WORD_COUNT_SELECTION_MARGIN_RATIO && wordCountDelta !== 0) {
|
|
824
|
+
return {
|
|
825
|
+
reason: "word-count-margin",
|
|
826
|
+
winner: wordCountDelta > 0 ? fallback : candidate
|
|
827
|
+
};
|
|
828
|
+
}
|
|
829
|
+
if (fallback.confidence > candidate.confidence + FALLBACK_CONFIDENCE_SELECTION_DELTA) {
|
|
830
|
+
return {
|
|
831
|
+
reason: "confidence-margin",
|
|
832
|
+
winner: fallback
|
|
833
|
+
};
|
|
834
|
+
}
|
|
835
|
+
if (candidate.confidence > fallback.confidence + FALLBACK_CONFIDENCE_SELECTION_DELTA) {
|
|
836
|
+
return {
|
|
837
|
+
reason: "kept-primary",
|
|
838
|
+
winner: candidate
|
|
839
|
+
};
|
|
840
|
+
}
|
|
841
|
+
if (fallback.wordCount > candidate.wordCount) {
|
|
842
|
+
return {
|
|
843
|
+
reason: "word-count-tiebreak",
|
|
844
|
+
winner: fallback
|
|
845
|
+
};
|
|
846
|
+
}
|
|
847
|
+
return {
|
|
848
|
+
reason: "kept-primary",
|
|
849
|
+
winner: candidate
|
|
850
|
+
};
|
|
851
|
+
};
|
|
510
852
|
var setTurnResult = (session, turnId, input) => {
|
|
511
853
|
session.turns = session.turns.map((turn) => turn.id === turnId ? {
|
|
512
854
|
...turn,
|
|
@@ -523,12 +865,55 @@ var createVoiceSession = (options) => {
|
|
|
523
865
|
};
|
|
524
866
|
const turnDetection = {
|
|
525
867
|
silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
|
|
526
|
-
speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD
|
|
868
|
+
speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
|
|
869
|
+
transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
|
|
527
870
|
};
|
|
871
|
+
const sttFallback = options.sttFallback ? {
|
|
872
|
+
adapter: options.sttFallback.adapter,
|
|
873
|
+
completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
|
|
874
|
+
confidenceThreshold: options.sttFallback.confidenceThreshold ?? DEFAULT_FALLBACK_CONFIDENCE_THRESHOLD,
|
|
875
|
+
maxAttemptsPerTurn: options.sttFallback.maxAttemptsPerTurn ?? DEFAULT_FALLBACK_MAX_ATTEMPTS_PER_TURN,
|
|
876
|
+
minTextLength: options.sttFallback.minTextLength ?? DEFAULT_FALLBACK_MIN_TEXT_LENGTH,
|
|
877
|
+
replayWindowMs: options.sttFallback.replayWindowMs ?? DEFAULT_FALLBACK_REPLAY_MS,
|
|
878
|
+
settleMs: options.sttFallback.settleMs ?? DEFAULT_FALLBACK_SETTLE_MS,
|
|
879
|
+
trigger: options.sttFallback.trigger ?? "empty-or-low-confidence"
|
|
880
|
+
} : undefined;
|
|
881
|
+
const phraseHints = options.phraseHints ?? [];
|
|
528
882
|
let socket = options.socket;
|
|
529
883
|
let sttSession = null;
|
|
530
884
|
let silenceTimer = null;
|
|
531
885
|
let speechDetected = false;
|
|
886
|
+
let operationQueue = Promise.resolve();
|
|
887
|
+
let adapterGenerationCounter = 0;
|
|
888
|
+
let activeAdapterGeneration = 0;
|
|
889
|
+
const currentTurnAudio = [];
|
|
890
|
+
let fallbackAttemptsForCurrentTurn = 0;
|
|
891
|
+
const pruneTurnAudio = () => {
|
|
892
|
+
const replayWindowMs = sttFallback?.replayWindowMs ?? DEFAULT_FALLBACK_REPLAY_MS;
|
|
893
|
+
const cutoffAt = Date.now() - replayWindowMs;
|
|
894
|
+
let index = 0;
|
|
895
|
+
while (index < currentTurnAudio.length && currentTurnAudio[index].recordedAt < cutoffAt) {
|
|
896
|
+
index += 1;
|
|
897
|
+
}
|
|
898
|
+
if (index > 0) {
|
|
899
|
+
currentTurnAudio.splice(0, index);
|
|
900
|
+
}
|
|
901
|
+
};
|
|
902
|
+
const pushTurnAudio = (audio) => {
|
|
903
|
+
const chunk = audio instanceof ArrayBuffer ? new Uint8Array(audio.slice(0)) : new Uint8Array(audio.buffer.slice(audio.byteOffset, audio.byteOffset + audio.byteLength));
|
|
904
|
+
currentTurnAudio.push({
|
|
905
|
+
chunk,
|
|
906
|
+
recordedAt: Date.now()
|
|
907
|
+
});
|
|
908
|
+
pruneTurnAudio();
|
|
909
|
+
};
|
|
910
|
+
const getFallbackWindowAudio = () => {
|
|
911
|
+
if (!sttFallback?.adapter) {
|
|
912
|
+
return [];
|
|
913
|
+
}
|
|
914
|
+
pruneTurnAudio();
|
|
915
|
+
return currentTurnAudio.map((audio) => audio.chunk);
|
|
916
|
+
};
|
|
532
917
|
const clearSilenceTimer = () => {
|
|
533
918
|
if (!silenceTimer) {
|
|
534
919
|
return;
|
|
@@ -554,12 +939,28 @@ var createVoiceSession = (options) => {
|
|
|
554
939
|
await options.store.set(options.id, session);
|
|
555
940
|
return session;
|
|
556
941
|
};
|
|
942
|
+
const runSerial = (phase, operation) => {
|
|
943
|
+
const result = operationQueue.then(async () => {
|
|
944
|
+
logger.debug("voice session operation", {
|
|
945
|
+
phase,
|
|
946
|
+
sessionId: options.id
|
|
947
|
+
});
|
|
948
|
+
return await operation();
|
|
949
|
+
});
|
|
950
|
+
operationQueue = result.then(() => {
|
|
951
|
+
return;
|
|
952
|
+
}, () => {
|
|
953
|
+
return;
|
|
954
|
+
});
|
|
955
|
+
return result;
|
|
956
|
+
};
|
|
557
957
|
const closeAdapter = async (reason) => {
|
|
558
958
|
if (!sttSession) {
|
|
559
959
|
return;
|
|
560
960
|
}
|
|
561
961
|
const activeSession = sttSession;
|
|
562
962
|
sttSession = null;
|
|
963
|
+
activeAdapterGeneration = 0;
|
|
563
964
|
try {
|
|
564
965
|
await activeSession.close(reason);
|
|
565
966
|
} catch (error) {
|
|
@@ -569,13 +970,87 @@ var createVoiceSession = (options) => {
|
|
|
569
970
|
});
|
|
570
971
|
}
|
|
571
972
|
};
|
|
572
|
-
const
|
|
573
|
-
if (silenceTimer) {
|
|
973
|
+
const scheduleTurnCommit = (delayMs, reason, reset = true) => {
|
|
974
|
+
if (!reset && silenceTimer) {
|
|
574
975
|
return;
|
|
575
976
|
}
|
|
977
|
+
if (reset) {
|
|
978
|
+
clearSilenceTimer();
|
|
979
|
+
}
|
|
576
980
|
silenceTimer = setTimeout(() => {
|
|
577
|
-
|
|
578
|
-
|
|
981
|
+
silenceTimer = null;
|
|
982
|
+
api.commitTurn(reason);
|
|
983
|
+
}, delayMs);
|
|
984
|
+
};
|
|
985
|
+
const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
|
|
986
|
+
const requestTurnCommit = async (reason) => {
|
|
987
|
+
const session = await readSession();
|
|
988
|
+
const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
989
|
+
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
990
|
+
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
991
|
+
});
|
|
992
|
+
if (!text) {
|
|
993
|
+
return;
|
|
994
|
+
}
|
|
995
|
+
const transcriptStabilityAge = session.currentTurn.lastTranscriptAt !== undefined ? Date.now() - session.currentTurn.lastTranscriptAt : undefined;
|
|
996
|
+
if (reason !== "manual" && typeof transcriptStabilityAge === "number" && transcriptStabilityAge < turnDetection.transcriptStabilityMs) {
|
|
997
|
+
scheduleTurnCommit(turnDetection.transcriptStabilityMs - transcriptStabilityAge, reason);
|
|
998
|
+
return;
|
|
999
|
+
}
|
|
1000
|
+
await commitTurnInternal(reason);
|
|
1001
|
+
};
|
|
1002
|
+
const failInternal = async (error) => {
|
|
1003
|
+
clearSilenceTimer();
|
|
1004
|
+
const session = await writeSession((currentSession) => {
|
|
1005
|
+
currentSession.lastActivityAt = Date.now();
|
|
1006
|
+
currentSession.status = "failed";
|
|
1007
|
+
});
|
|
1008
|
+
const resolvedError = toError(error);
|
|
1009
|
+
await send({
|
|
1010
|
+
message: resolvedError.message,
|
|
1011
|
+
recoverable: false,
|
|
1012
|
+
type: "error"
|
|
1013
|
+
});
|
|
1014
|
+
await closeAdapter("failed");
|
|
1015
|
+
speechDetected = false;
|
|
1016
|
+
rewindFallbackTurnAudio();
|
|
1017
|
+
await options.route.onError?.({
|
|
1018
|
+
api,
|
|
1019
|
+
context: options.context,
|
|
1020
|
+
error: resolvedError,
|
|
1021
|
+
session,
|
|
1022
|
+
sessionId: options.id
|
|
1023
|
+
});
|
|
1024
|
+
};
|
|
1025
|
+
const completeInternal = async (result) => {
|
|
1026
|
+
clearSilenceTimer();
|
|
1027
|
+
const session = await writeSession((currentSession) => {
|
|
1028
|
+
if (currentSession.status === "completed") {
|
|
1029
|
+
return;
|
|
1030
|
+
}
|
|
1031
|
+
currentSession.lastActivityAt = Date.now();
|
|
1032
|
+
currentSession.status = "completed";
|
|
1033
|
+
if (result !== undefined && currentSession.turns.length > 0) {
|
|
1034
|
+
const lastTurn = currentSession.turns.at(-1);
|
|
1035
|
+
if (lastTurn) {
|
|
1036
|
+
setTurnResult(currentSession, lastTurn.id, {
|
|
1037
|
+
result
|
|
1038
|
+
});
|
|
1039
|
+
}
|
|
1040
|
+
}
|
|
1041
|
+
});
|
|
1042
|
+
await send({
|
|
1043
|
+
sessionId: options.id,
|
|
1044
|
+
type: "complete"
|
|
1045
|
+
});
|
|
1046
|
+
await closeAdapter("complete");
|
|
1047
|
+
speechDetected = false;
|
|
1048
|
+
rewindFallbackTurnAudio();
|
|
1049
|
+
await options.route.onComplete({
|
|
1050
|
+
api,
|
|
1051
|
+
context: options.context,
|
|
1052
|
+
session
|
|
1053
|
+
});
|
|
579
1054
|
};
|
|
580
1055
|
const handleError = async (event) => {
|
|
581
1056
|
await send({
|
|
@@ -584,18 +1059,273 @@ var createVoiceSession = (options) => {
|
|
|
584
1059
|
type: "error"
|
|
585
1060
|
});
|
|
586
1061
|
if (!event.recoverable) {
|
|
587
|
-
await
|
|
1062
|
+
await failInternal(event.error);
|
|
588
1063
|
}
|
|
589
1064
|
};
|
|
590
1065
|
const handleClose = async (event) => {
|
|
591
1066
|
if (event.recoverable === false) {
|
|
592
|
-
await
|
|
1067
|
+
await failInternal(new Error(event.reason ?? "Speech-to-text session closed"));
|
|
1068
|
+
return;
|
|
1069
|
+
}
|
|
1070
|
+
if (!event.reason) {
|
|
1071
|
+
await closeAdapter("provider stream closed");
|
|
1072
|
+
return;
|
|
1073
|
+
}
|
|
1074
|
+
await closeAdapter(event.reason);
|
|
1075
|
+
};
|
|
1076
|
+
const rewindFallbackTurnAudio = () => {
|
|
1077
|
+
fallbackAttemptsForCurrentTurn = 0;
|
|
1078
|
+
currentTurnAudio.length = 0;
|
|
1079
|
+
};
|
|
1080
|
+
const runFallbackTranscription = async (primaryText, primaryTranscripts) => {
|
|
1081
|
+
if (!sttFallback?.adapter || fallbackAttemptsForCurrentTurn >= sttFallback.maxAttemptsPerTurn) {
|
|
1082
|
+
return null;
|
|
1083
|
+
}
|
|
1084
|
+
const candidate = {
|
|
1085
|
+
text: primaryText,
|
|
1086
|
+
transcripts: primaryTranscripts
|
|
1087
|
+
};
|
|
1088
|
+
if (!isFallbackNeeded(candidate, sttFallback)) {
|
|
1089
|
+
return null;
|
|
1090
|
+
}
|
|
1091
|
+
fallbackAttemptsForCurrentTurn += 1;
|
|
1092
|
+
const replayAudio = getFallbackWindowAudio();
|
|
1093
|
+
if (replayAudio.length === 0) {
|
|
1094
|
+
return null;
|
|
1095
|
+
}
|
|
1096
|
+
let fallbackSession = null;
|
|
1097
|
+
const fallbackTranscripts = [];
|
|
1098
|
+
let fallbackClosed = false;
|
|
1099
|
+
let fallbackEndOfTurnReceived = false;
|
|
1100
|
+
let fallbackFinalReceived = false;
|
|
1101
|
+
let lastFallbackTranscriptAt = 0;
|
|
1102
|
+
try {
|
|
1103
|
+
fallbackSession = await sttFallback.adapter.open({
|
|
1104
|
+
format: DEFAULT_FORMAT,
|
|
1105
|
+
phraseHints,
|
|
1106
|
+
sessionId: `${options.id}:fallback:${fallbackAttemptsForCurrentTurn}`
|
|
1107
|
+
});
|
|
1108
|
+
} catch (error) {
|
|
1109
|
+
logger.warn("voice stt fallback open failed", {
|
|
1110
|
+
error: toError(error).message,
|
|
1111
|
+
sessionId: options.id
|
|
1112
|
+
});
|
|
1113
|
+
return null;
|
|
1114
|
+
}
|
|
1115
|
+
const unsubscribers = [
|
|
1116
|
+
fallbackSession.on("final", ({ transcript }) => {
|
|
1117
|
+
fallbackFinalReceived = true;
|
|
1118
|
+
lastFallbackTranscriptAt = Date.now();
|
|
1119
|
+
fallbackTranscripts.push(cloneTranscript(transcript));
|
|
1120
|
+
}),
|
|
1121
|
+
fallbackSession.on("partial", ({ transcript }) => {
|
|
1122
|
+
lastFallbackTranscriptAt = Date.now();
|
|
1123
|
+
fallbackTranscripts.push(cloneTranscript(transcript));
|
|
1124
|
+
}),
|
|
1125
|
+
fallbackSession.on("endOfTurn", () => {
|
|
1126
|
+
fallbackEndOfTurnReceived = true;
|
|
1127
|
+
}),
|
|
1128
|
+
fallbackSession.on("error", (event) => {
|
|
1129
|
+
logger.warn("voice stt fallback error", {
|
|
1130
|
+
error: toError(event.error).message,
|
|
1131
|
+
sessionId: options.id
|
|
1132
|
+
});
|
|
1133
|
+
}),
|
|
1134
|
+
fallbackSession.on("close", () => {
|
|
1135
|
+
fallbackClosed = true;
|
|
1136
|
+
})
|
|
1137
|
+
];
|
|
1138
|
+
const closeFallback = async (reason) => {
|
|
1139
|
+
if (!fallbackSession) {
|
|
1140
|
+
return;
|
|
1141
|
+
}
|
|
1142
|
+
try {
|
|
1143
|
+
await fallbackSession.close(reason);
|
|
1144
|
+
} catch (error) {
|
|
1145
|
+
logger.warn("voice stt fallback close failed", {
|
|
1146
|
+
error: toError(error).message,
|
|
1147
|
+
sessionId: options.id
|
|
1148
|
+
});
|
|
1149
|
+
} finally {
|
|
1150
|
+
fallbackSession = null;
|
|
1151
|
+
}
|
|
1152
|
+
};
|
|
1153
|
+
try {
|
|
1154
|
+
for (const chunk of replayAudio) {
|
|
1155
|
+
await fallbackSession.send(chunk);
|
|
1156
|
+
}
|
|
1157
|
+
const replayDurationMs = getBufferedAudioDurationMs(replayAudio);
|
|
1158
|
+
const completionTimeoutMs = Math.max(sttFallback.completionTimeoutMs, Math.min(4000, Math.max(sttFallback.settleMs * 4, Math.round(replayDurationMs * 0.18))));
|
|
1159
|
+
const waitStartedAt = Date.now();
|
|
1160
|
+
while (Date.now() - waitStartedAt < completionTimeoutMs) {
|
|
1161
|
+
const idleMs = lastFallbackTranscriptAt > 0 ? Date.now() - lastFallbackTranscriptAt : Date.now() - waitStartedAt;
|
|
1162
|
+
if (fallbackEndOfTurnReceived && idleMs >= sttFallback.settleMs) {
|
|
1163
|
+
break;
|
|
1164
|
+
}
|
|
1165
|
+
if (fallbackFinalReceived && idleMs >= sttFallback.settleMs) {
|
|
1166
|
+
break;
|
|
1167
|
+
}
|
|
1168
|
+
if (fallbackClosed && (lastFallbackTranscriptAt === 0 || idleMs >= sttFallback.settleMs)) {
|
|
1169
|
+
break;
|
|
1170
|
+
}
|
|
1171
|
+
await Bun.sleep(Math.min(75, Math.max(25, sttFallback.settleMs / 2)));
|
|
1172
|
+
}
|
|
1173
|
+
} catch (error) {
|
|
1174
|
+
logger.warn("voice stt fallback failed", {
|
|
1175
|
+
error: toError(error).message,
|
|
1176
|
+
sessionId: options.id
|
|
1177
|
+
});
|
|
1178
|
+
} finally {
|
|
1179
|
+
await closeFallback("fallback-complete");
|
|
1180
|
+
for (const unsubscribe of unsubscribers) {
|
|
1181
|
+
unsubscribe();
|
|
1182
|
+
}
|
|
1183
|
+
}
|
|
1184
|
+
if (fallbackTranscripts.length === 0) {
|
|
1185
|
+
return null;
|
|
1186
|
+
}
|
|
1187
|
+
const fallbackText = buildTurnText(fallbackTranscripts, "", {});
|
|
1188
|
+
const fallbackConfidence = calculateMeanConfidence(fallbackTranscripts);
|
|
1189
|
+
const fallbackCandidate = {
|
|
1190
|
+
confidence: fallbackConfidence,
|
|
1191
|
+
text: fallbackText,
|
|
1192
|
+
wordCount: countWords2(normalizeText2(fallbackText))
|
|
1193
|
+
};
|
|
1194
|
+
const primaryCandidate = {
|
|
1195
|
+
confidence: calculateMeanConfidence(primaryTranscripts),
|
|
1196
|
+
text: primaryText,
|
|
1197
|
+
wordCount: countWords2(normalizeText2(primaryText))
|
|
1198
|
+
};
|
|
1199
|
+
const selection = selectBetterTurnText(primaryCandidate, fallbackCandidate);
|
|
1200
|
+
const diagnostics = {
|
|
1201
|
+
attempted: true,
|
|
1202
|
+
fallbackConfidence: fallbackCandidate.confidence,
|
|
1203
|
+
fallbackText: fallbackCandidate.text,
|
|
1204
|
+
fallbackWordCount: fallbackCandidate.wordCount,
|
|
1205
|
+
primaryConfidence: primaryCandidate.confidence,
|
|
1206
|
+
primaryText,
|
|
1207
|
+
primaryWordCount: primaryCandidate.wordCount,
|
|
1208
|
+
selected: selection.winner.text === fallbackCandidate.text,
|
|
1209
|
+
selectionReason: selection.reason,
|
|
1210
|
+
trigger: sttFallback.trigger
|
|
1211
|
+
};
|
|
1212
|
+
if (selection.winner.text === primaryCandidate.text) {
|
|
1213
|
+
return {
|
|
1214
|
+
diagnostics,
|
|
1215
|
+
fallbackUsed: false,
|
|
1216
|
+
source: "primary",
|
|
1217
|
+
text: primaryText,
|
|
1218
|
+
transcripts: primaryTranscripts.map((transcript) => ({
|
|
1219
|
+
...transcript,
|
|
1220
|
+
isFinal: true
|
|
1221
|
+
}))
|
|
1222
|
+
};
|
|
593
1223
|
}
|
|
1224
|
+
const candidateTranscripts = fallbackText === fallbackCandidate.text ? fallbackTranscripts : [];
|
|
1225
|
+
return {
|
|
1226
|
+
diagnostics,
|
|
1227
|
+
fallbackUsed: true,
|
|
1228
|
+
source: "fallback",
|
|
1229
|
+
text: selection.winner.text,
|
|
1230
|
+
transcripts: candidateTranscripts.length > 0 ? candidateTranscripts.map((transcript) => ({
|
|
1231
|
+
...transcript,
|
|
1232
|
+
isFinal: true
|
|
1233
|
+
})) : [{ id: createId(), isFinal: false, text: selection.winner.text }]
|
|
1234
|
+
};
|
|
1235
|
+
};
|
|
1236
|
+
const getFinalTranscriptIds = (transcripts) => {
|
|
1237
|
+
const finalTranscriptIds = transcripts.filter((transcript) => transcript.isFinal).map((transcript) => transcript.id);
|
|
1238
|
+
const fallbackIds = transcripts.map((transcript) => transcript.id);
|
|
1239
|
+
return finalTranscriptIds.length > 0 ? finalTranscriptIds : fallbackIds;
|
|
1240
|
+
};
|
|
1241
|
+
const runTurnCorrection = async (input) => {
|
|
1242
|
+
if (!options.route.correctTurn) {
|
|
1243
|
+
return;
|
|
1244
|
+
}
|
|
1245
|
+
const originalText = input.text;
|
|
1246
|
+
const result = await options.route.correctTurn({
|
|
1247
|
+
api,
|
|
1248
|
+
context: options.context,
|
|
1249
|
+
fallback: input.fallbackDiagnostics,
|
|
1250
|
+
phraseHints,
|
|
1251
|
+
session: input.session,
|
|
1252
|
+
text: originalText,
|
|
1253
|
+
transcripts: input.transcripts.map(cloneTranscript)
|
|
1254
|
+
});
|
|
1255
|
+
const nextText = typeof result === "string" ? result : typeof result?.text === "string" ? result.text : originalText;
|
|
1256
|
+
const correctedText = normalizeCorrectionText(nextText);
|
|
1257
|
+
const normalizedOriginal = normalizeCorrectionText(originalText);
|
|
1258
|
+
return {
|
|
1259
|
+
diagnostics: {
|
|
1260
|
+
attempted: true,
|
|
1261
|
+
changed: correctedText.length > 0 && correctedText !== normalizedOriginal,
|
|
1262
|
+
correctedText: correctedText.length > 0 ? correctedText : normalizedOriginal,
|
|
1263
|
+
metadata: typeof result === "object" ? result.metadata : undefined,
|
|
1264
|
+
originalText,
|
|
1265
|
+
provider: typeof result === "object" ? result.provider : undefined,
|
|
1266
|
+
reason: typeof result === "object" ? result.reason : undefined
|
|
1267
|
+
},
|
|
1268
|
+
text: correctedText.length > 0 ? correctedText : originalText
|
|
1269
|
+
};
|
|
1270
|
+
};
|
|
1271
|
+
const ensureCommittedTurnGuard = (session) => {
|
|
1272
|
+
if (!session.lastCommittedTurn) {
|
|
1273
|
+
session.lastCommittedTurn = {
|
|
1274
|
+
committedAt: 0,
|
|
1275
|
+
signature: "",
|
|
1276
|
+
text: "",
|
|
1277
|
+
transcriptIds: []
|
|
1278
|
+
};
|
|
1279
|
+
}
|
|
1280
|
+
return session;
|
|
1281
|
+
};
|
|
1282
|
+
const buildTurnSignature = (session, finalText, transcriptIdsOverride) => {
|
|
1283
|
+
const finalTranscriptIds = transcriptIdsOverride ?? getFinalTranscriptIds(session.currentTurn.transcripts);
|
|
1284
|
+
return `${normalizeText2(finalText)}|${finalTranscriptIds.join(",")}`;
|
|
1285
|
+
};
|
|
1286
|
+
const isDuplicateTurnCommit = (session, finalText) => {
|
|
1287
|
+
const signature = buildTurnSignature(session, finalText);
|
|
1288
|
+
const committedTurn = session.lastCommittedTurn;
|
|
1289
|
+
const isRecent = committedTurn && committedTurn.committedAt > 0 && Date.now() - committedTurn.committedAt < DEFAULT_DUPLICATE_TURN_WINDOW_MS;
|
|
1290
|
+
const committedSignature = committedTurn?.signature ?? "";
|
|
1291
|
+
const committedTranscriptIds = committedTurn?.transcriptIds ?? [];
|
|
1292
|
+
const committedText = normalizeText2(committedTurn?.text ?? "");
|
|
1293
|
+
const isSameText = normalizeText2(finalText) === committedText;
|
|
1294
|
+
const hasNoNewAudioSinceCommit = (session.currentTurn.lastAudioAt ?? 0) <= (committedTurn?.committedAt ?? 0);
|
|
1295
|
+
if (!isRecent) {
|
|
1296
|
+
return false;
|
|
1297
|
+
}
|
|
1298
|
+
if (isSameText && hasNoNewAudioSinceCommit) {
|
|
1299
|
+
return true;
|
|
1300
|
+
}
|
|
1301
|
+
if (signature !== committedSignature) {
|
|
1302
|
+
return false;
|
|
1303
|
+
}
|
|
1304
|
+
const lastSignatureIds = new Set(committedTranscriptIds);
|
|
1305
|
+
const hasNoNewFinalIds = session.currentTurn.transcripts.every((transcript) => !transcript.isFinal || lastSignatureIds.has(transcript.id));
|
|
1306
|
+
return isRecent && hasNoNewFinalIds;
|
|
1307
|
+
};
|
|
1308
|
+
const markTurnCommitted = (session, finalText, committedTranscripts) => {
|
|
1309
|
+
session.lastCommittedTurn = {
|
|
1310
|
+
...session.lastCommittedTurn ?? {},
|
|
1311
|
+
committedAt: Date.now(),
|
|
1312
|
+
signature: buildTurnSignature(session, finalText, getFinalTranscriptIds(committedTranscripts)),
|
|
1313
|
+
text: normalizeText2(finalText),
|
|
1314
|
+
transcriptIds: getFinalTranscriptIds(committedTranscripts)
|
|
1315
|
+
};
|
|
594
1316
|
};
|
|
595
1317
|
const handlePartial = async (transcript) => {
|
|
596
1318
|
await writeSession((session) => {
|
|
597
|
-
session.currentTurn.
|
|
598
|
-
|
|
1319
|
+
const nextPartialStartedAt = transcript.startedAtMs ?? session.currentTurn.partialStartedAt;
|
|
1320
|
+
const nextPartialEndedAt = transcript.endedAtMs ?? session.currentTurn.partialEndedAt;
|
|
1321
|
+
const preferredPartial = selectPreferredTranscriptText(session.currentTurn.partialText, transcript.text);
|
|
1322
|
+
session.currentTurn.lastTranscriptAt = Date.now();
|
|
1323
|
+
session.currentTurn.partialStartedAt = nextPartialStartedAt;
|
|
1324
|
+
session.currentTurn.partialEndedAt = nextPartialEndedAt;
|
|
1325
|
+
session.currentTurn.partialText = buildTurnText(session.currentTurn.transcripts, preferredPartial, {
|
|
1326
|
+
partialEndedAtMs: nextPartialEndedAt,
|
|
1327
|
+
partialStartedAtMs: nextPartialStartedAt
|
|
1328
|
+
});
|
|
599
1329
|
session.lastActivityAt = Date.now();
|
|
600
1330
|
session.status = "active";
|
|
601
1331
|
});
|
|
@@ -617,8 +1347,11 @@ var createVoiceSession = (options) => {
|
|
|
617
1347
|
cloneTranscript(transcript)
|
|
618
1348
|
];
|
|
619
1349
|
}
|
|
620
|
-
session.currentTurn.finalText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText
|
|
621
|
-
|
|
1350
|
+
session.currentTurn.finalText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
1351
|
+
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
1352
|
+
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
1353
|
+
});
|
|
1354
|
+
session.currentTurn.lastTranscriptAt = Date.now();
|
|
622
1355
|
session.lastActivityAt = Date.now();
|
|
623
1356
|
session.status = "active";
|
|
624
1357
|
});
|
|
@@ -627,36 +1360,60 @@ var createVoiceSession = (options) => {
|
|
|
627
1360
|
type: "final"
|
|
628
1361
|
});
|
|
629
1362
|
};
|
|
1363
|
+
const resumePendingTurnCommit = (session) => {
|
|
1364
|
+
const pendingText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
1365
|
+
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
1366
|
+
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
1367
|
+
});
|
|
1368
|
+
if (!pendingText) {
|
|
1369
|
+
speechDetected = false;
|
|
1370
|
+
return;
|
|
1371
|
+
}
|
|
1372
|
+
speechDetected = true;
|
|
1373
|
+
const audioAge = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : session.currentTurn.lastSpeechAt !== undefined ? Date.now() - session.currentTurn.lastSpeechAt : 0;
|
|
1374
|
+
const transcriptAge = session.currentTurn.lastTranscriptAt !== undefined ? Date.now() - session.currentTurn.lastTranscriptAt : turnDetection.transcriptStabilityMs;
|
|
1375
|
+
const delayMs = Math.max(0, turnDetection.silenceMs - audioAge, turnDetection.transcriptStabilityMs - transcriptAge);
|
|
1376
|
+
scheduleSilenceCommit(delayMs);
|
|
1377
|
+
};
|
|
630
1378
|
const ensureAdapter = async () => {
|
|
631
1379
|
if (sttSession) {
|
|
632
1380
|
return sttSession;
|
|
633
1381
|
}
|
|
634
|
-
|
|
635
|
-
format:
|
|
636
|
-
|
|
637
|
-
container: "raw",
|
|
638
|
-
encoding: "pcm_s16le",
|
|
639
|
-
sampleRateHz: 16000
|
|
640
|
-
},
|
|
1382
|
+
const openedSession = await options.stt.open({
|
|
1383
|
+
format: DEFAULT_FORMAT,
|
|
1384
|
+
phraseHints,
|
|
641
1385
|
sessionId: options.id
|
|
642
1386
|
});
|
|
643
|
-
|
|
644
|
-
|
|
1387
|
+
const generation = ++adapterGenerationCounter;
|
|
1388
|
+
sttSession = openedSession;
|
|
1389
|
+
activeAdapterGeneration = generation;
|
|
1390
|
+
const runAdapterEvent = (phase, handler) => {
|
|
1391
|
+
runSerial(phase, async () => {
|
|
1392
|
+
if (activeAdapterGeneration !== generation) {
|
|
1393
|
+
return;
|
|
1394
|
+
}
|
|
1395
|
+
await handler();
|
|
1396
|
+
});
|
|
1397
|
+
};
|
|
1398
|
+
openedSession.on("partial", ({ transcript }) => {
|
|
1399
|
+
runAdapterEvent("adapter.partial", () => handlePartial(transcript));
|
|
645
1400
|
});
|
|
646
|
-
|
|
647
|
-
handleFinal(transcript);
|
|
1401
|
+
openedSession.on("final", ({ transcript }) => {
|
|
1402
|
+
runAdapterEvent("adapter.final", () => handleFinal(transcript));
|
|
648
1403
|
});
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
1404
|
+
openedSession.on("endOfTurn", ({ reason }) => {
|
|
1405
|
+
runAdapterEvent("adapter.endOfTurn", async () => {
|
|
1406
|
+
clearSilenceTimer();
|
|
1407
|
+
await requestTurnCommit(reason);
|
|
1408
|
+
});
|
|
652
1409
|
});
|
|
653
|
-
|
|
654
|
-
handleError(event);
|
|
1410
|
+
openedSession.on("error", (event) => {
|
|
1411
|
+
runAdapterEvent("adapter.error", () => handleError(event));
|
|
655
1412
|
});
|
|
656
|
-
|
|
657
|
-
handleClose(event);
|
|
1413
|
+
openedSession.on("close", (event) => {
|
|
1414
|
+
runAdapterEvent("adapter.close", () => handleClose(event));
|
|
658
1415
|
});
|
|
659
|
-
return
|
|
1416
|
+
return openedSession;
|
|
660
1417
|
};
|
|
661
1418
|
const completeTurn = async (session, turn) => {
|
|
662
1419
|
const output = await options.route.onTurn({
|
|
@@ -685,206 +1442,312 @@ var createVoiceSession = (options) => {
|
|
|
685
1442
|
});
|
|
686
1443
|
}
|
|
687
1444
|
if (output?.complete) {
|
|
688
|
-
await
|
|
1445
|
+
await completeInternal(output.result);
|
|
689
1446
|
}
|
|
690
1447
|
};
|
|
691
|
-
const
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
1448
|
+
const commitTurnInternal = async (reason = "manual") => {
|
|
1449
|
+
clearSilenceTimer();
|
|
1450
|
+
const session = await readSession();
|
|
1451
|
+
if (session.status === "completed" || session.status === "failed") {
|
|
1452
|
+
return;
|
|
1453
|
+
}
|
|
1454
|
+
const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
1455
|
+
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
1456
|
+
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
1457
|
+
});
|
|
1458
|
+
let transcripts = session.currentTurn.transcripts.length ? session.currentTurn.transcripts.map(cloneTranscript) : [];
|
|
1459
|
+
let finalText = text;
|
|
1460
|
+
const transcriptStabilityAge = session.currentTurn.lastTranscriptAt !== undefined ? Date.now() - session.currentTurn.lastTranscriptAt : undefined;
|
|
1461
|
+
const fallbackSelection = await runFallbackTranscription(text, session.currentTurn.transcripts);
|
|
1462
|
+
const source = fallbackSelection?.source ?? "primary";
|
|
1463
|
+
const fallbackUsed = fallbackSelection?.fallbackUsed ?? false;
|
|
1464
|
+
const fallbackDiagnostics = fallbackSelection?.diagnostics;
|
|
1465
|
+
if (fallbackSelection) {
|
|
1466
|
+
finalText = fallbackSelection.text;
|
|
1467
|
+
transcripts = fallbackSelection.transcripts.length ? fallbackSelection.transcripts.map(cloneTranscript) : transcripts.length ? transcripts : [
|
|
1468
|
+
{
|
|
1469
|
+
id: createId(),
|
|
1470
|
+
isFinal: false,
|
|
1471
|
+
text: finalText
|
|
1472
|
+
}
|
|
1473
|
+
];
|
|
1474
|
+
if (fallbackSelection.fallbackUsed) {
|
|
1475
|
+
logger.info("voice fallback turn selected", {
|
|
1476
|
+
reason,
|
|
1477
|
+
sessionId: options.id,
|
|
1478
|
+
text: finalText
|
|
1479
|
+
});
|
|
707
1480
|
}
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
currentSession.lastActivityAt = Date.now();
|
|
727
|
-
currentSession.status = "active";
|
|
728
|
-
currentSession.turns = [...currentSession.turns, turn];
|
|
729
|
-
});
|
|
730
|
-
speechDetected = false;
|
|
731
|
-
logger.info("voice turn committed", {
|
|
1481
|
+
}
|
|
1482
|
+
const correctionSelection = await runTurnCorrection({
|
|
1483
|
+
fallbackDiagnostics,
|
|
1484
|
+
fallbackUsed,
|
|
1485
|
+
session,
|
|
1486
|
+
source,
|
|
1487
|
+
text: finalText,
|
|
1488
|
+
transcripts
|
|
1489
|
+
});
|
|
1490
|
+
const correctionDiagnostics = correctionSelection?.diagnostics;
|
|
1491
|
+
if (correctionSelection) {
|
|
1492
|
+
finalText = correctionSelection.text;
|
|
1493
|
+
}
|
|
1494
|
+
if (!finalText) {
|
|
1495
|
+
return;
|
|
1496
|
+
}
|
|
1497
|
+
if (isDuplicateTurnCommit(session, finalText)) {
|
|
1498
|
+
logger.debug("voice turn commit deduped", {
|
|
732
1499
|
reason,
|
|
733
|
-
sessionId: options.id
|
|
734
|
-
turnId: turn.id
|
|
735
|
-
});
|
|
736
|
-
await send({
|
|
737
|
-
turn,
|
|
738
|
-
type: "turn"
|
|
1500
|
+
sessionId: options.id
|
|
739
1501
|
});
|
|
740
|
-
|
|
741
|
-
}
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
}
|
|
1502
|
+
return;
|
|
1503
|
+
}
|
|
1504
|
+
if (typeof transcriptStabilityAge === "number" && transcriptStabilityAge < turnDetection.transcriptStabilityMs && reason !== "manual") {
|
|
1505
|
+
scheduleTurnCommit(turnDetection.transcriptStabilityMs - transcriptStabilityAge, reason, false);
|
|
1506
|
+
return;
|
|
1507
|
+
}
|
|
1508
|
+
const turn = {
|
|
1509
|
+
committedAt: Date.now(),
|
|
1510
|
+
id: createId(),
|
|
1511
|
+
text: finalText,
|
|
1512
|
+
quality: createTurnQuality(transcripts, source, fallbackUsed, fallbackDiagnostics, correctionDiagnostics),
|
|
1513
|
+
transcripts: transcripts.length > 0 ? transcripts : [
|
|
1514
|
+
{
|
|
1515
|
+
id: createId(),
|
|
1516
|
+
isFinal: false,
|
|
1517
|
+
text: finalText
|
|
757
1518
|
}
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
1519
|
+
]
|
|
1520
|
+
};
|
|
1521
|
+
const updatedSession = await writeSession((currentSession) => {
|
|
1522
|
+
currentSession.committedTurnIds = [
|
|
1523
|
+
...currentSession.committedTurnIds,
|
|
1524
|
+
turn.id
|
|
1525
|
+
];
|
|
1526
|
+
currentSession.currentTurn = createEmptyCurrentTurn();
|
|
1527
|
+
currentSession.lastActivityAt = Date.now();
|
|
1528
|
+
currentSession.status = "active";
|
|
1529
|
+
currentSession.turns = [...currentSession.turns, turn];
|
|
1530
|
+
markTurnCommitted(currentSession, finalText, transcripts);
|
|
1531
|
+
});
|
|
1532
|
+
speechDetected = false;
|
|
1533
|
+
rewindFallbackTurnAudio();
|
|
1534
|
+
logger.info("voice turn committed", {
|
|
1535
|
+
reason,
|
|
1536
|
+
sessionId: options.id,
|
|
1537
|
+
turnId: turn.id
|
|
1538
|
+
});
|
|
1539
|
+
await send({
|
|
1540
|
+
turn,
|
|
1541
|
+
type: "turn"
|
|
1542
|
+
});
|
|
1543
|
+
if (options.sttLifecycle === "turn-scoped") {
|
|
1544
|
+
await closeAdapter("turn-commit");
|
|
1545
|
+
}
|
|
1546
|
+
await completeTurn(updatedSession, turn);
|
|
1547
|
+
};
|
|
1548
|
+
const connectInternal = async (nextSocket) => {
|
|
1549
|
+
socket = nextSocket;
|
|
1550
|
+
const existingSession = await options.store.get(options.id);
|
|
1551
|
+
let session = existingSession ?? createVoiceSessionRecord(options.id, options.scenarioId);
|
|
1552
|
+
if (options.scenarioId && session.scenarioId !== options.scenarioId) {
|
|
1553
|
+
session.scenarioId = options.scenarioId;
|
|
1554
|
+
}
|
|
1555
|
+
ensureCommittedTurnGuard(session);
|
|
1556
|
+
let shouldFireOnSession = !existingSession;
|
|
1557
|
+
if (existingSession?.scenarioId && options.scenarioId && existingSession.scenarioId !== options.scenarioId) {
|
|
1558
|
+
session = resetVoiceSessionRecord(options.id, existingSession, options.scenarioId);
|
|
1559
|
+
shouldFireOnSession = true;
|
|
1560
|
+
}
|
|
1561
|
+
rewindFallbackTurnAudio();
|
|
1562
|
+
if (existingSession?.status === "reconnecting") {
|
|
1563
|
+
const nextAttempts = existingSession.reconnect.attempts + 1;
|
|
1564
|
+
const reconnectExpired = existingSession.reconnect.lastDisconnectAt !== undefined && Date.now() - existingSession.reconnect.lastDisconnectAt > reconnect.timeout;
|
|
1565
|
+
const tooManyAttempts = nextAttempts > reconnect.maxAttempts;
|
|
1566
|
+
if (reconnect.strategy === "fail" && (reconnectExpired || tooManyAttempts)) {
|
|
1567
|
+
await failInternal(new Error("Voice session reconnect policy exhausted"));
|
|
1568
|
+
return;
|
|
1569
|
+
}
|
|
1570
|
+
if (reconnect.strategy === "restart" && (reconnectExpired || tooManyAttempts)) {
|
|
1571
|
+
session = resetVoiceSessionRecord(options.id, existingSession, options.scenarioId);
|
|
1572
|
+
shouldFireOnSession = true;
|
|
1573
|
+
} else {
|
|
1574
|
+
session = {
|
|
1575
|
+
...existingSession,
|
|
1576
|
+
reconnect: {
|
|
1577
|
+
...existingSession.reconnect,
|
|
1578
|
+
attempts: nextAttempts
|
|
1579
|
+
},
|
|
1580
|
+
status: "active"
|
|
1581
|
+
};
|
|
1582
|
+
}
|
|
1583
|
+
}
|
|
1584
|
+
await options.store.set(options.id, session);
|
|
1585
|
+
await send({
|
|
1586
|
+
sessionId: options.id,
|
|
1587
|
+
status: session.status,
|
|
1588
|
+
scenarioId: session.scenarioId,
|
|
1589
|
+
type: "session"
|
|
1590
|
+
});
|
|
1591
|
+
if (shouldFireOnSession) {
|
|
1592
|
+
await options.route.onSession?.({
|
|
766
1593
|
api,
|
|
767
1594
|
context: options.context,
|
|
768
1595
|
session
|
|
769
1596
|
});
|
|
770
|
-
}
|
|
771
|
-
|
|
772
|
-
socket = nextSocket;
|
|
773
|
-
const existingSession = await options.store.get(options.id);
|
|
774
|
-
let session = existingSession ?? createVoiceSessionRecord(options.id);
|
|
775
|
-
let shouldFireOnSession = !existingSession;
|
|
776
|
-
if (existingSession?.status === "reconnecting") {
|
|
777
|
-
const nextAttempts = existingSession.reconnect.attempts + 1;
|
|
778
|
-
const reconnectExpired = existingSession.reconnect.lastDisconnectAt !== undefined && Date.now() - existingSession.reconnect.lastDisconnectAt > reconnect.timeout;
|
|
779
|
-
const tooManyAttempts = nextAttempts > reconnect.maxAttempts;
|
|
780
|
-
if (reconnect.strategy === "fail" && (reconnectExpired || tooManyAttempts)) {
|
|
781
|
-
await api.fail(new Error("Voice session reconnect policy exhausted"));
|
|
782
|
-
return;
|
|
783
|
-
}
|
|
784
|
-
if (reconnect.strategy === "restart" && (reconnectExpired || tooManyAttempts)) {
|
|
785
|
-
session = resetVoiceSessionRecord(options.id, existingSession);
|
|
786
|
-
shouldFireOnSession = true;
|
|
787
|
-
} else {
|
|
788
|
-
session = {
|
|
789
|
-
...existingSession,
|
|
790
|
-
reconnect: {
|
|
791
|
-
...existingSession.reconnect,
|
|
792
|
-
attempts: nextAttempts
|
|
793
|
-
},
|
|
794
|
-
status: "active"
|
|
795
|
-
};
|
|
796
|
-
}
|
|
797
|
-
}
|
|
798
|
-
await options.store.set(options.id, session);
|
|
1597
|
+
}
|
|
1598
|
+
if (session.status === "completed") {
|
|
799
1599
|
await send({
|
|
800
1600
|
sessionId: options.id,
|
|
801
|
-
|
|
802
|
-
type: "session"
|
|
1601
|
+
type: "complete"
|
|
803
1602
|
});
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
1603
|
+
return;
|
|
1604
|
+
}
|
|
1605
|
+
resumePendingTurnCommit(session);
|
|
1606
|
+
await ensureAdapter();
|
|
1607
|
+
};
|
|
1608
|
+
const disconnectInternal = async (event) => {
|
|
1609
|
+
clearSilenceTimer();
|
|
1610
|
+
await closeAdapter(event?.reason);
|
|
1611
|
+
rewindFallbackTurnAudio();
|
|
1612
|
+
if (reconnect.strategy === "fail") {
|
|
1613
|
+
await failInternal(new Error(event?.reason ?? "Voice socket disconnected"));
|
|
1614
|
+
return;
|
|
1615
|
+
}
|
|
1616
|
+
await writeSession((session) => {
|
|
1617
|
+
if (session.status === "completed" || session.status === "failed") {
|
|
816
1618
|
return;
|
|
817
1619
|
}
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
1620
|
+
session.lastActivityAt = Date.now();
|
|
1621
|
+
session.reconnect.lastDisconnectAt = Date.now();
|
|
1622
|
+
session.status = "reconnecting";
|
|
1623
|
+
});
|
|
1624
|
+
speechDetected = false;
|
|
1625
|
+
};
|
|
1626
|
+
const receiveAudioInternal = async (audio) => {
|
|
1627
|
+
const session = await readSession();
|
|
1628
|
+
if (session.status === "completed" || session.status === "failed") {
|
|
1629
|
+
return;
|
|
1630
|
+
}
|
|
1631
|
+
const adapter = await ensureAdapter();
|
|
1632
|
+
const conditionedAudio = conditionAudioChunk(audio, options.audioConditioning);
|
|
1633
|
+
const audioLevel = measureAudioLevel(conditionedAudio);
|
|
1634
|
+
const shouldStoreAudio = speechDetected || audioLevel >= turnDetection.speechThreshold;
|
|
1635
|
+
await writeSession((currentSession) => {
|
|
1636
|
+
currentSession.currentTurn.lastAudioAt = Date.now();
|
|
1637
|
+
currentSession.lastActivityAt = Date.now();
|
|
1638
|
+
currentSession.status = "active";
|
|
1639
|
+
if (audioLevel >= turnDetection.speechThreshold) {
|
|
1640
|
+
currentSession.currentTurn.lastSpeechAt = Date.now();
|
|
1641
|
+
currentSession.currentTurn.silenceStartedAt = undefined;
|
|
1642
|
+
} else if (speechDetected && currentSession.currentTurn.silenceStartedAt === undefined) {
|
|
1643
|
+
currentSession.currentTurn.silenceStartedAt = Date.now();
|
|
826
1644
|
}
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
session.status = "reconnecting";
|
|
834
|
-
});
|
|
835
|
-
speechDetected = false;
|
|
836
|
-
},
|
|
837
|
-
fail: async (error) => {
|
|
1645
|
+
});
|
|
1646
|
+
if (shouldStoreAudio) {
|
|
1647
|
+
pushTurnAudio(conditionedAudio);
|
|
1648
|
+
}
|
|
1649
|
+
if (audioLevel >= turnDetection.speechThreshold) {
|
|
1650
|
+
speechDetected = true;
|
|
838
1651
|
clearSilenceTimer();
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
type: "error"
|
|
848
|
-
});
|
|
849
|
-
await closeAdapter("failed");
|
|
850
|
-
speechDetected = false;
|
|
851
|
-
await options.route.onError?.({
|
|
852
|
-
api,
|
|
853
|
-
context: options.context,
|
|
854
|
-
error: resolvedError,
|
|
855
|
-
session,
|
|
856
|
-
sessionId: options.id
|
|
857
|
-
});
|
|
858
|
-
},
|
|
859
|
-
receiveAudio: async (audio) => {
|
|
860
|
-
const session = await readSession();
|
|
861
|
-
if (session.status === "completed" || session.status === "failed") {
|
|
862
|
-
return;
|
|
1652
|
+
} else if (speechDetected) {
|
|
1653
|
+
const currentSession = await readSession();
|
|
1654
|
+
const hasTurnText = Boolean(buildTurnText(currentSession.currentTurn.transcripts, currentSession.currentTurn.partialText, {
|
|
1655
|
+
partialEndedAtMs: currentSession.currentTurn.partialEndedAt,
|
|
1656
|
+
partialStartedAtMs: currentSession.currentTurn.partialStartedAt
|
|
1657
|
+
}));
|
|
1658
|
+
if (hasTurnText) {
|
|
1659
|
+
scheduleSilenceCommit(turnDetection.silenceMs, false);
|
|
863
1660
|
}
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
if (audioLevel >= turnDetection.speechThreshold) {
|
|
872
|
-
speechDetected = true;
|
|
1661
|
+
}
|
|
1662
|
+
await adapter.send(conditionedAudio);
|
|
1663
|
+
};
|
|
1664
|
+
const api = {
|
|
1665
|
+
id: options.id,
|
|
1666
|
+
close: async (reason) => {
|
|
1667
|
+
await runSerial("api.close", async () => {
|
|
873
1668
|
clearSilenceTimer();
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
if (hasTurnText) {
|
|
878
|
-
scheduleSilenceCommit();
|
|
879
|
-
}
|
|
880
|
-
}
|
|
881
|
-
await adapter.send(audio);
|
|
1669
|
+
await closeAdapter(reason);
|
|
1670
|
+
await Promise.resolve(socket.close(1000, reason));
|
|
1671
|
+
});
|
|
882
1672
|
},
|
|
883
|
-
|
|
1673
|
+
commitTurn: async (reason = "manual") => runSerial("api.commitTurn", async () => {
|
|
1674
|
+
await commitTurnInternal(reason);
|
|
1675
|
+
}),
|
|
1676
|
+
complete: async (result) => runSerial("api.complete", async () => {
|
|
1677
|
+
await completeInternal(result);
|
|
1678
|
+
}),
|
|
1679
|
+
connect: async (nextSocket) => runSerial("api.connect", async () => {
|
|
1680
|
+
await connectInternal(nextSocket);
|
|
1681
|
+
}),
|
|
1682
|
+
disconnect: async (event) => runSerial("api.disconnect", async () => {
|
|
1683
|
+
await disconnectInternal(event);
|
|
1684
|
+
}),
|
|
1685
|
+
fail: async (error) => runSerial("api.fail", async () => {
|
|
1686
|
+
await failInternal(error);
|
|
1687
|
+
}),
|
|
1688
|
+
receiveAudio: async (audio) => runSerial("api.receiveAudio", async () => {
|
|
1689
|
+
await receiveAudioInternal(audio);
|
|
1690
|
+
}),
|
|
1691
|
+
snapshot: async () => runSerial("api.snapshot", async () => readSession())
|
|
884
1692
|
};
|
|
885
1693
|
return api;
|
|
886
1694
|
};
|
|
887
1695
|
|
|
1696
|
+
// src/turnProfiles.ts
|
|
1697
|
+
var TURN_PROFILE_DEFAULTS = {
|
|
1698
|
+
balanced: {
|
|
1699
|
+
qualityProfile: "general",
|
|
1700
|
+
silenceMs: 1400,
|
|
1701
|
+
speechThreshold: 0.012,
|
|
1702
|
+
transcriptStabilityMs: 1000
|
|
1703
|
+
},
|
|
1704
|
+
fast: {
|
|
1705
|
+
qualityProfile: "general",
|
|
1706
|
+
silenceMs: 700,
|
|
1707
|
+
speechThreshold: 0.015,
|
|
1708
|
+
transcriptStabilityMs: 450
|
|
1709
|
+
},
|
|
1710
|
+
"long-form": {
|
|
1711
|
+
qualityProfile: "general",
|
|
1712
|
+
silenceMs: 2200,
|
|
1713
|
+
speechThreshold: 0.01,
|
|
1714
|
+
transcriptStabilityMs: 1500
|
|
1715
|
+
}
|
|
1716
|
+
};
|
|
1717
|
+
var QUALITY_PROFILE_DEFAULTS = {
|
|
1718
|
+
general: {},
|
|
1719
|
+
"accent-heavy": {
|
|
1720
|
+
silenceMs: 1200,
|
|
1721
|
+
speechThreshold: 0.01,
|
|
1722
|
+
transcriptStabilityMs: 1200
|
|
1723
|
+
},
|
|
1724
|
+
"noisy-room": {
|
|
1725
|
+
silenceMs: 2000,
|
|
1726
|
+
speechThreshold: 0.02,
|
|
1727
|
+
transcriptStabilityMs: 1600
|
|
1728
|
+
},
|
|
1729
|
+
"short-command": {
|
|
1730
|
+
silenceMs: 500,
|
|
1731
|
+
speechThreshold: 0.016,
|
|
1732
|
+
transcriptStabilityMs: 420
|
|
1733
|
+
}
|
|
1734
|
+
};
|
|
1735
|
+
var DEFAULT_TURN_PROFILE = "fast";
|
|
1736
|
+
var DEFAULT_QUALITY_PROFILE = "general";
|
|
1737
|
+
var resolveTurnDetectionConfig = (config) => {
|
|
1738
|
+
const profile = config?.profile ?? DEFAULT_TURN_PROFILE;
|
|
1739
|
+
const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
|
|
1740
|
+
const preset = TURN_PROFILE_DEFAULTS[profile];
|
|
1741
|
+
const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
|
|
1742
|
+
return {
|
|
1743
|
+
profile,
|
|
1744
|
+
qualityProfile,
|
|
1745
|
+
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
1746
|
+
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
1747
|
+
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
|
1748
|
+
};
|
|
1749
|
+
};
|
|
1750
|
+
|
|
888
1751
|
// src/testing/resilience.ts
|
|
889
1752
|
var roundMetric2 = (value, digits = 4) => {
|
|
890
1753
|
const factor = 10 ** digits;
|
|
@@ -951,10 +1814,12 @@ var runScenario = async (id, title, run) => {
|
|
|
951
1814
|
socket: createMockSocket(),
|
|
952
1815
|
store,
|
|
953
1816
|
stt: adapter.adapter,
|
|
954
|
-
|
|
1817
|
+
sttLifecycle: "continuous",
|
|
1818
|
+
turnDetection: resolveTurnDetectionConfig({
|
|
955
1819
|
silenceMs: 20,
|
|
956
|
-
speechThreshold: 0.01
|
|
957
|
-
|
|
1820
|
+
speechThreshold: 0.01,
|
|
1821
|
+
transcriptStabilityMs: 5
|
|
1822
|
+
})
|
|
958
1823
|
});
|
|
959
1824
|
await voice.connect(createMockSocket());
|
|
960
1825
|
try {
|
|
@@ -983,6 +1848,24 @@ var runScenario = async (id, title, run) => {
|
|
|
983
1848
|
type: "close"
|
|
984
1849
|
});
|
|
985
1850
|
},
|
|
1851
|
+
emitEndOfTurn: async () => {
|
|
1852
|
+
await adapter.session.emit("endOfTurn", {
|
|
1853
|
+
reason: "vendor",
|
|
1854
|
+
receivedAt: Date.now(),
|
|
1855
|
+
type: "endOfTurn"
|
|
1856
|
+
});
|
|
1857
|
+
},
|
|
1858
|
+
emitFinal: async (text, transcriptId = `${id}-${turns.length}`) => {
|
|
1859
|
+
await adapter.session.emit("final", {
|
|
1860
|
+
receivedAt: Date.now(),
|
|
1861
|
+
transcript: {
|
|
1862
|
+
id: transcriptId,
|
|
1863
|
+
isFinal: true,
|
|
1864
|
+
text
|
|
1865
|
+
},
|
|
1866
|
+
type: "final"
|
|
1867
|
+
});
|
|
1868
|
+
},
|
|
986
1869
|
turns
|
|
987
1870
|
});
|
|
988
1871
|
} finally {
|
|
@@ -1022,6 +1905,88 @@ var runVoiceResilienceBenchmark = async () => {
|
|
|
1022
1905
|
if (turns.length === 1) {
|
|
1023
1906
|
await commit("Fresh transcripts should still commit later");
|
|
1024
1907
|
}
|
|
1908
|
+
}),
|
|
1909
|
+
runScenario("duplicate-end-of-turn", "Repeated end-of-turn events for the same turn stay deduped", async ({ emitFinal, emitEndOfTurn, turns }) => {
|
|
1910
|
+
await emitFinal("Repeated end-of-turn should only commit once", "dup-endofturn");
|
|
1911
|
+
await emitEndOfTurn();
|
|
1912
|
+
await emitEndOfTurn();
|
|
1913
|
+
await Bun.sleep(80);
|
|
1914
|
+
if (turns.length !== 1) {
|
|
1915
|
+
throw new Error("Repeated end-of-turn events created duplicate turns");
|
|
1916
|
+
}
|
|
1917
|
+
}),
|
|
1918
|
+
runScenario("duplicate-end-of-turn-jitter", "End-of-turn jitter does not trigger extra commits", async ({ emitFinal, emitEndOfTurn, turns }) => {
|
|
1919
|
+
await emitFinal("Noisy end-of-turn signals should still commit once", "dup-endofturn-jitter");
|
|
1920
|
+
for (const delayMs of [40, 95, 180, 120]) {
|
|
1921
|
+
await Bun.sleep(delayMs);
|
|
1922
|
+
await emitEndOfTurn();
|
|
1923
|
+
}
|
|
1924
|
+
await Bun.sleep(80);
|
|
1925
|
+
if (turns.length !== 1) {
|
|
1926
|
+
throw new Error("Jittered end-of-turn signals created duplicate turns");
|
|
1927
|
+
}
|
|
1928
|
+
}),
|
|
1929
|
+
runScenario("reconnect-duplicate-text-no-new-audio", "Reconnect duplicate text with different ids and no audio does not replay turn", async ({
|
|
1930
|
+
adapter,
|
|
1931
|
+
connectNewSocket,
|
|
1932
|
+
disconnect,
|
|
1933
|
+
emitEndOfTurn,
|
|
1934
|
+
emitFinal,
|
|
1935
|
+
turns
|
|
1936
|
+
}) => {
|
|
1937
|
+
await emitFinal("Reconnect duplicate text should be suppressed", "dup-text-reconnect-1");
|
|
1938
|
+
await emitEndOfTurn();
|
|
1939
|
+
await Bun.sleep(60);
|
|
1940
|
+
await disconnect();
|
|
1941
|
+
await connectNewSocket();
|
|
1942
|
+
await adapter.session.emit("final", {
|
|
1943
|
+
receivedAt: Date.now(),
|
|
1944
|
+
transcript: {
|
|
1945
|
+
id: "dup-text-reconnect-2",
|
|
1946
|
+
isFinal: true,
|
|
1947
|
+
text: "Reconnect duplicate text should be suppressed"
|
|
1948
|
+
},
|
|
1949
|
+
type: "final"
|
|
1950
|
+
});
|
|
1951
|
+
for (const delayMs of [40, 70, 110]) {
|
|
1952
|
+
await Bun.sleep(delayMs);
|
|
1953
|
+
await emitEndOfTurn();
|
|
1954
|
+
}
|
|
1955
|
+
await Bun.sleep(60);
|
|
1956
|
+
if (turns.length !== 1) {
|
|
1957
|
+
throw new Error("Reconnect duplicate text was committed twice");
|
|
1958
|
+
}
|
|
1959
|
+
}),
|
|
1960
|
+
runScenario("reconnect-end-of-turn-jitter", "End-of-turn jitter after reconnect does not replay committed turns", async ({
|
|
1961
|
+
adapter,
|
|
1962
|
+
connectNewSocket,
|
|
1963
|
+
disconnect,
|
|
1964
|
+
emitEndOfTurn,
|
|
1965
|
+
emitFinal,
|
|
1966
|
+
turns
|
|
1967
|
+
}) => {
|
|
1968
|
+
await emitFinal("Reconnect duplicate end-of-turn should dedupe", "resume-jitter");
|
|
1969
|
+
await emitEndOfTurn();
|
|
1970
|
+
await Bun.sleep(60);
|
|
1971
|
+
await disconnect();
|
|
1972
|
+
await connectNewSocket();
|
|
1973
|
+
await adapter.session.emit("final", {
|
|
1974
|
+
receivedAt: Date.now(),
|
|
1975
|
+
transcript: {
|
|
1976
|
+
id: "resume-jitter",
|
|
1977
|
+
isFinal: true,
|
|
1978
|
+
text: "Reconnect duplicate end-of-turn should dedupe"
|
|
1979
|
+
},
|
|
1980
|
+
type: "final"
|
|
1981
|
+
});
|
|
1982
|
+
for (const delayMs of [50, 80, 120, 180]) {
|
|
1983
|
+
await Bun.sleep(delayMs);
|
|
1984
|
+
await emitEndOfTurn();
|
|
1985
|
+
}
|
|
1986
|
+
await Bun.sleep(80);
|
|
1987
|
+
if (turns.length !== 1) {
|
|
1988
|
+
throw new Error("Reconnected jittered end-of-turn signals replayed a committed turn");
|
|
1989
|
+
}
|
|
1025
1990
|
})
|
|
1026
1991
|
]);
|
|
1027
1992
|
const passCount = scenarios.filter((scenario) => scenario.passes).length;
|
|
@@ -1040,10 +2005,26 @@ var runVoiceResilienceBenchmark = async () => {
|
|
|
1040
2005
|
};
|
|
1041
2006
|
// src/testing/sessionBenchmark.ts
|
|
1042
2007
|
var average2 = (values) => values.length > 0 ? values.reduce((sum, value) => sum + value, 0) / values.length : 0;
|
|
2008
|
+
var normalizeTurnText = (value) => value.toLowerCase().replace(/[^\p{L}\p{N}\s']/gu, " ").replace(/\s+/g, " ").trim();
|
|
1043
2009
|
var roundMetric3 = (value, digits = 4) => {
|
|
1044
2010
|
const factor = 10 ** digits;
|
|
1045
2011
|
return Math.round(value * factor) / factor;
|
|
1046
2012
|
};
|
|
2013
|
+
var resolveBenchmarkFallbackConfig = (config) => {
|
|
2014
|
+
if (!config) {
|
|
2015
|
+
return;
|
|
2016
|
+
}
|
|
2017
|
+
return {
|
|
2018
|
+
adapter: config.adapter,
|
|
2019
|
+
completionTimeoutMs: config.completionTimeoutMs ?? 2500,
|
|
2020
|
+
confidenceThreshold: config.confidenceThreshold ?? 0.6,
|
|
2021
|
+
maxAttemptsPerTurn: config.maxAttemptsPerTurn ?? 1,
|
|
2022
|
+
minTextLength: config.minTextLength ?? 2,
|
|
2023
|
+
replayWindowMs: config.replayWindowMs ?? 8000,
|
|
2024
|
+
settleMs: config.settleMs ?? 220,
|
|
2025
|
+
trigger: config.trigger ?? "empty-or-low-confidence"
|
|
2026
|
+
};
|
|
2027
|
+
};
|
|
1047
2028
|
var chunkAudio2 = (audio, bytesPerChunk) => {
|
|
1048
2029
|
const chunks = [];
|
|
1049
2030
|
for (let offset = 0;offset < audio.byteLength; offset += bytesPerChunk) {
|
|
@@ -1052,39 +2033,178 @@ var chunkAudio2 = (audio, bytesPerChunk) => {
|
|
|
1052
2033
|
return chunks;
|
|
1053
2034
|
};
|
|
1054
2035
|
var createSilence2 = (byteLength) => new Uint8Array(byteLength);
|
|
1055
|
-
var
|
|
1056
|
-
|
|
1057
|
-
|
|
2036
|
+
var countUnexpectedDuplicateTurns = (actualTurns, expectedTurns) => {
|
|
2037
|
+
const expectedCounts = new Map;
|
|
2038
|
+
for (const turn of expectedTurns) {
|
|
2039
|
+
const key = normalizeTurnText(turn);
|
|
2040
|
+
expectedCounts.set(key, (expectedCounts.get(key) ?? 0) + 1);
|
|
2041
|
+
}
|
|
2042
|
+
const actualCounts = new Map;
|
|
2043
|
+
for (const turn of actualTurns) {
|
|
2044
|
+
const key = normalizeTurnText(turn);
|
|
2045
|
+
actualCounts.set(key, (actualCounts.get(key) ?? 0) + 1);
|
|
2046
|
+
}
|
|
2047
|
+
let duplicates = 0;
|
|
2048
|
+
for (const [key, actualCount] of actualCounts.entries()) {
|
|
2049
|
+
const expectedCount = expectedCounts.get(key) ?? 0;
|
|
2050
|
+
const allowedOccurrences = Math.max(expectedCount, 1);
|
|
2051
|
+
if (actualCount > allowedOccurrences) {
|
|
2052
|
+
duplicates += actualCount - allowedOccurrences;
|
|
2053
|
+
}
|
|
2054
|
+
}
|
|
2055
|
+
return duplicates;
|
|
2056
|
+
};
|
|
2057
|
+
var normalizeSocketMessage = (data) => {
|
|
2058
|
+
if (typeof data !== "string") {
|
|
2059
|
+
return {
|
|
2060
|
+
byteLength: data instanceof ArrayBuffer ? data.byteLength : data.byteLength,
|
|
2061
|
+
kind: "binary"
|
|
2062
|
+
};
|
|
2063
|
+
}
|
|
2064
|
+
try {
|
|
2065
|
+
return JSON.parse(data);
|
|
2066
|
+
} catch {
|
|
2067
|
+
return data;
|
|
2068
|
+
}
|
|
2069
|
+
};
|
|
2070
|
+
var createMockSocket2 = (onEvent) => ({
|
|
2071
|
+
close: async (code, reason) => {
|
|
2072
|
+
onEvent?.({
|
|
2073
|
+
data: {
|
|
2074
|
+
code,
|
|
2075
|
+
reason
|
|
2076
|
+
},
|
|
2077
|
+
phase: "socket.close"
|
|
2078
|
+
});
|
|
2079
|
+
},
|
|
2080
|
+
send: async (data) => {
|
|
2081
|
+
onEvent?.({
|
|
2082
|
+
data: normalizeSocketMessage(data),
|
|
2083
|
+
phase: "socket.send"
|
|
2084
|
+
});
|
|
2085
|
+
}
|
|
1058
2086
|
});
|
|
1059
|
-
var
|
|
2087
|
+
var waitForSessionIdle = async (session, settleMs, idleTimeoutMs) => {
|
|
2088
|
+
const startedAt = Date.now();
|
|
2089
|
+
while (Date.now() - startedAt < idleTimeoutMs) {
|
|
2090
|
+
const snapshot = await session.snapshot();
|
|
2091
|
+
const pendingText = snapshot.currentTurn.finalText || snapshot.currentTurn.partialText;
|
|
2092
|
+
const lastActivityAt = snapshot.lastActivityAt ?? snapshot.createdAt;
|
|
2093
|
+
if (!pendingText && Date.now() - lastActivityAt >= settleMs) {
|
|
2094
|
+
return;
|
|
2095
|
+
}
|
|
2096
|
+
await Bun.sleep(Math.min(100, settleMs));
|
|
2097
|
+
}
|
|
2098
|
+
};
|
|
2099
|
+
var runVoiceSessionBenchmarkScenario = async (adapter, fixture, options = {}) => {
|
|
1060
2100
|
const store = createVoiceMemoryStore();
|
|
1061
|
-
const
|
|
2101
|
+
const committedTurns = [];
|
|
2102
|
+
const traceStartedAt = Date.now();
|
|
2103
|
+
const trace = [];
|
|
2104
|
+
const pushTrace = (entry) => {
|
|
2105
|
+
if (!options.trace) {
|
|
2106
|
+
return;
|
|
2107
|
+
}
|
|
2108
|
+
trace.push({
|
|
2109
|
+
...entry,
|
|
2110
|
+
atMs: Date.now() - traceStartedAt
|
|
2111
|
+
});
|
|
2112
|
+
};
|
|
2113
|
+
const captureSnapshot = async (phase) => {
|
|
2114
|
+
if (!options.trace) {
|
|
2115
|
+
return;
|
|
2116
|
+
}
|
|
2117
|
+
const snapshot = await store.getOrCreate(`session-bench-${fixture.id}`);
|
|
2118
|
+
pushTrace({
|
|
2119
|
+
data: {
|
|
2120
|
+
currentTurn: {
|
|
2121
|
+
finalText: snapshot.currentTurn.finalText,
|
|
2122
|
+
lastAudioAt: snapshot.currentTurn.lastAudioAt,
|
|
2123
|
+
lastSpeechAt: snapshot.currentTurn.lastSpeechAt,
|
|
2124
|
+
lastTranscriptAt: snapshot.currentTurn.lastTranscriptAt,
|
|
2125
|
+
partialText: snapshot.currentTurn.partialText,
|
|
2126
|
+
silenceStartedAt: snapshot.currentTurn.silenceStartedAt,
|
|
2127
|
+
transcriptCount: snapshot.currentTurn.transcripts.length
|
|
2128
|
+
},
|
|
2129
|
+
lastActivityAt: snapshot.lastActivityAt,
|
|
2130
|
+
status: snapshot.status,
|
|
2131
|
+
turns: snapshot.turns.map((turn) => turn.text)
|
|
2132
|
+
},
|
|
2133
|
+
phase
|
|
2134
|
+
});
|
|
2135
|
+
};
|
|
2136
|
+
const logger = {
|
|
2137
|
+
debug: (message, meta) => {
|
|
2138
|
+
pushTrace({
|
|
2139
|
+
data: meta,
|
|
2140
|
+
phase: `logger.debug:${message}`
|
|
2141
|
+
});
|
|
2142
|
+
},
|
|
2143
|
+
error: (message, meta) => {
|
|
2144
|
+
pushTrace({
|
|
2145
|
+
data: meta,
|
|
2146
|
+
phase: `logger.error:${message}`
|
|
2147
|
+
});
|
|
2148
|
+
},
|
|
2149
|
+
info: (message, meta) => {
|
|
2150
|
+
pushTrace({
|
|
2151
|
+
data: meta,
|
|
2152
|
+
phase: `logger.info:${message}`
|
|
2153
|
+
});
|
|
2154
|
+
},
|
|
2155
|
+
warn: (message, meta) => {
|
|
2156
|
+
pushTrace({
|
|
2157
|
+
data: meta,
|
|
2158
|
+
phase: `logger.warn:${message}`
|
|
2159
|
+
});
|
|
2160
|
+
}
|
|
2161
|
+
};
|
|
1062
2162
|
const session = createVoiceSession({
|
|
2163
|
+
audioConditioning: resolveAudioConditioningConfig(fixture.audioConditioning),
|
|
1063
2164
|
context: {},
|
|
1064
2165
|
id: `session-bench-${fixture.id}`,
|
|
1065
|
-
logger
|
|
2166
|
+
logger,
|
|
1066
2167
|
reconnect: {
|
|
1067
2168
|
maxAttempts: 2,
|
|
1068
2169
|
strategy: "resume-last-turn",
|
|
1069
2170
|
timeout: 5000
|
|
1070
2171
|
},
|
|
1071
2172
|
route: {
|
|
2173
|
+
correctTurn: options.correctTurn,
|
|
1072
2174
|
onComplete: async () => {},
|
|
1073
2175
|
onTurn: async ({ turn }) => {
|
|
1074
|
-
|
|
2176
|
+
committedTurns.push({
|
|
2177
|
+
quality: turn.quality,
|
|
2178
|
+
text: turn.text
|
|
2179
|
+
});
|
|
2180
|
+
pushTrace({
|
|
2181
|
+
data: {
|
|
2182
|
+
quality: turn.quality,
|
|
2183
|
+
text: turn.text,
|
|
2184
|
+
transcriptCount: turn.transcripts.length,
|
|
2185
|
+
turnId: turn.id
|
|
2186
|
+
},
|
|
2187
|
+
phase: "route.onTurn"
|
|
2188
|
+
});
|
|
1075
2189
|
}
|
|
1076
2190
|
},
|
|
1077
|
-
|
|
2191
|
+
phraseHints: fixture.phraseHints,
|
|
2192
|
+
socket: createMockSocket2(pushTrace),
|
|
1078
2193
|
store,
|
|
1079
2194
|
stt: adapter,
|
|
1080
|
-
|
|
2195
|
+
sttFallback: resolveBenchmarkFallbackConfig(options.sttFallback),
|
|
2196
|
+
sttLifecycle: fixture.sttLifecycle ?? "continuous",
|
|
2197
|
+
turnDetection: resolveTurnDetectionConfig({
|
|
2198
|
+
profile: fixture.turnProfile ?? "balanced",
|
|
1081
2199
|
silenceMs: fixture.silenceMs ?? DEFAULT_SILENCE_MS,
|
|
1082
|
-
speechThreshold: fixture.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD
|
|
1083
|
-
|
|
2200
|
+
speechThreshold: fixture.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
|
|
2201
|
+
transcriptStabilityMs: fixture.transcriptStabilityMs ?? 900
|
|
2202
|
+
})
|
|
1084
2203
|
});
|
|
1085
2204
|
const startedAt = Date.now();
|
|
1086
2205
|
let reconnectTriggered = false;
|
|
1087
|
-
await session.connect(createMockSocket2());
|
|
2206
|
+
await session.connect(createMockSocket2(pushTrace));
|
|
2207
|
+
await captureSnapshot("session.connected");
|
|
1088
2208
|
try {
|
|
1089
2209
|
const chunkDurationMs = fixture.chunkDurationMs ?? 100;
|
|
1090
2210
|
const bytesPerMillisecond = fixture.format.sampleRateHz * fixture.format.channels * 2 / 1000;
|
|
@@ -1095,13 +2215,22 @@ var runVoiceSessionBenchmarkScenario = async (adapter, fixture) => {
|
|
|
1095
2215
|
await Bun.sleep(chunkDurationMs);
|
|
1096
2216
|
if (fixture.reconnectAtChunkIndex !== undefined && index === fixture.reconnectAtChunkIndex && !reconnectTriggered) {
|
|
1097
2217
|
reconnectTriggered = true;
|
|
2218
|
+
pushTrace({
|
|
2219
|
+
data: {
|
|
2220
|
+
chunkIndex: index
|
|
2221
|
+
},
|
|
2222
|
+
phase: "reconnect.begin"
|
|
2223
|
+
});
|
|
2224
|
+
await captureSnapshot("reconnect.pre-disconnect");
|
|
1098
2225
|
await session.disconnect({
|
|
1099
2226
|
reason: "benchmark-reconnect",
|
|
1100
2227
|
recoverable: true,
|
|
1101
2228
|
type: "close"
|
|
1102
2229
|
});
|
|
2230
|
+
await captureSnapshot("reconnect.post-disconnect");
|
|
1103
2231
|
await Bun.sleep(fixture.reconnectPauseMs ?? 150);
|
|
1104
|
-
await session.connect(createMockSocket2());
|
|
2232
|
+
await session.connect(createMockSocket2(pushTrace));
|
|
2233
|
+
await captureSnapshot("reconnect.post-connect");
|
|
1105
2234
|
}
|
|
1106
2235
|
}
|
|
1107
2236
|
const tailPaddingMs = fixture.tailPaddingMs ?? 1200;
|
|
@@ -1112,13 +2241,16 @@ var runVoiceSessionBenchmarkScenario = async (adapter, fixture) => {
|
|
|
1112
2241
|
await Bun.sleep(chunkDurationMs);
|
|
1113
2242
|
}
|
|
1114
2243
|
}
|
|
1115
|
-
await
|
|
2244
|
+
await waitForSessionIdle(session, Math.max(1200, (fixture.silenceMs ?? DEFAULT_SILENCE_MS) + (fixture.transcriptStabilityMs ?? 900)), 8000);
|
|
2245
|
+
await captureSnapshot("session.idle");
|
|
1116
2246
|
} finally {
|
|
2247
|
+
await captureSnapshot("session.pre-close");
|
|
1117
2248
|
await session.close("session-benchmark-complete");
|
|
1118
2249
|
}
|
|
1119
|
-
const duplicateTurnCount =
|
|
2250
|
+
const duplicateTurnCount = countUnexpectedDuplicateTurns(committedTurns.map((turn) => turn.text), fixture.expectedTurnTexts);
|
|
1120
2251
|
const turnResults = fixture.expectedTurnTexts.map((expectedText, index) => {
|
|
1121
|
-
const
|
|
2252
|
+
const actualTurn = committedTurns[index];
|
|
2253
|
+
const actualText = actualTurn?.text;
|
|
1122
2254
|
if (!actualText) {
|
|
1123
2255
|
return {
|
|
1124
2256
|
actualText: "",
|
|
@@ -1133,20 +2265,22 @@ var runVoiceSessionBenchmarkScenario = async (adapter, fixture) => {
|
|
|
1133
2265
|
accuracy,
|
|
1134
2266
|
expectedText,
|
|
1135
2267
|
index,
|
|
1136
|
-
passes: accuracy.passesThreshold
|
|
2268
|
+
passes: accuracy.passesThreshold,
|
|
2269
|
+
quality: actualTurn?.quality
|
|
1137
2270
|
};
|
|
1138
2271
|
});
|
|
1139
|
-
for (let index = fixture.expectedTurnTexts.length;index <
|
|
2272
|
+
for (let index = fixture.expectedTurnTexts.length;index < committedTurns.length; index += 1) {
|
|
1140
2273
|
turnResults.push({
|
|
1141
|
-
actualText:
|
|
2274
|
+
actualText: committedTurns[index]?.text ?? "",
|
|
1142
2275
|
expectedText: undefined,
|
|
1143
2276
|
index,
|
|
1144
|
-
passes: false
|
|
2277
|
+
passes: false,
|
|
2278
|
+
quality: committedTurns[index]?.quality
|
|
1145
2279
|
});
|
|
1146
2280
|
}
|
|
1147
|
-
const turnCountDelta =
|
|
2281
|
+
const turnCountDelta = committedTurns.length - fixture.expectedTurnTexts.length;
|
|
1148
2282
|
return {
|
|
1149
|
-
actualTurns:
|
|
2283
|
+
actualTurns: committedTurns.map((turn) => turn.text),
|
|
1150
2284
|
duplicateTurnCount,
|
|
1151
2285
|
elapsedMs: Date.now() - startedAt,
|
|
1152
2286
|
expectedTurns: fixture.expectedTurnTexts,
|
|
@@ -1156,7 +2290,8 @@ var runVoiceSessionBenchmarkScenario = async (adapter, fixture) => {
|
|
|
1156
2290
|
tags: fixture.tags ?? [],
|
|
1157
2291
|
title: fixture.title,
|
|
1158
2292
|
turnCountDelta,
|
|
1159
|
-
turnResults
|
|
2293
|
+
turnResults,
|
|
2294
|
+
trace: options.trace ? trace : undefined
|
|
1160
2295
|
};
|
|
1161
2296
|
};
|
|
1162
2297
|
var summarizeVoiceSessionBenchmark = (adapterId, scenarios) => {
|
|
@@ -1177,10 +2312,65 @@ var summarizeVoiceSessionBenchmark = (adapterId, scenarios) => {
|
|
|
1177
2312
|
scenariosWithTurnCountMismatch: scenarios.filter((scenario) => scenario.turnCountDelta !== 0).length
|
|
1178
2313
|
};
|
|
1179
2314
|
};
|
|
2315
|
+
var summarizeVoiceSessionBenchmarkSeries = (input) => {
|
|
2316
|
+
const scenarioMap = new Map;
|
|
2317
|
+
for (const report of input.reports) {
|
|
2318
|
+
for (const scenario of report.scenarios) {
|
|
2319
|
+
const entries = scenarioMap.get(scenario.fixtureId) ?? [];
|
|
2320
|
+
entries.push(scenario);
|
|
2321
|
+
scenarioMap.set(scenario.fixtureId, entries);
|
|
2322
|
+
}
|
|
2323
|
+
}
|
|
2324
|
+
const scenarioAggregates = [...scenarioMap.entries()].map(([fixtureId, results]) => {
|
|
2325
|
+
const wordErrorRates = results.flatMap((scenario) => scenario.turnResults.map((turn) => turn.accuracy?.wordErrorRate).filter((value) => typeof value === "number"));
|
|
2326
|
+
const reconnectRuns = results.filter((scenario) => scenario.reconnectTriggered);
|
|
2327
|
+
const passCount = results.filter((scenario) => scenario.passes).length;
|
|
2328
|
+
const sample = results[0];
|
|
2329
|
+
return {
|
|
2330
|
+
averageElapsedMs: roundMetric3(average2(results.map((scenario) => scenario.elapsedMs)), 2),
|
|
2331
|
+
averageWordErrorRate: roundMetric3(average2(wordErrorRates)),
|
|
2332
|
+
bestWordErrorRate: roundMetric3(wordErrorRates.length > 0 ? Math.min(...wordErrorRates) : 0),
|
|
2333
|
+
fixtureId,
|
|
2334
|
+
passCount,
|
|
2335
|
+
passRate: roundMetric3(results.length > 0 ? passCount / results.length : 0),
|
|
2336
|
+
reconnectSuccessRate: roundMetric3(reconnectRuns.length > 0 ? reconnectRuns.filter((scenario) => scenario.passes).length / reconnectRuns.length : 1),
|
|
2337
|
+
runCount: results.length,
|
|
2338
|
+
tags: sample.tags,
|
|
2339
|
+
title: sample.title,
|
|
2340
|
+
worstWordErrorRate: roundMetric3(wordErrorRates.length > 0 ? Math.max(...wordErrorRates) : 0)
|
|
2341
|
+
};
|
|
2342
|
+
});
|
|
2343
|
+
const totalRunCount = input.reports.reduce((sum, report) => sum + report.scenarios.length, 0);
|
|
2344
|
+
const totalPassCount = input.reports.reduce((sum, report) => sum + report.summary.passCount, 0);
|
|
2345
|
+
const reconnectRates = scenarioAggregates.map((scenario) => scenario.reconnectSuccessRate).filter((value) => Number.isFinite(value));
|
|
2346
|
+
return {
|
|
2347
|
+
adapterId: input.adapterId,
|
|
2348
|
+
generatedAt: Date.now(),
|
|
2349
|
+
runCount: input.reports.length,
|
|
2350
|
+
scenarios: scenarioAggregates,
|
|
2351
|
+
summary: {
|
|
2352
|
+
adapterId: input.adapterId,
|
|
2353
|
+
averageElapsedMs: roundMetric3(average2(scenarioAggregates.map((scenario) => scenario.averageElapsedMs)), 2),
|
|
2354
|
+
averagePassRate: roundMetric3(average2(scenarioAggregates.map((scenario) => scenario.passRate))),
|
|
2355
|
+
averageWordErrorRate: roundMetric3(average2(scenarioAggregates.map((scenario) => scenario.averageWordErrorRate))),
|
|
2356
|
+
flakyScenarioCount: scenarioAggregates.filter((scenario) => scenario.passRate > 0 && scenario.passRate < 1).length,
|
|
2357
|
+
generatedRunCount: input.reports.length,
|
|
2358
|
+
reconnectSuccessRate: roundMetric3(average2(reconnectRates)),
|
|
2359
|
+
scenarioCount: scenarioAggregates.length,
|
|
2360
|
+
stableScenarioCount: scenarioAggregates.filter((scenario) => scenario.passRate === 1).length,
|
|
2361
|
+
totalPassCount,
|
|
2362
|
+
totalRunCount
|
|
2363
|
+
}
|
|
2364
|
+
};
|
|
2365
|
+
};
|
|
1180
2366
|
var runVoiceSessionBenchmark = async (input) => {
|
|
1181
2367
|
const scenarioResults = [];
|
|
1182
2368
|
for (const scenario of input.scenarios) {
|
|
1183
|
-
scenarioResults.push(await runVoiceSessionBenchmarkScenario(input.adapter, scenario
|
|
2369
|
+
scenarioResults.push(await runVoiceSessionBenchmarkScenario(input.adapter, scenario, {
|
|
2370
|
+
correctTurn: input.correctTurn,
|
|
2371
|
+
sttFallback: input.sttFallback,
|
|
2372
|
+
trace: input.trace
|
|
2373
|
+
}));
|
|
1184
2374
|
}
|
|
1185
2375
|
return {
|
|
1186
2376
|
adapterId: input.adapterId,
|
|
@@ -1189,17 +2379,39 @@ var runVoiceSessionBenchmark = async (input) => {
|
|
|
1189
2379
|
summary: summarizeVoiceSessionBenchmark(input.adapterId, scenarioResults)
|
|
1190
2380
|
};
|
|
1191
2381
|
};
|
|
2382
|
+
var runVoiceSessionBenchmarkSeries = async (input) => {
|
|
2383
|
+
const reports = [];
|
|
2384
|
+
const runCount = Math.max(1, Math.floor(input.runs));
|
|
2385
|
+
for (let runIndex = 0;runIndex < runCount; runIndex += 1) {
|
|
2386
|
+
reports.push(await runVoiceSessionBenchmark({
|
|
2387
|
+
adapter: input.adapter,
|
|
2388
|
+
adapterId: input.adapterId,
|
|
2389
|
+
correctTurn: input.correctTurn,
|
|
2390
|
+
scenarios: input.scenarios,
|
|
2391
|
+
sttFallback: input.sttFallback,
|
|
2392
|
+
trace: input.trace
|
|
2393
|
+
}));
|
|
2394
|
+
}
|
|
2395
|
+
return summarizeVoiceSessionBenchmarkSeries({
|
|
2396
|
+
adapterId: input.adapterId,
|
|
2397
|
+
reports
|
|
2398
|
+
});
|
|
2399
|
+
};
|
|
1192
2400
|
export {
|
|
2401
|
+
summarizeVoiceSessionBenchmarkSeries,
|
|
1193
2402
|
summarizeVoiceSessionBenchmark,
|
|
1194
2403
|
summarizeSTTBenchmark,
|
|
1195
2404
|
scoreTranscriptAccuracy,
|
|
2405
|
+
runVoiceSessionBenchmarkSeries,
|
|
1196
2406
|
runVoiceSessionBenchmarkScenario,
|
|
1197
2407
|
runVoiceSessionBenchmark,
|
|
1198
2408
|
runVoiceResilienceBenchmark,
|
|
1199
2409
|
runSTTAdapterFixture,
|
|
1200
2410
|
runSTTAdapterBenchmark,
|
|
2411
|
+
resolveFixtureEnvironment,
|
|
1201
2412
|
mergeFinalTranscriptText,
|
|
1202
2413
|
loadVoiceTestFixtures,
|
|
1203
2414
|
getVoiceFixtureDirectory,
|
|
2415
|
+
evaluateSTTBenchmarkAcceptance,
|
|
1204
2416
|
compareSTTBenchmarks
|
|
1205
2417
|
};
|