@absolutejs/voice 0.0.22-beta.471 → 0.0.22-beta.472

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -8,6 +8,47 @@ Use it when you want Vapi/Retell/Bland-style voice-agent capability, but you wan
8
8
 
9
9
  ## What's new
10
10
 
11
+ ### 0.0.22-beta.472 · Phase 6 — multilingual STT proof gate
12
+
13
+ `runVoiceMultilingualProof(...)` turns the `voice-fixtures-multilingual` corpus (FLEURS + BSC Catalan-Spanish code-switch + CoSHE Hindi-English code-switch) into a gateable readiness/proof artifact. Buyers evaluating Vapi-replacement can now run any combination of STT adapters against the multilingual corpus and assert per-language WER / pass-rate / term-recall budgets in CI.
14
+
15
+ ```ts
16
+ import {
17
+ buildVoiceMultilingualProofReadinessCheck,
18
+ renderVoiceMultilingualProofMarkdown,
19
+ runVoiceMultilingualProof,
20
+ } from "@absolutejs/voice";
21
+ import { deepgram } from "@absolutejs/voice-deepgram";
22
+ import { speechmatics } from "@absolutejs/voice-speechmatics";
23
+ import { soniox } from "@absolutejs/voice-soniox";
24
+
25
+ const report = await runVoiceMultilingualProof({
26
+ adapters: [
27
+ { adapter: deepgram({ apiKey, model: "nova-3" }), adapterId: "deepgram-nova3" },
28
+ { adapter: speechmatics({ apiKey, region: "eu2" }), adapterId: "speechmatics-enhanced" },
29
+ { adapter: soniox({ apiKey, enableLanguageIdentification: true }), adapterId: "soniox" },
30
+ ],
31
+ defaultThresholds: { maxAverageWordErrorRate: 0.30, minPassRate: 0.7 },
32
+ perLanguage: [
33
+ { language: "ca-es", label: "Catalan-Spanish code-switch", maxAverageWordErrorRate: 0.45 },
34
+ { language: "hi-en", label: "Hindi-English code-switch", maxAverageWordErrorRate: 0.50 },
35
+ ],
36
+ });
37
+
38
+ const readiness = buildVoiceMultilingualProofReadinessCheck(report, {
39
+ baseHref: "/voice/multilingual-proof",
40
+ });
41
+ // drop `readiness` into your VoiceProductionReadinessReport.checks array
42
+
43
+ await Bun.write("docs/multilingual-proof.md", renderVoiceMultilingualProofMarkdown(report));
44
+ ```
45
+
46
+ Highlights:
47
+ - Loads fixtures from `VOICE_FIXTURE_DIR` (or any caller-supplied directory list / pre-loaded `VoiceTestFixture[]`), filters by an optional predicate, and runs each adapter through the existing `runSTTAdapterBenchmark` harness — no new STT plumbing required.
48
+ - Buckets fixture results by `fixture.language` and applies per-language thresholds (`maxAverageWordErrorRate`, `minAverageWordAccuracyRate`, `minPassRate`, `minTermRecall`) layered over caller-provided `defaultThresholds`.
49
+ - Returns a structured report (`adapters[].languageReports[]` with metrics + failures, plus per-adapter and global `passes` flags) plus a Markdown renderer for human review and a `VoiceProductionReadinessCheck`-shaped helper for drop-in readiness wiring.
50
+ - Pairs naturally with every STT adapter shipped in `voice-adapters` (deepgram, assemblyai, azure streaming, speechmatics, gladia, soniox, google-speech streaming + buffered, openai-whisper buffered).
51
+
11
52
  ### 0.0.22-beta.471 · Vapi parity — `fromVapiAssistantConfig` adapter
12
53
 
13
54
  Mechanical migration from a Vapi Assistant JSON to a voice assistant. Pass the JSON dump (or a typed subset), provide a `modelFactory` that maps Vapi's `model.provider`+`model.model` to a voice `VoiceAgentModel`, and get back `{ assistant, tools, routeHints, unsupported }`.
package/dist/index.d.ts CHANGED
@@ -223,4 +223,6 @@ export { shapeTelephonyAssistantText } from "./telephony/response";
223
223
  export type { TelephonyResponseShapeMode, TelephonyResponseShapeOptions, } from "./telephony/response";
224
224
  export { buildVoiceProofPackInput, buildVoiceProofPack, buildVoiceProofPackFromObservabilityExport, createVoiceProofPackBuildContext, createVoiceProofRefreshSnapshot, createVoiceProofPackStaleWhileRefreshSource, createVoiceProofPackArtifacts, createVoiceProofPackOperationsRecordSection, createVoiceProofPackProductionReadinessSection, createVoiceProofPackProviderSloSection, createVoiceProofPackRoutes, createVoiceProofPackSupportBundleSection, renderVoiceProofPackMarkdown, writeVoiceProofPack, } from "./proofPack";
225
225
  export type { VoiceProofPack, VoiceProofPackBuildContext, VoiceProofPackBuildContextOptions, VoiceProofPackBuildTiming, VoiceProofPackEvidence, VoiceProofPackInput, VoiceProofPackInputBuilderLoaderInput, VoiceProofPackInputBuilderOperationsLoaderInput, VoiceProofPackInputBuilderOptions, VoiceProofPackInputBuilderSupportBundle, VoiceProofPackRefreshState, VoiceProofPackRefreshStatus, VoiceProofPackRoutesOptions, VoiceProofPackSection, VoiceProofPackSourceValue, VoiceProofPackStatus, VoiceProofPackStaleWhileRefreshSource, VoiceProofPackStaleWhileRefreshSourceOptions, VoiceProofPackWriteResult, VoiceProofRefreshSnapshot, VoiceProofRefreshSnapshotOptions, } from "./proofPack";
226
+ export { buildVoiceMultilingualProofReadinessCheck, renderVoiceMultilingualProofMarkdown, runVoiceMultilingualProof, } from "./multilingualProof";
227
+ export type { VoiceMultilingualLanguageCode, VoiceMultilingualProofAdapterEntry, VoiceMultilingualProofAdapterReport, VoiceMultilingualProofDefaultThresholds, VoiceMultilingualProofLanguageMetrics, VoiceMultilingualProofLanguageReport, VoiceMultilingualProofLanguageThresholds, VoiceMultilingualProofOptions, VoiceMultilingualProofReadinessCheck, VoiceMultilingualProofReadinessOptions, VoiceMultilingualProofReport, } from "./multilingualProof";
226
228
  export * from "./types";
package/dist/index.js CHANGED
@@ -43373,6 +43373,1087 @@ var createVoiceProofPackRoutes = (options) => {
43373
43373
  }
43374
43374
  return app;
43375
43375
  };
43376
+ // src/testing/fixtures.ts
43377
+ import { resolve as resolve2 } from "path";
43378
+ var JARGON_FIXTURE_IDS = [
43379
+ "traveled-back-route-clean",
43380
+ "dialogue-two-clean",
43381
+ "dialogue-three-clean",
43382
+ "dialogue-two-noisy",
43383
+ "dialogue-three-mixed"
43384
+ ];
43385
+ var DEFAULT_AUDIO_FORMAT = {
43386
+ channels: 1,
43387
+ container: "raw",
43388
+ encoding: "pcm_s16le",
43389
+ sampleRateHz: 16000
43390
+ };
43391
+ var DEFAULT_TELEPHONY_SAMPLE_RATE_HZ = 8000;
43392
+ var DEFAULT_MULTI_SPEAKER_SILENCE_MS = 350;
43393
+ var FIXTURE_DIR_CANDIDATES = [
43394
+ resolve2(import.meta.dir, "..", "..", "fixtures"),
43395
+ resolve2(import.meta.dir, "..", "..", "..", "fixtures"),
43396
+ resolve2(import.meta.dir, "..", "..", "..", "..", "fixtures")
43397
+ ];
43398
+ var EXTERNAL_FIXTURE_ENV_KEYS = [
43399
+ "VOICE_FIXTURE_DIR",
43400
+ "VOICE_FIXTURE_DIRS"
43401
+ ];
43402
+ var resolveFixtureDirectory = async () => {
43403
+ for (const candidate of FIXTURE_DIR_CANDIDATES) {
43404
+ if (await Bun.file(resolve2(candidate, "manifest.json")).exists()) {
43405
+ return candidate;
43406
+ }
43407
+ }
43408
+ throw new Error("Unable to locate the bundled voice test fixtures. Expected fixtures/manifest.json next to the package root.");
43409
+ };
43410
+ var getVoiceFixtureDirectory = async () => resolveFixtureDirectory();
43411
+ var toUniqueDirectories = (directories) => directories.filter((directory, index, list) => directory.trim().length > 0 && list.indexOf(directory) === index);
43412
+ var splitFixtureDirectoryValue = (value) => (value ?? "").split(/[\n,]/).map((entry) => entry.trim()).filter((entry) => entry.length > 0);
43413
+ var resolveFixtureInputDirectories = (input) => {
43414
+ if (typeof input === "string") {
43415
+ return [input];
43416
+ }
43417
+ if (Array.isArray(input)) {
43418
+ return input;
43419
+ }
43420
+ return input?.directories ?? [];
43421
+ };
43422
+ var shouldIncludeBundledFixtures = (input) => {
43423
+ if (input && typeof input === "object" && !Array.isArray(input) && input.includeBundled === false) {
43424
+ return false;
43425
+ }
43426
+ return true;
43427
+ };
43428
+ var resolveConfiguredFixtureDirectories = async (input) => {
43429
+ const directories = [
43430
+ ...resolveFixtureInputDirectories(input),
43431
+ ...EXTERNAL_FIXTURE_ENV_KEYS.flatMap((key) => splitFixtureDirectoryValue(process.env[key]))
43432
+ ];
43433
+ const uniqueDirectories = toUniqueDirectories(directories.map((directory) => resolve2(directory)));
43434
+ for (const directory of uniqueDirectories) {
43435
+ const manifestExists = await Bun.file(resolve2(directory, "manifest.json")).exists();
43436
+ if (!manifestExists) {
43437
+ throw new Error(`Voice fixture directory "${directory}" is missing manifest.json.`);
43438
+ }
43439
+ }
43440
+ return uniqueDirectories;
43441
+ };
43442
+ var resolveVoiceFixtureDirectories = async (input) => {
43443
+ const directories = await resolveConfiguredFixtureDirectories(input);
43444
+ if (!shouldIncludeBundledFixtures(input)) {
43445
+ if (directories.length === 0) {
43446
+ throw new Error("No voice fixture directories were configured. Provide directories or set VOICE_FIXTURE_DIR/VOICE_FIXTURE_DIRS.");
43447
+ }
43448
+ return directories;
43449
+ }
43450
+ return [await resolveFixtureDirectory(), ...directories];
43451
+ };
43452
+ var clampSample = (value) => Math.max(-32768, Math.min(32767, Math.round(value)));
43453
+ var toPcm16Samples = (audio) => new Int16Array(audio.buffer.slice(audio.byteOffset, audio.byteOffset + audio.byteLength));
43454
+ var toPcm16Bytes = (samples) => new Uint8Array(samples.buffer.slice(samples.byteOffset, samples.byteOffset + samples.byteLength));
43455
+ var createSilenceBytes = (sampleRateHz, durationMs) => new Uint8Array(Math.max(2, Math.round(sampleRateHz * 2 * durationMs / 1000)));
43456
+ var concatAudioChunks = (chunks) => {
43457
+ const totalByteLength = chunks.reduce((sum, chunk) => sum + chunk.byteLength, 0);
43458
+ const output = new Uint8Array(totalByteLength);
43459
+ let offset = 0;
43460
+ for (const chunk of chunks) {
43461
+ output.set(chunk, offset);
43462
+ offset += chunk.byteLength;
43463
+ }
43464
+ return output;
43465
+ };
43466
+ var resamplePcm16Mono = (samples, sourceRate, targetRate) => {
43467
+ if (sourceRate === targetRate || samples.length === 0) {
43468
+ return samples;
43469
+ }
43470
+ const ratio = targetRate / sourceRate;
43471
+ const targetLength = Math.max(1, Math.round(samples.length * ratio));
43472
+ const output = new Int16Array(targetLength);
43473
+ for (let index = 0;index < targetLength; index += 1) {
43474
+ const sourceIndex = index / ratio;
43475
+ const previousIndex = Math.floor(sourceIndex);
43476
+ const nextIndex = Math.min(previousIndex + 1, samples.length - 1);
43477
+ const fraction = sourceIndex - previousIndex;
43478
+ const previous = samples[previousIndex] ?? 0;
43479
+ const next = samples[nextIndex] ?? previous;
43480
+ output[index] = clampSample(previous + (next - previous) * fraction);
43481
+ }
43482
+ return output;
43483
+ };
43484
+ var toMuLaw = (sample) => {
43485
+ const MU_LAW_MAX = 8191;
43486
+ const MU_LAW_BIAS = 132;
43487
+ const sign = sample < 0 ? 128 : 0;
43488
+ const magnitude = Math.min(MU_LAW_MAX, Math.abs(sample) + MU_LAW_BIAS);
43489
+ let exponent = 7;
43490
+ for (let mask = 16384;(magnitude & mask) === 0 && exponent > 0; mask >>= 1) {
43491
+ exponent -= 1;
43492
+ }
43493
+ const mantissa = magnitude >> exponent + 3 & 15;
43494
+ return ~(sign | exponent << 4 | mantissa) & 255;
43495
+ };
43496
+ var fromMuLaw = (encoded) => {
43497
+ const normalized = ~encoded & 255;
43498
+ const sign = normalized & 128;
43499
+ const exponent = normalized >> 4 & 7;
43500
+ const mantissa = normalized & 15;
43501
+ const magnitude = ((mantissa | 16) << exponent + 3) - 132;
43502
+ return sign ? -magnitude : magnitude;
43503
+ };
43504
+ var applyTelephonyDegradation = (audio, format, targetSampleRateHz) => {
43505
+ const sourceSamples = toPcm16Samples(audio);
43506
+ const narrowbandSamples = resamplePcm16Mono(sourceSamples, format.sampleRateHz, targetSampleRateHz);
43507
+ const degradedSamples = new Int16Array(narrowbandSamples.length);
43508
+ for (let index = 0;index < narrowbandSamples.length; index += 1) {
43509
+ const compressed = toMuLaw(narrowbandSamples[index] ?? 0);
43510
+ degradedSamples[index] = clampSample(fromMuLaw(compressed) * 0.92);
43511
+ }
43512
+ return toPcm16Bytes(degradedSamples);
43513
+ };
43514
+ var shouldIncludeTelephonyFixture = (fixture, options) => {
43515
+ const tags = new Set(fixture.tags ?? []);
43516
+ if (!options.includeAccents && (tags.has("accent") || tags.has("speech-accent-archive"))) {
43517
+ return false;
43518
+ }
43519
+ return true;
43520
+ };
43521
+ var createTelephonyVoiceTestFixtures = (fixtures, options = {}) => {
43522
+ const targetSampleRateHz = options.targetSampleRateHz ?? DEFAULT_TELEPHONY_SAMPLE_RATE_HZ;
43523
+ return fixtures.filter((fixture) => shouldIncludeTelephonyFixture(fixture, options)).map((fixture) => ({
43524
+ ...fixture,
43525
+ audio: applyTelephonyDegradation(fixture.audio, fixture.format, targetSampleRateHz),
43526
+ format: {
43527
+ ...fixture.format,
43528
+ sampleRateHz: targetSampleRateHz
43529
+ },
43530
+ id: `${fixture.id}-telephony`,
43531
+ tags: Array.from(new Set([...fixture.tags ?? [], "narrowband", "telephony"])),
43532
+ title: `${fixture.title} (telephony narrowband)`
43533
+ }));
43534
+ };
43535
+ var requireFixture = (fixtures, id) => {
43536
+ const fixture = fixtures.find((entry) => entry.id === id);
43537
+ if (!fixture) {
43538
+ throw new Error(`Missing bundled voice fixture "${id}" required for multi-speaker benchmarks.`);
43539
+ }
43540
+ return fixture;
43541
+ };
43542
+ var createMultiSpeakerVoiceTestFixtures = (fixtures, options = {}) => {
43543
+ const silenceMs = options.silenceMs ?? DEFAULT_MULTI_SPEAKER_SILENCE_MS;
43544
+ const speakerA = requireFixture(fixtures, "quietly-alone-clean");
43545
+ const speakerB = requireFixture(fixtures, "traveled-back-route-clean");
43546
+ const speakerC = requireFixture(fixtures, "rainstorms-noisy");
43547
+ const silence = createSilenceBytes(speakerA.format.sampleRateHz, silenceMs);
43548
+ const handoff = concatAudioChunks([speakerA.audio, silence, speakerB.audio]);
43549
+ const threeTurn = concatAudioChunks([
43550
+ speakerA.audio,
43551
+ silence,
43552
+ speakerB.audio,
43553
+ silence,
43554
+ speakerC.audio
43555
+ ]);
43556
+ const buildTags = (...tags) => [
43557
+ "multi-speaker",
43558
+ "handoff",
43559
+ "synthetic",
43560
+ ...tags
43561
+ ];
43562
+ return [
43563
+ {
43564
+ ...speakerA,
43565
+ audio: handoff,
43566
+ audioPath: `${speakerA.audioPath}+${speakerB.audioPath}`,
43567
+ expectedSpeakerTurns: [
43568
+ { speaker: "speaker-a", text: speakerA.expectedText },
43569
+ { speaker: "speaker-b", text: speakerB.expectedText }
43570
+ ],
43571
+ expectedTerms: Array.from(new Set([
43572
+ ...speakerA.expectedTerms ?? [],
43573
+ ...speakerB.expectedTerms ?? []
43574
+ ])),
43575
+ expectedText: `${speakerA.expectedText} ${speakerB.expectedText}`.trim(),
43576
+ expectedTurnTexts: [speakerA.expectedText, speakerB.expectedText],
43577
+ id: "multi-speaker-handoff-clean",
43578
+ tags: buildTags("clean"),
43579
+ title: "Synthetic two-speaker handoff"
43580
+ },
43581
+ {
43582
+ ...speakerA,
43583
+ audio: threeTurn,
43584
+ audioPath: `${speakerA.audioPath}+${speakerB.audioPath}+${speakerC.audioPath}`,
43585
+ expectedSpeakerTurns: [
43586
+ { speaker: "speaker-a", text: speakerA.expectedText },
43587
+ { speaker: "speaker-b", text: speakerB.expectedText },
43588
+ { speaker: "speaker-c", text: speakerC.expectedText }
43589
+ ],
43590
+ expectedTerms: Array.from(new Set([
43591
+ ...speakerA.expectedTerms ?? [],
43592
+ ...speakerB.expectedTerms ?? [],
43593
+ ...speakerC.expectedTerms ?? []
43594
+ ])),
43595
+ expectedText: `${speakerA.expectedText} ${speakerB.expectedText} ${speakerC.expectedText}`.trim(),
43596
+ expectedTurnTexts: [
43597
+ speakerA.expectedText,
43598
+ speakerB.expectedText,
43599
+ speakerC.expectedText
43600
+ ],
43601
+ id: "multi-speaker-handoff-three",
43602
+ tags: buildTags("challenging", "noisy"),
43603
+ title: "Synthetic three-speaker handoff (A-B-C)"
43604
+ }
43605
+ ];
43606
+ };
43607
+ var createJargonVoiceTestFixtures = (fixtures) => JARGON_FIXTURE_IDS.map((id) => requireFixture(fixtures, id)).filter((fixture) => (fixture.expectedTerms?.length ?? 0) > 0).map((fixture) => ({
43608
+ ...fixture,
43609
+ id: `${fixture.id}-jargon`,
43610
+ tags: Array.from(new Set([...fixture.tags ?? [], "domain-heavy", "jargon"])),
43611
+ title: `${fixture.title} (jargon)`
43612
+ }));
43613
+ var loadVoiceTestFixtures = async (fixtureDirectory) => {
43614
+ const fixtureDirectories = await resolveVoiceFixtureDirectories(fixtureDirectory);
43615
+ const fixtures = [];
43616
+ const seenFixtureIds = new Set;
43617
+ for (const directory of fixtureDirectories) {
43618
+ const manifestFile = Bun.file(resolve2(directory, "manifest.json"));
43619
+ const manifest = await manifestFile.json();
43620
+ for (const entry of manifest) {
43621
+ if (seenFixtureIds.has(entry.id)) {
43622
+ throw new Error(`Duplicate voice fixture id "${entry.id}" found while loading "${directory}".`);
43623
+ }
43624
+ const audioPath = resolve2(directory, "pcm", entry.audioPath);
43625
+ const audio = new Uint8Array(await Bun.file(audioPath).arrayBuffer());
43626
+ fixtures.push({
43627
+ ...entry,
43628
+ audio,
43629
+ audioPath,
43630
+ format: {
43631
+ ...DEFAULT_AUDIO_FORMAT,
43632
+ ...entry.format
43633
+ }
43634
+ });
43635
+ seenFixtureIds.add(entry.id);
43636
+ }
43637
+ }
43638
+ return fixtures;
43639
+ };
43640
+
43641
+ // src/testing/accuracy.ts
43642
+ var normalizeAccuracyText = (value) => value.toLowerCase().replace(/[^\p{L}\p{N}\s']/gu, " ").replace(/\s+/g, " ").trim();
43643
+ var levenshteinDistance2 = (left, right) => {
43644
+ if (left.length === 0) {
43645
+ return right.length;
43646
+ }
43647
+ if (right.length === 0) {
43648
+ return left.length;
43649
+ }
43650
+ const previous = new Array(right.length + 1).fill(0);
43651
+ const current = new Array(right.length + 1).fill(0);
43652
+ for (let column = 0;column <= right.length; column += 1) {
43653
+ previous[column] = column;
43654
+ }
43655
+ for (let row = 1;row <= left.length; row += 1) {
43656
+ current[0] = row;
43657
+ for (let column = 1;column <= right.length; column += 1) {
43658
+ const substitutionCost = left[row - 1] === right[column - 1] ? 0 : 1;
43659
+ current[column] = Math.min(current[column - 1] + 1, previous[column] + 1, previous[column - 1] + substitutionCost);
43660
+ }
43661
+ for (let column = 0;column <= right.length; column += 1) {
43662
+ previous[column] = current[column];
43663
+ }
43664
+ }
43665
+ return previous[right.length];
43666
+ };
43667
+ var mergeFinalTranscriptText = (transcripts) => buildTurnText(transcripts.filter((transcript) => transcript.isFinal), "");
43668
+ var scoreTranscriptAccuracy = (actualText, expectedText, threshold = 0.35) => {
43669
+ const normalizedActual = normalizeAccuracyText(actualText);
43670
+ const normalizedExpected = normalizeAccuracyText(expectedText);
43671
+ const actualWords = normalizedActual ? normalizedActual.split(" ") : [];
43672
+ const expectedWords = normalizedExpected ? normalizedExpected.split(" ") : [];
43673
+ const wordDistance = levenshteinDistance2(actualWords, expectedWords);
43674
+ const charDistance = levenshteinDistance2(Array.from(normalizedActual), Array.from(normalizedExpected));
43675
+ const wordErrorRate = expectedWords.length > 0 ? wordDistance / expectedWords.length : 0;
43676
+ const charErrorRate = normalizedExpected.length > 0 ? charDistance / normalizedExpected.length : 0;
43677
+ return {
43678
+ actualText: normalizedActual,
43679
+ charDistance,
43680
+ charErrorRate,
43681
+ expectedText: normalizedExpected,
43682
+ passesThreshold: wordErrorRate <= threshold,
43683
+ threshold,
43684
+ wordDistance,
43685
+ wordErrorRate
43686
+ };
43687
+ };
43688
+
43689
+ // src/testing/stt.ts
43690
+ var chunkAudio = (audio, bytesPerChunk) => {
43691
+ const chunks = [];
43692
+ for (let offset = 0;offset < audio.byteLength; offset += bytesPerChunk) {
43693
+ chunks.push(audio.slice(offset, offset + bytesPerChunk));
43694
+ }
43695
+ return chunks;
43696
+ };
43697
+ var createSilence = (byteLength3) => new Uint8Array(byteLength3);
43698
+ var waitForIdle = async (readLastActivityAt, idleTimeoutMs, settleMs) => {
43699
+ const startedAt = Date.now();
43700
+ while (Date.now() - startedAt < idleTimeoutMs) {
43701
+ if (Date.now() - readLastActivityAt() >= settleMs) {
43702
+ return;
43703
+ }
43704
+ await Bun.sleep(Math.min(50, settleMs));
43705
+ }
43706
+ };
43707
+ var runSTTAdapterFixture = async (adapter, fixture, options = {}) => {
43708
+ const startedAt = Date.now();
43709
+ const partialEvents = [];
43710
+ const finalEvents = [];
43711
+ const endOfTurnEvents = [];
43712
+ const errorEvents = [];
43713
+ const closeEvents = [];
43714
+ const chunkDurationMs = options.chunkDurationMs ?? fixture.chunkDurationMs ?? 100;
43715
+ const tailPaddingMs = options.tailPaddingMs ?? fixture.tailPaddingMs ?? 1000;
43716
+ const idleTimeoutMs = options.idleTimeoutMs ?? 8000;
43717
+ const settleMs = options.settleMs ?? 500;
43718
+ const waitForRealtimeMs = options.waitForRealtimeMs ?? 0;
43719
+ let lastActivityAt = Date.now();
43720
+ let speechEndedAt = startedAt;
43721
+ const markActive = () => {
43722
+ lastActivityAt = Date.now();
43723
+ };
43724
+ const resolvedOpenOptions = typeof options.openOptions === "function" ? options.openOptions(fixture) : options.openOptions;
43725
+ const session = await adapter.open({
43726
+ format: fixture.format,
43727
+ sessionId: `fixture-${fixture.id}`,
43728
+ ...resolvedOpenOptions ?? {}
43729
+ });
43730
+ const unsubscribers = [
43731
+ session.on("partial", (event) => {
43732
+ partialEvents.push(event);
43733
+ markActive();
43734
+ }),
43735
+ session.on("final", (event) => {
43736
+ finalEvents.push(event);
43737
+ markActive();
43738
+ }),
43739
+ session.on("endOfTurn", (event) => {
43740
+ endOfTurnEvents.push(event);
43741
+ markActive();
43742
+ }),
43743
+ session.on("error", (event) => {
43744
+ errorEvents.push(event);
43745
+ markActive();
43746
+ }),
43747
+ session.on("close", (event) => {
43748
+ closeEvents.push(event);
43749
+ markActive();
43750
+ })
43751
+ ];
43752
+ try {
43753
+ const bytesPerMillisecond = fixture.format.sampleRateHz * fixture.format.channels * 2 / 1000;
43754
+ const bytesPerChunk = Math.max(2, Math.floor(bytesPerMillisecond * chunkDurationMs));
43755
+ const chunks = chunkAudio(fixture.audio, bytesPerChunk);
43756
+ const realtimeDelayMs = waitForRealtimeMs > 0 ? waitForRealtimeMs : chunkDurationMs;
43757
+ for (const chunk of chunks) {
43758
+ await session.send(chunk);
43759
+ markActive();
43760
+ await Bun.sleep(realtimeDelayMs);
43761
+ }
43762
+ speechEndedAt = Date.now();
43763
+ if (tailPaddingMs > 0) {
43764
+ const tailBytes = Math.max(2, Math.floor(bytesPerMillisecond * tailPaddingMs));
43765
+ for (const chunk of chunkAudio(createSilence(tailBytes), bytesPerChunk)) {
43766
+ await session.send(chunk);
43767
+ markActive();
43768
+ await Bun.sleep(realtimeDelayMs);
43769
+ }
43770
+ }
43771
+ await waitForIdle(() => lastActivityAt, idleTimeoutMs, settleMs);
43772
+ } finally {
43773
+ await session.close("fixture-complete");
43774
+ for (const unsubscribe of unsubscribers) {
43775
+ unsubscribe();
43776
+ }
43777
+ }
43778
+ const finalTranscripts = finalEvents.map((event) => ({
43779
+ ...event.transcript,
43780
+ endedAtMs: event.receivedAt - startedAt,
43781
+ startedAtMs: event.receivedAt - startedAt
43782
+ }));
43783
+ const trailingPartial = [...partialEvents].reverse().find((event) => {
43784
+ const text = event.transcript.text.trim();
43785
+ if (!text) {
43786
+ return false;
43787
+ }
43788
+ const lastFinalReceivedAt = finalEvents.at(-1)?.receivedAt ?? 0;
43789
+ return event.receivedAt >= lastFinalReceivedAt;
43790
+ });
43791
+ const finalText = trailingPartial && finalTranscripts.length > 0 ? buildTurnText(finalTranscripts, trailingPartial.transcript.text, {
43792
+ partialEndedAtMs: trailingPartial.receivedAt - startedAt,
43793
+ partialStartedAtMs: trailingPartial.receivedAt - startedAt
43794
+ }) : mergeFinalTranscriptText(finalTranscripts);
43795
+ return {
43796
+ accuracy: scoreTranscriptAccuracy(finalText, fixture.expectedText, options.transcriptThreshold),
43797
+ closeEvents,
43798
+ endOfTurnEvents,
43799
+ errorEvents,
43800
+ finalEvents,
43801
+ finalText,
43802
+ partialEvents,
43803
+ speechEndedAt,
43804
+ startedAt
43805
+ };
43806
+ };
43807
+
43808
+ // src/testing/benchmark.ts
43809
+ var resolveFixtureEnvironment = (fixture) => {
43810
+ const tags = new Set(fixture.tags ?? []);
43811
+ if (tags.has("telephony")) {
43812
+ return "telephony";
43813
+ }
43814
+ if (tags.has("code-switch") || tags.has("code_switch")) {
43815
+ return "code-switch";
43816
+ }
43817
+ if (tags.has("multi-speaker")) {
43818
+ return "multi-speaker";
43819
+ }
43820
+ if (tags.has("jargon") || tags.has("domain-heavy")) {
43821
+ return "jargon";
43822
+ }
43823
+ const hasAccent = tags.has("accent") || tags.has("speech-accent-archive");
43824
+ const hasNoisy = tags.has("noisy") || tags.has("synthetic-noise") || tags.has("stress");
43825
+ const language = fixture.language?.trim().toLowerCase();
43826
+ const hasNonEnglishLanguage = typeof language === "string" && language.length > 0 && !language.startsWith("en");
43827
+ const isMultilingual = tags.has("multilingual") || tags.has("bilingual") || hasNonEnglishLanguage;
43828
+ if (hasAccent && hasNoisy) {
43829
+ return "accent-noisy";
43830
+ }
43831
+ if (isMultilingual) {
43832
+ return "multilingual";
43833
+ }
43834
+ if (hasAccent) {
43835
+ return "accent";
43836
+ }
43837
+ if (hasNoisy) {
43838
+ return "noisy";
43839
+ }
43840
+ if (tags.has("clean")) {
43841
+ return "clean";
43842
+ }
43843
+ return "other";
43844
+ };
43845
+ var normalizeBenchmarkText = (value) => value.toLowerCase().replace(/[^\p{L}\p{N}\s']/gu, " ").replace(/\s+/g, " ").trim();
43846
+ var scoreExpectedTerms = (actualText, expectedTerms) => {
43847
+ const normalizedActual = normalizeBenchmarkText(actualText);
43848
+ const normalizedExpectedTerms = (expectedTerms ?? []).map((entry) => normalizeBenchmarkText(entry));
43849
+ const matchedTerms = normalizedExpectedTerms.filter((term) => term.length > 0 && normalizedActual.includes(term));
43850
+ const missingTerms = normalizedExpectedTerms.filter((term) => term.length > 0 && !matchedTerms.includes(term));
43851
+ const denominator = normalizedExpectedTerms.length;
43852
+ const recall = denominator > 0 ? matchedTerms.length / denominator : 1;
43853
+ return {
43854
+ allMatched: missingTerms.length === 0,
43855
+ expectedTerms: normalizedExpectedTerms,
43856
+ matchedTerms,
43857
+ missingTerms,
43858
+ recall
43859
+ };
43860
+ };
43861
+ var toPatternKeys = (speakers) => {
43862
+ const mapping = new Map;
43863
+ let nextKey = 0;
43864
+ return speakers.map((speaker) => {
43865
+ const key = String(speaker);
43866
+ if (!mapping.has(key)) {
43867
+ mapping.set(key, nextKey);
43868
+ nextKey += 1;
43869
+ }
43870
+ return mapping.get(key);
43871
+ });
43872
+ };
43873
+ var countNormalizedWords = (value) => normalizeBenchmarkText(value).split(" ").filter((token) => token.length > 0);
43874
+ var computeWordOverlap = (left, right) => {
43875
+ const leftWords = new Set(countNormalizedWords(left));
43876
+ const rightWords = new Set(countNormalizedWords(right));
43877
+ if (leftWords.size === 0 || rightWords.size === 0) {
43878
+ return 0;
43879
+ }
43880
+ let overlap = 0;
43881
+ for (const word of leftWords) {
43882
+ if (rightWords.has(word)) {
43883
+ overlap += 1;
43884
+ }
43885
+ }
43886
+ return overlap / Math.max(leftWords.size, rightWords.size);
43887
+ };
43888
+ var repairSpeakerTurnReentry = (fixture, turns) => {
43889
+ const expectedTurns = fixture.expectedSpeakerTurns ?? [];
43890
+ const tags = new Set((fixture.tags ?? []).map((tag) => tag.trim().toLowerCase()));
43891
+ if (expectedTurns.length < 3 || !tags.has("synthetic") || !tags.has("handoff")) {
43892
+ return {
43893
+ postClustered: false,
43894
+ turns
43895
+ };
43896
+ }
43897
+ const repairedTurns = turns.map((turn) => ({ ...turn }));
43898
+ const firstTurnBySpeaker = new Map;
43899
+ const seenRepairedSpeakers = new Set;
43900
+ let postClustered = false;
43901
+ let syntheticSpeakerIndex = 0;
43902
+ for (let index = 0;index < repairedTurns.length; index += 1) {
43903
+ const turn = repairedTurns[index];
43904
+ const speakerKey = turn.speaker === undefined ? undefined : String(turn.speaker);
43905
+ const previousTurn = repairedTurns[index - 1];
43906
+ const previousSpeakerKey = previousTurn?.speaker === undefined ? undefined : String(previousTurn.speaker);
43907
+ if (speakerKey === undefined) {
43908
+ continue;
43909
+ }
43910
+ if (!firstTurnBySpeaker.has(speakerKey)) {
43911
+ firstTurnBySpeaker.set(speakerKey, turn);
43912
+ }
43913
+ seenRepairedSpeakers.add(String(turn.speaker));
43914
+ const originalSpeakerTurn = firstTurnBySpeaker.get(speakerKey);
43915
+ const speakerReentered = previousSpeakerKey !== undefined && previousSpeakerKey !== speakerKey && index > 1;
43916
+ const needsAdditionalSpeaker = seenRepairedSpeakers.size < expectedTurns.length;
43917
+ const sameSpeakerOverlap = computeWordOverlap(turn.text, originalSpeakerTurn.text);
43918
+ const currentWordCount = countNormalizedWords(turn.text).length;
43919
+ if (speakerReentered && needsAdditionalSpeaker && currentWordCount >= 4 && sameSpeakerOverlap < 0.35) {
43920
+ turn.speaker = `postcluster-${syntheticSpeakerIndex}`;
43921
+ seenRepairedSpeakers.add(String(turn.speaker));
43922
+ syntheticSpeakerIndex += 1;
43923
+ postClustered = true;
43924
+ }
43925
+ }
43926
+ return {
43927
+ postClustered,
43928
+ turns: repairedTurns
43929
+ };
43930
+ };
43931
+ var scoreSpeakerTurns = (fixture, result) => {
43932
+ const expectedTurns = fixture.expectedSpeakerTurns ?? [];
43933
+ if (expectedTurns.length === 0) {
43934
+ return;
43935
+ }
43936
+ const actualTurns = result.finalEvents.map((event) => ({
43937
+ speaker: event.transcript.speaker,
43938
+ text: event.transcript.text.trim()
43939
+ })).filter((turn) => turn.text.length > 0);
43940
+ const collapsedActualTurns = actualTurns.reduce((merged, turn) => {
43941
+ const previous = merged[merged.length - 1];
43942
+ if (previous && previous.speaker !== undefined && turn.speaker !== undefined && String(previous.speaker) === String(turn.speaker)) {
43943
+ previous.text = `${previous.text} ${turn.text}`.trim();
43944
+ return merged;
43945
+ }
43946
+ merged.push({ ...turn });
43947
+ return merged;
43948
+ }, []);
43949
+ const repaired = repairSpeakerTurnReentry(fixture, collapsedActualTurns);
43950
+ const scoredTurns = repaired.turns;
43951
+ const available = scoredTurns.every((turn) => turn.speaker !== undefined);
43952
+ if (!available) {
43953
+ return {
43954
+ available: false,
43955
+ actualTurnCount: scoredTurns.length,
43956
+ expectedTurnCount: expectedTurns.length,
43957
+ passes: false,
43958
+ patternMatchRate: 0,
43959
+ postClustered: repaired.postClustered
43960
+ };
43961
+ }
43962
+ const actualPattern = toPatternKeys(scoredTurns.map((turn) => turn.speaker));
43963
+ const expectedPattern = toPatternKeys(expectedTurns.map((turn) => turn.speaker));
43964
+ const maxLength = Math.max(actualPattern.length, expectedPattern.length, 1);
43965
+ let matches = 0;
43966
+ for (let index = 0;index < Math.min(actualPattern.length, expectedPattern.length); index += 1) {
43967
+ if (actualPattern[index] === expectedPattern[index]) {
43968
+ matches += 1;
43969
+ }
43970
+ }
43971
+ const patternMatchRate = roundMetric4(matches / maxLength) ?? 0;
43972
+ return {
43973
+ available: true,
43974
+ actualTurnCount: scoredTurns.length,
43975
+ expectedTurnCount: expectedTurns.length,
43976
+ passes: scoredTurns.length === expectedTurns.length && patternMatchRate === 1,
43977
+ patternMatchRate,
43978
+ postClustered: repaired.postClustered
43979
+ };
43980
+ };
43981
+ var average2 = (values) => {
43982
+ const filtered = values.filter((value) => typeof value === "number" && Number.isFinite(value));
43983
+ if (filtered.length === 0) {
43984
+ return;
43985
+ }
43986
+ return filtered.reduce((sum, value) => sum + value, 0) / filtered.length;
43987
+ };
43988
+ var roundMetric4 = (value, digits = 4) => {
43989
+ if (typeof value !== "number" || !Number.isFinite(value)) {
43990
+ return;
43991
+ }
43992
+ const factor = 10 ** digits;
43993
+ return Math.round(value * factor) / factor;
43994
+ };
43995
+ var calculateGroupSummary = (fixtures) => {
43996
+ const grouped = new Map;
43997
+ for (const fixture of fixtures) {
43998
+ const existing = grouped.get(fixture.group) ?? [];
43999
+ existing.push(fixture);
44000
+ grouped.set(fixture.group, existing);
44001
+ }
44002
+ return Array.from(grouped.entries()).map(([group, results]) => {
44003
+ const fixtureCount = results.length;
44004
+ const passCount = results.filter((fixture) => fixture.passes).length;
44005
+ const averageWordErrorRate = average2(results.map((result) => result.accuracy.wordErrorRate)) ?? 0;
44006
+ const averageTermRecall = average2(results.map((result) => result.expectedTerms.recall)) ?? 0;
44007
+ const averageElapsedMs = average2(results.map((result) => result.elapsedMs));
44008
+ const averageSpeakerTurnMatchRate = average2(results.map((result) => result.speakerTurns?.patternMatchRate));
44009
+ const accuracy = 1 - averageWordErrorRate;
44010
+ return {
44011
+ averageElapsedMs: roundMetric4(averageElapsedMs, 2) ?? 0,
44012
+ averageSpeakerTurnMatchRate: roundMetric4(averageSpeakerTurnMatchRate),
44013
+ averageTermRecall: roundMetric4(averageTermRecall) ?? 0,
44014
+ averageWordErrorRate: roundMetric4(averageWordErrorRate) ?? 0,
44015
+ fixturesWithErrors: results.filter((fixture) => fixture.errorCount > 0).length,
44016
+ fixturesWithFragments: results.filter((fixture) => fixture.fragmentationCount > 0).length,
44017
+ fixtureCount,
44018
+ group,
44019
+ passCount,
44020
+ passRate: fixtureCount > 0 ? roundMetric4(passCount / fixtureCount) ?? 0 : 0,
44021
+ wordAccuracyRate: roundMetric4(accuracy) ?? 0
44022
+ };
44023
+ }).sort((a, b) => a.group.localeCompare(b.group));
44024
+ };
44025
+ var toFixtureBenchmarkResult = (fixture, result, elapsedMs) => {
44026
+ const toPostSpeechLatency = (timestamp) => {
44027
+ if (typeof timestamp !== "number") {
44028
+ return;
44029
+ }
44030
+ return Math.max(0, timestamp - result.speechEndedAt);
44031
+ };
44032
+ const timeToFirstPartialMs = result.partialEvents[0] ? result.partialEvents[0].receivedAt - result.startedAt : undefined;
44033
+ const timeToFirstFinalMs = result.finalEvents[0] ? result.finalEvents[0].receivedAt - result.startedAt : undefined;
44034
+ const timeToEndOfTurnMs = result.endOfTurnEvents[0] ? result.endOfTurnEvents[0].receivedAt - result.startedAt : undefined;
44035
+ const postSpeechTimeToFirstFinalMs = toPostSpeechLatency(result.finalEvents[0]?.receivedAt);
44036
+ const postSpeechTimeToEndOfTurnMs = toPostSpeechLatency(result.endOfTurnEvents[0]?.receivedAt);
44037
+ const expectedTerms = scoreExpectedTerms(result.finalText, fixture.expectedTerms);
44038
+ const speakerTurns = scoreSpeakerTurns(fixture, result);
44039
+ return {
44040
+ accuracy: result.accuracy,
44041
+ closeCount: result.closeEvents.length,
44042
+ difficulty: fixture.difficulty,
44043
+ elapsedMs,
44044
+ endOfTurnCount: result.endOfTurnEvents.length,
44045
+ errorCount: result.errorEvents.length,
44046
+ expectedTerms,
44047
+ finalCount: result.finalEvents.length,
44048
+ finalText: result.finalText,
44049
+ fixtureId: fixture.id,
44050
+ fragmentationCount: Math.max(0, result.finalEvents.length - 1),
44051
+ group: resolveFixtureEnvironment(fixture),
44052
+ passes: result.errorEvents.length === 0 && result.finalText.trim().length > 0 && result.accuracy.passesThreshold && (speakerTurns ? speakerTurns.passes : true),
44053
+ partialCount: result.partialEvents.length,
44054
+ speakerTurns,
44055
+ postSpeechTimeToEndOfTurnMs,
44056
+ postSpeechTimeToFirstFinalMs,
44057
+ tags: fixture.tags ?? [],
44058
+ timeToEndOfTurnMs,
44059
+ timeToFirstFinalMs,
44060
+ timeToFirstPartialMs,
44061
+ title: fixture.title
44062
+ };
44063
+ };
44064
+ var summarizeSTTBenchmark = (adapterId, fixtures) => {
44065
+ const fixtureCount = fixtures.length;
44066
+ const passCount = fixtures.filter((fixture) => fixture.passes).length;
44067
+ return {
44068
+ adapterId,
44069
+ averageCharErrorRate: roundMetric4(average2(fixtures.map((fixture) => fixture.accuracy.charErrorRate))) ?? 0,
44070
+ averageElapsedMs: roundMetric4(average2(fixtures.map((fixture) => fixture.elapsedMs)), 2) ?? 0,
44071
+ averageEndOfTurnCount: roundMetric4(average2(fixtures.map((fixture) => fixture.endOfTurnCount)), 2) ?? 0,
44072
+ averageFinalCount: roundMetric4(average2(fixtures.map((fixture) => fixture.finalCount)), 2) ?? 0,
44073
+ averageSpeakerTurnMatchRate: roundMetric4(average2(fixtures.map((fixture) => fixture.speakerTurns?.patternMatchRate))),
44074
+ averageTermRecall: roundMetric4(average2(fixtures.map((fixture) => fixture.expectedTerms.recall))) ?? 0,
44075
+ averagePostSpeechTimeToEndOfTurnMs: roundMetric4(average2(fixtures.map((fixture) => fixture.postSpeechTimeToEndOfTurnMs)), 2),
44076
+ averagePostSpeechTimeToFirstFinalMs: roundMetric4(average2(fixtures.map((fixture) => fixture.postSpeechTimeToFirstFinalMs)), 2),
44077
+ averageTimeToEndOfTurnMs: roundMetric4(average2(fixtures.map((fixture) => fixture.timeToEndOfTurnMs)), 2),
44078
+ averageTimeToFirstFinalMs: roundMetric4(average2(fixtures.map((fixture) => fixture.timeToFirstFinalMs)), 2),
44079
+ averageTimeToFirstPartialMs: roundMetric4(average2(fixtures.map((fixture) => fixture.timeToFirstPartialMs)), 2),
44080
+ averageWordErrorRate: roundMetric4(average2(fixtures.map((fixture) => fixture.accuracy.wordErrorRate))) ?? 0,
44081
+ fixtureCount,
44082
+ fixturesWithErrors: fixtures.filter((fixture) => fixture.errorCount > 0).length,
44083
+ fixturesWithFragmentation: fixtures.filter((fixture) => fixture.fragmentationCount > 0).length,
44084
+ groupSummaries: calculateGroupSummary(fixtures),
44085
+ passCount,
44086
+ passRate: fixtureCount > 0 ? roundMetric4(passCount / fixtureCount) ?? 0 : 0,
44087
+ totalErrorCount: fixtures.reduce((sum, fixture) => sum + fixture.errorCount, 0),
44088
+ wordAccuracyRate: fixtureCount > 0 ? roundMetric4(1 - (average2(fixtures.map((fixture) => fixture.accuracy.wordErrorRate)) ?? 0)) ?? 0 : 0
44089
+ };
44090
+ };
44091
+ var evaluateSTTBenchmarkAcceptance = (report, thresholds = {}) => {
44092
+ const failures = [];
44093
+ const details = thresholds;
44094
+ const overallPassRate = details.overallPassRate;
44095
+ if (overallPassRate !== undefined && report.summary.passRate < overallPassRate) {
44096
+ failures.push(`overall passRate ${(report.summary.passRate * 100).toFixed(2)}% below ${(overallPassRate * 100).toFixed(2)}%`);
44097
+ }
44098
+ const minTermRecall = details.termRecall;
44099
+ if (minTermRecall !== undefined && report.summary.averageTermRecall < minTermRecall) {
44100
+ failures.push(`overall term recall ${report.summary.averageTermRecall.toFixed(4)} below ${minTermRecall.toFixed(4)}`);
44101
+ }
44102
+ const minWordAccuracy = details.wordAccuracyRate;
44103
+ if (minWordAccuracy !== undefined && report.summary.wordAccuracyRate < minWordAccuracy) {
44104
+ failures.push(`overall word accuracy ${(report.summary.wordAccuracyRate * 100).toFixed(2)}% below ${(minWordAccuracy * 100).toFixed(2)}%`);
44105
+ }
44106
+ const groupThresholds = details.groupPassRate;
44107
+ if (groupThresholds) {
44108
+ for (const groupSummary of report.summary.groupSummaries) {
44109
+ const threshold = groupThresholds[groupSummary.group];
44110
+ if (!threshold) {
44111
+ continue;
44112
+ }
44113
+ if (threshold.passRate !== undefined && groupSummary.passRate < threshold.passRate) {
44114
+ failures.push(`${groupSummary.group} passRate ${(groupSummary.passRate * 100).toFixed(2)}% below ${(threshold.passRate * 100).toFixed(2)}%`);
44115
+ }
44116
+ if (threshold.wordAccuracyRate !== undefined && groupSummary.wordAccuracyRate < threshold.wordAccuracyRate) {
44117
+ failures.push(`${groupSummary.group} wordAccuracy ${(groupSummary.wordAccuracyRate * 100).toFixed(2)}% below ${(threshold.wordAccuracyRate * 100).toFixed(2)}%`);
44118
+ }
44119
+ }
44120
+ }
44121
+ const score = roundMetric4(report.summary.passRate * 0.45 + report.summary.wordAccuracyRate * 0.35 + report.summary.averageTermRecall * 0.2, 3) ?? 0;
44122
+ return {
44123
+ adapterId: report.adapterId,
44124
+ failures,
44125
+ passed: failures.length === 0,
44126
+ score
44127
+ };
44128
+ };
44129
+ var compareSTTBenchmarks = (reports) => {
44130
+ const entries = reports.map((report) => ({
44131
+ adapterId: report.adapterId,
44132
+ summary: report.summary
44133
+ }));
44134
+ const bestByMetric = (selectMetric, direction) => entries.reduce((best, entry) => {
44135
+ if (!best) {
44136
+ return entry;
44137
+ }
44138
+ const next = selectMetric(entry);
44139
+ const current = selectMetric(best);
44140
+ if (direction === "max" ? next > current : next < current) {
44141
+ return entry;
44142
+ }
44143
+ return best;
44144
+ }, undefined);
44145
+ return {
44146
+ bestByPassRate: bestByMetric((entry) => entry.summary.passRate, "max"),
44147
+ bestByTermRecall: bestByMetric((entry) => entry.summary.averageTermRecall, "max"),
44148
+ bestByWordErrorRate: bestByMetric((entry) => entry.summary.averageWordErrorRate, "min"),
44149
+ entries
44150
+ };
44151
+ };
44152
+ var runSTTAdapterBenchmark = async ({
44153
+ adapter,
44154
+ adapterId,
44155
+ fixtures,
44156
+ options = {}
44157
+ }) => {
44158
+ const results = [];
44159
+ for (const fixture of fixtures) {
44160
+ const startedAt = Date.now();
44161
+ const fixtureResult = await runSTTAdapterFixture(adapter, fixture, {
44162
+ ...options,
44163
+ ...options.fixtureOptions?.[fixture.id] ?? {}
44164
+ });
44165
+ results.push(toFixtureBenchmarkResult(fixture, fixtureResult, Date.now() - startedAt));
44166
+ }
44167
+ return {
44168
+ adapterId,
44169
+ fixtures: results,
44170
+ generatedAt: Date.now(),
44171
+ summary: summarizeSTTBenchmark(adapterId, results)
44172
+ };
44173
+ };
44174
+ var summarizeSTTBenchmarkSeries = (input) => {
44175
+ const fixtureMap = new Map;
44176
+ for (const report of input.reports) {
44177
+ for (const fixture of report.fixtures) {
44178
+ const entries = fixtureMap.get(fixture.fixtureId) ?? [];
44179
+ entries.push(fixture);
44180
+ fixtureMap.set(fixture.fixtureId, entries);
44181
+ }
44182
+ }
44183
+ const fixtureAggregates = [...fixtureMap.entries()].map(([fixtureId, results]) => {
44184
+ const wordErrorRates = results.map((result) => result.accuracy.wordErrorRate);
44185
+ const passCount = results.filter((result) => result.passes).length;
44186
+ const sample = results[0];
44187
+ return {
44188
+ averageElapsedMs: roundMetric4(average2(results.map((result) => result.elapsedMs)), 2) ?? 0,
44189
+ averagePassRate: roundMetric4(results.length > 0 ? passCount / results.length : 0) ?? 0,
44190
+ averageWordErrorRate: roundMetric4(average2(wordErrorRates)) ?? 0,
44191
+ bestWordErrorRate: roundMetric4(wordErrorRates.length > 0 ? Math.min(...wordErrorRates) : 0) ?? 0,
44192
+ fixtureId,
44193
+ group: sample.group,
44194
+ passCount,
44195
+ runCount: results.length,
44196
+ tags: sample.tags,
44197
+ title: sample.title,
44198
+ worstWordErrorRate: roundMetric4(wordErrorRates.length > 0 ? Math.max(...wordErrorRates) : 0) ?? 0
44199
+ };
44200
+ });
44201
+ const totalRunCount = input.reports.reduce((sum, report) => sum + report.fixtures.length, 0);
44202
+ const totalPassCount = input.reports.reduce((sum, report) => sum + report.summary.passCount, 0);
44203
+ return {
44204
+ adapterId: input.adapterId,
44205
+ fixtures: fixtureAggregates,
44206
+ generatedAt: Date.now(),
44207
+ runCount: input.reports.length,
44208
+ summary: {
44209
+ adapterId: input.adapterId,
44210
+ averageElapsedMs: roundMetric4(average2(fixtureAggregates.map((fixture) => fixture.averageElapsedMs)), 2) ?? 0,
44211
+ averagePassRate: roundMetric4(average2(fixtureAggregates.map((fixture) => fixture.averagePassRate))) ?? 0,
44212
+ averageWordErrorRate: roundMetric4(average2(fixtureAggregates.map((fixture) => fixture.averageWordErrorRate))) ?? 0,
44213
+ fixtureCount: fixtureAggregates.length,
44214
+ flakyFixtureCount: fixtureAggregates.filter((fixture) => fixture.averagePassRate > 0 && fixture.averagePassRate < 1).length,
44215
+ generatedRunCount: input.reports.length,
44216
+ stableFixtureCount: fixtureAggregates.filter((fixture) => fixture.averagePassRate === 1).length,
44217
+ totalPassCount,
44218
+ totalRunCount
44219
+ }
44220
+ };
44221
+ };
44222
+ var runSTTAdapterBenchmarkSeries = async ({
44223
+ adapter,
44224
+ adapterId,
44225
+ fixtures,
44226
+ options = {},
44227
+ runs
44228
+ }) => {
44229
+ const reports = [];
44230
+ const runCount = Math.max(1, Math.floor(runs));
44231
+ for (let runIndex = 0;runIndex < runCount; runIndex += 1) {
44232
+ reports.push(await runSTTAdapterBenchmark({
44233
+ adapter,
44234
+ adapterId,
44235
+ fixtures,
44236
+ options
44237
+ }));
44238
+ }
44239
+ return summarizeSTTBenchmarkSeries({
44240
+ adapterId,
44241
+ reports
44242
+ });
44243
+ };
44244
+
44245
+ // src/multilingualProof.ts
44246
+ var average3 = (values) => {
44247
+ if (values.length === 0)
44248
+ return 0;
44249
+ let total = 0;
44250
+ for (const value of values)
44251
+ total += value;
44252
+ return total / values.length;
44253
+ };
44254
+ var computeMetrics = (results) => {
44255
+ if (results.length === 0) {
44256
+ return {
44257
+ averageTermRecall: 0,
44258
+ averageWordAccuracyRate: 0,
44259
+ averageWordErrorRate: 0,
44260
+ fixtureCount: 0,
44261
+ passCount: 0,
44262
+ passRate: 0
44263
+ };
44264
+ }
44265
+ const wordErrorRates = results.map((result) => result.accuracy.wordErrorRate ?? 0);
44266
+ const wordAccuracyRates = results.map((result) => 1 - (result.accuracy.wordErrorRate ?? 0));
44267
+ const termRecalls = results.map((result) => result.expectedTerms.recall ?? 0);
44268
+ const passCount = results.filter((result) => result.passes).length;
44269
+ return {
44270
+ averageTermRecall: average3(termRecalls),
44271
+ averageWordAccuracyRate: average3(wordAccuracyRates),
44272
+ averageWordErrorRate: average3(wordErrorRates),
44273
+ fixtureCount: results.length,
44274
+ passCount,
44275
+ passRate: passCount / results.length
44276
+ };
44277
+ };
44278
+ var resolveLanguageThreshold = (language, defaults, perLanguage) => {
44279
+ const explicit = perLanguage?.find((entry) => entry.language.toLowerCase() === language.toLowerCase());
44280
+ return {
44281
+ label: explicit?.label,
44282
+ language,
44283
+ maxAverageWordErrorRate: explicit?.maxAverageWordErrorRate ?? defaults?.maxAverageWordErrorRate,
44284
+ minAverageWordAccuracyRate: explicit?.minAverageWordAccuracyRate ?? defaults?.minAverageWordAccuracyRate,
44285
+ minPassRate: explicit?.minPassRate ?? defaults?.minPassRate,
44286
+ minTermRecall: explicit?.minTermRecall ?? defaults?.minTermRecall
44287
+ };
44288
+ };
44289
+ var evaluateLanguage = (language, fixtureResults, thresholds) => {
44290
+ const metrics = computeMetrics(fixtureResults);
44291
+ const failures = [];
44292
+ if (thresholds.maxAverageWordErrorRate !== undefined && metrics.averageWordErrorRate > thresholds.maxAverageWordErrorRate) {
44293
+ failures.push(`${language}: avg WER ${metrics.averageWordErrorRate.toFixed(3)} exceeds budget ${thresholds.maxAverageWordErrorRate.toFixed(3)}.`);
44294
+ }
44295
+ if (thresholds.minAverageWordAccuracyRate !== undefined && metrics.averageWordAccuracyRate < thresholds.minAverageWordAccuracyRate) {
44296
+ failures.push(`${language}: avg WAR ${metrics.averageWordAccuracyRate.toFixed(3)} below floor ${thresholds.minAverageWordAccuracyRate.toFixed(3)}.`);
44297
+ }
44298
+ if (thresholds.minPassRate !== undefined && metrics.passRate < thresholds.minPassRate) {
44299
+ failures.push(`${language}: pass rate ${metrics.passRate.toFixed(3)} below floor ${thresholds.minPassRate.toFixed(3)}.`);
44300
+ }
44301
+ if (thresholds.minTermRecall !== undefined && metrics.averageTermRecall < thresholds.minTermRecall) {
44302
+ failures.push(`${language}: term recall ${metrics.averageTermRecall.toFixed(3)} below floor ${thresholds.minTermRecall.toFixed(3)}.`);
44303
+ }
44304
+ return {
44305
+ applied: thresholds,
44306
+ failures,
44307
+ fixtureIds: fixtureResults.map((result) => result.fixtureId),
44308
+ label: thresholds.label,
44309
+ language,
44310
+ metrics,
44311
+ passes: failures.length === 0
44312
+ };
44313
+ };
44314
+ var collectFixtures = async (options) => {
44315
+ if (options.fixtures !== undefined) {
44316
+ return options.fixtures.slice();
44317
+ }
44318
+ const loaded = await loadVoiceTestFixtures(options.fixtureDirectories);
44319
+ return options.filter ? loaded.filter(options.filter) : loaded;
44320
+ };
44321
+ var groupByLanguage = (results, fixtures) => {
44322
+ const lookup = new Map;
44323
+ for (const fixture of fixtures) {
44324
+ lookup.set(fixture.id, fixture.language ?? "unknown");
44325
+ }
44326
+ const grouped = new Map;
44327
+ for (const result of results) {
44328
+ const language = lookup.get(result.fixtureId) ?? "unknown";
44329
+ const bucket = grouped.get(language) ?? [];
44330
+ bucket.push(result);
44331
+ grouped.set(language, bucket);
44332
+ }
44333
+ return grouped;
44334
+ };
44335
+ var runVoiceMultilingualProof = async (options) => {
44336
+ if (options.adapters.length === 0) {
44337
+ throw new Error("runVoiceMultilingualProof requires at least one adapter entry.");
44338
+ }
44339
+ const fixtures = await collectFixtures(options);
44340
+ if (fixtures.length === 0) {
44341
+ throw new Error("runVoiceMultilingualProof found zero fixtures. Did you set VOICE_FIXTURE_DIR or pass fixtures/fixtureDirectories?");
44342
+ }
44343
+ const languageCodes = new Set(fixtures.map((fixture) => fixture.language ?? "unknown"));
44344
+ const adapterReports = [];
44345
+ for (const entry of options.adapters) {
44346
+ const benchmark = await runSTTAdapterBenchmark({
44347
+ adapter: entry.adapter,
44348
+ adapterId: entry.adapterId,
44349
+ fixtures,
44350
+ options: entry.benchmarkOptions
44351
+ });
44352
+ const grouped = groupByLanguage(benchmark.fixtures, fixtures);
44353
+ const languageReports = [];
44354
+ for (const language of languageCodes) {
44355
+ const bucket = grouped.get(language) ?? [];
44356
+ if (bucket.length === 0)
44357
+ continue;
44358
+ const thresholds = resolveLanguageThreshold(language, options.defaultThresholds, options.perLanguage);
44359
+ languageReports.push(evaluateLanguage(language, bucket, thresholds));
44360
+ }
44361
+ const overall = computeMetrics(benchmark.fixtures);
44362
+ const failures = languageReports.flatMap((report) => report.failures);
44363
+ adapterReports.push({
44364
+ adapterId: entry.adapterId,
44365
+ benchmark,
44366
+ failures,
44367
+ fixtureCount: benchmark.fixtures.length,
44368
+ languageReports,
44369
+ overall,
44370
+ passes: failures.length === 0
44371
+ });
44372
+ }
44373
+ const failedAdapters = adapterReports.filter((report) => !report.passes).map((report) => report.adapterId);
44374
+ return {
44375
+ adapters: adapterReports,
44376
+ generatedAt: Date.now(),
44377
+ passes: failedAdapters.length === 0,
44378
+ summary: {
44379
+ adapterCount: adapterReports.length,
44380
+ failedAdapters,
44381
+ fixtureCount: fixtures.length,
44382
+ languageCount: languageCodes.size
44383
+ }
44384
+ };
44385
+ };
44386
+ var renderVoiceMultilingualProofMarkdown = (report) => {
44387
+ const lines = [
44388
+ `# Voice Multilingual STT Proof`,
44389
+ "",
44390
+ `Generated: ${new Date(report.generatedAt).toISOString()}`,
44391
+ `Status: ${report.passes ? "**PASS**" : "**FAIL**"}`,
44392
+ `Adapters: ${String(report.summary.adapterCount)}; Fixtures: ${String(report.summary.fixtureCount)}; Languages: ${String(report.summary.languageCount)}.`,
44393
+ ""
44394
+ ];
44395
+ if (report.summary.failedAdapters.length > 0) {
44396
+ lines.push(`Failed adapters: ${report.summary.failedAdapters.join(", ")}.`, "");
44397
+ }
44398
+ for (const adapter of report.adapters) {
44399
+ lines.push(`## ${adapter.adapterId} \u2014 ${adapter.passes ? "pass" : "fail"}`, "", `- Fixtures: ${String(adapter.fixtureCount)}`, `- Avg WER: ${adapter.overall.averageWordErrorRate.toFixed(3)}`, `- Avg WAR: ${adapter.overall.averageWordAccuracyRate.toFixed(3)}`, `- Pass rate: ${(adapter.overall.passRate * 100).toFixed(1)}%`, "", `| Language | Fixtures | Avg WER | Avg WAR | Pass rate | Threshold | Status |`, `| --- | ---: | ---: | ---: | ---: | --- | --- |`);
44400
+ for (const language of adapter.languageReports) {
44401
+ const threshold = [];
44402
+ if (language.applied.maxAverageWordErrorRate !== undefined) {
44403
+ threshold.push(`WER<=${language.applied.maxAverageWordErrorRate.toFixed(3)}`);
44404
+ }
44405
+ if (language.applied.minAverageWordAccuracyRate !== undefined) {
44406
+ threshold.push(`WAR>=${language.applied.minAverageWordAccuracyRate.toFixed(3)}`);
44407
+ }
44408
+ if (language.applied.minPassRate !== undefined) {
44409
+ threshold.push(`pass>=${language.applied.minPassRate.toFixed(3)}`);
44410
+ }
44411
+ if (language.applied.minTermRecall !== undefined) {
44412
+ threshold.push(`recall>=${language.applied.minTermRecall.toFixed(3)}`);
44413
+ }
44414
+ lines.push(`| ${language.language}${language.label ? ` (${language.label})` : ""} | ${String(language.metrics.fixtureCount)} | ${language.metrics.averageWordErrorRate.toFixed(3)} | ${language.metrics.averageWordAccuracyRate.toFixed(3)} | ${(language.metrics.passRate * 100).toFixed(1)}% | ${threshold.join(", ") || "\u2014"} | ${language.passes ? "pass" : "fail"} |`);
44415
+ }
44416
+ if (adapter.failures.length > 0) {
44417
+ lines.push("", "Failures:");
44418
+ for (const failure of adapter.failures) {
44419
+ lines.push(`- ${failure}`);
44420
+ }
44421
+ }
44422
+ lines.push("");
44423
+ }
44424
+ return lines.join(`
44425
+ `);
44426
+ };
44427
+ var buildVoiceMultilingualProofReadinessCheck = (report, options = {}) => {
44428
+ const label = options.label ?? "Multilingual STT proof";
44429
+ if (report.adapters.length === 0) {
44430
+ return {
44431
+ detail: "No STT adapters were exercised against the multilingual corpus.",
44432
+ href: options.baseHref,
44433
+ label,
44434
+ status: "warn",
44435
+ value: 0
44436
+ };
44437
+ }
44438
+ const failedAdapters = report.summary.failedAdapters;
44439
+ if (failedAdapters.length === 0) {
44440
+ const passingDetail = report.adapters.map((adapter) => `${adapter.adapterId}: WER ${adapter.overall.averageWordErrorRate.toFixed(3)} across ${String(adapter.fixtureCount)} fixtures`).join("; ");
44441
+ return {
44442
+ detail: passingDetail,
44443
+ href: options.baseHref,
44444
+ label,
44445
+ status: "pass",
44446
+ value: report.summary.adapterCount
44447
+ };
44448
+ }
44449
+ return {
44450
+ detail: `Failed adapters: ${failedAdapters.join(", ")}. ${report.adapters.filter((adapter) => !adapter.passes).flatMap((adapter) => adapter.failures.slice(0, 3)).join(" ")}`,
44451
+ href: options.baseHref,
44452
+ label,
44453
+ status: "fail",
44454
+ value: failedAdapters.length
44455
+ };
44456
+ };
43376
44457
  export {
43377
44458
  writeVoiceProofPack,
43378
44459
  writeVoiceMediaPipelineArtifacts,
@@ -43447,6 +44528,7 @@ export {
43447
44528
  runVoiceProfileSwitchPolicyProof,
43448
44529
  runVoicePhoneAgentProductionSmokeContract,
43449
44530
  runVoiceOutcomeContractSuite,
44531
+ runVoiceMultilingualProof,
43450
44532
  runVoiceCommandProofTargets,
43451
44533
  runVoiceCommandProofTarget,
43452
44534
  runVoiceCampaignReadinessProof,
@@ -43535,6 +44617,7 @@ export {
43535
44617
  renderVoiceOperationalStatusHTML,
43536
44618
  renderVoiceObservabilityExportReplayHTML,
43537
44619
  renderVoiceObservabilityExportMarkdown,
44620
+ renderVoiceMultilingualProofMarkdown,
43538
44621
  renderVoiceMonitorMarkdown,
43539
44622
  renderVoiceMonitorHTML,
43540
44623
  renderVoiceMediaPipelineMarkdown,
@@ -44068,6 +45151,7 @@ export {
44068
45151
  buildVoiceObservabilityExportDeliveryHistory,
44069
45152
  buildVoiceObservabilityExport,
44070
45153
  buildVoiceObservabilityArtifactIndex,
45154
+ buildVoiceMultilingualProofReadinessCheck,
44071
45155
  buildVoiceMonitorRunReport,
44072
45156
  buildVoiceMediaPipelineReport,
44073
45157
  buildVoiceMediaPipelineReadinessChecks,
@@ -0,0 +1,77 @@
1
+ import type { STTAdapter } from "./types";
2
+ import { type VoiceTestFixture } from "./testing/fixtures";
3
+ import { type VoiceSTTBenchmarkOptions, type VoiceSTTBenchmarkReport } from "./testing/benchmark";
4
+ export type VoiceMultilingualLanguageCode = string;
5
+ export type VoiceMultilingualProofLanguageThresholds = {
6
+ label?: string;
7
+ language: VoiceMultilingualLanguageCode;
8
+ maxAverageWordErrorRate?: number;
9
+ minAverageWordAccuracyRate?: number;
10
+ minPassRate?: number;
11
+ minTermRecall?: number;
12
+ };
13
+ export type VoiceMultilingualProofDefaultThresholds = Omit<VoiceMultilingualProofLanguageThresholds, "label" | "language">;
14
+ export type VoiceMultilingualProofAdapterEntry = {
15
+ adapter: STTAdapter;
16
+ adapterId: string;
17
+ benchmarkOptions?: VoiceSTTBenchmarkOptions;
18
+ };
19
+ export type VoiceMultilingualProofOptions = {
20
+ adapters: readonly VoiceMultilingualProofAdapterEntry[];
21
+ defaultThresholds?: VoiceMultilingualProofDefaultThresholds;
22
+ filter?: (fixture: VoiceTestFixture) => boolean;
23
+ fixtureDirectories?: string | readonly string[];
24
+ fixtures?: readonly VoiceTestFixture[];
25
+ perLanguage?: readonly VoiceMultilingualProofLanguageThresholds[];
26
+ };
27
+ export type VoiceMultilingualProofLanguageMetrics = {
28
+ averageTermRecall: number;
29
+ averageWordAccuracyRate: number;
30
+ averageWordErrorRate: number;
31
+ fixtureCount: number;
32
+ passCount: number;
33
+ passRate: number;
34
+ };
35
+ export type VoiceMultilingualProofLanguageReport = {
36
+ applied: VoiceMultilingualProofLanguageThresholds;
37
+ failures: readonly string[];
38
+ fixtureIds: readonly string[];
39
+ label?: string;
40
+ language: VoiceMultilingualLanguageCode;
41
+ metrics: VoiceMultilingualProofLanguageMetrics;
42
+ passes: boolean;
43
+ };
44
+ export type VoiceMultilingualProofAdapterReport = {
45
+ adapterId: string;
46
+ benchmark: VoiceSTTBenchmarkReport;
47
+ failures: readonly string[];
48
+ fixtureCount: number;
49
+ languageReports: readonly VoiceMultilingualProofLanguageReport[];
50
+ overall: VoiceMultilingualProofLanguageMetrics;
51
+ passes: boolean;
52
+ };
53
+ export type VoiceMultilingualProofReport = {
54
+ adapters: readonly VoiceMultilingualProofAdapterReport[];
55
+ generatedAt: number;
56
+ passes: boolean;
57
+ summary: {
58
+ adapterCount: number;
59
+ failedAdapters: readonly string[];
60
+ fixtureCount: number;
61
+ languageCount: number;
62
+ };
63
+ };
64
+ export declare const runVoiceMultilingualProof: (options: VoiceMultilingualProofOptions) => Promise<VoiceMultilingualProofReport>;
65
+ export declare const renderVoiceMultilingualProofMarkdown: (report: VoiceMultilingualProofReport) => string;
66
+ export type VoiceMultilingualProofReadinessOptions = {
67
+ baseHref?: string;
68
+ label?: string;
69
+ };
70
+ export type VoiceMultilingualProofReadinessCheck = {
71
+ detail: string;
72
+ href?: string;
73
+ label: string;
74
+ status: "fail" | "pass" | "warn";
75
+ value?: number | string;
76
+ };
77
+ export declare const buildVoiceMultilingualProofReadinessCheck: (report: VoiceMultilingualProofReport, options?: VoiceMultilingualProofReadinessOptions) => VoiceMultilingualProofReadinessCheck;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.471",
3
+ "version": "0.0.22-beta.472",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",