@absolutejs/voice 0.0.22-beta.471 → 0.0.22-beta.472
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +1084 -0
- package/dist/multilingualProof.d.ts +77 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -8,6 +8,47 @@ Use it when you want Vapi/Retell/Bland-style voice-agent capability, but you wan
|
|
|
8
8
|
|
|
9
9
|
## What's new
|
|
10
10
|
|
|
11
|
+
### 0.0.22-beta.472 · Phase 6 — multilingual STT proof gate
|
|
12
|
+
|
|
13
|
+
`runVoiceMultilingualProof(...)` turns the `voice-fixtures-multilingual` corpus (FLEURS + BSC Catalan-Spanish code-switch + CoSHE Hindi-English code-switch) into a gateable readiness/proof artifact. Buyers evaluating Vapi-replacement can now run any combination of STT adapters against the multilingual corpus and assert per-language WER / pass-rate / term-recall budgets in CI.
|
|
14
|
+
|
|
15
|
+
```ts
|
|
16
|
+
import {
|
|
17
|
+
buildVoiceMultilingualProofReadinessCheck,
|
|
18
|
+
renderVoiceMultilingualProofMarkdown,
|
|
19
|
+
runVoiceMultilingualProof,
|
|
20
|
+
} from "@absolutejs/voice";
|
|
21
|
+
import { deepgram } from "@absolutejs/voice-deepgram";
|
|
22
|
+
import { speechmatics } from "@absolutejs/voice-speechmatics";
|
|
23
|
+
import { soniox } from "@absolutejs/voice-soniox";
|
|
24
|
+
|
|
25
|
+
const report = await runVoiceMultilingualProof({
|
|
26
|
+
adapters: [
|
|
27
|
+
{ adapter: deepgram({ apiKey, model: "nova-3" }), adapterId: "deepgram-nova3" },
|
|
28
|
+
{ adapter: speechmatics({ apiKey, region: "eu2" }), adapterId: "speechmatics-enhanced" },
|
|
29
|
+
{ adapter: soniox({ apiKey, enableLanguageIdentification: true }), adapterId: "soniox" },
|
|
30
|
+
],
|
|
31
|
+
defaultThresholds: { maxAverageWordErrorRate: 0.30, minPassRate: 0.7 },
|
|
32
|
+
perLanguage: [
|
|
33
|
+
{ language: "ca-es", label: "Catalan-Spanish code-switch", maxAverageWordErrorRate: 0.45 },
|
|
34
|
+
{ language: "hi-en", label: "Hindi-English code-switch", maxAverageWordErrorRate: 0.50 },
|
|
35
|
+
],
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
const readiness = buildVoiceMultilingualProofReadinessCheck(report, {
|
|
39
|
+
baseHref: "/voice/multilingual-proof",
|
|
40
|
+
});
|
|
41
|
+
// drop `readiness` into your VoiceProductionReadinessReport.checks array
|
|
42
|
+
|
|
43
|
+
await Bun.write("docs/multilingual-proof.md", renderVoiceMultilingualProofMarkdown(report));
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Highlights:
|
|
47
|
+
- Loads fixtures from `VOICE_FIXTURE_DIR` (or any caller-supplied directory list / pre-loaded `VoiceTestFixture[]`), filters by an optional predicate, and runs each adapter through the existing `runSTTAdapterBenchmark` harness — no new STT plumbing required.
|
|
48
|
+
- Buckets fixture results by `fixture.language` and applies per-language thresholds (`maxAverageWordErrorRate`, `minAverageWordAccuracyRate`, `minPassRate`, `minTermRecall`) layered over caller-provided `defaultThresholds`.
|
|
49
|
+
- Returns a structured report (`adapters[].languageReports[]` with metrics + failures, plus per-adapter and global `passes` flags) plus a Markdown renderer for human review and a `VoiceProductionReadinessCheck`-shaped helper for drop-in readiness wiring.
|
|
50
|
+
- Pairs naturally with every STT adapter shipped in `voice-adapters` (deepgram, assemblyai, azure streaming, speechmatics, gladia, soniox, google-speech streaming + buffered, openai-whisper buffered).
|
|
51
|
+
|
|
11
52
|
### 0.0.22-beta.471 · Vapi parity — `fromVapiAssistantConfig` adapter
|
|
12
53
|
|
|
13
54
|
Mechanical migration from a Vapi Assistant JSON to a voice assistant. Pass the JSON dump (or a typed subset), provide a `modelFactory` that maps Vapi's `model.provider`+`model.model` to a voice `VoiceAgentModel`, and get back `{ assistant, tools, routeHints, unsupported }`.
|
package/dist/index.d.ts
CHANGED
|
@@ -223,4 +223,6 @@ export { shapeTelephonyAssistantText } from "./telephony/response";
|
|
|
223
223
|
export type { TelephonyResponseShapeMode, TelephonyResponseShapeOptions, } from "./telephony/response";
|
|
224
224
|
export { buildVoiceProofPackInput, buildVoiceProofPack, buildVoiceProofPackFromObservabilityExport, createVoiceProofPackBuildContext, createVoiceProofRefreshSnapshot, createVoiceProofPackStaleWhileRefreshSource, createVoiceProofPackArtifacts, createVoiceProofPackOperationsRecordSection, createVoiceProofPackProductionReadinessSection, createVoiceProofPackProviderSloSection, createVoiceProofPackRoutes, createVoiceProofPackSupportBundleSection, renderVoiceProofPackMarkdown, writeVoiceProofPack, } from "./proofPack";
|
|
225
225
|
export type { VoiceProofPack, VoiceProofPackBuildContext, VoiceProofPackBuildContextOptions, VoiceProofPackBuildTiming, VoiceProofPackEvidence, VoiceProofPackInput, VoiceProofPackInputBuilderLoaderInput, VoiceProofPackInputBuilderOperationsLoaderInput, VoiceProofPackInputBuilderOptions, VoiceProofPackInputBuilderSupportBundle, VoiceProofPackRefreshState, VoiceProofPackRefreshStatus, VoiceProofPackRoutesOptions, VoiceProofPackSection, VoiceProofPackSourceValue, VoiceProofPackStatus, VoiceProofPackStaleWhileRefreshSource, VoiceProofPackStaleWhileRefreshSourceOptions, VoiceProofPackWriteResult, VoiceProofRefreshSnapshot, VoiceProofRefreshSnapshotOptions, } from "./proofPack";
|
|
226
|
+
export { buildVoiceMultilingualProofReadinessCheck, renderVoiceMultilingualProofMarkdown, runVoiceMultilingualProof, } from "./multilingualProof";
|
|
227
|
+
export type { VoiceMultilingualLanguageCode, VoiceMultilingualProofAdapterEntry, VoiceMultilingualProofAdapterReport, VoiceMultilingualProofDefaultThresholds, VoiceMultilingualProofLanguageMetrics, VoiceMultilingualProofLanguageReport, VoiceMultilingualProofLanguageThresholds, VoiceMultilingualProofOptions, VoiceMultilingualProofReadinessCheck, VoiceMultilingualProofReadinessOptions, VoiceMultilingualProofReport, } from "./multilingualProof";
|
|
226
228
|
export * from "./types";
|
package/dist/index.js
CHANGED
|
@@ -43373,6 +43373,1087 @@ var createVoiceProofPackRoutes = (options) => {
|
|
|
43373
43373
|
}
|
|
43374
43374
|
return app;
|
|
43375
43375
|
};
|
|
43376
|
+
// src/testing/fixtures.ts
|
|
43377
|
+
import { resolve as resolve2 } from "path";
|
|
43378
|
+
var JARGON_FIXTURE_IDS = [
|
|
43379
|
+
"traveled-back-route-clean",
|
|
43380
|
+
"dialogue-two-clean",
|
|
43381
|
+
"dialogue-three-clean",
|
|
43382
|
+
"dialogue-two-noisy",
|
|
43383
|
+
"dialogue-three-mixed"
|
|
43384
|
+
];
|
|
43385
|
+
var DEFAULT_AUDIO_FORMAT = {
|
|
43386
|
+
channels: 1,
|
|
43387
|
+
container: "raw",
|
|
43388
|
+
encoding: "pcm_s16le",
|
|
43389
|
+
sampleRateHz: 16000
|
|
43390
|
+
};
|
|
43391
|
+
var DEFAULT_TELEPHONY_SAMPLE_RATE_HZ = 8000;
|
|
43392
|
+
var DEFAULT_MULTI_SPEAKER_SILENCE_MS = 350;
|
|
43393
|
+
var FIXTURE_DIR_CANDIDATES = [
|
|
43394
|
+
resolve2(import.meta.dir, "..", "..", "fixtures"),
|
|
43395
|
+
resolve2(import.meta.dir, "..", "..", "..", "fixtures"),
|
|
43396
|
+
resolve2(import.meta.dir, "..", "..", "..", "..", "fixtures")
|
|
43397
|
+
];
|
|
43398
|
+
var EXTERNAL_FIXTURE_ENV_KEYS = [
|
|
43399
|
+
"VOICE_FIXTURE_DIR",
|
|
43400
|
+
"VOICE_FIXTURE_DIRS"
|
|
43401
|
+
];
|
|
43402
|
+
var resolveFixtureDirectory = async () => {
|
|
43403
|
+
for (const candidate of FIXTURE_DIR_CANDIDATES) {
|
|
43404
|
+
if (await Bun.file(resolve2(candidate, "manifest.json")).exists()) {
|
|
43405
|
+
return candidate;
|
|
43406
|
+
}
|
|
43407
|
+
}
|
|
43408
|
+
throw new Error("Unable to locate the bundled voice test fixtures. Expected fixtures/manifest.json next to the package root.");
|
|
43409
|
+
};
|
|
43410
|
+
var getVoiceFixtureDirectory = async () => resolveFixtureDirectory();
|
|
43411
|
+
var toUniqueDirectories = (directories) => directories.filter((directory, index, list) => directory.trim().length > 0 && list.indexOf(directory) === index);
|
|
43412
|
+
var splitFixtureDirectoryValue = (value) => (value ?? "").split(/[\n,]/).map((entry) => entry.trim()).filter((entry) => entry.length > 0);
|
|
43413
|
+
var resolveFixtureInputDirectories = (input) => {
|
|
43414
|
+
if (typeof input === "string") {
|
|
43415
|
+
return [input];
|
|
43416
|
+
}
|
|
43417
|
+
if (Array.isArray(input)) {
|
|
43418
|
+
return input;
|
|
43419
|
+
}
|
|
43420
|
+
return input?.directories ?? [];
|
|
43421
|
+
};
|
|
43422
|
+
var shouldIncludeBundledFixtures = (input) => {
|
|
43423
|
+
if (input && typeof input === "object" && !Array.isArray(input) && input.includeBundled === false) {
|
|
43424
|
+
return false;
|
|
43425
|
+
}
|
|
43426
|
+
return true;
|
|
43427
|
+
};
|
|
43428
|
+
var resolveConfiguredFixtureDirectories = async (input) => {
|
|
43429
|
+
const directories = [
|
|
43430
|
+
...resolveFixtureInputDirectories(input),
|
|
43431
|
+
...EXTERNAL_FIXTURE_ENV_KEYS.flatMap((key) => splitFixtureDirectoryValue(process.env[key]))
|
|
43432
|
+
];
|
|
43433
|
+
const uniqueDirectories = toUniqueDirectories(directories.map((directory) => resolve2(directory)));
|
|
43434
|
+
for (const directory of uniqueDirectories) {
|
|
43435
|
+
const manifestExists = await Bun.file(resolve2(directory, "manifest.json")).exists();
|
|
43436
|
+
if (!manifestExists) {
|
|
43437
|
+
throw new Error(`Voice fixture directory "${directory}" is missing manifest.json.`);
|
|
43438
|
+
}
|
|
43439
|
+
}
|
|
43440
|
+
return uniqueDirectories;
|
|
43441
|
+
};
|
|
43442
|
+
var resolveVoiceFixtureDirectories = async (input) => {
|
|
43443
|
+
const directories = await resolveConfiguredFixtureDirectories(input);
|
|
43444
|
+
if (!shouldIncludeBundledFixtures(input)) {
|
|
43445
|
+
if (directories.length === 0) {
|
|
43446
|
+
throw new Error("No voice fixture directories were configured. Provide directories or set VOICE_FIXTURE_DIR/VOICE_FIXTURE_DIRS.");
|
|
43447
|
+
}
|
|
43448
|
+
return directories;
|
|
43449
|
+
}
|
|
43450
|
+
return [await resolveFixtureDirectory(), ...directories];
|
|
43451
|
+
};
|
|
43452
|
+
var clampSample = (value) => Math.max(-32768, Math.min(32767, Math.round(value)));
|
|
43453
|
+
var toPcm16Samples = (audio) => new Int16Array(audio.buffer.slice(audio.byteOffset, audio.byteOffset + audio.byteLength));
|
|
43454
|
+
var toPcm16Bytes = (samples) => new Uint8Array(samples.buffer.slice(samples.byteOffset, samples.byteOffset + samples.byteLength));
|
|
43455
|
+
var createSilenceBytes = (sampleRateHz, durationMs) => new Uint8Array(Math.max(2, Math.round(sampleRateHz * 2 * durationMs / 1000)));
|
|
43456
|
+
var concatAudioChunks = (chunks) => {
|
|
43457
|
+
const totalByteLength = chunks.reduce((sum, chunk) => sum + chunk.byteLength, 0);
|
|
43458
|
+
const output = new Uint8Array(totalByteLength);
|
|
43459
|
+
let offset = 0;
|
|
43460
|
+
for (const chunk of chunks) {
|
|
43461
|
+
output.set(chunk, offset);
|
|
43462
|
+
offset += chunk.byteLength;
|
|
43463
|
+
}
|
|
43464
|
+
return output;
|
|
43465
|
+
};
|
|
43466
|
+
var resamplePcm16Mono = (samples, sourceRate, targetRate) => {
|
|
43467
|
+
if (sourceRate === targetRate || samples.length === 0) {
|
|
43468
|
+
return samples;
|
|
43469
|
+
}
|
|
43470
|
+
const ratio = targetRate / sourceRate;
|
|
43471
|
+
const targetLength = Math.max(1, Math.round(samples.length * ratio));
|
|
43472
|
+
const output = new Int16Array(targetLength);
|
|
43473
|
+
for (let index = 0;index < targetLength; index += 1) {
|
|
43474
|
+
const sourceIndex = index / ratio;
|
|
43475
|
+
const previousIndex = Math.floor(sourceIndex);
|
|
43476
|
+
const nextIndex = Math.min(previousIndex + 1, samples.length - 1);
|
|
43477
|
+
const fraction = sourceIndex - previousIndex;
|
|
43478
|
+
const previous = samples[previousIndex] ?? 0;
|
|
43479
|
+
const next = samples[nextIndex] ?? previous;
|
|
43480
|
+
output[index] = clampSample(previous + (next - previous) * fraction);
|
|
43481
|
+
}
|
|
43482
|
+
return output;
|
|
43483
|
+
};
|
|
43484
|
+
var toMuLaw = (sample) => {
|
|
43485
|
+
const MU_LAW_MAX = 8191;
|
|
43486
|
+
const MU_LAW_BIAS = 132;
|
|
43487
|
+
const sign = sample < 0 ? 128 : 0;
|
|
43488
|
+
const magnitude = Math.min(MU_LAW_MAX, Math.abs(sample) + MU_LAW_BIAS);
|
|
43489
|
+
let exponent = 7;
|
|
43490
|
+
for (let mask = 16384;(magnitude & mask) === 0 && exponent > 0; mask >>= 1) {
|
|
43491
|
+
exponent -= 1;
|
|
43492
|
+
}
|
|
43493
|
+
const mantissa = magnitude >> exponent + 3 & 15;
|
|
43494
|
+
return ~(sign | exponent << 4 | mantissa) & 255;
|
|
43495
|
+
};
|
|
43496
|
+
var fromMuLaw = (encoded) => {
|
|
43497
|
+
const normalized = ~encoded & 255;
|
|
43498
|
+
const sign = normalized & 128;
|
|
43499
|
+
const exponent = normalized >> 4 & 7;
|
|
43500
|
+
const mantissa = normalized & 15;
|
|
43501
|
+
const magnitude = ((mantissa | 16) << exponent + 3) - 132;
|
|
43502
|
+
return sign ? -magnitude : magnitude;
|
|
43503
|
+
};
|
|
43504
|
+
var applyTelephonyDegradation = (audio, format, targetSampleRateHz) => {
|
|
43505
|
+
const sourceSamples = toPcm16Samples(audio);
|
|
43506
|
+
const narrowbandSamples = resamplePcm16Mono(sourceSamples, format.sampleRateHz, targetSampleRateHz);
|
|
43507
|
+
const degradedSamples = new Int16Array(narrowbandSamples.length);
|
|
43508
|
+
for (let index = 0;index < narrowbandSamples.length; index += 1) {
|
|
43509
|
+
const compressed = toMuLaw(narrowbandSamples[index] ?? 0);
|
|
43510
|
+
degradedSamples[index] = clampSample(fromMuLaw(compressed) * 0.92);
|
|
43511
|
+
}
|
|
43512
|
+
return toPcm16Bytes(degradedSamples);
|
|
43513
|
+
};
|
|
43514
|
+
var shouldIncludeTelephonyFixture = (fixture, options) => {
|
|
43515
|
+
const tags = new Set(fixture.tags ?? []);
|
|
43516
|
+
if (!options.includeAccents && (tags.has("accent") || tags.has("speech-accent-archive"))) {
|
|
43517
|
+
return false;
|
|
43518
|
+
}
|
|
43519
|
+
return true;
|
|
43520
|
+
};
|
|
43521
|
+
var createTelephonyVoiceTestFixtures = (fixtures, options = {}) => {
|
|
43522
|
+
const targetSampleRateHz = options.targetSampleRateHz ?? DEFAULT_TELEPHONY_SAMPLE_RATE_HZ;
|
|
43523
|
+
return fixtures.filter((fixture) => shouldIncludeTelephonyFixture(fixture, options)).map((fixture) => ({
|
|
43524
|
+
...fixture,
|
|
43525
|
+
audio: applyTelephonyDegradation(fixture.audio, fixture.format, targetSampleRateHz),
|
|
43526
|
+
format: {
|
|
43527
|
+
...fixture.format,
|
|
43528
|
+
sampleRateHz: targetSampleRateHz
|
|
43529
|
+
},
|
|
43530
|
+
id: `${fixture.id}-telephony`,
|
|
43531
|
+
tags: Array.from(new Set([...fixture.tags ?? [], "narrowband", "telephony"])),
|
|
43532
|
+
title: `${fixture.title} (telephony narrowband)`
|
|
43533
|
+
}));
|
|
43534
|
+
};
|
|
43535
|
+
var requireFixture = (fixtures, id) => {
|
|
43536
|
+
const fixture = fixtures.find((entry) => entry.id === id);
|
|
43537
|
+
if (!fixture) {
|
|
43538
|
+
throw new Error(`Missing bundled voice fixture "${id}" required for multi-speaker benchmarks.`);
|
|
43539
|
+
}
|
|
43540
|
+
return fixture;
|
|
43541
|
+
};
|
|
43542
|
+
var createMultiSpeakerVoiceTestFixtures = (fixtures, options = {}) => {
|
|
43543
|
+
const silenceMs = options.silenceMs ?? DEFAULT_MULTI_SPEAKER_SILENCE_MS;
|
|
43544
|
+
const speakerA = requireFixture(fixtures, "quietly-alone-clean");
|
|
43545
|
+
const speakerB = requireFixture(fixtures, "traveled-back-route-clean");
|
|
43546
|
+
const speakerC = requireFixture(fixtures, "rainstorms-noisy");
|
|
43547
|
+
const silence = createSilenceBytes(speakerA.format.sampleRateHz, silenceMs);
|
|
43548
|
+
const handoff = concatAudioChunks([speakerA.audio, silence, speakerB.audio]);
|
|
43549
|
+
const threeTurn = concatAudioChunks([
|
|
43550
|
+
speakerA.audio,
|
|
43551
|
+
silence,
|
|
43552
|
+
speakerB.audio,
|
|
43553
|
+
silence,
|
|
43554
|
+
speakerC.audio
|
|
43555
|
+
]);
|
|
43556
|
+
const buildTags = (...tags) => [
|
|
43557
|
+
"multi-speaker",
|
|
43558
|
+
"handoff",
|
|
43559
|
+
"synthetic",
|
|
43560
|
+
...tags
|
|
43561
|
+
];
|
|
43562
|
+
return [
|
|
43563
|
+
{
|
|
43564
|
+
...speakerA,
|
|
43565
|
+
audio: handoff,
|
|
43566
|
+
audioPath: `${speakerA.audioPath}+${speakerB.audioPath}`,
|
|
43567
|
+
expectedSpeakerTurns: [
|
|
43568
|
+
{ speaker: "speaker-a", text: speakerA.expectedText },
|
|
43569
|
+
{ speaker: "speaker-b", text: speakerB.expectedText }
|
|
43570
|
+
],
|
|
43571
|
+
expectedTerms: Array.from(new Set([
|
|
43572
|
+
...speakerA.expectedTerms ?? [],
|
|
43573
|
+
...speakerB.expectedTerms ?? []
|
|
43574
|
+
])),
|
|
43575
|
+
expectedText: `${speakerA.expectedText} ${speakerB.expectedText}`.trim(),
|
|
43576
|
+
expectedTurnTexts: [speakerA.expectedText, speakerB.expectedText],
|
|
43577
|
+
id: "multi-speaker-handoff-clean",
|
|
43578
|
+
tags: buildTags("clean"),
|
|
43579
|
+
title: "Synthetic two-speaker handoff"
|
|
43580
|
+
},
|
|
43581
|
+
{
|
|
43582
|
+
...speakerA,
|
|
43583
|
+
audio: threeTurn,
|
|
43584
|
+
audioPath: `${speakerA.audioPath}+${speakerB.audioPath}+${speakerC.audioPath}`,
|
|
43585
|
+
expectedSpeakerTurns: [
|
|
43586
|
+
{ speaker: "speaker-a", text: speakerA.expectedText },
|
|
43587
|
+
{ speaker: "speaker-b", text: speakerB.expectedText },
|
|
43588
|
+
{ speaker: "speaker-c", text: speakerC.expectedText }
|
|
43589
|
+
],
|
|
43590
|
+
expectedTerms: Array.from(new Set([
|
|
43591
|
+
...speakerA.expectedTerms ?? [],
|
|
43592
|
+
...speakerB.expectedTerms ?? [],
|
|
43593
|
+
...speakerC.expectedTerms ?? []
|
|
43594
|
+
])),
|
|
43595
|
+
expectedText: `${speakerA.expectedText} ${speakerB.expectedText} ${speakerC.expectedText}`.trim(),
|
|
43596
|
+
expectedTurnTexts: [
|
|
43597
|
+
speakerA.expectedText,
|
|
43598
|
+
speakerB.expectedText,
|
|
43599
|
+
speakerC.expectedText
|
|
43600
|
+
],
|
|
43601
|
+
id: "multi-speaker-handoff-three",
|
|
43602
|
+
tags: buildTags("challenging", "noisy"),
|
|
43603
|
+
title: "Synthetic three-speaker handoff (A-B-C)"
|
|
43604
|
+
}
|
|
43605
|
+
];
|
|
43606
|
+
};
|
|
43607
|
+
var createJargonVoiceTestFixtures = (fixtures) => JARGON_FIXTURE_IDS.map((id) => requireFixture(fixtures, id)).filter((fixture) => (fixture.expectedTerms?.length ?? 0) > 0).map((fixture) => ({
|
|
43608
|
+
...fixture,
|
|
43609
|
+
id: `${fixture.id}-jargon`,
|
|
43610
|
+
tags: Array.from(new Set([...fixture.tags ?? [], "domain-heavy", "jargon"])),
|
|
43611
|
+
title: `${fixture.title} (jargon)`
|
|
43612
|
+
}));
|
|
43613
|
+
var loadVoiceTestFixtures = async (fixtureDirectory) => {
|
|
43614
|
+
const fixtureDirectories = await resolveVoiceFixtureDirectories(fixtureDirectory);
|
|
43615
|
+
const fixtures = [];
|
|
43616
|
+
const seenFixtureIds = new Set;
|
|
43617
|
+
for (const directory of fixtureDirectories) {
|
|
43618
|
+
const manifestFile = Bun.file(resolve2(directory, "manifest.json"));
|
|
43619
|
+
const manifest = await manifestFile.json();
|
|
43620
|
+
for (const entry of manifest) {
|
|
43621
|
+
if (seenFixtureIds.has(entry.id)) {
|
|
43622
|
+
throw new Error(`Duplicate voice fixture id "${entry.id}" found while loading "${directory}".`);
|
|
43623
|
+
}
|
|
43624
|
+
const audioPath = resolve2(directory, "pcm", entry.audioPath);
|
|
43625
|
+
const audio = new Uint8Array(await Bun.file(audioPath).arrayBuffer());
|
|
43626
|
+
fixtures.push({
|
|
43627
|
+
...entry,
|
|
43628
|
+
audio,
|
|
43629
|
+
audioPath,
|
|
43630
|
+
format: {
|
|
43631
|
+
...DEFAULT_AUDIO_FORMAT,
|
|
43632
|
+
...entry.format
|
|
43633
|
+
}
|
|
43634
|
+
});
|
|
43635
|
+
seenFixtureIds.add(entry.id);
|
|
43636
|
+
}
|
|
43637
|
+
}
|
|
43638
|
+
return fixtures;
|
|
43639
|
+
};
|
|
43640
|
+
|
|
43641
|
+
// src/testing/accuracy.ts
|
|
43642
|
+
var normalizeAccuracyText = (value) => value.toLowerCase().replace(/[^\p{L}\p{N}\s']/gu, " ").replace(/\s+/g, " ").trim();
|
|
43643
|
+
var levenshteinDistance2 = (left, right) => {
|
|
43644
|
+
if (left.length === 0) {
|
|
43645
|
+
return right.length;
|
|
43646
|
+
}
|
|
43647
|
+
if (right.length === 0) {
|
|
43648
|
+
return left.length;
|
|
43649
|
+
}
|
|
43650
|
+
const previous = new Array(right.length + 1).fill(0);
|
|
43651
|
+
const current = new Array(right.length + 1).fill(0);
|
|
43652
|
+
for (let column = 0;column <= right.length; column += 1) {
|
|
43653
|
+
previous[column] = column;
|
|
43654
|
+
}
|
|
43655
|
+
for (let row = 1;row <= left.length; row += 1) {
|
|
43656
|
+
current[0] = row;
|
|
43657
|
+
for (let column = 1;column <= right.length; column += 1) {
|
|
43658
|
+
const substitutionCost = left[row - 1] === right[column - 1] ? 0 : 1;
|
|
43659
|
+
current[column] = Math.min(current[column - 1] + 1, previous[column] + 1, previous[column - 1] + substitutionCost);
|
|
43660
|
+
}
|
|
43661
|
+
for (let column = 0;column <= right.length; column += 1) {
|
|
43662
|
+
previous[column] = current[column];
|
|
43663
|
+
}
|
|
43664
|
+
}
|
|
43665
|
+
return previous[right.length];
|
|
43666
|
+
};
|
|
43667
|
+
var mergeFinalTranscriptText = (transcripts) => buildTurnText(transcripts.filter((transcript) => transcript.isFinal), "");
|
|
43668
|
+
var scoreTranscriptAccuracy = (actualText, expectedText, threshold = 0.35) => {
|
|
43669
|
+
const normalizedActual = normalizeAccuracyText(actualText);
|
|
43670
|
+
const normalizedExpected = normalizeAccuracyText(expectedText);
|
|
43671
|
+
const actualWords = normalizedActual ? normalizedActual.split(" ") : [];
|
|
43672
|
+
const expectedWords = normalizedExpected ? normalizedExpected.split(" ") : [];
|
|
43673
|
+
const wordDistance = levenshteinDistance2(actualWords, expectedWords);
|
|
43674
|
+
const charDistance = levenshteinDistance2(Array.from(normalizedActual), Array.from(normalizedExpected));
|
|
43675
|
+
const wordErrorRate = expectedWords.length > 0 ? wordDistance / expectedWords.length : 0;
|
|
43676
|
+
const charErrorRate = normalizedExpected.length > 0 ? charDistance / normalizedExpected.length : 0;
|
|
43677
|
+
return {
|
|
43678
|
+
actualText: normalizedActual,
|
|
43679
|
+
charDistance,
|
|
43680
|
+
charErrorRate,
|
|
43681
|
+
expectedText: normalizedExpected,
|
|
43682
|
+
passesThreshold: wordErrorRate <= threshold,
|
|
43683
|
+
threshold,
|
|
43684
|
+
wordDistance,
|
|
43685
|
+
wordErrorRate
|
|
43686
|
+
};
|
|
43687
|
+
};
|
|
43688
|
+
|
|
43689
|
+
// src/testing/stt.ts
|
|
43690
|
+
var chunkAudio = (audio, bytesPerChunk) => {
|
|
43691
|
+
const chunks = [];
|
|
43692
|
+
for (let offset = 0;offset < audio.byteLength; offset += bytesPerChunk) {
|
|
43693
|
+
chunks.push(audio.slice(offset, offset + bytesPerChunk));
|
|
43694
|
+
}
|
|
43695
|
+
return chunks;
|
|
43696
|
+
};
|
|
43697
|
+
var createSilence = (byteLength3) => new Uint8Array(byteLength3);
|
|
43698
|
+
var waitForIdle = async (readLastActivityAt, idleTimeoutMs, settleMs) => {
|
|
43699
|
+
const startedAt = Date.now();
|
|
43700
|
+
while (Date.now() - startedAt < idleTimeoutMs) {
|
|
43701
|
+
if (Date.now() - readLastActivityAt() >= settleMs) {
|
|
43702
|
+
return;
|
|
43703
|
+
}
|
|
43704
|
+
await Bun.sleep(Math.min(50, settleMs));
|
|
43705
|
+
}
|
|
43706
|
+
};
|
|
43707
|
+
var runSTTAdapterFixture = async (adapter, fixture, options = {}) => {
|
|
43708
|
+
const startedAt = Date.now();
|
|
43709
|
+
const partialEvents = [];
|
|
43710
|
+
const finalEvents = [];
|
|
43711
|
+
const endOfTurnEvents = [];
|
|
43712
|
+
const errorEvents = [];
|
|
43713
|
+
const closeEvents = [];
|
|
43714
|
+
const chunkDurationMs = options.chunkDurationMs ?? fixture.chunkDurationMs ?? 100;
|
|
43715
|
+
const tailPaddingMs = options.tailPaddingMs ?? fixture.tailPaddingMs ?? 1000;
|
|
43716
|
+
const idleTimeoutMs = options.idleTimeoutMs ?? 8000;
|
|
43717
|
+
const settleMs = options.settleMs ?? 500;
|
|
43718
|
+
const waitForRealtimeMs = options.waitForRealtimeMs ?? 0;
|
|
43719
|
+
let lastActivityAt = Date.now();
|
|
43720
|
+
let speechEndedAt = startedAt;
|
|
43721
|
+
const markActive = () => {
|
|
43722
|
+
lastActivityAt = Date.now();
|
|
43723
|
+
};
|
|
43724
|
+
const resolvedOpenOptions = typeof options.openOptions === "function" ? options.openOptions(fixture) : options.openOptions;
|
|
43725
|
+
const session = await adapter.open({
|
|
43726
|
+
format: fixture.format,
|
|
43727
|
+
sessionId: `fixture-${fixture.id}`,
|
|
43728
|
+
...resolvedOpenOptions ?? {}
|
|
43729
|
+
});
|
|
43730
|
+
const unsubscribers = [
|
|
43731
|
+
session.on("partial", (event) => {
|
|
43732
|
+
partialEvents.push(event);
|
|
43733
|
+
markActive();
|
|
43734
|
+
}),
|
|
43735
|
+
session.on("final", (event) => {
|
|
43736
|
+
finalEvents.push(event);
|
|
43737
|
+
markActive();
|
|
43738
|
+
}),
|
|
43739
|
+
session.on("endOfTurn", (event) => {
|
|
43740
|
+
endOfTurnEvents.push(event);
|
|
43741
|
+
markActive();
|
|
43742
|
+
}),
|
|
43743
|
+
session.on("error", (event) => {
|
|
43744
|
+
errorEvents.push(event);
|
|
43745
|
+
markActive();
|
|
43746
|
+
}),
|
|
43747
|
+
session.on("close", (event) => {
|
|
43748
|
+
closeEvents.push(event);
|
|
43749
|
+
markActive();
|
|
43750
|
+
})
|
|
43751
|
+
];
|
|
43752
|
+
try {
|
|
43753
|
+
const bytesPerMillisecond = fixture.format.sampleRateHz * fixture.format.channels * 2 / 1000;
|
|
43754
|
+
const bytesPerChunk = Math.max(2, Math.floor(bytesPerMillisecond * chunkDurationMs));
|
|
43755
|
+
const chunks = chunkAudio(fixture.audio, bytesPerChunk);
|
|
43756
|
+
const realtimeDelayMs = waitForRealtimeMs > 0 ? waitForRealtimeMs : chunkDurationMs;
|
|
43757
|
+
for (const chunk of chunks) {
|
|
43758
|
+
await session.send(chunk);
|
|
43759
|
+
markActive();
|
|
43760
|
+
await Bun.sleep(realtimeDelayMs);
|
|
43761
|
+
}
|
|
43762
|
+
speechEndedAt = Date.now();
|
|
43763
|
+
if (tailPaddingMs > 0) {
|
|
43764
|
+
const tailBytes = Math.max(2, Math.floor(bytesPerMillisecond * tailPaddingMs));
|
|
43765
|
+
for (const chunk of chunkAudio(createSilence(tailBytes), bytesPerChunk)) {
|
|
43766
|
+
await session.send(chunk);
|
|
43767
|
+
markActive();
|
|
43768
|
+
await Bun.sleep(realtimeDelayMs);
|
|
43769
|
+
}
|
|
43770
|
+
}
|
|
43771
|
+
await waitForIdle(() => lastActivityAt, idleTimeoutMs, settleMs);
|
|
43772
|
+
} finally {
|
|
43773
|
+
await session.close("fixture-complete");
|
|
43774
|
+
for (const unsubscribe of unsubscribers) {
|
|
43775
|
+
unsubscribe();
|
|
43776
|
+
}
|
|
43777
|
+
}
|
|
43778
|
+
const finalTranscripts = finalEvents.map((event) => ({
|
|
43779
|
+
...event.transcript,
|
|
43780
|
+
endedAtMs: event.receivedAt - startedAt,
|
|
43781
|
+
startedAtMs: event.receivedAt - startedAt
|
|
43782
|
+
}));
|
|
43783
|
+
const trailingPartial = [...partialEvents].reverse().find((event) => {
|
|
43784
|
+
const text = event.transcript.text.trim();
|
|
43785
|
+
if (!text) {
|
|
43786
|
+
return false;
|
|
43787
|
+
}
|
|
43788
|
+
const lastFinalReceivedAt = finalEvents.at(-1)?.receivedAt ?? 0;
|
|
43789
|
+
return event.receivedAt >= lastFinalReceivedAt;
|
|
43790
|
+
});
|
|
43791
|
+
const finalText = trailingPartial && finalTranscripts.length > 0 ? buildTurnText(finalTranscripts, trailingPartial.transcript.text, {
|
|
43792
|
+
partialEndedAtMs: trailingPartial.receivedAt - startedAt,
|
|
43793
|
+
partialStartedAtMs: trailingPartial.receivedAt - startedAt
|
|
43794
|
+
}) : mergeFinalTranscriptText(finalTranscripts);
|
|
43795
|
+
return {
|
|
43796
|
+
accuracy: scoreTranscriptAccuracy(finalText, fixture.expectedText, options.transcriptThreshold),
|
|
43797
|
+
closeEvents,
|
|
43798
|
+
endOfTurnEvents,
|
|
43799
|
+
errorEvents,
|
|
43800
|
+
finalEvents,
|
|
43801
|
+
finalText,
|
|
43802
|
+
partialEvents,
|
|
43803
|
+
speechEndedAt,
|
|
43804
|
+
startedAt
|
|
43805
|
+
};
|
|
43806
|
+
};
|
|
43807
|
+
|
|
43808
|
+
// src/testing/benchmark.ts
|
|
43809
|
+
var resolveFixtureEnvironment = (fixture) => {
|
|
43810
|
+
const tags = new Set(fixture.tags ?? []);
|
|
43811
|
+
if (tags.has("telephony")) {
|
|
43812
|
+
return "telephony";
|
|
43813
|
+
}
|
|
43814
|
+
if (tags.has("code-switch") || tags.has("code_switch")) {
|
|
43815
|
+
return "code-switch";
|
|
43816
|
+
}
|
|
43817
|
+
if (tags.has("multi-speaker")) {
|
|
43818
|
+
return "multi-speaker";
|
|
43819
|
+
}
|
|
43820
|
+
if (tags.has("jargon") || tags.has("domain-heavy")) {
|
|
43821
|
+
return "jargon";
|
|
43822
|
+
}
|
|
43823
|
+
const hasAccent = tags.has("accent") || tags.has("speech-accent-archive");
|
|
43824
|
+
const hasNoisy = tags.has("noisy") || tags.has("synthetic-noise") || tags.has("stress");
|
|
43825
|
+
const language = fixture.language?.trim().toLowerCase();
|
|
43826
|
+
const hasNonEnglishLanguage = typeof language === "string" && language.length > 0 && !language.startsWith("en");
|
|
43827
|
+
const isMultilingual = tags.has("multilingual") || tags.has("bilingual") || hasNonEnglishLanguage;
|
|
43828
|
+
if (hasAccent && hasNoisy) {
|
|
43829
|
+
return "accent-noisy";
|
|
43830
|
+
}
|
|
43831
|
+
if (isMultilingual) {
|
|
43832
|
+
return "multilingual";
|
|
43833
|
+
}
|
|
43834
|
+
if (hasAccent) {
|
|
43835
|
+
return "accent";
|
|
43836
|
+
}
|
|
43837
|
+
if (hasNoisy) {
|
|
43838
|
+
return "noisy";
|
|
43839
|
+
}
|
|
43840
|
+
if (tags.has("clean")) {
|
|
43841
|
+
return "clean";
|
|
43842
|
+
}
|
|
43843
|
+
return "other";
|
|
43844
|
+
};
|
|
43845
|
+
var normalizeBenchmarkText = (value) => value.toLowerCase().replace(/[^\p{L}\p{N}\s']/gu, " ").replace(/\s+/g, " ").trim();
|
|
43846
|
+
var scoreExpectedTerms = (actualText, expectedTerms) => {
|
|
43847
|
+
const normalizedActual = normalizeBenchmarkText(actualText);
|
|
43848
|
+
const normalizedExpectedTerms = (expectedTerms ?? []).map((entry) => normalizeBenchmarkText(entry));
|
|
43849
|
+
const matchedTerms = normalizedExpectedTerms.filter((term) => term.length > 0 && normalizedActual.includes(term));
|
|
43850
|
+
const missingTerms = normalizedExpectedTerms.filter((term) => term.length > 0 && !matchedTerms.includes(term));
|
|
43851
|
+
const denominator = normalizedExpectedTerms.length;
|
|
43852
|
+
const recall = denominator > 0 ? matchedTerms.length / denominator : 1;
|
|
43853
|
+
return {
|
|
43854
|
+
allMatched: missingTerms.length === 0,
|
|
43855
|
+
expectedTerms: normalizedExpectedTerms,
|
|
43856
|
+
matchedTerms,
|
|
43857
|
+
missingTerms,
|
|
43858
|
+
recall
|
|
43859
|
+
};
|
|
43860
|
+
};
|
|
43861
|
+
var toPatternKeys = (speakers) => {
|
|
43862
|
+
const mapping = new Map;
|
|
43863
|
+
let nextKey = 0;
|
|
43864
|
+
return speakers.map((speaker) => {
|
|
43865
|
+
const key = String(speaker);
|
|
43866
|
+
if (!mapping.has(key)) {
|
|
43867
|
+
mapping.set(key, nextKey);
|
|
43868
|
+
nextKey += 1;
|
|
43869
|
+
}
|
|
43870
|
+
return mapping.get(key);
|
|
43871
|
+
});
|
|
43872
|
+
};
|
|
43873
|
+
var countNormalizedWords = (value) => normalizeBenchmarkText(value).split(" ").filter((token) => token.length > 0);
|
|
43874
|
+
var computeWordOverlap = (left, right) => {
|
|
43875
|
+
const leftWords = new Set(countNormalizedWords(left));
|
|
43876
|
+
const rightWords = new Set(countNormalizedWords(right));
|
|
43877
|
+
if (leftWords.size === 0 || rightWords.size === 0) {
|
|
43878
|
+
return 0;
|
|
43879
|
+
}
|
|
43880
|
+
let overlap = 0;
|
|
43881
|
+
for (const word of leftWords) {
|
|
43882
|
+
if (rightWords.has(word)) {
|
|
43883
|
+
overlap += 1;
|
|
43884
|
+
}
|
|
43885
|
+
}
|
|
43886
|
+
return overlap / Math.max(leftWords.size, rightWords.size);
|
|
43887
|
+
};
|
|
43888
|
+
var repairSpeakerTurnReentry = (fixture, turns) => {
|
|
43889
|
+
const expectedTurns = fixture.expectedSpeakerTurns ?? [];
|
|
43890
|
+
const tags = new Set((fixture.tags ?? []).map((tag) => tag.trim().toLowerCase()));
|
|
43891
|
+
if (expectedTurns.length < 3 || !tags.has("synthetic") || !tags.has("handoff")) {
|
|
43892
|
+
return {
|
|
43893
|
+
postClustered: false,
|
|
43894
|
+
turns
|
|
43895
|
+
};
|
|
43896
|
+
}
|
|
43897
|
+
const repairedTurns = turns.map((turn) => ({ ...turn }));
|
|
43898
|
+
const firstTurnBySpeaker = new Map;
|
|
43899
|
+
const seenRepairedSpeakers = new Set;
|
|
43900
|
+
let postClustered = false;
|
|
43901
|
+
let syntheticSpeakerIndex = 0;
|
|
43902
|
+
for (let index = 0;index < repairedTurns.length; index += 1) {
|
|
43903
|
+
const turn = repairedTurns[index];
|
|
43904
|
+
const speakerKey = turn.speaker === undefined ? undefined : String(turn.speaker);
|
|
43905
|
+
const previousTurn = repairedTurns[index - 1];
|
|
43906
|
+
const previousSpeakerKey = previousTurn?.speaker === undefined ? undefined : String(previousTurn.speaker);
|
|
43907
|
+
if (speakerKey === undefined) {
|
|
43908
|
+
continue;
|
|
43909
|
+
}
|
|
43910
|
+
if (!firstTurnBySpeaker.has(speakerKey)) {
|
|
43911
|
+
firstTurnBySpeaker.set(speakerKey, turn);
|
|
43912
|
+
}
|
|
43913
|
+
seenRepairedSpeakers.add(String(turn.speaker));
|
|
43914
|
+
const originalSpeakerTurn = firstTurnBySpeaker.get(speakerKey);
|
|
43915
|
+
const speakerReentered = previousSpeakerKey !== undefined && previousSpeakerKey !== speakerKey && index > 1;
|
|
43916
|
+
const needsAdditionalSpeaker = seenRepairedSpeakers.size < expectedTurns.length;
|
|
43917
|
+
const sameSpeakerOverlap = computeWordOverlap(turn.text, originalSpeakerTurn.text);
|
|
43918
|
+
const currentWordCount = countNormalizedWords(turn.text).length;
|
|
43919
|
+
if (speakerReentered && needsAdditionalSpeaker && currentWordCount >= 4 && sameSpeakerOverlap < 0.35) {
|
|
43920
|
+
turn.speaker = `postcluster-${syntheticSpeakerIndex}`;
|
|
43921
|
+
seenRepairedSpeakers.add(String(turn.speaker));
|
|
43922
|
+
syntheticSpeakerIndex += 1;
|
|
43923
|
+
postClustered = true;
|
|
43924
|
+
}
|
|
43925
|
+
}
|
|
43926
|
+
return {
|
|
43927
|
+
postClustered,
|
|
43928
|
+
turns: repairedTurns
|
|
43929
|
+
};
|
|
43930
|
+
};
|
|
43931
|
+
var scoreSpeakerTurns = (fixture, result) => {
|
|
43932
|
+
const expectedTurns = fixture.expectedSpeakerTurns ?? [];
|
|
43933
|
+
if (expectedTurns.length === 0) {
|
|
43934
|
+
return;
|
|
43935
|
+
}
|
|
43936
|
+
const actualTurns = result.finalEvents.map((event) => ({
|
|
43937
|
+
speaker: event.transcript.speaker,
|
|
43938
|
+
text: event.transcript.text.trim()
|
|
43939
|
+
})).filter((turn) => turn.text.length > 0);
|
|
43940
|
+
const collapsedActualTurns = actualTurns.reduce((merged, turn) => {
|
|
43941
|
+
const previous = merged[merged.length - 1];
|
|
43942
|
+
if (previous && previous.speaker !== undefined && turn.speaker !== undefined && String(previous.speaker) === String(turn.speaker)) {
|
|
43943
|
+
previous.text = `${previous.text} ${turn.text}`.trim();
|
|
43944
|
+
return merged;
|
|
43945
|
+
}
|
|
43946
|
+
merged.push({ ...turn });
|
|
43947
|
+
return merged;
|
|
43948
|
+
}, []);
|
|
43949
|
+
const repaired = repairSpeakerTurnReentry(fixture, collapsedActualTurns);
|
|
43950
|
+
const scoredTurns = repaired.turns;
|
|
43951
|
+
const available = scoredTurns.every((turn) => turn.speaker !== undefined);
|
|
43952
|
+
if (!available) {
|
|
43953
|
+
return {
|
|
43954
|
+
available: false,
|
|
43955
|
+
actualTurnCount: scoredTurns.length,
|
|
43956
|
+
expectedTurnCount: expectedTurns.length,
|
|
43957
|
+
passes: false,
|
|
43958
|
+
patternMatchRate: 0,
|
|
43959
|
+
postClustered: repaired.postClustered
|
|
43960
|
+
};
|
|
43961
|
+
}
|
|
43962
|
+
const actualPattern = toPatternKeys(scoredTurns.map((turn) => turn.speaker));
|
|
43963
|
+
const expectedPattern = toPatternKeys(expectedTurns.map((turn) => turn.speaker));
|
|
43964
|
+
const maxLength = Math.max(actualPattern.length, expectedPattern.length, 1);
|
|
43965
|
+
let matches = 0;
|
|
43966
|
+
for (let index = 0;index < Math.min(actualPattern.length, expectedPattern.length); index += 1) {
|
|
43967
|
+
if (actualPattern[index] === expectedPattern[index]) {
|
|
43968
|
+
matches += 1;
|
|
43969
|
+
}
|
|
43970
|
+
}
|
|
43971
|
+
const patternMatchRate = roundMetric4(matches / maxLength) ?? 0;
|
|
43972
|
+
return {
|
|
43973
|
+
available: true,
|
|
43974
|
+
actualTurnCount: scoredTurns.length,
|
|
43975
|
+
expectedTurnCount: expectedTurns.length,
|
|
43976
|
+
passes: scoredTurns.length === expectedTurns.length && patternMatchRate === 1,
|
|
43977
|
+
patternMatchRate,
|
|
43978
|
+
postClustered: repaired.postClustered
|
|
43979
|
+
};
|
|
43980
|
+
};
|
|
43981
|
+
var average2 = (values) => {
|
|
43982
|
+
const filtered = values.filter((value) => typeof value === "number" && Number.isFinite(value));
|
|
43983
|
+
if (filtered.length === 0) {
|
|
43984
|
+
return;
|
|
43985
|
+
}
|
|
43986
|
+
return filtered.reduce((sum, value) => sum + value, 0) / filtered.length;
|
|
43987
|
+
};
|
|
43988
|
+
var roundMetric4 = (value, digits = 4) => {
|
|
43989
|
+
if (typeof value !== "number" || !Number.isFinite(value)) {
|
|
43990
|
+
return;
|
|
43991
|
+
}
|
|
43992
|
+
const factor = 10 ** digits;
|
|
43993
|
+
return Math.round(value * factor) / factor;
|
|
43994
|
+
};
|
|
43995
|
+
var calculateGroupSummary = (fixtures) => {
|
|
43996
|
+
const grouped = new Map;
|
|
43997
|
+
for (const fixture of fixtures) {
|
|
43998
|
+
const existing = grouped.get(fixture.group) ?? [];
|
|
43999
|
+
existing.push(fixture);
|
|
44000
|
+
grouped.set(fixture.group, existing);
|
|
44001
|
+
}
|
|
44002
|
+
return Array.from(grouped.entries()).map(([group, results]) => {
|
|
44003
|
+
const fixtureCount = results.length;
|
|
44004
|
+
const passCount = results.filter((fixture) => fixture.passes).length;
|
|
44005
|
+
const averageWordErrorRate = average2(results.map((result) => result.accuracy.wordErrorRate)) ?? 0;
|
|
44006
|
+
const averageTermRecall = average2(results.map((result) => result.expectedTerms.recall)) ?? 0;
|
|
44007
|
+
const averageElapsedMs = average2(results.map((result) => result.elapsedMs));
|
|
44008
|
+
const averageSpeakerTurnMatchRate = average2(results.map((result) => result.speakerTurns?.patternMatchRate));
|
|
44009
|
+
const accuracy = 1 - averageWordErrorRate;
|
|
44010
|
+
return {
|
|
44011
|
+
averageElapsedMs: roundMetric4(averageElapsedMs, 2) ?? 0,
|
|
44012
|
+
averageSpeakerTurnMatchRate: roundMetric4(averageSpeakerTurnMatchRate),
|
|
44013
|
+
averageTermRecall: roundMetric4(averageTermRecall) ?? 0,
|
|
44014
|
+
averageWordErrorRate: roundMetric4(averageWordErrorRate) ?? 0,
|
|
44015
|
+
fixturesWithErrors: results.filter((fixture) => fixture.errorCount > 0).length,
|
|
44016
|
+
fixturesWithFragments: results.filter((fixture) => fixture.fragmentationCount > 0).length,
|
|
44017
|
+
fixtureCount,
|
|
44018
|
+
group,
|
|
44019
|
+
passCount,
|
|
44020
|
+
passRate: fixtureCount > 0 ? roundMetric4(passCount / fixtureCount) ?? 0 : 0,
|
|
44021
|
+
wordAccuracyRate: roundMetric4(accuracy) ?? 0
|
|
44022
|
+
};
|
|
44023
|
+
}).sort((a, b) => a.group.localeCompare(b.group));
|
|
44024
|
+
};
|
|
44025
|
+
var toFixtureBenchmarkResult = (fixture, result, elapsedMs) => {
|
|
44026
|
+
const toPostSpeechLatency = (timestamp) => {
|
|
44027
|
+
if (typeof timestamp !== "number") {
|
|
44028
|
+
return;
|
|
44029
|
+
}
|
|
44030
|
+
return Math.max(0, timestamp - result.speechEndedAt);
|
|
44031
|
+
};
|
|
44032
|
+
const timeToFirstPartialMs = result.partialEvents[0] ? result.partialEvents[0].receivedAt - result.startedAt : undefined;
|
|
44033
|
+
const timeToFirstFinalMs = result.finalEvents[0] ? result.finalEvents[0].receivedAt - result.startedAt : undefined;
|
|
44034
|
+
const timeToEndOfTurnMs = result.endOfTurnEvents[0] ? result.endOfTurnEvents[0].receivedAt - result.startedAt : undefined;
|
|
44035
|
+
const postSpeechTimeToFirstFinalMs = toPostSpeechLatency(result.finalEvents[0]?.receivedAt);
|
|
44036
|
+
const postSpeechTimeToEndOfTurnMs = toPostSpeechLatency(result.endOfTurnEvents[0]?.receivedAt);
|
|
44037
|
+
const expectedTerms = scoreExpectedTerms(result.finalText, fixture.expectedTerms);
|
|
44038
|
+
const speakerTurns = scoreSpeakerTurns(fixture, result);
|
|
44039
|
+
return {
|
|
44040
|
+
accuracy: result.accuracy,
|
|
44041
|
+
closeCount: result.closeEvents.length,
|
|
44042
|
+
difficulty: fixture.difficulty,
|
|
44043
|
+
elapsedMs,
|
|
44044
|
+
endOfTurnCount: result.endOfTurnEvents.length,
|
|
44045
|
+
errorCount: result.errorEvents.length,
|
|
44046
|
+
expectedTerms,
|
|
44047
|
+
finalCount: result.finalEvents.length,
|
|
44048
|
+
finalText: result.finalText,
|
|
44049
|
+
fixtureId: fixture.id,
|
|
44050
|
+
fragmentationCount: Math.max(0, result.finalEvents.length - 1),
|
|
44051
|
+
group: resolveFixtureEnvironment(fixture),
|
|
44052
|
+
passes: result.errorEvents.length === 0 && result.finalText.trim().length > 0 && result.accuracy.passesThreshold && (speakerTurns ? speakerTurns.passes : true),
|
|
44053
|
+
partialCount: result.partialEvents.length,
|
|
44054
|
+
speakerTurns,
|
|
44055
|
+
postSpeechTimeToEndOfTurnMs,
|
|
44056
|
+
postSpeechTimeToFirstFinalMs,
|
|
44057
|
+
tags: fixture.tags ?? [],
|
|
44058
|
+
timeToEndOfTurnMs,
|
|
44059
|
+
timeToFirstFinalMs,
|
|
44060
|
+
timeToFirstPartialMs,
|
|
44061
|
+
title: fixture.title
|
|
44062
|
+
};
|
|
44063
|
+
};
|
|
44064
|
+
var summarizeSTTBenchmark = (adapterId, fixtures) => {
|
|
44065
|
+
const fixtureCount = fixtures.length;
|
|
44066
|
+
const passCount = fixtures.filter((fixture) => fixture.passes).length;
|
|
44067
|
+
return {
|
|
44068
|
+
adapterId,
|
|
44069
|
+
averageCharErrorRate: roundMetric4(average2(fixtures.map((fixture) => fixture.accuracy.charErrorRate))) ?? 0,
|
|
44070
|
+
averageElapsedMs: roundMetric4(average2(fixtures.map((fixture) => fixture.elapsedMs)), 2) ?? 0,
|
|
44071
|
+
averageEndOfTurnCount: roundMetric4(average2(fixtures.map((fixture) => fixture.endOfTurnCount)), 2) ?? 0,
|
|
44072
|
+
averageFinalCount: roundMetric4(average2(fixtures.map((fixture) => fixture.finalCount)), 2) ?? 0,
|
|
44073
|
+
averageSpeakerTurnMatchRate: roundMetric4(average2(fixtures.map((fixture) => fixture.speakerTurns?.patternMatchRate))),
|
|
44074
|
+
averageTermRecall: roundMetric4(average2(fixtures.map((fixture) => fixture.expectedTerms.recall))) ?? 0,
|
|
44075
|
+
averagePostSpeechTimeToEndOfTurnMs: roundMetric4(average2(fixtures.map((fixture) => fixture.postSpeechTimeToEndOfTurnMs)), 2),
|
|
44076
|
+
averagePostSpeechTimeToFirstFinalMs: roundMetric4(average2(fixtures.map((fixture) => fixture.postSpeechTimeToFirstFinalMs)), 2),
|
|
44077
|
+
averageTimeToEndOfTurnMs: roundMetric4(average2(fixtures.map((fixture) => fixture.timeToEndOfTurnMs)), 2),
|
|
44078
|
+
averageTimeToFirstFinalMs: roundMetric4(average2(fixtures.map((fixture) => fixture.timeToFirstFinalMs)), 2),
|
|
44079
|
+
averageTimeToFirstPartialMs: roundMetric4(average2(fixtures.map((fixture) => fixture.timeToFirstPartialMs)), 2),
|
|
44080
|
+
averageWordErrorRate: roundMetric4(average2(fixtures.map((fixture) => fixture.accuracy.wordErrorRate))) ?? 0,
|
|
44081
|
+
fixtureCount,
|
|
44082
|
+
fixturesWithErrors: fixtures.filter((fixture) => fixture.errorCount > 0).length,
|
|
44083
|
+
fixturesWithFragmentation: fixtures.filter((fixture) => fixture.fragmentationCount > 0).length,
|
|
44084
|
+
groupSummaries: calculateGroupSummary(fixtures),
|
|
44085
|
+
passCount,
|
|
44086
|
+
passRate: fixtureCount > 0 ? roundMetric4(passCount / fixtureCount) ?? 0 : 0,
|
|
44087
|
+
totalErrorCount: fixtures.reduce((sum, fixture) => sum + fixture.errorCount, 0),
|
|
44088
|
+
wordAccuracyRate: fixtureCount > 0 ? roundMetric4(1 - (average2(fixtures.map((fixture) => fixture.accuracy.wordErrorRate)) ?? 0)) ?? 0 : 0
|
|
44089
|
+
};
|
|
44090
|
+
};
|
|
44091
|
+
var evaluateSTTBenchmarkAcceptance = (report, thresholds = {}) => {
|
|
44092
|
+
const failures = [];
|
|
44093
|
+
const details = thresholds;
|
|
44094
|
+
const overallPassRate = details.overallPassRate;
|
|
44095
|
+
if (overallPassRate !== undefined && report.summary.passRate < overallPassRate) {
|
|
44096
|
+
failures.push(`overall passRate ${(report.summary.passRate * 100).toFixed(2)}% below ${(overallPassRate * 100).toFixed(2)}%`);
|
|
44097
|
+
}
|
|
44098
|
+
const minTermRecall = details.termRecall;
|
|
44099
|
+
if (minTermRecall !== undefined && report.summary.averageTermRecall < minTermRecall) {
|
|
44100
|
+
failures.push(`overall term recall ${report.summary.averageTermRecall.toFixed(4)} below ${minTermRecall.toFixed(4)}`);
|
|
44101
|
+
}
|
|
44102
|
+
const minWordAccuracy = details.wordAccuracyRate;
|
|
44103
|
+
if (minWordAccuracy !== undefined && report.summary.wordAccuracyRate < minWordAccuracy) {
|
|
44104
|
+
failures.push(`overall word accuracy ${(report.summary.wordAccuracyRate * 100).toFixed(2)}% below ${(minWordAccuracy * 100).toFixed(2)}%`);
|
|
44105
|
+
}
|
|
44106
|
+
const groupThresholds = details.groupPassRate;
|
|
44107
|
+
if (groupThresholds) {
|
|
44108
|
+
for (const groupSummary of report.summary.groupSummaries) {
|
|
44109
|
+
const threshold = groupThresholds[groupSummary.group];
|
|
44110
|
+
if (!threshold) {
|
|
44111
|
+
continue;
|
|
44112
|
+
}
|
|
44113
|
+
if (threshold.passRate !== undefined && groupSummary.passRate < threshold.passRate) {
|
|
44114
|
+
failures.push(`${groupSummary.group} passRate ${(groupSummary.passRate * 100).toFixed(2)}% below ${(threshold.passRate * 100).toFixed(2)}%`);
|
|
44115
|
+
}
|
|
44116
|
+
if (threshold.wordAccuracyRate !== undefined && groupSummary.wordAccuracyRate < threshold.wordAccuracyRate) {
|
|
44117
|
+
failures.push(`${groupSummary.group} wordAccuracy ${(groupSummary.wordAccuracyRate * 100).toFixed(2)}% below ${(threshold.wordAccuracyRate * 100).toFixed(2)}%`);
|
|
44118
|
+
}
|
|
44119
|
+
}
|
|
44120
|
+
}
|
|
44121
|
+
const score = roundMetric4(report.summary.passRate * 0.45 + report.summary.wordAccuracyRate * 0.35 + report.summary.averageTermRecall * 0.2, 3) ?? 0;
|
|
44122
|
+
return {
|
|
44123
|
+
adapterId: report.adapterId,
|
|
44124
|
+
failures,
|
|
44125
|
+
passed: failures.length === 0,
|
|
44126
|
+
score
|
|
44127
|
+
};
|
|
44128
|
+
};
|
|
44129
|
+
var compareSTTBenchmarks = (reports) => {
|
|
44130
|
+
const entries = reports.map((report) => ({
|
|
44131
|
+
adapterId: report.adapterId,
|
|
44132
|
+
summary: report.summary
|
|
44133
|
+
}));
|
|
44134
|
+
const bestByMetric = (selectMetric, direction) => entries.reduce((best, entry) => {
|
|
44135
|
+
if (!best) {
|
|
44136
|
+
return entry;
|
|
44137
|
+
}
|
|
44138
|
+
const next = selectMetric(entry);
|
|
44139
|
+
const current = selectMetric(best);
|
|
44140
|
+
if (direction === "max" ? next > current : next < current) {
|
|
44141
|
+
return entry;
|
|
44142
|
+
}
|
|
44143
|
+
return best;
|
|
44144
|
+
}, undefined);
|
|
44145
|
+
return {
|
|
44146
|
+
bestByPassRate: bestByMetric((entry) => entry.summary.passRate, "max"),
|
|
44147
|
+
bestByTermRecall: bestByMetric((entry) => entry.summary.averageTermRecall, "max"),
|
|
44148
|
+
bestByWordErrorRate: bestByMetric((entry) => entry.summary.averageWordErrorRate, "min"),
|
|
44149
|
+
entries
|
|
44150
|
+
};
|
|
44151
|
+
};
|
|
44152
|
+
var runSTTAdapterBenchmark = async ({
|
|
44153
|
+
adapter,
|
|
44154
|
+
adapterId,
|
|
44155
|
+
fixtures,
|
|
44156
|
+
options = {}
|
|
44157
|
+
}) => {
|
|
44158
|
+
const results = [];
|
|
44159
|
+
for (const fixture of fixtures) {
|
|
44160
|
+
const startedAt = Date.now();
|
|
44161
|
+
const fixtureResult = await runSTTAdapterFixture(adapter, fixture, {
|
|
44162
|
+
...options,
|
|
44163
|
+
...options.fixtureOptions?.[fixture.id] ?? {}
|
|
44164
|
+
});
|
|
44165
|
+
results.push(toFixtureBenchmarkResult(fixture, fixtureResult, Date.now() - startedAt));
|
|
44166
|
+
}
|
|
44167
|
+
return {
|
|
44168
|
+
adapterId,
|
|
44169
|
+
fixtures: results,
|
|
44170
|
+
generatedAt: Date.now(),
|
|
44171
|
+
summary: summarizeSTTBenchmark(adapterId, results)
|
|
44172
|
+
};
|
|
44173
|
+
};
|
|
44174
|
+
var summarizeSTTBenchmarkSeries = (input) => {
|
|
44175
|
+
const fixtureMap = new Map;
|
|
44176
|
+
for (const report of input.reports) {
|
|
44177
|
+
for (const fixture of report.fixtures) {
|
|
44178
|
+
const entries = fixtureMap.get(fixture.fixtureId) ?? [];
|
|
44179
|
+
entries.push(fixture);
|
|
44180
|
+
fixtureMap.set(fixture.fixtureId, entries);
|
|
44181
|
+
}
|
|
44182
|
+
}
|
|
44183
|
+
const fixtureAggregates = [...fixtureMap.entries()].map(([fixtureId, results]) => {
|
|
44184
|
+
const wordErrorRates = results.map((result) => result.accuracy.wordErrorRate);
|
|
44185
|
+
const passCount = results.filter((result) => result.passes).length;
|
|
44186
|
+
const sample = results[0];
|
|
44187
|
+
return {
|
|
44188
|
+
averageElapsedMs: roundMetric4(average2(results.map((result) => result.elapsedMs)), 2) ?? 0,
|
|
44189
|
+
averagePassRate: roundMetric4(results.length > 0 ? passCount / results.length : 0) ?? 0,
|
|
44190
|
+
averageWordErrorRate: roundMetric4(average2(wordErrorRates)) ?? 0,
|
|
44191
|
+
bestWordErrorRate: roundMetric4(wordErrorRates.length > 0 ? Math.min(...wordErrorRates) : 0) ?? 0,
|
|
44192
|
+
fixtureId,
|
|
44193
|
+
group: sample.group,
|
|
44194
|
+
passCount,
|
|
44195
|
+
runCount: results.length,
|
|
44196
|
+
tags: sample.tags,
|
|
44197
|
+
title: sample.title,
|
|
44198
|
+
worstWordErrorRate: roundMetric4(wordErrorRates.length > 0 ? Math.max(...wordErrorRates) : 0) ?? 0
|
|
44199
|
+
};
|
|
44200
|
+
});
|
|
44201
|
+
const totalRunCount = input.reports.reduce((sum, report) => sum + report.fixtures.length, 0);
|
|
44202
|
+
const totalPassCount = input.reports.reduce((sum, report) => sum + report.summary.passCount, 0);
|
|
44203
|
+
return {
|
|
44204
|
+
adapterId: input.adapterId,
|
|
44205
|
+
fixtures: fixtureAggregates,
|
|
44206
|
+
generatedAt: Date.now(),
|
|
44207
|
+
runCount: input.reports.length,
|
|
44208
|
+
summary: {
|
|
44209
|
+
adapterId: input.adapterId,
|
|
44210
|
+
averageElapsedMs: roundMetric4(average2(fixtureAggregates.map((fixture) => fixture.averageElapsedMs)), 2) ?? 0,
|
|
44211
|
+
averagePassRate: roundMetric4(average2(fixtureAggregates.map((fixture) => fixture.averagePassRate))) ?? 0,
|
|
44212
|
+
averageWordErrorRate: roundMetric4(average2(fixtureAggregates.map((fixture) => fixture.averageWordErrorRate))) ?? 0,
|
|
44213
|
+
fixtureCount: fixtureAggregates.length,
|
|
44214
|
+
flakyFixtureCount: fixtureAggregates.filter((fixture) => fixture.averagePassRate > 0 && fixture.averagePassRate < 1).length,
|
|
44215
|
+
generatedRunCount: input.reports.length,
|
|
44216
|
+
stableFixtureCount: fixtureAggregates.filter((fixture) => fixture.averagePassRate === 1).length,
|
|
44217
|
+
totalPassCount,
|
|
44218
|
+
totalRunCount
|
|
44219
|
+
}
|
|
44220
|
+
};
|
|
44221
|
+
};
|
|
44222
|
+
var runSTTAdapterBenchmarkSeries = async ({
|
|
44223
|
+
adapter,
|
|
44224
|
+
adapterId,
|
|
44225
|
+
fixtures,
|
|
44226
|
+
options = {},
|
|
44227
|
+
runs
|
|
44228
|
+
}) => {
|
|
44229
|
+
const reports = [];
|
|
44230
|
+
const runCount = Math.max(1, Math.floor(runs));
|
|
44231
|
+
for (let runIndex = 0;runIndex < runCount; runIndex += 1) {
|
|
44232
|
+
reports.push(await runSTTAdapterBenchmark({
|
|
44233
|
+
adapter,
|
|
44234
|
+
adapterId,
|
|
44235
|
+
fixtures,
|
|
44236
|
+
options
|
|
44237
|
+
}));
|
|
44238
|
+
}
|
|
44239
|
+
return summarizeSTTBenchmarkSeries({
|
|
44240
|
+
adapterId,
|
|
44241
|
+
reports
|
|
44242
|
+
});
|
|
44243
|
+
};
|
|
44244
|
+
|
|
44245
|
+
// src/multilingualProof.ts
|
|
44246
|
+
var average3 = (values) => {
|
|
44247
|
+
if (values.length === 0)
|
|
44248
|
+
return 0;
|
|
44249
|
+
let total = 0;
|
|
44250
|
+
for (const value of values)
|
|
44251
|
+
total += value;
|
|
44252
|
+
return total / values.length;
|
|
44253
|
+
};
|
|
44254
|
+
var computeMetrics = (results) => {
|
|
44255
|
+
if (results.length === 0) {
|
|
44256
|
+
return {
|
|
44257
|
+
averageTermRecall: 0,
|
|
44258
|
+
averageWordAccuracyRate: 0,
|
|
44259
|
+
averageWordErrorRate: 0,
|
|
44260
|
+
fixtureCount: 0,
|
|
44261
|
+
passCount: 0,
|
|
44262
|
+
passRate: 0
|
|
44263
|
+
};
|
|
44264
|
+
}
|
|
44265
|
+
const wordErrorRates = results.map((result) => result.accuracy.wordErrorRate ?? 0);
|
|
44266
|
+
const wordAccuracyRates = results.map((result) => 1 - (result.accuracy.wordErrorRate ?? 0));
|
|
44267
|
+
const termRecalls = results.map((result) => result.expectedTerms.recall ?? 0);
|
|
44268
|
+
const passCount = results.filter((result) => result.passes).length;
|
|
44269
|
+
return {
|
|
44270
|
+
averageTermRecall: average3(termRecalls),
|
|
44271
|
+
averageWordAccuracyRate: average3(wordAccuracyRates),
|
|
44272
|
+
averageWordErrorRate: average3(wordErrorRates),
|
|
44273
|
+
fixtureCount: results.length,
|
|
44274
|
+
passCount,
|
|
44275
|
+
passRate: passCount / results.length
|
|
44276
|
+
};
|
|
44277
|
+
};
|
|
44278
|
+
var resolveLanguageThreshold = (language, defaults, perLanguage) => {
|
|
44279
|
+
const explicit = perLanguage?.find((entry) => entry.language.toLowerCase() === language.toLowerCase());
|
|
44280
|
+
return {
|
|
44281
|
+
label: explicit?.label,
|
|
44282
|
+
language,
|
|
44283
|
+
maxAverageWordErrorRate: explicit?.maxAverageWordErrorRate ?? defaults?.maxAverageWordErrorRate,
|
|
44284
|
+
minAverageWordAccuracyRate: explicit?.minAverageWordAccuracyRate ?? defaults?.minAverageWordAccuracyRate,
|
|
44285
|
+
minPassRate: explicit?.minPassRate ?? defaults?.minPassRate,
|
|
44286
|
+
minTermRecall: explicit?.minTermRecall ?? defaults?.minTermRecall
|
|
44287
|
+
};
|
|
44288
|
+
};
|
|
44289
|
+
var evaluateLanguage = (language, fixtureResults, thresholds) => {
|
|
44290
|
+
const metrics = computeMetrics(fixtureResults);
|
|
44291
|
+
const failures = [];
|
|
44292
|
+
if (thresholds.maxAverageWordErrorRate !== undefined && metrics.averageWordErrorRate > thresholds.maxAverageWordErrorRate) {
|
|
44293
|
+
failures.push(`${language}: avg WER ${metrics.averageWordErrorRate.toFixed(3)} exceeds budget ${thresholds.maxAverageWordErrorRate.toFixed(3)}.`);
|
|
44294
|
+
}
|
|
44295
|
+
if (thresholds.minAverageWordAccuracyRate !== undefined && metrics.averageWordAccuracyRate < thresholds.minAverageWordAccuracyRate) {
|
|
44296
|
+
failures.push(`${language}: avg WAR ${metrics.averageWordAccuracyRate.toFixed(3)} below floor ${thresholds.minAverageWordAccuracyRate.toFixed(3)}.`);
|
|
44297
|
+
}
|
|
44298
|
+
if (thresholds.minPassRate !== undefined && metrics.passRate < thresholds.minPassRate) {
|
|
44299
|
+
failures.push(`${language}: pass rate ${metrics.passRate.toFixed(3)} below floor ${thresholds.minPassRate.toFixed(3)}.`);
|
|
44300
|
+
}
|
|
44301
|
+
if (thresholds.minTermRecall !== undefined && metrics.averageTermRecall < thresholds.minTermRecall) {
|
|
44302
|
+
failures.push(`${language}: term recall ${metrics.averageTermRecall.toFixed(3)} below floor ${thresholds.minTermRecall.toFixed(3)}.`);
|
|
44303
|
+
}
|
|
44304
|
+
return {
|
|
44305
|
+
applied: thresholds,
|
|
44306
|
+
failures,
|
|
44307
|
+
fixtureIds: fixtureResults.map((result) => result.fixtureId),
|
|
44308
|
+
label: thresholds.label,
|
|
44309
|
+
language,
|
|
44310
|
+
metrics,
|
|
44311
|
+
passes: failures.length === 0
|
|
44312
|
+
};
|
|
44313
|
+
};
|
|
44314
|
+
var collectFixtures = async (options) => {
|
|
44315
|
+
if (options.fixtures !== undefined) {
|
|
44316
|
+
return options.fixtures.slice();
|
|
44317
|
+
}
|
|
44318
|
+
const loaded = await loadVoiceTestFixtures(options.fixtureDirectories);
|
|
44319
|
+
return options.filter ? loaded.filter(options.filter) : loaded;
|
|
44320
|
+
};
|
|
44321
|
+
var groupByLanguage = (results, fixtures) => {
|
|
44322
|
+
const lookup = new Map;
|
|
44323
|
+
for (const fixture of fixtures) {
|
|
44324
|
+
lookup.set(fixture.id, fixture.language ?? "unknown");
|
|
44325
|
+
}
|
|
44326
|
+
const grouped = new Map;
|
|
44327
|
+
for (const result of results) {
|
|
44328
|
+
const language = lookup.get(result.fixtureId) ?? "unknown";
|
|
44329
|
+
const bucket = grouped.get(language) ?? [];
|
|
44330
|
+
bucket.push(result);
|
|
44331
|
+
grouped.set(language, bucket);
|
|
44332
|
+
}
|
|
44333
|
+
return grouped;
|
|
44334
|
+
};
|
|
44335
|
+
var runVoiceMultilingualProof = async (options) => {
|
|
44336
|
+
if (options.adapters.length === 0) {
|
|
44337
|
+
throw new Error("runVoiceMultilingualProof requires at least one adapter entry.");
|
|
44338
|
+
}
|
|
44339
|
+
const fixtures = await collectFixtures(options);
|
|
44340
|
+
if (fixtures.length === 0) {
|
|
44341
|
+
throw new Error("runVoiceMultilingualProof found zero fixtures. Did you set VOICE_FIXTURE_DIR or pass fixtures/fixtureDirectories?");
|
|
44342
|
+
}
|
|
44343
|
+
const languageCodes = new Set(fixtures.map((fixture) => fixture.language ?? "unknown"));
|
|
44344
|
+
const adapterReports = [];
|
|
44345
|
+
for (const entry of options.adapters) {
|
|
44346
|
+
const benchmark = await runSTTAdapterBenchmark({
|
|
44347
|
+
adapter: entry.adapter,
|
|
44348
|
+
adapterId: entry.adapterId,
|
|
44349
|
+
fixtures,
|
|
44350
|
+
options: entry.benchmarkOptions
|
|
44351
|
+
});
|
|
44352
|
+
const grouped = groupByLanguage(benchmark.fixtures, fixtures);
|
|
44353
|
+
const languageReports = [];
|
|
44354
|
+
for (const language of languageCodes) {
|
|
44355
|
+
const bucket = grouped.get(language) ?? [];
|
|
44356
|
+
if (bucket.length === 0)
|
|
44357
|
+
continue;
|
|
44358
|
+
const thresholds = resolveLanguageThreshold(language, options.defaultThresholds, options.perLanguage);
|
|
44359
|
+
languageReports.push(evaluateLanguage(language, bucket, thresholds));
|
|
44360
|
+
}
|
|
44361
|
+
const overall = computeMetrics(benchmark.fixtures);
|
|
44362
|
+
const failures = languageReports.flatMap((report) => report.failures);
|
|
44363
|
+
adapterReports.push({
|
|
44364
|
+
adapterId: entry.adapterId,
|
|
44365
|
+
benchmark,
|
|
44366
|
+
failures,
|
|
44367
|
+
fixtureCount: benchmark.fixtures.length,
|
|
44368
|
+
languageReports,
|
|
44369
|
+
overall,
|
|
44370
|
+
passes: failures.length === 0
|
|
44371
|
+
});
|
|
44372
|
+
}
|
|
44373
|
+
const failedAdapters = adapterReports.filter((report) => !report.passes).map((report) => report.adapterId);
|
|
44374
|
+
return {
|
|
44375
|
+
adapters: adapterReports,
|
|
44376
|
+
generatedAt: Date.now(),
|
|
44377
|
+
passes: failedAdapters.length === 0,
|
|
44378
|
+
summary: {
|
|
44379
|
+
adapterCount: adapterReports.length,
|
|
44380
|
+
failedAdapters,
|
|
44381
|
+
fixtureCount: fixtures.length,
|
|
44382
|
+
languageCount: languageCodes.size
|
|
44383
|
+
}
|
|
44384
|
+
};
|
|
44385
|
+
};
|
|
44386
|
+
var renderVoiceMultilingualProofMarkdown = (report) => {
|
|
44387
|
+
const lines = [
|
|
44388
|
+
`# Voice Multilingual STT Proof`,
|
|
44389
|
+
"",
|
|
44390
|
+
`Generated: ${new Date(report.generatedAt).toISOString()}`,
|
|
44391
|
+
`Status: ${report.passes ? "**PASS**" : "**FAIL**"}`,
|
|
44392
|
+
`Adapters: ${String(report.summary.adapterCount)}; Fixtures: ${String(report.summary.fixtureCount)}; Languages: ${String(report.summary.languageCount)}.`,
|
|
44393
|
+
""
|
|
44394
|
+
];
|
|
44395
|
+
if (report.summary.failedAdapters.length > 0) {
|
|
44396
|
+
lines.push(`Failed adapters: ${report.summary.failedAdapters.join(", ")}.`, "");
|
|
44397
|
+
}
|
|
44398
|
+
for (const adapter of report.adapters) {
|
|
44399
|
+
lines.push(`## ${adapter.adapterId} \u2014 ${adapter.passes ? "pass" : "fail"}`, "", `- Fixtures: ${String(adapter.fixtureCount)}`, `- Avg WER: ${adapter.overall.averageWordErrorRate.toFixed(3)}`, `- Avg WAR: ${adapter.overall.averageWordAccuracyRate.toFixed(3)}`, `- Pass rate: ${(adapter.overall.passRate * 100).toFixed(1)}%`, "", `| Language | Fixtures | Avg WER | Avg WAR | Pass rate | Threshold | Status |`, `| --- | ---: | ---: | ---: | ---: | --- | --- |`);
|
|
44400
|
+
for (const language of adapter.languageReports) {
|
|
44401
|
+
const threshold = [];
|
|
44402
|
+
if (language.applied.maxAverageWordErrorRate !== undefined) {
|
|
44403
|
+
threshold.push(`WER<=${language.applied.maxAverageWordErrorRate.toFixed(3)}`);
|
|
44404
|
+
}
|
|
44405
|
+
if (language.applied.minAverageWordAccuracyRate !== undefined) {
|
|
44406
|
+
threshold.push(`WAR>=${language.applied.minAverageWordAccuracyRate.toFixed(3)}`);
|
|
44407
|
+
}
|
|
44408
|
+
if (language.applied.minPassRate !== undefined) {
|
|
44409
|
+
threshold.push(`pass>=${language.applied.minPassRate.toFixed(3)}`);
|
|
44410
|
+
}
|
|
44411
|
+
if (language.applied.minTermRecall !== undefined) {
|
|
44412
|
+
threshold.push(`recall>=${language.applied.minTermRecall.toFixed(3)}`);
|
|
44413
|
+
}
|
|
44414
|
+
lines.push(`| ${language.language}${language.label ? ` (${language.label})` : ""} | ${String(language.metrics.fixtureCount)} | ${language.metrics.averageWordErrorRate.toFixed(3)} | ${language.metrics.averageWordAccuracyRate.toFixed(3)} | ${(language.metrics.passRate * 100).toFixed(1)}% | ${threshold.join(", ") || "\u2014"} | ${language.passes ? "pass" : "fail"} |`);
|
|
44415
|
+
}
|
|
44416
|
+
if (adapter.failures.length > 0) {
|
|
44417
|
+
lines.push("", "Failures:");
|
|
44418
|
+
for (const failure of adapter.failures) {
|
|
44419
|
+
lines.push(`- ${failure}`);
|
|
44420
|
+
}
|
|
44421
|
+
}
|
|
44422
|
+
lines.push("");
|
|
44423
|
+
}
|
|
44424
|
+
return lines.join(`
|
|
44425
|
+
`);
|
|
44426
|
+
};
|
|
44427
|
+
var buildVoiceMultilingualProofReadinessCheck = (report, options = {}) => {
|
|
44428
|
+
const label = options.label ?? "Multilingual STT proof";
|
|
44429
|
+
if (report.adapters.length === 0) {
|
|
44430
|
+
return {
|
|
44431
|
+
detail: "No STT adapters were exercised against the multilingual corpus.",
|
|
44432
|
+
href: options.baseHref,
|
|
44433
|
+
label,
|
|
44434
|
+
status: "warn",
|
|
44435
|
+
value: 0
|
|
44436
|
+
};
|
|
44437
|
+
}
|
|
44438
|
+
const failedAdapters = report.summary.failedAdapters;
|
|
44439
|
+
if (failedAdapters.length === 0) {
|
|
44440
|
+
const passingDetail = report.adapters.map((adapter) => `${adapter.adapterId}: WER ${adapter.overall.averageWordErrorRate.toFixed(3)} across ${String(adapter.fixtureCount)} fixtures`).join("; ");
|
|
44441
|
+
return {
|
|
44442
|
+
detail: passingDetail,
|
|
44443
|
+
href: options.baseHref,
|
|
44444
|
+
label,
|
|
44445
|
+
status: "pass",
|
|
44446
|
+
value: report.summary.adapterCount
|
|
44447
|
+
};
|
|
44448
|
+
}
|
|
44449
|
+
return {
|
|
44450
|
+
detail: `Failed adapters: ${failedAdapters.join(", ")}. ${report.adapters.filter((adapter) => !adapter.passes).flatMap((adapter) => adapter.failures.slice(0, 3)).join(" ")}`,
|
|
44451
|
+
href: options.baseHref,
|
|
44452
|
+
label,
|
|
44453
|
+
status: "fail",
|
|
44454
|
+
value: failedAdapters.length
|
|
44455
|
+
};
|
|
44456
|
+
};
|
|
43376
44457
|
export {
|
|
43377
44458
|
writeVoiceProofPack,
|
|
43378
44459
|
writeVoiceMediaPipelineArtifacts,
|
|
@@ -43447,6 +44528,7 @@ export {
|
|
|
43447
44528
|
runVoiceProfileSwitchPolicyProof,
|
|
43448
44529
|
runVoicePhoneAgentProductionSmokeContract,
|
|
43449
44530
|
runVoiceOutcomeContractSuite,
|
|
44531
|
+
runVoiceMultilingualProof,
|
|
43450
44532
|
runVoiceCommandProofTargets,
|
|
43451
44533
|
runVoiceCommandProofTarget,
|
|
43452
44534
|
runVoiceCampaignReadinessProof,
|
|
@@ -43535,6 +44617,7 @@ export {
|
|
|
43535
44617
|
renderVoiceOperationalStatusHTML,
|
|
43536
44618
|
renderVoiceObservabilityExportReplayHTML,
|
|
43537
44619
|
renderVoiceObservabilityExportMarkdown,
|
|
44620
|
+
renderVoiceMultilingualProofMarkdown,
|
|
43538
44621
|
renderVoiceMonitorMarkdown,
|
|
43539
44622
|
renderVoiceMonitorHTML,
|
|
43540
44623
|
renderVoiceMediaPipelineMarkdown,
|
|
@@ -44068,6 +45151,7 @@ export {
|
|
|
44068
45151
|
buildVoiceObservabilityExportDeliveryHistory,
|
|
44069
45152
|
buildVoiceObservabilityExport,
|
|
44070
45153
|
buildVoiceObservabilityArtifactIndex,
|
|
45154
|
+
buildVoiceMultilingualProofReadinessCheck,
|
|
44071
45155
|
buildVoiceMonitorRunReport,
|
|
44072
45156
|
buildVoiceMediaPipelineReport,
|
|
44073
45157
|
buildVoiceMediaPipelineReadinessChecks,
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import type { STTAdapter } from "./types";
|
|
2
|
+
import { type VoiceTestFixture } from "./testing/fixtures";
|
|
3
|
+
import { type VoiceSTTBenchmarkOptions, type VoiceSTTBenchmarkReport } from "./testing/benchmark";
|
|
4
|
+
export type VoiceMultilingualLanguageCode = string;
|
|
5
|
+
export type VoiceMultilingualProofLanguageThresholds = {
|
|
6
|
+
label?: string;
|
|
7
|
+
language: VoiceMultilingualLanguageCode;
|
|
8
|
+
maxAverageWordErrorRate?: number;
|
|
9
|
+
minAverageWordAccuracyRate?: number;
|
|
10
|
+
minPassRate?: number;
|
|
11
|
+
minTermRecall?: number;
|
|
12
|
+
};
|
|
13
|
+
export type VoiceMultilingualProofDefaultThresholds = Omit<VoiceMultilingualProofLanguageThresholds, "label" | "language">;
|
|
14
|
+
export type VoiceMultilingualProofAdapterEntry = {
|
|
15
|
+
adapter: STTAdapter;
|
|
16
|
+
adapterId: string;
|
|
17
|
+
benchmarkOptions?: VoiceSTTBenchmarkOptions;
|
|
18
|
+
};
|
|
19
|
+
export type VoiceMultilingualProofOptions = {
|
|
20
|
+
adapters: readonly VoiceMultilingualProofAdapterEntry[];
|
|
21
|
+
defaultThresholds?: VoiceMultilingualProofDefaultThresholds;
|
|
22
|
+
filter?: (fixture: VoiceTestFixture) => boolean;
|
|
23
|
+
fixtureDirectories?: string | readonly string[];
|
|
24
|
+
fixtures?: readonly VoiceTestFixture[];
|
|
25
|
+
perLanguage?: readonly VoiceMultilingualProofLanguageThresholds[];
|
|
26
|
+
};
|
|
27
|
+
export type VoiceMultilingualProofLanguageMetrics = {
|
|
28
|
+
averageTermRecall: number;
|
|
29
|
+
averageWordAccuracyRate: number;
|
|
30
|
+
averageWordErrorRate: number;
|
|
31
|
+
fixtureCount: number;
|
|
32
|
+
passCount: number;
|
|
33
|
+
passRate: number;
|
|
34
|
+
};
|
|
35
|
+
export type VoiceMultilingualProofLanguageReport = {
|
|
36
|
+
applied: VoiceMultilingualProofLanguageThresholds;
|
|
37
|
+
failures: readonly string[];
|
|
38
|
+
fixtureIds: readonly string[];
|
|
39
|
+
label?: string;
|
|
40
|
+
language: VoiceMultilingualLanguageCode;
|
|
41
|
+
metrics: VoiceMultilingualProofLanguageMetrics;
|
|
42
|
+
passes: boolean;
|
|
43
|
+
};
|
|
44
|
+
export type VoiceMultilingualProofAdapterReport = {
|
|
45
|
+
adapterId: string;
|
|
46
|
+
benchmark: VoiceSTTBenchmarkReport;
|
|
47
|
+
failures: readonly string[];
|
|
48
|
+
fixtureCount: number;
|
|
49
|
+
languageReports: readonly VoiceMultilingualProofLanguageReport[];
|
|
50
|
+
overall: VoiceMultilingualProofLanguageMetrics;
|
|
51
|
+
passes: boolean;
|
|
52
|
+
};
|
|
53
|
+
export type VoiceMultilingualProofReport = {
|
|
54
|
+
adapters: readonly VoiceMultilingualProofAdapterReport[];
|
|
55
|
+
generatedAt: number;
|
|
56
|
+
passes: boolean;
|
|
57
|
+
summary: {
|
|
58
|
+
adapterCount: number;
|
|
59
|
+
failedAdapters: readonly string[];
|
|
60
|
+
fixtureCount: number;
|
|
61
|
+
languageCount: number;
|
|
62
|
+
};
|
|
63
|
+
};
|
|
64
|
+
export declare const runVoiceMultilingualProof: (options: VoiceMultilingualProofOptions) => Promise<VoiceMultilingualProofReport>;
|
|
65
|
+
export declare const renderVoiceMultilingualProofMarkdown: (report: VoiceMultilingualProofReport) => string;
|
|
66
|
+
export type VoiceMultilingualProofReadinessOptions = {
|
|
67
|
+
baseHref?: string;
|
|
68
|
+
label?: string;
|
|
69
|
+
};
|
|
70
|
+
export type VoiceMultilingualProofReadinessCheck = {
|
|
71
|
+
detail: string;
|
|
72
|
+
href?: string;
|
|
73
|
+
label: string;
|
|
74
|
+
status: "fail" | "pass" | "warn";
|
|
75
|
+
value?: number | string;
|
|
76
|
+
};
|
|
77
|
+
export declare const buildVoiceMultilingualProofReadinessCheck: (report: VoiceMultilingualProofReport, options?: VoiceMultilingualProofReadinessOptions) => VoiceMultilingualProofReadinessCheck;
|