@absolutejs/voice 0.0.16 → 0.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/client/htmxBootstrap.d.ts +1 -0
- package/dist/client/htmxBootstrap.js +888 -0
- package/dist/index.js +52 -1
- package/dist/plugin.d.ts +12 -0
- package/dist/testing/accuracy.d.ts +13 -0
- package/dist/testing/benchmark.d.ts +76 -0
- package/dist/testing/fixtures.d.ts +21 -0
- package/dist/testing/index.d.ts +4 -0
- package/dist/testing/index.js +446 -0
- package/dist/testing/stt.d.ts +22 -0
- package/dist/types.d.ts +1 -0
- package/fixtures/manifest.json +32 -0
- package/fixtures/pcm/quietly-alone-clean.pcm +0 -0
- package/fixtures/pcm/rainstorms-noisy.pcm +0 -0
- package/fixtures/pcm/traveled-back-route-clean.pcm +0 -0
- package/package.json +19 -3
package/dist/index.js
CHANGED
|
@@ -71,6 +71,7 @@ var __decorateElement = (array, flags, name, decorators, target, extra) => {
|
|
|
71
71
|
|
|
72
72
|
// src/plugin.ts
|
|
73
73
|
import { Elysia } from "elysia";
|
|
74
|
+
import { resolve } from "path";
|
|
74
75
|
|
|
75
76
|
// src/htmx.ts
|
|
76
77
|
var DEFAULT_HTMX_TARGETS = {
|
|
@@ -680,6 +681,51 @@ var createVoiceSession = (options) => {
|
|
|
680
681
|
};
|
|
681
682
|
|
|
682
683
|
// src/plugin.ts
|
|
684
|
+
var HTMX_BOOTSTRAP_DIST_CANDIDATES = [
|
|
685
|
+
resolve(import.meta.dir, "client", "htmxBootstrap.js"),
|
|
686
|
+
resolve(import.meta.dir, "..", "dist", "client", "htmxBootstrap.js")
|
|
687
|
+
];
|
|
688
|
+
var HTMX_BOOTSTRAP_SOURCE_CANDIDATES = [
|
|
689
|
+
resolve(import.meta.dir, "client", "htmxBootstrap.ts"),
|
|
690
|
+
resolve(import.meta.dir, "..", "src", "client", "htmxBootstrap.ts")
|
|
691
|
+
];
|
|
692
|
+
var loadHTMXBootstrap = (() => {
|
|
693
|
+
let cached = null;
|
|
694
|
+
return () => {
|
|
695
|
+
if (cached) {
|
|
696
|
+
return cached;
|
|
697
|
+
}
|
|
698
|
+
cached = (async () => {
|
|
699
|
+
for (const candidate of HTMX_BOOTSTRAP_DIST_CANDIDATES) {
|
|
700
|
+
const asset = Bun.file(candidate);
|
|
701
|
+
if (await asset.exists()) {
|
|
702
|
+
return await asset.text();
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
for (const candidate of HTMX_BOOTSTRAP_SOURCE_CANDIDATES) {
|
|
706
|
+
const asset = Bun.file(candidate);
|
|
707
|
+
if (!await asset.exists()) {
|
|
708
|
+
continue;
|
|
709
|
+
}
|
|
710
|
+
const build = await Bun.build({
|
|
711
|
+
entrypoints: [candidate],
|
|
712
|
+
format: "esm",
|
|
713
|
+
minify: true,
|
|
714
|
+
target: "browser"
|
|
715
|
+
});
|
|
716
|
+
if (!build.success || build.outputs.length === 0) {
|
|
717
|
+
const log = build.logs.map((entry) => entry.message).join(`
|
|
718
|
+
`);
|
|
719
|
+
throw new Error(`Failed to build the voice HTMX bootstrap bundle.${log ? `
|
|
720
|
+
${log}` : ""}`);
|
|
721
|
+
}
|
|
722
|
+
return await build.outputs[0].text();
|
|
723
|
+
}
|
|
724
|
+
throw new Error("Unable to locate the voice HTMX bootstrap client.");
|
|
725
|
+
})();
|
|
726
|
+
return cached;
|
|
727
|
+
};
|
|
728
|
+
})();
|
|
683
729
|
var isArrayBufferView = (value) => typeof value === "object" && value !== null && ArrayBuffer.isView(value);
|
|
684
730
|
var isVoiceClientMessage = (value) => {
|
|
685
731
|
if (!value || typeof value !== "object" || !("type" in value)) {
|
|
@@ -755,6 +801,7 @@ var voice = (config) => {
|
|
|
755
801
|
const onTurn = normalizeOnTurn(config.onTurn);
|
|
756
802
|
const htmxOptions = config.htmx && typeof config.htmx === "object" ? config.htmx : undefined;
|
|
757
803
|
const htmxRoute = htmxOptions?.route ?? `${config.path}/htmx/session`;
|
|
804
|
+
const htmxBootstrapRoute = htmxOptions?.bootstrapRoute ?? `${config.path}/htmx/bootstrap.js`;
|
|
758
805
|
const htmxRenderers = resolveVoiceHTMXRenderers(config.htmx && config.htmx !== true ? config.htmx : undefined);
|
|
759
806
|
const htmxTargets = resolveVoiceHTMXTargets(htmxOptions?.targets);
|
|
760
807
|
const htmxRoutes = () => {
|
|
@@ -778,7 +825,11 @@ var voice = (config) => {
|
|
|
778
825
|
}, htmxRenderers, htmxTargets), {
|
|
779
826
|
headers: { "Content-Type": "text/html; charset=utf-8" }
|
|
780
827
|
});
|
|
781
|
-
})
|
|
828
|
+
}).get(htmxBootstrapRoute, async () => new Response(await loadHTMXBootstrap(), {
|
|
829
|
+
headers: {
|
|
830
|
+
"Content-Type": "application/javascript; charset=utf-8"
|
|
831
|
+
}
|
|
832
|
+
}));
|
|
782
833
|
};
|
|
783
834
|
return new Elysia({ name: "absolutejs-voice" }).ws(config.path, {
|
|
784
835
|
close: async (ws, code, reason) => {
|
package/dist/plugin.d.ts
CHANGED
|
@@ -47,6 +47,18 @@ export declare const voice: <TContext = unknown, TSession extends VoiceSessionRe
|
|
|
47
47
|
};
|
|
48
48
|
};
|
|
49
49
|
};
|
|
50
|
+
} & {
|
|
51
|
+
[x: string]: {
|
|
52
|
+
get: {
|
|
53
|
+
body: unknown;
|
|
54
|
+
params: {};
|
|
55
|
+
query: unknown;
|
|
56
|
+
headers: unknown;
|
|
57
|
+
response: {
|
|
58
|
+
200: Response;
|
|
59
|
+
};
|
|
60
|
+
};
|
|
61
|
+
};
|
|
50
62
|
}), {
|
|
51
63
|
derive: {};
|
|
52
64
|
resolve: {};
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { Transcript } from '../types';
|
|
2
|
+
export type VoiceTranscriptAccuracy = {
|
|
3
|
+
actualText: string;
|
|
4
|
+
charDistance: number;
|
|
5
|
+
charErrorRate: number;
|
|
6
|
+
expectedText: string;
|
|
7
|
+
passesThreshold: boolean;
|
|
8
|
+
threshold: number;
|
|
9
|
+
wordDistance: number;
|
|
10
|
+
wordErrorRate: number;
|
|
11
|
+
};
|
|
12
|
+
export declare const mergeFinalTranscriptText: (transcripts: Transcript[]) => string;
|
|
13
|
+
export declare const scoreTranscriptAccuracy: (actualText: string, expectedText: string, threshold?: number) => VoiceTranscriptAccuracy;
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import type { STTAdapter } from '../types';
|
|
2
|
+
import { type VoiceSTTAdapterHarnessOptions, type VoiceSTTAdapterHarnessResult } from './stt';
|
|
3
|
+
import type { VoiceTestFixture } from './fixtures';
|
|
4
|
+
export type VoiceExpectedTermAccuracy = {
|
|
5
|
+
allMatched: boolean;
|
|
6
|
+
expectedTerms: string[];
|
|
7
|
+
matchedTerms: string[];
|
|
8
|
+
missingTerms: string[];
|
|
9
|
+
recall: number;
|
|
10
|
+
};
|
|
11
|
+
export type VoiceSTTBenchmarkFixtureResult = {
|
|
12
|
+
accuracy: VoiceSTTAdapterHarnessResult['accuracy'];
|
|
13
|
+
closeCount: number;
|
|
14
|
+
difficulty?: VoiceTestFixture['difficulty'];
|
|
15
|
+
elapsedMs: number;
|
|
16
|
+
endOfTurnCount: number;
|
|
17
|
+
errorCount: number;
|
|
18
|
+
expectedTerms: VoiceExpectedTermAccuracy;
|
|
19
|
+
finalCount: number;
|
|
20
|
+
finalText: string;
|
|
21
|
+
fixtureId: string;
|
|
22
|
+
fragmentationCount: number;
|
|
23
|
+
passes: boolean;
|
|
24
|
+
partialCount: number;
|
|
25
|
+
tags: string[];
|
|
26
|
+
timeToEndOfTurnMs?: number;
|
|
27
|
+
timeToFirstFinalMs?: number;
|
|
28
|
+
timeToFirstPartialMs?: number;
|
|
29
|
+
title: string;
|
|
30
|
+
};
|
|
31
|
+
export type VoiceSTTBenchmarkSummary = {
|
|
32
|
+
adapterId: string;
|
|
33
|
+
averageCharErrorRate: number;
|
|
34
|
+
averageElapsedMs: number;
|
|
35
|
+
averageEndOfTurnCount: number;
|
|
36
|
+
averageFinalCount: number;
|
|
37
|
+
averageTermRecall: number;
|
|
38
|
+
averageTimeToEndOfTurnMs?: number;
|
|
39
|
+
averageTimeToFirstFinalMs?: number;
|
|
40
|
+
averageTimeToFirstPartialMs?: number;
|
|
41
|
+
averageWordErrorRate: number;
|
|
42
|
+
fixtureCount: number;
|
|
43
|
+
fixturesWithErrors: number;
|
|
44
|
+
fixturesWithFragmentation: number;
|
|
45
|
+
passCount: number;
|
|
46
|
+
passRate: number;
|
|
47
|
+
totalErrorCount: number;
|
|
48
|
+
wordAccuracyRate: number;
|
|
49
|
+
};
|
|
50
|
+
export type VoiceSTTBenchmarkReport = {
|
|
51
|
+
adapterId: string;
|
|
52
|
+
fixtures: VoiceSTTBenchmarkFixtureResult[];
|
|
53
|
+
generatedAt: number;
|
|
54
|
+
summary: VoiceSTTBenchmarkSummary;
|
|
55
|
+
};
|
|
56
|
+
export type VoiceSTTBenchmarkComparisonEntry = {
|
|
57
|
+
adapterId: string;
|
|
58
|
+
summary: VoiceSTTBenchmarkSummary;
|
|
59
|
+
};
|
|
60
|
+
export type VoiceSTTBenchmarkComparison = {
|
|
61
|
+
bestByPassRate?: VoiceSTTBenchmarkComparisonEntry;
|
|
62
|
+
bestByTermRecall?: VoiceSTTBenchmarkComparisonEntry;
|
|
63
|
+
bestByWordErrorRate?: VoiceSTTBenchmarkComparisonEntry;
|
|
64
|
+
entries: VoiceSTTBenchmarkComparisonEntry[];
|
|
65
|
+
};
|
|
66
|
+
export type VoiceSTTBenchmarkOptions = VoiceSTTAdapterHarnessOptions & {
|
|
67
|
+
fixtureOptions?: Record<string, Omit<VoiceSTTAdapterHarnessOptions, 'fixtureOptions'>>;
|
|
68
|
+
};
|
|
69
|
+
export declare const summarizeSTTBenchmark: (adapterId: string, fixtures: VoiceSTTBenchmarkFixtureResult[]) => VoiceSTTBenchmarkSummary;
|
|
70
|
+
export declare const compareSTTBenchmarks: (reports: VoiceSTTBenchmarkReport[]) => VoiceSTTBenchmarkComparison;
|
|
71
|
+
export declare const runSTTAdapterBenchmark: ({ adapter, adapterId, fixtures, options }: {
|
|
72
|
+
adapter: STTAdapter;
|
|
73
|
+
adapterId: string;
|
|
74
|
+
fixtures: VoiceTestFixture[];
|
|
75
|
+
options?: VoiceSTTBenchmarkOptions;
|
|
76
|
+
}) => Promise<VoiceSTTBenchmarkReport>;
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { AudioFormat } from '../types';
|
|
2
|
+
export type VoiceTestFixtureManifestEntry = {
|
|
3
|
+
id: string;
|
|
4
|
+
title: string;
|
|
5
|
+
audioPath: string;
|
|
6
|
+
expectedText: string;
|
|
7
|
+
expectedTerms?: string[];
|
|
8
|
+
chunkDurationMs?: number;
|
|
9
|
+
language?: string;
|
|
10
|
+
difficulty?: 'clean' | 'noisy' | 'challenging';
|
|
11
|
+
tags?: string[];
|
|
12
|
+
tailPaddingMs?: number;
|
|
13
|
+
format?: Partial<AudioFormat>;
|
|
14
|
+
};
|
|
15
|
+
export type VoiceTestFixture = Omit<VoiceTestFixtureManifestEntry, 'audioPath'> & {
|
|
16
|
+
audio: Uint8Array;
|
|
17
|
+
audioPath: string;
|
|
18
|
+
format: AudioFormat;
|
|
19
|
+
};
|
|
20
|
+
export declare const getVoiceFixtureDirectory: () => Promise<string>;
|
|
21
|
+
export declare const loadVoiceTestFixtures: (fixtureDirectory?: string) => Promise<VoiceTestFixture[]>;
|
|
@@ -0,0 +1,446 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __name = (target, name) => {
|
|
6
|
+
Object.defineProperty(target, "name", {
|
|
7
|
+
value: name,
|
|
8
|
+
enumerable: false,
|
|
9
|
+
configurable: true
|
|
10
|
+
});
|
|
11
|
+
return target;
|
|
12
|
+
};
|
|
13
|
+
var __knownSymbol = (name, symbol) => (symbol = Symbol[name]) ? symbol : Symbol.for("Symbol." + name);
|
|
14
|
+
var __typeError = (msg) => {
|
|
15
|
+
throw TypeError(msg);
|
|
16
|
+
};
|
|
17
|
+
var __defNormalProp = (obj, key, value) => (key in obj) ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
|
|
18
|
+
var __accessCheck = (obj, member, msg) => member.has(obj) || __typeError("Cannot " + msg);
|
|
19
|
+
var __privateIn = (member, obj) => Object(obj) !== obj ? __typeError('Cannot use the "in" operator on this value') : member.has(obj);
|
|
20
|
+
var __privateGet = (obj, member, getter) => (__accessCheck(obj, member, "read from private field"), getter ? getter.call(obj) : member.get(obj));
|
|
21
|
+
var __privateSet = (obj, member, value, setter) => (__accessCheck(obj, member, "write to private field"), setter ? setter.call(obj, value) : member.set(obj, value), value);
|
|
22
|
+
var __privateMethod = (obj, member, method) => (__accessCheck(obj, member, "access private method"), method);
|
|
23
|
+
var __decoratorStart = (base) => [, , , __create(base?.[__knownSymbol("metadata")] ?? null)];
|
|
24
|
+
var __decoratorStrings = ["class", "method", "getter", "setter", "accessor", "field", "value", "get", "set"];
|
|
25
|
+
var __expectFn = (fn) => fn !== undefined && typeof fn !== "function" ? __typeError("Function expected") : fn;
|
|
26
|
+
var __decoratorContext = (kind, name, done, metadata, fns) => ({
|
|
27
|
+
kind: __decoratorStrings[kind],
|
|
28
|
+
name,
|
|
29
|
+
metadata,
|
|
30
|
+
addInitializer: (fn) => done._ ? __typeError("Already initialized") : fns.push(__expectFn(fn || null))
|
|
31
|
+
});
|
|
32
|
+
var __decoratorMetadata = (array, target) => __defNormalProp(target, __knownSymbol("metadata"), array[3]);
|
|
33
|
+
var __runInitializers = (array, flags, self, value) => {
|
|
34
|
+
for (var i = 0, fns = array[flags >> 1], n = fns && fns.length;i < n; i++)
|
|
35
|
+
flags & 1 ? fns[i].call(self) : value = fns[i].call(self, value);
|
|
36
|
+
return value;
|
|
37
|
+
};
|
|
38
|
+
var __decorateElement = (array, flags, name, decorators, target, extra) => {
|
|
39
|
+
var fn, it, done, ctx, access, k = flags & 7, s = !!(flags & 8), p = !!(flags & 16);
|
|
40
|
+
var j = k > 3 ? array.length + 1 : k ? s ? 1 : 2 : 0, key = __decoratorStrings[k + 5];
|
|
41
|
+
var initializers = k > 3 && (array[j - 1] = []), extraInitializers = array[j] || (array[j] = []);
|
|
42
|
+
var desc = k && (!p && !s && (target = target.prototype), k < 5 && (k > 3 || !p) && __getOwnPropDesc(k < 4 ? target : {
|
|
43
|
+
get [name]() {
|
|
44
|
+
return __privateGet(this, extra);
|
|
45
|
+
},
|
|
46
|
+
set [name](x) {
|
|
47
|
+
__privateSet(this, extra, x);
|
|
48
|
+
}
|
|
49
|
+
}, name));
|
|
50
|
+
k ? p && k < 4 && __name(extra, (k > 2 ? "set " : k > 1 ? "get " : "") + name) : __name(target, name);
|
|
51
|
+
for (var i = decorators.length - 1;i >= 0; i--) {
|
|
52
|
+
ctx = __decoratorContext(k, name, done = {}, array[3], extraInitializers);
|
|
53
|
+
if (k) {
|
|
54
|
+
ctx.static = s, ctx.private = p, access = ctx.access = { has: p ? (x) => __privateIn(target, x) : (x) => (name in x) };
|
|
55
|
+
if (k ^ 3)
|
|
56
|
+
access.get = p ? (x) => (k ^ 1 ? __privateGet : __privateMethod)(x, target, k ^ 4 ? extra : desc.get) : (x) => x[name];
|
|
57
|
+
if (k > 2)
|
|
58
|
+
access.set = p ? (x, y) => __privateSet(x, target, y, k ^ 4 ? extra : desc.set) : (x, y) => x[name] = y;
|
|
59
|
+
}
|
|
60
|
+
it = (0, decorators[i])(k ? k < 4 ? p ? extra : desc[key] : k > 4 ? undefined : { get: desc.get, set: desc.set } : target, ctx);
|
|
61
|
+
done._ = 1;
|
|
62
|
+
if (k ^ 4 || it === undefined)
|
|
63
|
+
__expectFn(it) && (k > 4 ? initializers.unshift(it) : k ? p ? extra = it : desc[key] = it : target = it);
|
|
64
|
+
else if (typeof it !== "object" || it === null)
|
|
65
|
+
__typeError("Object expected");
|
|
66
|
+
else
|
|
67
|
+
__expectFn(fn = it.get) && (desc.get = fn), __expectFn(fn = it.set) && (desc.set = fn), __expectFn(fn = it.init) && initializers.unshift(fn);
|
|
68
|
+
}
|
|
69
|
+
return k || __decoratorMetadata(array, target), desc && __defProp(target, name, desc), p ? k ^ 4 ? extra : desc : target;
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
// src/turnDetection.ts
|
|
73
|
+
var DEFAULT_SILENCE_MS = 700;
|
|
74
|
+
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
75
|
+
var toUint8Array = (audio) => {
|
|
76
|
+
if (audio instanceof ArrayBuffer) {
|
|
77
|
+
return new Uint8Array(audio);
|
|
78
|
+
}
|
|
79
|
+
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
80
|
+
};
|
|
81
|
+
var measureAudioLevel = (audio) => {
|
|
82
|
+
const bytes = toUint8Array(audio);
|
|
83
|
+
if (bytes.byteLength < 2) {
|
|
84
|
+
return 0;
|
|
85
|
+
}
|
|
86
|
+
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
87
|
+
if (samples.length === 0) {
|
|
88
|
+
return 0;
|
|
89
|
+
}
|
|
90
|
+
let sumSquares = 0;
|
|
91
|
+
for (const sample of samples) {
|
|
92
|
+
const normalized = sample / 32768;
|
|
93
|
+
sumSquares += normalized * normalized;
|
|
94
|
+
}
|
|
95
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
96
|
+
};
|
|
97
|
+
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
98
|
+
var mergeTranscriptTexts = (transcripts) => {
|
|
99
|
+
const merged = [];
|
|
100
|
+
for (const transcript of transcripts) {
|
|
101
|
+
const nextText = normalizeText(transcript.text);
|
|
102
|
+
if (!nextText) {
|
|
103
|
+
continue;
|
|
104
|
+
}
|
|
105
|
+
const previous = merged.at(-1);
|
|
106
|
+
if (!previous) {
|
|
107
|
+
merged.push(nextText);
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
if (nextText === previous || previous.includes(nextText)) {
|
|
111
|
+
continue;
|
|
112
|
+
}
|
|
113
|
+
if (nextText.includes(previous)) {
|
|
114
|
+
merged[merged.length - 1] = nextText;
|
|
115
|
+
continue;
|
|
116
|
+
}
|
|
117
|
+
merged.push(nextText);
|
|
118
|
+
}
|
|
119
|
+
return merged.join(" ").trim();
|
|
120
|
+
};
|
|
121
|
+
var buildTurnText = (transcripts, partialText) => {
|
|
122
|
+
const finalText = mergeTranscriptTexts(transcripts);
|
|
123
|
+
if (finalText) {
|
|
124
|
+
return finalText;
|
|
125
|
+
}
|
|
126
|
+
return normalizeText(partialText);
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
// src/testing/accuracy.ts
|
|
130
|
+
var normalizeAccuracyText = (value) => value.toLowerCase().replace(/[^\p{L}\p{N}\s']/gu, " ").replace(/\s+/g, " ").trim();
|
|
131
|
+
var levenshteinDistance = (left, right) => {
|
|
132
|
+
if (left.length === 0) {
|
|
133
|
+
return right.length;
|
|
134
|
+
}
|
|
135
|
+
if (right.length === 0) {
|
|
136
|
+
return left.length;
|
|
137
|
+
}
|
|
138
|
+
const previous = new Array(right.length + 1).fill(0);
|
|
139
|
+
const current = new Array(right.length + 1).fill(0);
|
|
140
|
+
for (let column = 0;column <= right.length; column += 1) {
|
|
141
|
+
previous[column] = column;
|
|
142
|
+
}
|
|
143
|
+
for (let row = 1;row <= left.length; row += 1) {
|
|
144
|
+
current[0] = row;
|
|
145
|
+
for (let column = 1;column <= right.length; column += 1) {
|
|
146
|
+
const substitutionCost = left[row - 1] === right[column - 1] ? 0 : 1;
|
|
147
|
+
current[column] = Math.min(current[column - 1] + 1, previous[column] + 1, previous[column - 1] + substitutionCost);
|
|
148
|
+
}
|
|
149
|
+
for (let column = 0;column <= right.length; column += 1) {
|
|
150
|
+
previous[column] = current[column];
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
return previous[right.length];
|
|
154
|
+
};
|
|
155
|
+
var mergeFinalTranscriptText = (transcripts) => buildTurnText(transcripts.filter((transcript) => transcript.isFinal), "");
|
|
156
|
+
var scoreTranscriptAccuracy = (actualText, expectedText, threshold = 0.35) => {
|
|
157
|
+
const normalizedActual = normalizeAccuracyText(actualText);
|
|
158
|
+
const normalizedExpected = normalizeAccuracyText(expectedText);
|
|
159
|
+
const actualWords = normalizedActual ? normalizedActual.split(" ") : [];
|
|
160
|
+
const expectedWords = normalizedExpected ? normalizedExpected.split(" ") : [];
|
|
161
|
+
const wordDistance = levenshteinDistance(actualWords, expectedWords);
|
|
162
|
+
const charDistance = levenshteinDistance(Array.from(normalizedActual), Array.from(normalizedExpected));
|
|
163
|
+
const wordErrorRate = expectedWords.length > 0 ? wordDistance / expectedWords.length : 0;
|
|
164
|
+
const charErrorRate = normalizedExpected.length > 0 ? charDistance / normalizedExpected.length : 0;
|
|
165
|
+
return {
|
|
166
|
+
actualText: normalizedActual,
|
|
167
|
+
charDistance,
|
|
168
|
+
charErrorRate,
|
|
169
|
+
expectedText: normalizedExpected,
|
|
170
|
+
passesThreshold: wordErrorRate <= threshold,
|
|
171
|
+
threshold,
|
|
172
|
+
wordDistance,
|
|
173
|
+
wordErrorRate
|
|
174
|
+
};
|
|
175
|
+
};
|
|
176
|
+
// src/testing/stt.ts
|
|
177
|
+
var chunkAudio = (audio, bytesPerChunk) => {
|
|
178
|
+
const chunks = [];
|
|
179
|
+
for (let offset = 0;offset < audio.byteLength; offset += bytesPerChunk) {
|
|
180
|
+
chunks.push(audio.slice(offset, offset + bytesPerChunk));
|
|
181
|
+
}
|
|
182
|
+
return chunks;
|
|
183
|
+
};
|
|
184
|
+
var createSilence = (byteLength) => new Uint8Array(byteLength);
|
|
185
|
+
var waitForIdle = async (readLastActivityAt, idleTimeoutMs, settleMs) => {
|
|
186
|
+
const startedAt = Date.now();
|
|
187
|
+
while (Date.now() - startedAt < idleTimeoutMs) {
|
|
188
|
+
if (Date.now() - readLastActivityAt() >= settleMs) {
|
|
189
|
+
return;
|
|
190
|
+
}
|
|
191
|
+
await Bun.sleep(Math.min(50, settleMs));
|
|
192
|
+
}
|
|
193
|
+
};
|
|
194
|
+
var runSTTAdapterFixture = async (adapter, fixture, options = {}) => {
|
|
195
|
+
const startedAt = Date.now();
|
|
196
|
+
const partialEvents = [];
|
|
197
|
+
const finalEvents = [];
|
|
198
|
+
const endOfTurnEvents = [];
|
|
199
|
+
const errorEvents = [];
|
|
200
|
+
const closeEvents = [];
|
|
201
|
+
const chunkDurationMs = options.chunkDurationMs ?? fixture.chunkDurationMs ?? 100;
|
|
202
|
+
const tailPaddingMs = options.tailPaddingMs ?? fixture.tailPaddingMs ?? 1000;
|
|
203
|
+
const idleTimeoutMs = options.idleTimeoutMs ?? 8000;
|
|
204
|
+
const settleMs = options.settleMs ?? 500;
|
|
205
|
+
const waitForRealtimeMs = options.waitForRealtimeMs ?? 0;
|
|
206
|
+
let lastActivityAt = Date.now();
|
|
207
|
+
const markActive = () => {
|
|
208
|
+
lastActivityAt = Date.now();
|
|
209
|
+
};
|
|
210
|
+
const session = await adapter.open({
|
|
211
|
+
format: fixture.format,
|
|
212
|
+
sessionId: `fixture-${fixture.id}`
|
|
213
|
+
});
|
|
214
|
+
const unsubscribers = [
|
|
215
|
+
session.on("partial", (event) => {
|
|
216
|
+
partialEvents.push(event);
|
|
217
|
+
markActive();
|
|
218
|
+
}),
|
|
219
|
+
session.on("final", (event) => {
|
|
220
|
+
finalEvents.push(event);
|
|
221
|
+
markActive();
|
|
222
|
+
}),
|
|
223
|
+
session.on("endOfTurn", (event) => {
|
|
224
|
+
endOfTurnEvents.push(event);
|
|
225
|
+
markActive();
|
|
226
|
+
}),
|
|
227
|
+
session.on("error", (event) => {
|
|
228
|
+
errorEvents.push(event);
|
|
229
|
+
markActive();
|
|
230
|
+
}),
|
|
231
|
+
session.on("close", (event) => {
|
|
232
|
+
closeEvents.push(event);
|
|
233
|
+
markActive();
|
|
234
|
+
})
|
|
235
|
+
];
|
|
236
|
+
try {
|
|
237
|
+
const bytesPerMillisecond = fixture.format.sampleRateHz * fixture.format.channels * 2 / 1000;
|
|
238
|
+
const bytesPerChunk = Math.max(2, Math.floor(bytesPerMillisecond * chunkDurationMs));
|
|
239
|
+
const chunks = chunkAudio(fixture.audio, bytesPerChunk);
|
|
240
|
+
const realtimeDelayMs = waitForRealtimeMs > 0 ? waitForRealtimeMs : chunkDurationMs;
|
|
241
|
+
for (const chunk of chunks) {
|
|
242
|
+
await session.send(chunk);
|
|
243
|
+
await Bun.sleep(realtimeDelayMs);
|
|
244
|
+
}
|
|
245
|
+
if (tailPaddingMs > 0) {
|
|
246
|
+
const tailBytes = Math.max(2, Math.floor(bytesPerMillisecond * tailPaddingMs));
|
|
247
|
+
for (const chunk of chunkAudio(createSilence(tailBytes), bytesPerChunk)) {
|
|
248
|
+
await session.send(chunk);
|
|
249
|
+
await Bun.sleep(realtimeDelayMs);
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
await waitForIdle(() => lastActivityAt, idleTimeoutMs, settleMs);
|
|
253
|
+
} finally {
|
|
254
|
+
await session.close("fixture-complete");
|
|
255
|
+
for (const unsubscribe of unsubscribers) {
|
|
256
|
+
unsubscribe();
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
const finalText = mergeFinalTranscriptText(finalEvents.map((event) => event.transcript));
|
|
260
|
+
return {
|
|
261
|
+
accuracy: scoreTranscriptAccuracy(finalText, fixture.expectedText, options.transcriptThreshold),
|
|
262
|
+
closeEvents,
|
|
263
|
+
endOfTurnEvents,
|
|
264
|
+
errorEvents,
|
|
265
|
+
finalEvents,
|
|
266
|
+
finalText,
|
|
267
|
+
partialEvents,
|
|
268
|
+
startedAt
|
|
269
|
+
};
|
|
270
|
+
};
|
|
271
|
+
|
|
272
|
+
// src/testing/benchmark.ts
|
|
273
|
+
var normalizeBenchmarkText = (value) => value.toLowerCase().replace(/[^\p{L}\p{N}\s']/gu, " ").replace(/\s+/g, " ").trim();
|
|
274
|
+
var scoreExpectedTerms = (actualText, expectedTerms) => {
|
|
275
|
+
const normalizedActual = normalizeBenchmarkText(actualText);
|
|
276
|
+
const normalizedExpectedTerms = (expectedTerms ?? []).map((entry) => normalizeBenchmarkText(entry));
|
|
277
|
+
const matchedTerms = normalizedExpectedTerms.filter((term) => term.length > 0 && normalizedActual.includes(term));
|
|
278
|
+
const missingTerms = normalizedExpectedTerms.filter((term) => term.length > 0 && !matchedTerms.includes(term));
|
|
279
|
+
const denominator = normalizedExpectedTerms.length;
|
|
280
|
+
const recall = denominator > 0 ? matchedTerms.length / denominator : 1;
|
|
281
|
+
return {
|
|
282
|
+
allMatched: missingTerms.length === 0,
|
|
283
|
+
expectedTerms: normalizedExpectedTerms,
|
|
284
|
+
matchedTerms,
|
|
285
|
+
missingTerms,
|
|
286
|
+
recall
|
|
287
|
+
};
|
|
288
|
+
};
|
|
289
|
+
var average = (values) => {
|
|
290
|
+
const filtered = values.filter((value) => typeof value === "number" && Number.isFinite(value));
|
|
291
|
+
if (filtered.length === 0) {
|
|
292
|
+
return;
|
|
293
|
+
}
|
|
294
|
+
return filtered.reduce((sum, value) => sum + value, 0) / filtered.length;
|
|
295
|
+
};
|
|
296
|
+
var roundMetric = (value, digits = 4) => {
|
|
297
|
+
if (typeof value !== "number" || !Number.isFinite(value)) {
|
|
298
|
+
return;
|
|
299
|
+
}
|
|
300
|
+
const factor = 10 ** digits;
|
|
301
|
+
return Math.round(value * factor) / factor;
|
|
302
|
+
};
|
|
303
|
+
var toFixtureBenchmarkResult = (fixture, result, elapsedMs) => {
|
|
304
|
+
const timeToFirstPartialMs = result.partialEvents[0] ? result.partialEvents[0].receivedAt - result.startedAt : undefined;
|
|
305
|
+
const timeToFirstFinalMs = result.finalEvents[0] ? result.finalEvents[0].receivedAt - result.startedAt : undefined;
|
|
306
|
+
const timeToEndOfTurnMs = result.endOfTurnEvents[0] ? result.endOfTurnEvents[0].receivedAt - result.startedAt : undefined;
|
|
307
|
+
const expectedTerms = scoreExpectedTerms(result.finalText, fixture.expectedTerms);
|
|
308
|
+
return {
|
|
309
|
+
accuracy: result.accuracy,
|
|
310
|
+
closeCount: result.closeEvents.length,
|
|
311
|
+
difficulty: fixture.difficulty,
|
|
312
|
+
elapsedMs,
|
|
313
|
+
endOfTurnCount: result.endOfTurnEvents.length,
|
|
314
|
+
errorCount: result.errorEvents.length,
|
|
315
|
+
expectedTerms,
|
|
316
|
+
finalCount: result.finalEvents.length,
|
|
317
|
+
finalText: result.finalText,
|
|
318
|
+
fixtureId: fixture.id,
|
|
319
|
+
fragmentationCount: Math.max(0, result.finalEvents.length - 1),
|
|
320
|
+
passes: result.errorEvents.length === 0 && result.finalText.trim().length > 0 && result.accuracy.passesThreshold,
|
|
321
|
+
partialCount: result.partialEvents.length,
|
|
322
|
+
tags: fixture.tags ?? [],
|
|
323
|
+
timeToEndOfTurnMs,
|
|
324
|
+
timeToFirstFinalMs,
|
|
325
|
+
timeToFirstPartialMs,
|
|
326
|
+
title: fixture.title
|
|
327
|
+
};
|
|
328
|
+
};
|
|
329
|
+
var summarizeSTTBenchmark = (adapterId, fixtures) => {
|
|
330
|
+
const fixtureCount = fixtures.length;
|
|
331
|
+
const passCount = fixtures.filter((fixture) => fixture.passes).length;
|
|
332
|
+
return {
|
|
333
|
+
adapterId,
|
|
334
|
+
averageCharErrorRate: roundMetric(average(fixtures.map((fixture) => fixture.accuracy.charErrorRate))) ?? 0,
|
|
335
|
+
averageElapsedMs: roundMetric(average(fixtures.map((fixture) => fixture.elapsedMs)), 2) ?? 0,
|
|
336
|
+
averageEndOfTurnCount: roundMetric(average(fixtures.map((fixture) => fixture.endOfTurnCount)), 2) ?? 0,
|
|
337
|
+
averageFinalCount: roundMetric(average(fixtures.map((fixture) => fixture.finalCount)), 2) ?? 0,
|
|
338
|
+
averageTermRecall: roundMetric(average(fixtures.map((fixture) => fixture.expectedTerms.recall))) ?? 0,
|
|
339
|
+
averageTimeToEndOfTurnMs: roundMetric(average(fixtures.map((fixture) => fixture.timeToEndOfTurnMs)), 2),
|
|
340
|
+
averageTimeToFirstFinalMs: roundMetric(average(fixtures.map((fixture) => fixture.timeToFirstFinalMs)), 2),
|
|
341
|
+
averageTimeToFirstPartialMs: roundMetric(average(fixtures.map((fixture) => fixture.timeToFirstPartialMs)), 2),
|
|
342
|
+
averageWordErrorRate: roundMetric(average(fixtures.map((fixture) => fixture.accuracy.wordErrorRate))) ?? 0,
|
|
343
|
+
fixtureCount,
|
|
344
|
+
fixturesWithErrors: fixtures.filter((fixture) => fixture.errorCount > 0).length,
|
|
345
|
+
fixturesWithFragmentation: fixtures.filter((fixture) => fixture.fragmentationCount > 0).length,
|
|
346
|
+
passCount,
|
|
347
|
+
passRate: fixtureCount > 0 ? roundMetric(passCount / fixtureCount) ?? 0 : 0,
|
|
348
|
+
totalErrorCount: fixtures.reduce((sum, fixture) => sum + fixture.errorCount, 0),
|
|
349
|
+
wordAccuracyRate: fixtureCount > 0 ? roundMetric(1 - (average(fixtures.map((fixture) => fixture.accuracy.wordErrorRate)) ?? 0)) ?? 0 : 0
|
|
350
|
+
};
|
|
351
|
+
};
|
|
352
|
+
var compareSTTBenchmarks = (reports) => {
|
|
353
|
+
const entries = reports.map((report) => ({
|
|
354
|
+
adapterId: report.adapterId,
|
|
355
|
+
summary: report.summary
|
|
356
|
+
}));
|
|
357
|
+
const bestByMetric = (selectMetric, direction) => entries.reduce((best, entry) => {
|
|
358
|
+
if (!best) {
|
|
359
|
+
return entry;
|
|
360
|
+
}
|
|
361
|
+
const next = selectMetric(entry);
|
|
362
|
+
const current = selectMetric(best);
|
|
363
|
+
if (direction === "max" ? next > current : next < current) {
|
|
364
|
+
return entry;
|
|
365
|
+
}
|
|
366
|
+
return best;
|
|
367
|
+
}, undefined);
|
|
368
|
+
return {
|
|
369
|
+
bestByPassRate: bestByMetric((entry) => entry.summary.passRate, "max"),
|
|
370
|
+
bestByTermRecall: bestByMetric((entry) => entry.summary.averageTermRecall, "max"),
|
|
371
|
+
bestByWordErrorRate: bestByMetric((entry) => entry.summary.averageWordErrorRate, "min"),
|
|
372
|
+
entries
|
|
373
|
+
};
|
|
374
|
+
};
|
|
375
|
+
var runSTTAdapterBenchmark = async ({
|
|
376
|
+
adapter,
|
|
377
|
+
adapterId,
|
|
378
|
+
fixtures,
|
|
379
|
+
options = {}
|
|
380
|
+
}) => {
|
|
381
|
+
const results = [];
|
|
382
|
+
for (const fixture of fixtures) {
|
|
383
|
+
const startedAt = Date.now();
|
|
384
|
+
const fixtureResult = await runSTTAdapterFixture(adapter, fixture, {
|
|
385
|
+
...options,
|
|
386
|
+
...options.fixtureOptions?.[fixture.id] ?? {}
|
|
387
|
+
});
|
|
388
|
+
results.push(toFixtureBenchmarkResult(fixture, fixtureResult, Date.now() - startedAt));
|
|
389
|
+
}
|
|
390
|
+
return {
|
|
391
|
+
adapterId,
|
|
392
|
+
fixtures: results,
|
|
393
|
+
generatedAt: Date.now(),
|
|
394
|
+
summary: summarizeSTTBenchmark(adapterId, results)
|
|
395
|
+
};
|
|
396
|
+
};
|
|
397
|
+
// src/testing/fixtures.ts
|
|
398
|
+
import { resolve } from "path";
|
|
399
|
+
var DEFAULT_AUDIO_FORMAT = {
|
|
400
|
+
channels: 1,
|
|
401
|
+
container: "raw",
|
|
402
|
+
encoding: "pcm_s16le",
|
|
403
|
+
sampleRateHz: 16000
|
|
404
|
+
};
|
|
405
|
+
var FIXTURE_DIR_CANDIDATES = [
|
|
406
|
+
resolve(import.meta.dir, "..", "..", "fixtures"),
|
|
407
|
+
resolve(import.meta.dir, "..", "..", "..", "fixtures"),
|
|
408
|
+
resolve(import.meta.dir, "..", "..", "..", "..", "fixtures")
|
|
409
|
+
];
|
|
410
|
+
var resolveFixtureDirectory = async () => {
|
|
411
|
+
for (const candidate of FIXTURE_DIR_CANDIDATES) {
|
|
412
|
+
if (await Bun.file(resolve(candidate, "manifest.json")).exists()) {
|
|
413
|
+
return candidate;
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
throw new Error("Unable to locate the bundled voice test fixtures. Expected fixtures/manifest.json next to the package root.");
|
|
417
|
+
};
|
|
418
|
+
var getVoiceFixtureDirectory = async () => resolveFixtureDirectory();
|
|
419
|
+
var loadVoiceTestFixtures = async (fixtureDirectory) => {
|
|
420
|
+
const resolvedFixtureDirectory = fixtureDirectory ?? await resolveFixtureDirectory();
|
|
421
|
+
const manifestFile = Bun.file(resolve(resolvedFixtureDirectory, "manifest.json"));
|
|
422
|
+
const manifest = await manifestFile.json();
|
|
423
|
+
return await Promise.all(manifest.map(async (entry) => {
|
|
424
|
+
const audioPath = resolve(resolvedFixtureDirectory, "pcm", entry.audioPath);
|
|
425
|
+
const audio = new Uint8Array(await Bun.file(audioPath).arrayBuffer());
|
|
426
|
+
return {
|
|
427
|
+
...entry,
|
|
428
|
+
audio,
|
|
429
|
+
audioPath,
|
|
430
|
+
format: {
|
|
431
|
+
...DEFAULT_AUDIO_FORMAT,
|
|
432
|
+
...entry.format
|
|
433
|
+
}
|
|
434
|
+
};
|
|
435
|
+
}));
|
|
436
|
+
};
|
|
437
|
+
export {
|
|
438
|
+
summarizeSTTBenchmark,
|
|
439
|
+
scoreTranscriptAccuracy,
|
|
440
|
+
runSTTAdapterFixture,
|
|
441
|
+
runSTTAdapterBenchmark,
|
|
442
|
+
mergeFinalTranscriptText,
|
|
443
|
+
loadVoiceTestFixtures,
|
|
444
|
+
getVoiceFixtureDirectory,
|
|
445
|
+
compareSTTBenchmarks
|
|
446
|
+
};
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { type VoiceTranscriptAccuracy } from './accuracy';
|
|
2
|
+
import type { STTAdapter, VoiceCloseEvent, VoiceEndOfTurnEvent, VoiceErrorEvent, VoiceFinalEvent, VoicePartialEvent } from '../types';
|
|
3
|
+
import type { VoiceTestFixture } from './fixtures';
|
|
4
|
+
export type VoiceSTTAdapterHarnessOptions = {
|
|
5
|
+
chunkDurationMs?: number;
|
|
6
|
+
idleTimeoutMs?: number;
|
|
7
|
+
settleMs?: number;
|
|
8
|
+
tailPaddingMs?: number;
|
|
9
|
+
transcriptThreshold?: number;
|
|
10
|
+
waitForRealtimeMs?: number;
|
|
11
|
+
};
|
|
12
|
+
export type VoiceSTTAdapterHarnessResult = {
|
|
13
|
+
accuracy: VoiceTranscriptAccuracy;
|
|
14
|
+
closeEvents: VoiceCloseEvent[];
|
|
15
|
+
endOfTurnEvents: VoiceEndOfTurnEvent[];
|
|
16
|
+
errorEvents: VoiceErrorEvent[];
|
|
17
|
+
finalEvents: VoiceFinalEvent[];
|
|
18
|
+
finalText: string;
|
|
19
|
+
partialEvents: VoicePartialEvent[];
|
|
20
|
+
startedAt: number;
|
|
21
|
+
};
|
|
22
|
+
export declare const runSTTAdapterFixture: (adapter: STTAdapter, fixture: VoiceTestFixture, options?: VoiceSTTAdapterHarnessOptions) => Promise<VoiceSTTAdapterHarnessResult>;
|