@mcut/transcription-local 0.1.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +187 -0
- package/README.md +11 -0
- package/dist/index.d.ts +174 -0
- package/dist/index.js +357 -0
- package/dist/whisper-worker.js +29053 -0
- package/package.json +58 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
//#region src/wav.ts
|
|
2
|
+
/**
|
|
3
|
+
* Minimal WAV reader → 16kHz mono Float32Array (Whisper's input format).
|
|
4
|
+
* The editor extracts clip audio as 16kHz WAV already (`extractAudioToWav`),
|
|
5
|
+
* so this covers the hot path without any AudioContext — it runs in workers
|
|
6
|
+
* and tests alike. Non-WAV input falls back to decodeAudioData upstream.
|
|
7
|
+
*/
|
|
8
|
+
const WHISPER_SAMPLE_RATE = 16e3;
|
|
9
|
+
/** Parse a PCM/float WAV file. Returns null when it isn't one. */
|
|
10
|
+
function parseWav(buffer) {
|
|
11
|
+
const view = new DataView(buffer);
|
|
12
|
+
if (buffer.byteLength < 44) return null;
|
|
13
|
+
if (view.getUint32(0) !== 1380533830) return null;
|
|
14
|
+
if (view.getUint32(8) !== 1463899717) return null;
|
|
15
|
+
let offset = 12;
|
|
16
|
+
let format = null;
|
|
17
|
+
while (offset + 8 <= buffer.byteLength) {
|
|
18
|
+
const id = view.getUint32(offset);
|
|
19
|
+
const size = view.getUint32(offset + 4, true);
|
|
20
|
+
const body = offset + 8;
|
|
21
|
+
if (id === 1718449184) format = {
|
|
22
|
+
audioFormat: view.getUint16(body, true),
|
|
23
|
+
channels: view.getUint16(body + 2, true),
|
|
24
|
+
sampleRate: view.getUint32(body + 4, true),
|
|
25
|
+
bitsPerSample: view.getUint16(body + 14, true)
|
|
26
|
+
};
|
|
27
|
+
else if (id === 1684108385 && format) return decodeData(view, body, Math.min(buffer.byteLength, body + size), format);
|
|
28
|
+
offset = body + size + size % 2;
|
|
29
|
+
}
|
|
30
|
+
return null;
|
|
31
|
+
}
|
|
32
|
+
function decodeData(view, start, end, format) {
|
|
33
|
+
const { audioFormat, channels, sampleRate, bitsPerSample } = format;
|
|
34
|
+
if (channels < 1 || sampleRate <= 0) return null;
|
|
35
|
+
const bytesPerSample = bitsPerSample / 8;
|
|
36
|
+
const frameBytes = bytesPerSample * channels;
|
|
37
|
+
const frames = Math.floor((end - start) / frameBytes);
|
|
38
|
+
const samples = new Float32Array(frames);
|
|
39
|
+
const read = (at) => {
|
|
40
|
+
if (audioFormat === 3 && bitsPerSample === 32) return view.getFloat32(at, true);
|
|
41
|
+
if (audioFormat === 1 && bitsPerSample === 16) return view.getInt16(at, true) / 32768;
|
|
42
|
+
if (audioFormat === 1 && bitsPerSample === 32) return view.getInt32(at, true) / 2147483648;
|
|
43
|
+
if (audioFormat === 1 && bitsPerSample === 8) return (view.getUint8(at) - 128) / 128;
|
|
44
|
+
if (audioFormat === 1 && bitsPerSample === 24) return (view.getUint8(at) | view.getUint8(at + 1) << 8 | view.getInt8(at + 2) << 16) / 8388608;
|
|
45
|
+
return NaN;
|
|
46
|
+
};
|
|
47
|
+
if (frames > 0 && Number.isNaN(read(start))) return null;
|
|
48
|
+
for (let frame = 0; frame < frames; frame++) {
|
|
49
|
+
const at = start + frame * frameBytes;
|
|
50
|
+
let sum = 0;
|
|
51
|
+
for (let c = 0; c < channels; c++) sum += read(at + c * bytesPerSample);
|
|
52
|
+
samples[frame] = sum / channels;
|
|
53
|
+
}
|
|
54
|
+
return {
|
|
55
|
+
samples,
|
|
56
|
+
sampleRate
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
/** Linear-interpolation resample (fine for speech models). */
|
|
60
|
+
function resampleTo(audio, targetRate) {
|
|
61
|
+
if (audio.sampleRate === targetRate) return audio.samples;
|
|
62
|
+
const ratio = audio.sampleRate / targetRate;
|
|
63
|
+
const length = Math.max(1, Math.round(audio.samples.length / ratio));
|
|
64
|
+
const out = new Float32Array(length);
|
|
65
|
+
for (let i = 0; i < length; i++) {
|
|
66
|
+
const position = i * ratio;
|
|
67
|
+
const index = Math.floor(position);
|
|
68
|
+
const fraction = position - index;
|
|
69
|
+
const a = audio.samples[Math.min(index, audio.samples.length - 1)];
|
|
70
|
+
out[i] = a + (audio.samples[Math.min(index + 1, audio.samples.length - 1)] - a) * fraction;
|
|
71
|
+
}
|
|
72
|
+
return out;
|
|
73
|
+
}
|
|
74
|
+
//#endregion
|
|
75
|
+
//#region src/chunking.ts
|
|
76
|
+
const CHUNK_WINDOW_S = 30;
|
|
77
|
+
const CHUNK_OVERLAP_S = 5;
|
|
78
|
+
/** Split a duration into overlapping windows (last window may be shorter). */
|
|
79
|
+
function planChunks(durationS, windowS = 30, overlapS = 5) {
|
|
80
|
+
if (durationS <= 0) return [];
|
|
81
|
+
if (durationS <= windowS) return [{
|
|
82
|
+
startS: 0,
|
|
83
|
+
endS: durationS
|
|
84
|
+
}];
|
|
85
|
+
const step = windowS - overlapS;
|
|
86
|
+
const chunks = [];
|
|
87
|
+
for (let start = 0; start < durationS - overlapS; start += step) chunks.push({
|
|
88
|
+
startS: start,
|
|
89
|
+
endS: Math.min(durationS, start + windowS)
|
|
90
|
+
});
|
|
91
|
+
return chunks;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Merge consecutive chunk transcripts on word timestamps rather than
|
|
95
|
+
* concatenating: inside each overlap the cut lands on the largest silence
|
|
96
|
+
* between the incoming chunk's words (falling back to the overlap midpoint),
|
|
97
|
+
* the outgoing chunk keeps words before the cut, the incoming one after.
|
|
98
|
+
* This absorbs the timestamp drift Whisper accumulates near window edges.
|
|
99
|
+
*/
|
|
100
|
+
function mergeChunkWords(results) {
|
|
101
|
+
const present = results.filter((r) => r.words.length > 0);
|
|
102
|
+
if (present.length === 0) return [];
|
|
103
|
+
let merged = [...present[0].words];
|
|
104
|
+
for (let i = 1; i < present.length; i++) {
|
|
105
|
+
const next = present[i];
|
|
106
|
+
const overlapStartMs = next.chunk.startS * 1e3;
|
|
107
|
+
const prevEndMs = present[i - 1].chunk.endS * 1e3;
|
|
108
|
+
const overlapEndMs = Math.min(prevEndMs, next.chunk.endS * 1e3);
|
|
109
|
+
if (overlapEndMs <= overlapStartMs) {
|
|
110
|
+
merged = [...merged, ...next.words];
|
|
111
|
+
continue;
|
|
112
|
+
}
|
|
113
|
+
const cutMs = pickCut(next.words, overlapStartMs, overlapEndMs);
|
|
114
|
+
merged = [...merged.filter((w) => w.startMs < cutMs), ...next.words.filter((w) => w.startMs >= cutMs)];
|
|
115
|
+
}
|
|
116
|
+
return merged.sort((a, b) => a.startMs - b.startMs);
|
|
117
|
+
}
|
|
118
|
+
/** Same overlap-cut strategy as {@link mergeChunkWords}, for segment-timed models. */
|
|
119
|
+
function mergeChunkSegments(results) {
|
|
120
|
+
const present = results.filter((r) => r.segments.length > 0);
|
|
121
|
+
if (present.length === 0) return [];
|
|
122
|
+
let merged = [...present[0].segments];
|
|
123
|
+
for (let i = 1; i < present.length; i++) {
|
|
124
|
+
const next = present[i];
|
|
125
|
+
const overlapStartMs = next.chunk.startS * 1e3;
|
|
126
|
+
const prevEndMs = present[i - 1].chunk.endS * 1e3;
|
|
127
|
+
const overlapEndMs = Math.min(prevEndMs, next.chunk.endS * 1e3);
|
|
128
|
+
if (overlapEndMs <= overlapStartMs) {
|
|
129
|
+
merged = [...merged, ...next.segments];
|
|
130
|
+
continue;
|
|
131
|
+
}
|
|
132
|
+
const cutMs = pickCut(next.segments, overlapStartMs, overlapEndMs);
|
|
133
|
+
merged = [...merged.filter((s) => s.startMs < cutMs), ...next.segments.filter((s) => s.startMs >= cutMs)];
|
|
134
|
+
}
|
|
135
|
+
return merged.sort((a, b) => a.startMs - b.startMs);
|
|
136
|
+
}
|
|
137
|
+
/**
|
|
138
|
+
* The middle of the largest inter-word gap inside the overlap, else the
|
|
139
|
+
* overlap midpoint. Cutting mid-gap (not at a word edge) keeps the two
|
|
140
|
+
* chunks' slightly-drifted copies of the same word from both surviving.
|
|
141
|
+
*/
|
|
142
|
+
function pickCut(words, overlapStartMs, overlapEndMs) {
|
|
143
|
+
let bestGap = 0;
|
|
144
|
+
let bestCut = (overlapStartMs + overlapEndMs) / 2;
|
|
145
|
+
const inWindow = words.filter((w) => w.endMs > overlapStartMs && w.startMs < overlapEndMs);
|
|
146
|
+
for (let i = 1; i < inWindow.length; i++) {
|
|
147
|
+
const gap = inWindow[i].startMs - inWindow[i - 1].endMs;
|
|
148
|
+
if (gap > bestGap) {
|
|
149
|
+
bestGap = gap;
|
|
150
|
+
bestCut = (inWindow[i - 1].endMs + inWindow[i].startMs) / 2;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
return bestGap >= 120 ? bestCut : (overlapStartMs + overlapEndMs) / 2;
|
|
154
|
+
}
|
|
155
|
+
//#endregion
|
|
156
|
+
//#region src/repetition.ts
|
|
157
|
+
/** True when the token stream ends up looping the same n-gram. */
|
|
158
|
+
function hasRepetitionLoop(tokens, options = {}) {
|
|
159
|
+
const maxNgram = options.maxNgram ?? 4;
|
|
160
|
+
const normalized = tokens.map((t) => t.toLowerCase().replace(/[^\p{L}\p{N}']+/gu, "")).filter(Boolean);
|
|
161
|
+
for (let n = 1; n <= maxNgram; n++) {
|
|
162
|
+
const needed = options.minRepeats ?? (n === 1 ? 6 : n === 2 ? 4 : 3);
|
|
163
|
+
if (normalized.length < n * needed) continue;
|
|
164
|
+
let repeats = 1;
|
|
165
|
+
for (let i = n; i + n <= normalized.length; i += n) {
|
|
166
|
+
let same = true;
|
|
167
|
+
for (let j = 0; j < n; j++) if (normalized[i + j] !== normalized[i + j - n]) {
|
|
168
|
+
same = false;
|
|
169
|
+
break;
|
|
170
|
+
}
|
|
171
|
+
repeats = same ? repeats + 1 : 1;
|
|
172
|
+
if (repeats >= needed) return true;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
return false;
|
|
176
|
+
}
|
|
177
|
+
/** Convenience for plain text output. */
|
|
178
|
+
function textHasRepetitionLoop(text, options) {
|
|
179
|
+
return hasRepetitionLoop(text.split(/\s+/), options);
|
|
180
|
+
}
|
|
181
|
+
//#endregion
|
|
182
|
+
//#region src/vad.ts
|
|
183
|
+
/**
|
|
184
|
+
* Lightweight energy-based voice activity pre-pass: windows that are
|
|
185
|
+
* essentially silent never reach the model, which is where Whisper
|
|
186
|
+
* hallucinates text. Deliberately conservative — it only skips clear
|
|
187
|
+
* silence, anything ambiguous (music, low speech) still gets transcribed.
|
|
188
|
+
* The seam is shaped so a silero-vad WASM pass can replace it later.
|
|
189
|
+
*/
|
|
190
|
+
const FRAME_S = .03;
|
|
191
|
+
function measureActivity(samples, sampleRate) {
|
|
192
|
+
const frameLength = Math.max(1, Math.round(sampleRate * FRAME_S));
|
|
193
|
+
let active = 0;
|
|
194
|
+
let frames = 0;
|
|
195
|
+
let peak = 0;
|
|
196
|
+
for (let start = 0; start < samples.length; start += frameLength) {
|
|
197
|
+
const end = Math.min(samples.length, start + frameLength);
|
|
198
|
+
let sum = 0;
|
|
199
|
+
for (let i = start; i < end; i++) sum += samples[i] * samples[i];
|
|
200
|
+
const rms = Math.sqrt(sum / Math.max(1, end - start));
|
|
201
|
+
peak = Math.max(peak, rms);
|
|
202
|
+
if (rms > .004) active++;
|
|
203
|
+
frames++;
|
|
204
|
+
}
|
|
205
|
+
return {
|
|
206
|
+
activeFraction: frames > 0 ? active / frames : 0,
|
|
207
|
+
peakRms: peak
|
|
208
|
+
};
|
|
209
|
+
}
|
|
210
|
+
/** Whether a window plausibly contains any speech at all. */
|
|
211
|
+
function hasSpeech(samples, sampleRate) {
|
|
212
|
+
const { activeFraction, peakRms } = measureActivity(samples, sampleRate);
|
|
213
|
+
return peakRms >= .006 && activeFraction >= .01;
|
|
214
|
+
}
|
|
215
|
+
//#endregion
|
|
216
|
+
//#region src/index.ts
|
|
217
|
+
/**
|
|
218
|
+
* On-device Whisper provider (Transformers.js in a dedicated worker).
|
|
219
|
+
* Reliability over flash: capability-gated hard (WebGPU + enough memory),
|
|
220
|
+
* chunked with per-window progress, VAD + repetition guards in the worker.
|
|
221
|
+
* On-device is OFFERED, never forced — keep a server provider as the
|
|
222
|
+
* default and let users opt in (the model is a 40–150MB one-time download,
|
|
223
|
+
* cached by the browser after that).
|
|
224
|
+
*/
|
|
225
|
+
/** Built-in model choices (ONNX community builds of OpenAI Whisper). */
|
|
226
|
+
const WHISPER_MODELS = {
|
|
227
|
+
/** Multilingual, ~145MB at q8 — the WebGPU default. */
|
|
228
|
+
base: "onnx-community/whisper-base",
|
|
229
|
+
/** English-only, ~40MB — the low-memory default. */
|
|
230
|
+
"tiny.en": "onnx-community/whisper-tiny.en"
|
|
231
|
+
};
|
|
232
|
+
function capabilities() {
|
|
233
|
+
return typeof navigator === "undefined" ? {} : navigator;
|
|
234
|
+
}
|
|
235
|
+
/**
|
|
236
|
+
* Hard capability gate: WebGPU plus ≥4GB device memory. Browsers that don't
|
|
237
|
+
* report `deviceMemory` (Safari/Firefox) pass on WebGPU alone — the spec
|
|
238
|
+
* caps reported values at 8 anyway.
|
|
239
|
+
*/
|
|
240
|
+
function isLocalTranscriptionSupported() {
|
|
241
|
+
if (typeof Worker === "undefined") return false;
|
|
242
|
+
const { gpu, deviceMemory } = capabilities();
|
|
243
|
+
if (!gpu) return false;
|
|
244
|
+
return deviceMemory === void 0 || deviceMemory >= 4;
|
|
245
|
+
}
|
|
246
|
+
/** whisper-base on roomy machines, whisper-tiny.en when memory is tight. */
|
|
247
|
+
function pickDefaultModel() {
|
|
248
|
+
const { deviceMemory } = capabilities();
|
|
249
|
+
return deviceMemory !== void 0 && deviceMemory < 8 ? WHISPER_MODELS["tiny.en"] : WHISPER_MODELS.base;
|
|
250
|
+
}
|
|
251
|
+
function createLocalWhisperProvider(options = {}) {
|
|
252
|
+
const model = options.model && options.model in WHISPER_MODELS ? WHISPER_MODELS[options.model] : options.model ?? pickDefaultModel();
|
|
253
|
+
const device = options.device ?? "webgpu";
|
|
254
|
+
const dtype = options.dtype ?? "q8";
|
|
255
|
+
let worker = null;
|
|
256
|
+
let requestId = 0;
|
|
257
|
+
const ensureWorker = () => {
|
|
258
|
+
worker ??= options.createWorker ? options.createWorker() : new Worker(new URL("./whisper-worker.js", import.meta.url), { type: "module" });
|
|
259
|
+
return worker;
|
|
260
|
+
};
|
|
261
|
+
return {
|
|
262
|
+
id: options.id ?? "whisper-local",
|
|
263
|
+
async transcribe(input, transcribeOptions) {
|
|
264
|
+
const signal = transcribeOptions?.signal;
|
|
265
|
+
signal?.throwIfAborted();
|
|
266
|
+
const audio = await decodeToWhisperInput(input);
|
|
267
|
+
signal?.throwIfAborted();
|
|
268
|
+
const target = ensureWorker();
|
|
269
|
+
const id = requestId++;
|
|
270
|
+
return new Promise((resolve, reject) => {
|
|
271
|
+
const cleanup = () => {
|
|
272
|
+
target.removeEventListener("message", onMessage);
|
|
273
|
+
target.removeEventListener("error", onError);
|
|
274
|
+
signal?.removeEventListener("abort", onAbort);
|
|
275
|
+
};
|
|
276
|
+
const onAbort = () => {
|
|
277
|
+
cleanup();
|
|
278
|
+
target.terminate();
|
|
279
|
+
worker = null;
|
|
280
|
+
reject(signal?.reason ?? new DOMException("Transcription aborted", "AbortError"));
|
|
281
|
+
};
|
|
282
|
+
const onError = (event) => {
|
|
283
|
+
cleanup();
|
|
284
|
+
worker = null;
|
|
285
|
+
reject(event.error instanceof Error ? event.error : new Error(event.message || "Whisper worker crashed"));
|
|
286
|
+
};
|
|
287
|
+
const onMessage = (event) => {
|
|
288
|
+
const message = event.data;
|
|
289
|
+
if (message.type === "progress" && message.id === id) options.onProgress?.({
|
|
290
|
+
phase: message.phase,
|
|
291
|
+
progress: message.progress
|
|
292
|
+
});
|
|
293
|
+
else if (message.type === "result" && message.id === id) {
|
|
294
|
+
cleanup();
|
|
295
|
+
resolve(message.result);
|
|
296
|
+
} else if (message.type === "error" && message.id === id) {
|
|
297
|
+
cleanup();
|
|
298
|
+
reject(new Error(message.message));
|
|
299
|
+
}
|
|
300
|
+
};
|
|
301
|
+
signal?.addEventListener("abort", onAbort, { once: true });
|
|
302
|
+
target.addEventListener("message", onMessage);
|
|
303
|
+
target.addEventListener("error", onError);
|
|
304
|
+
const request = {
|
|
305
|
+
type: "transcribe",
|
|
306
|
+
id,
|
|
307
|
+
config: {
|
|
308
|
+
model,
|
|
309
|
+
device,
|
|
310
|
+
dtype
|
|
311
|
+
},
|
|
312
|
+
audio,
|
|
313
|
+
...transcribeOptions?.language ? { language: transcribeOptions.language } : {}
|
|
314
|
+
};
|
|
315
|
+
target.postMessage(request, [audio.buffer]);
|
|
316
|
+
});
|
|
317
|
+
}
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
/** Normalize any {@link TranscribeInput} into 16kHz mono PCM. */
|
|
321
|
+
async function decodeToWhisperInput(input) {
|
|
322
|
+
const buffer = await toArrayBuffer(input.audio);
|
|
323
|
+
const wav = parseWav(buffer);
|
|
324
|
+
if (wav) return resampleTo(wav, WHISPER_SAMPLE_RATE);
|
|
325
|
+
if (typeof AudioContext !== "undefined") {
|
|
326
|
+
const context = new AudioContext({ sampleRate: WHISPER_SAMPLE_RATE });
|
|
327
|
+
try {
|
|
328
|
+
const decoded = await context.decodeAudioData(buffer.slice(0));
|
|
329
|
+
const mono = new Float32Array(decoded.length);
|
|
330
|
+
for (let c = 0; c < decoded.numberOfChannels; c++) {
|
|
331
|
+
const channel = decoded.getChannelData(c);
|
|
332
|
+
for (let i = 0; i < channel.length; i++) mono[i] += channel[i] / decoded.numberOfChannels;
|
|
333
|
+
}
|
|
334
|
+
return resampleTo({
|
|
335
|
+
samples: mono,
|
|
336
|
+
sampleRate: decoded.sampleRate
|
|
337
|
+
}, WHISPER_SAMPLE_RATE);
|
|
338
|
+
} finally {
|
|
339
|
+
context.close();
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
throw new Error("Unsupported audio: expected WAV (use extractAudioToWav) or a browser context");
|
|
343
|
+
}
|
|
344
|
+
async function toArrayBuffer(audio) {
|
|
345
|
+
if (typeof audio === "string") {
|
|
346
|
+
const response = await fetch(audio);
|
|
347
|
+
if (!response.ok) throw new Error(`Could not fetch audio (${response.status})`);
|
|
348
|
+
return response.arrayBuffer();
|
|
349
|
+
}
|
|
350
|
+
if (audio instanceof Blob) return audio.arrayBuffer();
|
|
351
|
+
if (audio instanceof ArrayBuffer) return audio;
|
|
352
|
+
const copy = new Uint8Array(audio.byteLength);
|
|
353
|
+
copy.set(audio);
|
|
354
|
+
return copy.buffer;
|
|
355
|
+
}
|
|
356
|
+
//#endregion
|
|
357
|
+
export { CHUNK_OVERLAP_S, CHUNK_WINDOW_S, WHISPER_MODELS, WHISPER_SAMPLE_RATE, createLocalWhisperProvider, hasRepetitionLoop, hasSpeech, isLocalTranscriptionSupported, measureActivity, mergeChunkSegments, mergeChunkWords, parseWav, pickDefaultModel, planChunks, resampleTo, textHasRepetitionLoop };
|