@mcut/transcription-local 0.1.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,357 @@
1
+ //#region src/wav.ts
2
+ /**
3
+ * Minimal WAV reader → 16kHz mono Float32Array (Whisper's input format).
4
+ * The editor extracts clip audio as 16kHz WAV already (`extractAudioToWav`),
5
+ * so this covers the hot path without any AudioContext — it runs in workers
6
+ * and tests alike. Non-WAV input falls back to decodeAudioData upstream.
7
+ */
8
+ const WHISPER_SAMPLE_RATE = 16e3;
9
+ /** Parse a PCM/float WAV file. Returns null when it isn't one. */
10
+ function parseWav(buffer) {
11
+ const view = new DataView(buffer);
12
+ if (buffer.byteLength < 44) return null;
13
+ if (view.getUint32(0) !== 1380533830) return null;
14
+ if (view.getUint32(8) !== 1463899717) return null;
15
+ let offset = 12;
16
+ let format = null;
17
+ while (offset + 8 <= buffer.byteLength) {
18
+ const id = view.getUint32(offset);
19
+ const size = view.getUint32(offset + 4, true);
20
+ const body = offset + 8;
21
+ if (id === 1718449184) format = {
22
+ audioFormat: view.getUint16(body, true),
23
+ channels: view.getUint16(body + 2, true),
24
+ sampleRate: view.getUint32(body + 4, true),
25
+ bitsPerSample: view.getUint16(body + 14, true)
26
+ };
27
+ else if (id === 1684108385 && format) return decodeData(view, body, Math.min(buffer.byteLength, body + size), format);
28
+ offset = body + size + size % 2;
29
+ }
30
+ return null;
31
+ }
32
+ function decodeData(view, start, end, format) {
33
+ const { audioFormat, channels, sampleRate, bitsPerSample } = format;
34
+ if (channels < 1 || sampleRate <= 0) return null;
35
+ const bytesPerSample = bitsPerSample / 8;
36
+ const frameBytes = bytesPerSample * channels;
37
+ const frames = Math.floor((end - start) / frameBytes);
38
+ const samples = new Float32Array(frames);
39
+ const read = (at) => {
40
+ if (audioFormat === 3 && bitsPerSample === 32) return view.getFloat32(at, true);
41
+ if (audioFormat === 1 && bitsPerSample === 16) return view.getInt16(at, true) / 32768;
42
+ if (audioFormat === 1 && bitsPerSample === 32) return view.getInt32(at, true) / 2147483648;
43
+ if (audioFormat === 1 && bitsPerSample === 8) return (view.getUint8(at) - 128) / 128;
44
+ if (audioFormat === 1 && bitsPerSample === 24) return (view.getUint8(at) | view.getUint8(at + 1) << 8 | view.getInt8(at + 2) << 16) / 8388608;
45
+ return NaN;
46
+ };
47
+ if (frames > 0 && Number.isNaN(read(start))) return null;
48
+ for (let frame = 0; frame < frames; frame++) {
49
+ const at = start + frame * frameBytes;
50
+ let sum = 0;
51
+ for (let c = 0; c < channels; c++) sum += read(at + c * bytesPerSample);
52
+ samples[frame] = sum / channels;
53
+ }
54
+ return {
55
+ samples,
56
+ sampleRate
57
+ };
58
+ }
59
+ /** Linear-interpolation resample (fine for speech models). */
60
+ function resampleTo(audio, targetRate) {
61
+ if (audio.sampleRate === targetRate) return audio.samples;
62
+ const ratio = audio.sampleRate / targetRate;
63
+ const length = Math.max(1, Math.round(audio.samples.length / ratio));
64
+ const out = new Float32Array(length);
65
+ for (let i = 0; i < length; i++) {
66
+ const position = i * ratio;
67
+ const index = Math.floor(position);
68
+ const fraction = position - index;
69
+ const a = audio.samples[Math.min(index, audio.samples.length - 1)];
70
+ out[i] = a + (audio.samples[Math.min(index + 1, audio.samples.length - 1)] - a) * fraction;
71
+ }
72
+ return out;
73
+ }
74
+ //#endregion
75
+ //#region src/chunking.ts
76
+ const CHUNK_WINDOW_S = 30;
77
+ const CHUNK_OVERLAP_S = 5;
78
+ /** Split a duration into overlapping windows (last window may be shorter). */
79
+ function planChunks(durationS, windowS = 30, overlapS = 5) {
80
+ if (durationS <= 0) return [];
81
+ if (durationS <= windowS) return [{
82
+ startS: 0,
83
+ endS: durationS
84
+ }];
85
+ const step = windowS - overlapS;
86
+ const chunks = [];
87
+ for (let start = 0; start < durationS - overlapS; start += step) chunks.push({
88
+ startS: start,
89
+ endS: Math.min(durationS, start + windowS)
90
+ });
91
+ return chunks;
92
+ }
93
+ /**
94
+ * Merge consecutive chunk transcripts on word timestamps rather than
95
+ * concatenating: inside each overlap the cut lands on the largest silence
96
+ * between the incoming chunk's words (falling back to the overlap midpoint),
97
+ * the outgoing chunk keeps words before the cut, the incoming one after.
98
+ * This absorbs the timestamp drift Whisper accumulates near window edges.
99
+ */
100
+ function mergeChunkWords(results) {
101
+ const present = results.filter((r) => r.words.length > 0);
102
+ if (present.length === 0) return [];
103
+ let merged = [...present[0].words];
104
+ for (let i = 1; i < present.length; i++) {
105
+ const next = present[i];
106
+ const overlapStartMs = next.chunk.startS * 1e3;
107
+ const prevEndMs = present[i - 1].chunk.endS * 1e3;
108
+ const overlapEndMs = Math.min(prevEndMs, next.chunk.endS * 1e3);
109
+ if (overlapEndMs <= overlapStartMs) {
110
+ merged = [...merged, ...next.words];
111
+ continue;
112
+ }
113
+ const cutMs = pickCut(next.words, overlapStartMs, overlapEndMs);
114
+ merged = [...merged.filter((w) => w.startMs < cutMs), ...next.words.filter((w) => w.startMs >= cutMs)];
115
+ }
116
+ return merged.sort((a, b) => a.startMs - b.startMs);
117
+ }
118
+ /** Same overlap-cut strategy as {@link mergeChunkWords}, for segment-timed models. */
119
+ function mergeChunkSegments(results) {
120
+ const present = results.filter((r) => r.segments.length > 0);
121
+ if (present.length === 0) return [];
122
+ let merged = [...present[0].segments];
123
+ for (let i = 1; i < present.length; i++) {
124
+ const next = present[i];
125
+ const overlapStartMs = next.chunk.startS * 1e3;
126
+ const prevEndMs = present[i - 1].chunk.endS * 1e3;
127
+ const overlapEndMs = Math.min(prevEndMs, next.chunk.endS * 1e3);
128
+ if (overlapEndMs <= overlapStartMs) {
129
+ merged = [...merged, ...next.segments];
130
+ continue;
131
+ }
132
+ const cutMs = pickCut(next.segments, overlapStartMs, overlapEndMs);
133
+ merged = [...merged.filter((s) => s.startMs < cutMs), ...next.segments.filter((s) => s.startMs >= cutMs)];
134
+ }
135
+ return merged.sort((a, b) => a.startMs - b.startMs);
136
+ }
137
+ /**
138
+ * The middle of the largest inter-word gap inside the overlap, else the
139
+ * overlap midpoint. Cutting mid-gap (not at a word edge) keeps the two
140
+ * chunks' slightly-drifted copies of the same word from both surviving.
141
+ */
142
+ function pickCut(words, overlapStartMs, overlapEndMs) {
143
+ let bestGap = 0;
144
+ let bestCut = (overlapStartMs + overlapEndMs) / 2;
145
+ const inWindow = words.filter((w) => w.endMs > overlapStartMs && w.startMs < overlapEndMs);
146
+ for (let i = 1; i < inWindow.length; i++) {
147
+ const gap = inWindow[i].startMs - inWindow[i - 1].endMs;
148
+ if (gap > bestGap) {
149
+ bestGap = gap;
150
+ bestCut = (inWindow[i - 1].endMs + inWindow[i].startMs) / 2;
151
+ }
152
+ }
153
+ return bestGap >= 120 ? bestCut : (overlapStartMs + overlapEndMs) / 2;
154
+ }
155
+ //#endregion
156
+ //#region src/repetition.ts
157
+ /** True when the token stream ends up looping the same n-gram. */
158
+ function hasRepetitionLoop(tokens, options = {}) {
159
+ const maxNgram = options.maxNgram ?? 4;
160
+ const normalized = tokens.map((t) => t.toLowerCase().replace(/[^\p{L}\p{N}']+/gu, "")).filter(Boolean);
161
+ for (let n = 1; n <= maxNgram; n++) {
162
+ const needed = options.minRepeats ?? (n === 1 ? 6 : n === 2 ? 4 : 3);
163
+ if (normalized.length < n * needed) continue;
164
+ let repeats = 1;
165
+ for (let i = n; i + n <= normalized.length; i += n) {
166
+ let same = true;
167
+ for (let j = 0; j < n; j++) if (normalized[i + j] !== normalized[i + j - n]) {
168
+ same = false;
169
+ break;
170
+ }
171
+ repeats = same ? repeats + 1 : 1;
172
+ if (repeats >= needed) return true;
173
+ }
174
+ }
175
+ return false;
176
+ }
177
+ /** Convenience for plain text output. */
178
+ function textHasRepetitionLoop(text, options) {
179
+ return hasRepetitionLoop(text.split(/\s+/), options);
180
+ }
181
+ //#endregion
182
+ //#region src/vad.ts
183
+ /**
184
+ * Lightweight energy-based voice activity pre-pass: windows that are
185
+ * essentially silent never reach the model, which is where Whisper
186
+ * hallucinates text. Deliberately conservative — it only skips clear
187
+ * silence, anything ambiguous (music, low speech) still gets transcribed.
188
+ * The seam is shaped so a silero-vad WASM pass can replace it later.
189
+ */
190
+ const FRAME_S = .03;
191
+ function measureActivity(samples, sampleRate) {
192
+ const frameLength = Math.max(1, Math.round(sampleRate * FRAME_S));
193
+ let active = 0;
194
+ let frames = 0;
195
+ let peak = 0;
196
+ for (let start = 0; start < samples.length; start += frameLength) {
197
+ const end = Math.min(samples.length, start + frameLength);
198
+ let sum = 0;
199
+ for (let i = start; i < end; i++) sum += samples[i] * samples[i];
200
+ const rms = Math.sqrt(sum / Math.max(1, end - start));
201
+ peak = Math.max(peak, rms);
202
+ if (rms > .004) active++;
203
+ frames++;
204
+ }
205
+ return {
206
+ activeFraction: frames > 0 ? active / frames : 0,
207
+ peakRms: peak
208
+ };
209
+ }
210
+ /** Whether a window plausibly contains any speech at all. */
211
+ function hasSpeech(samples, sampleRate) {
212
+ const { activeFraction, peakRms } = measureActivity(samples, sampleRate);
213
+ return peakRms >= .006 && activeFraction >= .01;
214
+ }
215
+ //#endregion
216
+ //#region src/index.ts
217
+ /**
218
+ * On-device Whisper provider (Transformers.js in a dedicated worker).
219
+ * Reliability over flash: capability-gated hard (WebGPU + enough memory),
220
+ * chunked with per-window progress, VAD + repetition guards in the worker.
221
+ * On-device is OFFERED, never forced — keep a server provider as the
222
+ * default and let users opt in (the model is a 40–150MB one-time download,
223
+ * cached by the browser after that).
224
+ */
225
+ /** Built-in model choices (ONNX community builds of OpenAI Whisper). */
226
+ const WHISPER_MODELS = {
227
+ /** Multilingual, ~145MB at q8 — the WebGPU default. */
228
+ base: "onnx-community/whisper-base",
229
+ /** English-only, ~40MB — the low-memory default. */
230
+ "tiny.en": "onnx-community/whisper-tiny.en"
231
+ };
232
+ function capabilities() {
233
+ return typeof navigator === "undefined" ? {} : navigator;
234
+ }
235
+ /**
236
+ * Hard capability gate: WebGPU plus ≥4GB device memory. Browsers that don't
237
+ * report `deviceMemory` (Safari/Firefox) pass on WebGPU alone — the spec
238
+ * caps reported values at 8 anyway.
239
+ */
240
+ function isLocalTranscriptionSupported() {
241
+ if (typeof Worker === "undefined") return false;
242
+ const { gpu, deviceMemory } = capabilities();
243
+ if (!gpu) return false;
244
+ return deviceMemory === void 0 || deviceMemory >= 4;
245
+ }
246
+ /** whisper-base on roomy machines, whisper-tiny.en when memory is tight. */
247
+ function pickDefaultModel() {
248
+ const { deviceMemory } = capabilities();
249
+ return deviceMemory !== void 0 && deviceMemory < 8 ? WHISPER_MODELS["tiny.en"] : WHISPER_MODELS.base;
250
+ }
251
+ function createLocalWhisperProvider(options = {}) {
252
+ const model = options.model && options.model in WHISPER_MODELS ? WHISPER_MODELS[options.model] : options.model ?? pickDefaultModel();
253
+ const device = options.device ?? "webgpu";
254
+ const dtype = options.dtype ?? "q8";
255
+ let worker = null;
256
+ let requestId = 0;
257
+ const ensureWorker = () => {
258
+ worker ??= options.createWorker ? options.createWorker() : new Worker(new URL("./whisper-worker.js", import.meta.url), { type: "module" });
259
+ return worker;
260
+ };
261
+ return {
262
+ id: options.id ?? "whisper-local",
263
+ async transcribe(input, transcribeOptions) {
264
+ const signal = transcribeOptions?.signal;
265
+ signal?.throwIfAborted();
266
+ const audio = await decodeToWhisperInput(input);
267
+ signal?.throwIfAborted();
268
+ const target = ensureWorker();
269
+ const id = requestId++;
270
+ return new Promise((resolve, reject) => {
271
+ const cleanup = () => {
272
+ target.removeEventListener("message", onMessage);
273
+ target.removeEventListener("error", onError);
274
+ signal?.removeEventListener("abort", onAbort);
275
+ };
276
+ const onAbort = () => {
277
+ cleanup();
278
+ target.terminate();
279
+ worker = null;
280
+ reject(signal?.reason ?? new DOMException("Transcription aborted", "AbortError"));
281
+ };
282
+ const onError = (event) => {
283
+ cleanup();
284
+ worker = null;
285
+ reject(event.error instanceof Error ? event.error : new Error(event.message || "Whisper worker crashed"));
286
+ };
287
+ const onMessage = (event) => {
288
+ const message = event.data;
289
+ if (message.type === "progress" && message.id === id) options.onProgress?.({
290
+ phase: message.phase,
291
+ progress: message.progress
292
+ });
293
+ else if (message.type === "result" && message.id === id) {
294
+ cleanup();
295
+ resolve(message.result);
296
+ } else if (message.type === "error" && message.id === id) {
297
+ cleanup();
298
+ reject(new Error(message.message));
299
+ }
300
+ };
301
+ signal?.addEventListener("abort", onAbort, { once: true });
302
+ target.addEventListener("message", onMessage);
303
+ target.addEventListener("error", onError);
304
+ const request = {
305
+ type: "transcribe",
306
+ id,
307
+ config: {
308
+ model,
309
+ device,
310
+ dtype
311
+ },
312
+ audio,
313
+ ...transcribeOptions?.language ? { language: transcribeOptions.language } : {}
314
+ };
315
+ target.postMessage(request, [audio.buffer]);
316
+ });
317
+ }
318
+ };
319
+ }
320
+ /** Normalize any {@link TranscribeInput} into 16kHz mono PCM. */
321
+ async function decodeToWhisperInput(input) {
322
+ const buffer = await toArrayBuffer(input.audio);
323
+ const wav = parseWav(buffer);
324
+ if (wav) return resampleTo(wav, WHISPER_SAMPLE_RATE);
325
+ if (typeof AudioContext !== "undefined") {
326
+ const context = new AudioContext({ sampleRate: WHISPER_SAMPLE_RATE });
327
+ try {
328
+ const decoded = await context.decodeAudioData(buffer.slice(0));
329
+ const mono = new Float32Array(decoded.length);
330
+ for (let c = 0; c < decoded.numberOfChannels; c++) {
331
+ const channel = decoded.getChannelData(c);
332
+ for (let i = 0; i < channel.length; i++) mono[i] += channel[i] / decoded.numberOfChannels;
333
+ }
334
+ return resampleTo({
335
+ samples: mono,
336
+ sampleRate: decoded.sampleRate
337
+ }, WHISPER_SAMPLE_RATE);
338
+ } finally {
339
+ context.close();
340
+ }
341
+ }
342
+ throw new Error("Unsupported audio: expected WAV (use extractAudioToWav) or a browser context");
343
+ }
344
+ async function toArrayBuffer(audio) {
345
+ if (typeof audio === "string") {
346
+ const response = await fetch(audio);
347
+ if (!response.ok) throw new Error(`Could not fetch audio (${response.status})`);
348
+ return response.arrayBuffer();
349
+ }
350
+ if (audio instanceof Blob) return audio.arrayBuffer();
351
+ if (audio instanceof ArrayBuffer) return audio;
352
+ const copy = new Uint8Array(audio.byteLength);
353
+ copy.set(audio);
354
+ return copy.buffer;
355
+ }
356
+ //#endregion
357
+ export { CHUNK_OVERLAP_S, CHUNK_WINDOW_S, WHISPER_MODELS, WHISPER_SAMPLE_RATE, createLocalWhisperProvider, hasRepetitionLoop, hasSpeech, isLocalTranscriptionSupported, measureActivity, mergeChunkSegments, mergeChunkWords, parseWav, pickDefaultModel, planChunks, resampleTo, textHasRepetitionLoop };