voxtral-transcribe-ts 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +231 -0
- package/dist/audio-core-CYgIOxAr.d.ts +50 -0
- package/dist/chunk-J6JHGURB.js +161 -0
- package/dist/index.browser.d.ts +111 -0
- package/dist/index.browser.js +247 -0
- package/dist/index.node.d.ts +124 -0
- package/dist/index.node.js +328 -0
- package/package.json +65 -0
package/README.md
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
# voxtral-transcribe-ts
|
|
2
|
+
|
|
3
|
+
Minimal TypeScript wrapper for local transcription with `Voxtral Mini 4B Realtime` in Node.js.
|
|
4
|
+
|
|
5
|
+
This package targets the ONNX checkpoint:
|
|
6
|
+
|
|
7
|
+
- `onnx-community/Voxtral-Mini-4B-Realtime-2602-ONNX`
|
|
8
|
+
|
|
9
|
+
It is intentionally small:
|
|
10
|
+
|
|
11
|
+
- Node/TS only, no Python
|
|
12
|
+
- thin wrapper around `@huggingface/transformers` + ONNX Runtime
|
|
13
|
+
- 0 external audio decoder dependency
|
|
14
|
+
|
|
15
|
+
The built-in file loader only supports `.wav` input so the package can stay lightweight. If you already have PCM samples in memory, use `transcribeAudio()`.
|
|
16
|
+
|
|
17
|
+
Architecture and multi-target rollout plan: [PLAN.md](/home/antoinefa/src/voxtral-ts/PLAN.md)
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
npm install voxtral-transcribe-ts
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
```ts
|
|
28
|
+
import { VoxtralTranscriber } from "voxtral-transcribe-ts";
|
|
29
|
+
|
|
30
|
+
const transcriber = new VoxtralTranscriber({
|
|
31
|
+
device: "cpu",
|
|
32
|
+
dtype: "q4",
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
const result = await transcriber.transcribeFile("./sample.wav");
|
|
36
|
+
console.log(result.text);
|
|
37
|
+
|
|
38
|
+
await transcriber.dispose();
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
By default, the package now auto-selects the audio decoder backend:
|
|
42
|
+
|
|
43
|
+
- Node/local: `InternalWavDecoder`
|
|
44
|
+
- Browser: `BrowserNativeAudioDecoder`
|
|
45
|
+
|
|
46
|
+
The package now ships conditional entries:
|
|
47
|
+
|
|
48
|
+
- package root in Node -> `dist/index.node.js`
|
|
49
|
+
- package root in browser-aware bundlers -> `dist/index.browser.js`
|
|
50
|
+
- explicit subpaths:
|
|
51
|
+
- `voxtral-transcribe-ts/node`
|
|
52
|
+
- `voxtral-transcribe-ts/browser`
|
|
53
|
+
|
|
54
|
+
## Environment Matrix
|
|
55
|
+
|
|
56
|
+
| Environment | Package entry | Inference runtime | Default decoder | File input strategy |
|
|
57
|
+
|---|---|---|---|---|
|
|
58
|
+
| Node / local | `voxtral-transcribe-ts` or `voxtral-transcribe-ts/node` | `@huggingface/transformers` + `onnxruntime-node` | `InternalWavDecoder` | `wav` by default, multiformat via `FfmpegDecoder` |
|
|
59
|
+
| Browser | `voxtral-transcribe-ts` in browser-aware bundlers or `voxtral-transcribe-ts/browser` | browser-safe package entry | `BrowserNativeAudioDecoder` | URL, `Blob`, `File`, browser codec support dependent on runtime |
|
|
60
|
+
| Server high-perf | `voxtral-transcribe-ts/node` | `@huggingface/transformers` + `onnxruntime-node` | `FfmpegDecoder` recommended | multiformat through `ffmpeg` |
|
|
61
|
+
|
|
62
|
+
## Decoder Matrix
|
|
63
|
+
|
|
64
|
+
| Decoder | Environment | Purpose | Notes |
|
|
65
|
+
|---|---|---|---|
|
|
66
|
+
| `InternalWavDecoder` | Node, browser | Minimal fallback | `wav` only |
|
|
67
|
+
| `FfmpegDecoder` | Node / server | Best multiformat local path | Not available in browser builds |
|
|
68
|
+
| `BrowserNativeAudioDecoder` | Browser | Native client-side decoding | Depends on browser codec support |
|
|
69
|
+
|
|
70
|
+
You can override this with:
|
|
71
|
+
|
|
72
|
+
- `target: "auto" | "node" | "browser"`
|
|
73
|
+
- `audioDecoderBackend`
|
|
74
|
+
- `inferenceBackend`
|
|
75
|
+
|
|
76
|
+
## Raw Audio
|
|
77
|
+
|
|
78
|
+
```ts
|
|
79
|
+
import { transcribeAudio } from "voxtral-transcribe-ts";
|
|
80
|
+
|
|
81
|
+
const samples = new Float32Array([/* mono PCM samples */]);
|
|
82
|
+
|
|
83
|
+
const result = await transcribeAudio(samples, {
|
|
84
|
+
sampleRate: 16_000,
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
console.log(result.text);
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## API
|
|
91
|
+
|
|
92
|
+
### `new VoxtralTranscriber(options?)`
|
|
93
|
+
|
|
94
|
+
Options:
|
|
95
|
+
|
|
96
|
+
- `model`: defaults to `onnx-community/Voxtral-Mini-4B-Realtime-2602-ONNX`
|
|
97
|
+
- `device`: defaults to `cpu`
|
|
98
|
+
- `dtype`: defaults to `q4`
|
|
99
|
+
- `cacheDir`
|
|
100
|
+
- `localFilesOnly`
|
|
101
|
+
- `revision`
|
|
102
|
+
- `progressCallback`
|
|
103
|
+
- `target`: defaults to `auto`
|
|
104
|
+
|
|
105
|
+
### `await transcriber.load()`
|
|
106
|
+
|
|
107
|
+
Preloads the processor and model.
|
|
108
|
+
|
|
109
|
+
### `await transcriber.transcribeFile(path, options?)`
|
|
110
|
+
|
|
111
|
+
Reads a WAV file, downmixes it to mono, resamples it to the model sample rate, and returns:
|
|
112
|
+
|
|
113
|
+
```ts
|
|
114
|
+
type VoxtralTranscriptionResult = {
|
|
115
|
+
decoder: string;
|
|
116
|
+
durationMs: number;
|
|
117
|
+
model: string;
|
|
118
|
+
sampleRate: number;
|
|
119
|
+
text: string;
|
|
120
|
+
};
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### `await transcriber.transcribeAudio(samples, options?)`
|
|
124
|
+
|
|
125
|
+
Transcribes mono PCM samples already loaded in memory.
|
|
126
|
+
|
|
127
|
+
Options:
|
|
128
|
+
|
|
129
|
+
- `sampleRate`: defaults to `16000`
|
|
130
|
+
- `maxNewTokens`
|
|
131
|
+
- `skipSpecialTokens`: defaults to `true`
|
|
132
|
+
|
|
133
|
+
## Advanced
|
|
134
|
+
|
|
135
|
+
The transcriber now separates:
|
|
136
|
+
|
|
137
|
+
- inference backend
|
|
138
|
+
- audio decoder backend
|
|
139
|
+
|
|
140
|
+
The current default pair is:
|
|
141
|
+
|
|
142
|
+
- `TransformersInferenceBackend`
|
|
143
|
+
- `InternalWavDecoder` in Node
|
|
144
|
+
- `BrowserNativeAudioDecoder` in browsers
|
|
145
|
+
|
|
146
|
+
For multiformat local/server decoding, use `FfmpegDecoder`.
|
|
147
|
+
|
|
148
|
+
```ts
|
|
149
|
+
import { FfmpegDecoder, VoxtralTranscriber } from "voxtral-transcribe-ts";
|
|
150
|
+
|
|
151
|
+
const transcriber = new VoxtralTranscriber({
|
|
152
|
+
audioDecoderBackend: new FfmpegDecoder(),
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
const result = await transcriber.transcribeFile("./sample.mp3");
|
|
156
|
+
console.log(result.text);
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
Browser inputs can be passed as URLs or `Blob` / `File` objects when using `BrowserNativeAudioDecoder` or the default browser auto-selection.
|
|
160
|
+
|
|
161
|
+
You can also create an instance through `createTranscriber(options)`, which uses the same defaults and target rules as `new VoxtralTranscriber(options)`.
|
|
162
|
+
|
|
163
|
+
## Browser Entry
|
|
164
|
+
|
|
165
|
+
```ts
|
|
166
|
+
import { createTranscriber } from "voxtral-transcribe-ts/browser";
|
|
167
|
+
|
|
168
|
+
const transcriber = createTranscriber({
|
|
169
|
+
target: "browser",
|
|
170
|
+
});
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Node Entry
|
|
174
|
+
|
|
175
|
+
```ts
|
|
176
|
+
import { createTranscriber, FfmpegDecoder } from "voxtral-transcribe-ts/node";
|
|
177
|
+
|
|
178
|
+
const transcriber = createTranscriber({
|
|
179
|
+
target: "node",
|
|
180
|
+
audioDecoderBackend: new FfmpegDecoder(),
|
|
181
|
+
});
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## WAV Support
|
|
185
|
+
|
|
186
|
+
The internal WAV decoder supports:
|
|
187
|
+
|
|
188
|
+
- PCM 8/16/24/32-bit
|
|
189
|
+
- IEEE float 32-bit
|
|
190
|
+
- mono or multi-channel input, mixed down to mono
|
|
191
|
+
|
|
192
|
+
For `mp3`, `m4a`, `ogg`, or `flac`, decode audio yourself and call `transcribeAudio()`.
|
|
193
|
+
|
|
194
|
+
If you want the package to decode those formats for you on local/server, instantiate the transcriber with `FfmpegDecoder`.
|
|
195
|
+
|
|
196
|
+
## Validation
|
|
197
|
+
|
|
198
|
+
```bash
|
|
199
|
+
npm run validate
|
|
200
|
+
npm run test:smoke
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
## CI / Release
|
|
204
|
+
|
|
205
|
+
The repository now ships a GitHub Actions workflow in `.github/workflows/typescript-ci.yml` modeled after `graphify`.
|
|
206
|
+
|
|
207
|
+
It does four things:
|
|
208
|
+
|
|
209
|
+
- runs `npm run validate` on Node `20` and `22`
|
|
210
|
+
- builds a tarball and installs it in a clean directory
|
|
211
|
+
- verifies the published root, `node`, and `browser` exports
|
|
212
|
+
- publishes to npm on tags matching `v*`
|
|
213
|
+
|
|
214
|
+
Publish strategy:
|
|
215
|
+
|
|
216
|
+
- default: GitHub Actions trusted publishing with `id-token: write`
|
|
217
|
+
- fallback: if `NPM_TOKEN` is configured as a repository secret, the workflow uses that token instead
|
|
218
|
+
|
|
219
|
+
Local pre-publish check:
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
npm run test:smoke
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
Typical release flow:
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
npm version patch
|
|
229
|
+
git push
|
|
230
|
+
git push --tags
|
|
231
|
+
```
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { DeviceType, DataType } from '@huggingface/transformers';
|
|
2
|
+
|
|
3
|
+
interface InferenceBackendLoadOptions {
|
|
4
|
+
cacheDir?: string;
|
|
5
|
+
device: DeviceType;
|
|
6
|
+
dtype: DataType;
|
|
7
|
+
localFilesOnly?: boolean;
|
|
8
|
+
model: string;
|
|
9
|
+
progressCallback?: (progress: unknown) => void;
|
|
10
|
+
revision?: string;
|
|
11
|
+
}
|
|
12
|
+
type ProcessorLike = {
|
|
13
|
+
feature_extractor: {
|
|
14
|
+
config: {
|
|
15
|
+
sampling_rate: number;
|
|
16
|
+
};
|
|
17
|
+
};
|
|
18
|
+
batch_decode(output: unknown, options?: {
|
|
19
|
+
skip_special_tokens?: boolean;
|
|
20
|
+
}): string[];
|
|
21
|
+
} & ((audio: Float32Array) => Promise<Record<string, unknown>>);
|
|
22
|
+
interface ModelLike {
|
|
23
|
+
dispose(): Promise<unknown>;
|
|
24
|
+
generate(input: Record<string, unknown>): Promise<unknown>;
|
|
25
|
+
}
|
|
26
|
+
interface InferenceBackend {
|
|
27
|
+
load(options: InferenceBackendLoadOptions): Promise<{
|
|
28
|
+
model: ModelLike;
|
|
29
|
+
processor: ProcessorLike;
|
|
30
|
+
}>;
|
|
31
|
+
}
|
|
32
|
+
declare class TransformersInferenceBackend implements InferenceBackend {
|
|
33
|
+
load(options: InferenceBackendLoadOptions): Promise<{
|
|
34
|
+
model: ModelLike;
|
|
35
|
+
processor: ProcessorLike;
|
|
36
|
+
}>;
|
|
37
|
+
}
|
|
38
|
+
type VoxtralRuntime = InferenceBackend;
|
|
39
|
+
declare class TransformersRuntime extends TransformersInferenceBackend {
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
interface DecodedWav {
|
|
43
|
+
channels: number;
|
|
44
|
+
sampleRate: number;
|
|
45
|
+
samples: Float32Array;
|
|
46
|
+
}
|
|
47
|
+
declare function decodeWav(buffer: Uint8Array | ArrayBuffer): DecodedWav;
|
|
48
|
+
declare function resampleAudio(samples: Float32Array | readonly number[], fromSampleRate: number, toSampleRate: number): Float32Array;
|
|
49
|
+
|
|
50
|
+
export { type DecodedWav as D, type InferenceBackend as I, type ModelLike as M, type ProcessorLike as P, TransformersInferenceBackend as T, type VoxtralRuntime as V, type InferenceBackendLoadOptions as a, TransformersRuntime as b, decodeWav as d, resampleAudio as r };
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
// src/audio-core.ts
|
|
2
|
+
var RIFF_HEADER = "RIFF";
|
|
3
|
+
var WAVE_HEADER = "WAVE";
|
|
4
|
+
var FMT_CHUNK = "fmt ";
|
|
5
|
+
var DATA_CHUNK = "data";
|
|
6
|
+
var WAVE_FORMAT_PCM = 1;
|
|
7
|
+
var WAVE_FORMAT_IEEE_FLOAT = 3;
|
|
8
|
+
var WAVE_FORMAT_EXTENSIBLE = 65534;
|
|
9
|
+
function readFourCC(view, offset) {
|
|
10
|
+
return String.fromCharCode(
|
|
11
|
+
view.getUint8(offset),
|
|
12
|
+
view.getUint8(offset + 1),
|
|
13
|
+
view.getUint8(offset + 2),
|
|
14
|
+
view.getUint8(offset + 3)
|
|
15
|
+
);
|
|
16
|
+
}
|
|
17
|
+
function clampChunkSize(buffer, offset, size) {
|
|
18
|
+
return Math.max(0, Math.min(size, buffer.byteLength - offset));
|
|
19
|
+
}
|
|
20
|
+
function decodeSample(view, offset, audioFormat, bitsPerSample) {
|
|
21
|
+
if (audioFormat === WAVE_FORMAT_IEEE_FLOAT) {
|
|
22
|
+
if (bitsPerSample !== 32) {
|
|
23
|
+
throw new Error(`Unsupported float WAV bit depth: ${bitsPerSample}.`);
|
|
24
|
+
}
|
|
25
|
+
return view.getFloat32(offset, true);
|
|
26
|
+
}
|
|
27
|
+
switch (bitsPerSample) {
|
|
28
|
+
case 8:
|
|
29
|
+
return (view.getUint8(offset) - 128) / 128;
|
|
30
|
+
case 16:
|
|
31
|
+
return view.getInt16(offset, true) / 32768;
|
|
32
|
+
case 24: {
|
|
33
|
+
const value = view.getUint8(offset) | view.getUint8(offset + 1) << 8 | view.getUint8(offset + 2) << 16;
|
|
34
|
+
const signed = value & 8388608 ? value | ~16777215 : value;
|
|
35
|
+
return signed / 8388608;
|
|
36
|
+
}
|
|
37
|
+
case 32:
|
|
38
|
+
return view.getInt32(offset, true) / 2147483648;
|
|
39
|
+
default:
|
|
40
|
+
throw new Error(`Unsupported PCM WAV bit depth: ${bitsPerSample}.`);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
function decodeWav(buffer) {
|
|
44
|
+
const bytes = buffer instanceof Uint8Array ? buffer : new Uint8Array(buffer);
|
|
45
|
+
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
|
|
46
|
+
if (bytes.byteLength < 44) {
|
|
47
|
+
throw new Error("Invalid WAV file: file is too small.");
|
|
48
|
+
}
|
|
49
|
+
if (readFourCC(view, 0) !== RIFF_HEADER || readFourCC(view, 8) !== WAVE_HEADER) {
|
|
50
|
+
throw new Error("Invalid WAV file: missing RIFF/WAVE header.");
|
|
51
|
+
}
|
|
52
|
+
let offset = 12;
|
|
53
|
+
let audioFormat = 0;
|
|
54
|
+
let channels = 0;
|
|
55
|
+
let sampleRate = 0;
|
|
56
|
+
let bitsPerSample = 0;
|
|
57
|
+
let blockAlign = 0;
|
|
58
|
+
let dataOffset = -1;
|
|
59
|
+
let dataSize = 0;
|
|
60
|
+
while (offset + 8 <= bytes.byteLength) {
|
|
61
|
+
const chunkId = readFourCC(view, offset);
|
|
62
|
+
const chunkSize = clampChunkSize(bytes, offset + 8, view.getUint32(offset + 4, true));
|
|
63
|
+
const chunkDataOffset = offset + 8;
|
|
64
|
+
if (chunkId === FMT_CHUNK) {
|
|
65
|
+
if (chunkSize < 16) {
|
|
66
|
+
throw new Error("Invalid WAV file: incomplete fmt chunk.");
|
|
67
|
+
}
|
|
68
|
+
let format = view.getUint16(chunkDataOffset, true);
|
|
69
|
+
channels = view.getUint16(chunkDataOffset + 2, true);
|
|
70
|
+
sampleRate = view.getUint32(chunkDataOffset + 4, true);
|
|
71
|
+
blockAlign = view.getUint16(chunkDataOffset + 12, true);
|
|
72
|
+
bitsPerSample = view.getUint16(chunkDataOffset + 14, true);
|
|
73
|
+
if (format === WAVE_FORMAT_EXTENSIBLE) {
|
|
74
|
+
if (chunkSize < 40) {
|
|
75
|
+
throw new Error("Unsupported WAV extensible format: fmt chunk is too small.");
|
|
76
|
+
}
|
|
77
|
+
const subFormat = view.getUint16(chunkDataOffset + 24, true);
|
|
78
|
+
format = subFormat;
|
|
79
|
+
}
|
|
80
|
+
audioFormat = format;
|
|
81
|
+
} else if (chunkId === DATA_CHUNK) {
|
|
82
|
+
dataOffset = chunkDataOffset;
|
|
83
|
+
dataSize = chunkSize;
|
|
84
|
+
}
|
|
85
|
+
offset = chunkDataOffset + chunkSize + chunkSize % 2;
|
|
86
|
+
}
|
|
87
|
+
if (!audioFormat || !channels || !sampleRate || !bitsPerSample || !blockAlign) {
|
|
88
|
+
throw new Error("Invalid WAV file: missing audio format metadata.");
|
|
89
|
+
}
|
|
90
|
+
if (dataOffset < 0 || dataSize <= 0) {
|
|
91
|
+
throw new Error("Invalid WAV file: missing data chunk.");
|
|
92
|
+
}
|
|
93
|
+
if (audioFormat !== WAVE_FORMAT_PCM && audioFormat !== WAVE_FORMAT_IEEE_FLOAT) {
|
|
94
|
+
throw new Error(`Unsupported WAV format: ${audioFormat}.`);
|
|
95
|
+
}
|
|
96
|
+
const frameCount = Math.floor(dataSize / blockAlign);
|
|
97
|
+
const samples = new Float32Array(frameCount);
|
|
98
|
+
const bytesPerSample = bitsPerSample / 8;
|
|
99
|
+
for (let frame = 0; frame < frameCount; frame += 1) {
|
|
100
|
+
let mono = 0;
|
|
101
|
+
const frameOffset = dataOffset + frame * blockAlign;
|
|
102
|
+
for (let channel = 0; channel < channels; channel += 1) {
|
|
103
|
+
const sampleOffset = frameOffset + channel * bytesPerSample;
|
|
104
|
+
mono += decodeSample(view, sampleOffset, audioFormat, bitsPerSample);
|
|
105
|
+
}
|
|
106
|
+
samples[frame] = mono / channels;
|
|
107
|
+
}
|
|
108
|
+
return { channels, sampleRate, samples };
|
|
109
|
+
}
|
|
110
|
+
function resampleAudio(samples, fromSampleRate, toSampleRate) {
|
|
111
|
+
if (fromSampleRate <= 0 || toSampleRate <= 0) {
|
|
112
|
+
throw new Error("Sample rates must be strictly positive.");
|
|
113
|
+
}
|
|
114
|
+
const input = samples instanceof Float32Array ? samples : Float32Array.from(samples);
|
|
115
|
+
if (input.length === 0) {
|
|
116
|
+
return new Float32Array();
|
|
117
|
+
}
|
|
118
|
+
if (fromSampleRate === toSampleRate) {
|
|
119
|
+
return input.slice();
|
|
120
|
+
}
|
|
121
|
+
const targetLength = Math.max(1, Math.round(input.length * toSampleRate / fromSampleRate));
|
|
122
|
+
const output = new Float32Array(targetLength);
|
|
123
|
+
const ratio = fromSampleRate / toSampleRate;
|
|
124
|
+
for (let index = 0; index < targetLength; index += 1) {
|
|
125
|
+
const position = index * ratio;
|
|
126
|
+
const leftIndex = Math.floor(position);
|
|
127
|
+
const rightIndex = Math.min(leftIndex + 1, input.length - 1);
|
|
128
|
+
const interpolation = position - leftIndex;
|
|
129
|
+
output[index] = input[leftIndex] * (1 - interpolation) + input[rightIndex] * interpolation;
|
|
130
|
+
}
|
|
131
|
+
return output;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// src/runtime.ts
|
|
135
|
+
var TransformersInferenceBackend = class {
|
|
136
|
+
async load(options) {
|
|
137
|
+
const { AutoProcessor, PreTrainedModel } = await import("@huggingface/transformers");
|
|
138
|
+
const shared = {
|
|
139
|
+
cache_dir: options.cacheDir,
|
|
140
|
+
local_files_only: options.localFilesOnly,
|
|
141
|
+
progress_callback: options.progressCallback,
|
|
142
|
+
revision: options.revision
|
|
143
|
+
};
|
|
144
|
+
const processor = await AutoProcessor.from_pretrained(options.model, shared);
|
|
145
|
+
const model = await PreTrainedModel.from_pretrained(options.model, {
|
|
146
|
+
...shared,
|
|
147
|
+
device: options.device,
|
|
148
|
+
dtype: options.dtype
|
|
149
|
+
});
|
|
150
|
+
return { model, processor };
|
|
151
|
+
}
|
|
152
|
+
};
|
|
153
|
+
var TransformersRuntime = class extends TransformersInferenceBackend {
|
|
154
|
+
};
|
|
155
|
+
|
|
156
|
+
export {
|
|
157
|
+
decodeWav,
|
|
158
|
+
resampleAudio,
|
|
159
|
+
TransformersInferenceBackend,
|
|
160
|
+
TransformersRuntime
|
|
161
|
+
};
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import { DeviceType, DataType } from '@huggingface/transformers';
|
|
2
|
+
import { D as DecodedWav, I as InferenceBackend } from './audio-core-CYgIOxAr.js';
|
|
3
|
+
export { a as InferenceBackendLoadOptions, M as ModelLike, P as ProcessorLike, T as TransformersInferenceBackend, b as TransformersRuntime, V as VoxtralRuntime, d as decodeWav, r as resampleAudio } from './audio-core-CYgIOxAr.js';
|
|
4
|
+
|
|
5
|
+
interface DecodedAudio {
|
|
6
|
+
channels?: number;
|
|
7
|
+
sampleRate: number;
|
|
8
|
+
samples: Float32Array;
|
|
9
|
+
}
|
|
10
|
+
type VoxtralTarget = "auto" | "browser" | "node";
|
|
11
|
+
type AudioDecoderInput = Blob | string | URL;
|
|
12
|
+
interface DecodeFileOptions {
|
|
13
|
+
channels?: number;
|
|
14
|
+
sampleRate?: number;
|
|
15
|
+
}
|
|
16
|
+
interface AudioDecoderBackend {
|
|
17
|
+
readonly name: string;
|
|
18
|
+
decodeFile(input: AudioDecoderInput, options?: DecodeFileOptions): Promise<DecodedAudio>;
|
|
19
|
+
}
|
|
20
|
+
type AudioContextLike = {
|
|
21
|
+
close?: () => Promise<void>;
|
|
22
|
+
decodeAudioData(audioData: ArrayBuffer): Promise<{
|
|
23
|
+
length: number;
|
|
24
|
+
numberOfChannels: number;
|
|
25
|
+
sampleRate: number;
|
|
26
|
+
getChannelData(channel: number): Float32Array;
|
|
27
|
+
}>;
|
|
28
|
+
};
|
|
29
|
+
declare class InternalWavDecoder implements AudioDecoderBackend {
|
|
30
|
+
readonly name = "internal-wav";
|
|
31
|
+
decodeFile(input: AudioDecoderInput): Promise<DecodedAudio>;
|
|
32
|
+
}
|
|
33
|
+
interface FfmpegDecoderOptions {
|
|
34
|
+
channels?: number;
|
|
35
|
+
ffmpegPath?: string;
|
|
36
|
+
sampleRate?: number;
|
|
37
|
+
}
|
|
38
|
+
declare class FfmpegDecoder implements AudioDecoderBackend {
|
|
39
|
+
readonly name = "ffmpeg";
|
|
40
|
+
constructor(_options?: FfmpegDecoderOptions);
|
|
41
|
+
decodeFile(): Promise<DecodedAudio>;
|
|
42
|
+
}
|
|
43
|
+
interface BrowserNativeAudioDecoderOptions {
|
|
44
|
+
audioContextFactory?: (options?: {
|
|
45
|
+
sampleRate?: number;
|
|
46
|
+
}) => AudioContextLike;
|
|
47
|
+
fetcher?: typeof fetch;
|
|
48
|
+
}
|
|
49
|
+
declare class BrowserNativeAudioDecoder implements AudioDecoderBackend {
|
|
50
|
+
readonly name = "browser-native";
|
|
51
|
+
private readonly audioContextFactory?;
|
|
52
|
+
private readonly fetcher?;
|
|
53
|
+
constructor(options?: BrowserNativeAudioDecoderOptions);
|
|
54
|
+
decodeFile(input: AudioDecoderInput, options?: DecodeFileOptions): Promise<DecodedAudio>;
|
|
55
|
+
private createAudioContext;
|
|
56
|
+
private readInputAsArrayBuffer;
|
|
57
|
+
}
|
|
58
|
+
declare function createDefaultAudioDecoderBackend(target?: VoxtralTarget): AudioDecoderBackend;
|
|
59
|
+
|
|
60
|
+
declare function readWavFile(input: Blob | string | URL): Promise<DecodedWav>;
|
|
61
|
+
|
|
62
|
+
type VoxtralDevice = DeviceType;
|
|
63
|
+
type VoxtralDtype = DataType;
|
|
64
|
+
declare const DEFAULT_MODEL = "onnx-community/Voxtral-Mini-4B-Realtime-2602-ONNX";
|
|
65
|
+
declare const DEFAULT_DEVICE: VoxtralDevice;
|
|
66
|
+
declare const DEFAULT_DTYPE: VoxtralDtype;
|
|
67
|
+
declare const DEFAULT_SAMPLE_RATE = 16000;
|
|
68
|
+
declare const DEFAULT_TARGET: VoxtralTarget;
|
|
69
|
+
interface VoxtralTranscriberOptions {
|
|
70
|
+
audioDecoderBackend?: AudioDecoderBackend;
|
|
71
|
+
cacheDir?: string;
|
|
72
|
+
device?: VoxtralDevice;
|
|
73
|
+
dtype?: VoxtralDtype;
|
|
74
|
+
inferenceBackend?: InferenceBackend;
|
|
75
|
+
localFilesOnly?: boolean;
|
|
76
|
+
model?: string;
|
|
77
|
+
progressCallback?: (progress: unknown) => void;
|
|
78
|
+
revision?: string;
|
|
79
|
+
target?: VoxtralTarget;
|
|
80
|
+
}
|
|
81
|
+
interface VoxtralTranscribeOptions {
|
|
82
|
+
maxNewTokens?: number;
|
|
83
|
+
sampleRate?: number;
|
|
84
|
+
skipSpecialTokens?: boolean;
|
|
85
|
+
}
|
|
86
|
+
interface VoxtralTranscriptionResult {
|
|
87
|
+
decoder: string;
|
|
88
|
+
durationMs: number;
|
|
89
|
+
model: string;
|
|
90
|
+
sampleRate: number;
|
|
91
|
+
text: string;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
declare function createDefaultInferenceBackend(): InferenceBackend;
|
|
95
|
+
declare class VoxtralTranscriber {
|
|
96
|
+
private readonly audioDecoderBackend;
|
|
97
|
+
private readonly inferenceBackend;
|
|
98
|
+
private readonly options;
|
|
99
|
+
private runtimePromise?;
|
|
100
|
+
constructor(options?: VoxtralTranscriberOptions, inferenceBackend?: InferenceBackend, audioDecoderBackend?: AudioDecoderBackend);
|
|
101
|
+
load(): Promise<void>;
|
|
102
|
+
transcribeAudio(audio: Float32Array | readonly number[], options?: VoxtralTranscribeOptions): Promise<VoxtralTranscriptionResult>;
|
|
103
|
+
transcribeFile(path: AudioDecoderInput, options?: Omit<VoxtralTranscribeOptions, "sampleRate">): Promise<VoxtralTranscriptionResult>;
|
|
104
|
+
dispose(): Promise<void>;
|
|
105
|
+
private ensureRuntime;
|
|
106
|
+
}
|
|
107
|
+
declare function createTranscriber(options?: VoxtralTranscriberOptions): VoxtralTranscriber;
|
|
108
|
+
declare function transcribeAudio(audio: Float32Array | readonly number[], options?: VoxtralTranscriberOptions & VoxtralTranscribeOptions): Promise<VoxtralTranscriptionResult>;
|
|
109
|
+
declare function transcribeFile(path: AudioDecoderInput, options?: VoxtralTranscriberOptions & Omit<VoxtralTranscribeOptions, "sampleRate">): Promise<VoxtralTranscriptionResult>;
|
|
110
|
+
|
|
111
|
+
export { type AudioDecoderBackend, type AudioDecoderInput, BrowserNativeAudioDecoder, type BrowserNativeAudioDecoderOptions, DEFAULT_DEVICE, DEFAULT_DTYPE, DEFAULT_MODEL, DEFAULT_SAMPLE_RATE, DEFAULT_TARGET, type DecodeFileOptions, type DecodedAudio, DecodedWav, FfmpegDecoder, type FfmpegDecoderOptions, InferenceBackend, InternalWavDecoder, type VoxtralDevice, type VoxtralDtype, type VoxtralTarget, type VoxtralTranscribeOptions, VoxtralTranscriber, type VoxtralTranscriberOptions, type VoxtralTranscriptionResult, createDefaultAudioDecoderBackend, createDefaultInferenceBackend, createTranscriber, readWavFile, transcribeAudio, transcribeFile };
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import {
|
|
2
|
+
TransformersInferenceBackend,
|
|
3
|
+
TransformersRuntime,
|
|
4
|
+
decodeWav,
|
|
5
|
+
resampleAudio
|
|
6
|
+
} from "./chunk-J6JHGURB.js";
|
|
7
|
+
|
|
8
|
+
// src/audio.browser.ts
|
|
9
|
+
function isBlobLike(value) {
|
|
10
|
+
return typeof Blob !== "undefined" && value instanceof Blob;
|
|
11
|
+
}
|
|
12
|
+
async function readWavFile(input) {
|
|
13
|
+
if (isBlobLike(input)) {
|
|
14
|
+
return decodeWav(new Uint8Array(await input.arrayBuffer()));
|
|
15
|
+
}
|
|
16
|
+
if (typeof fetch !== "function") {
|
|
17
|
+
throw new Error("readWavFile requires fetch in browser environments.");
|
|
18
|
+
}
|
|
19
|
+
const response = await fetch(input.toString());
|
|
20
|
+
if (!response.ok) {
|
|
21
|
+
throw new Error(`Failed to fetch WAV input "${input.toString()}": ${response.status} ${response.statusText}`);
|
|
22
|
+
}
|
|
23
|
+
return decodeWav(new Uint8Array(await response.arrayBuffer()));
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// src/decoder.browser.ts
|
|
27
|
+
function isBlobLike2(value) {
|
|
28
|
+
return typeof Blob !== "undefined" && value instanceof Blob;
|
|
29
|
+
}
|
|
30
|
+
function downmixToMono(channelData) {
|
|
31
|
+
if (channelData.length === 0) {
|
|
32
|
+
return new Float32Array();
|
|
33
|
+
}
|
|
34
|
+
if (channelData.length === 1) {
|
|
35
|
+
return channelData[0].slice();
|
|
36
|
+
}
|
|
37
|
+
const samples = new Float32Array(channelData[0].length);
|
|
38
|
+
for (let sampleIndex = 0; sampleIndex < samples.length; sampleIndex += 1) {
|
|
39
|
+
let sum = 0;
|
|
40
|
+
for (const channel of channelData) {
|
|
41
|
+
sum += channel[sampleIndex] ?? 0;
|
|
42
|
+
}
|
|
43
|
+
samples[sampleIndex] = sum / channelData.length;
|
|
44
|
+
}
|
|
45
|
+
return samples;
|
|
46
|
+
}
|
|
47
|
+
var InternalWavDecoder = class {
|
|
48
|
+
name = "internal-wav";
|
|
49
|
+
async decodeFile(input) {
|
|
50
|
+
if (isBlobLike2(input)) {
|
|
51
|
+
return decodeWav(new Uint8Array(await input.arrayBuffer()));
|
|
52
|
+
}
|
|
53
|
+
return await readWavFile(input);
|
|
54
|
+
}
|
|
55
|
+
};
|
|
56
|
+
var FfmpegDecoder = class {
|
|
57
|
+
name = "ffmpeg";
|
|
58
|
+
constructor(_options = {}) {
|
|
59
|
+
}
|
|
60
|
+
async decodeFile() {
|
|
61
|
+
throw new Error("FfmpegDecoder is not available in browser builds.");
|
|
62
|
+
}
|
|
63
|
+
};
|
|
64
|
+
var BrowserNativeAudioDecoder = class {
|
|
65
|
+
name = "browser-native";
|
|
66
|
+
audioContextFactory;
|
|
67
|
+
fetcher;
|
|
68
|
+
constructor(options = {}) {
|
|
69
|
+
this.audioContextFactory = options.audioContextFactory;
|
|
70
|
+
this.fetcher = options.fetcher;
|
|
71
|
+
}
|
|
72
|
+
async decodeFile(input, options = {}) {
|
|
73
|
+
const channels = options.channels ?? 1;
|
|
74
|
+
if (channels !== 1) {
|
|
75
|
+
throw new Error(`BrowserNativeAudioDecoder currently supports only mono output. Received channels=${channels}.`);
|
|
76
|
+
}
|
|
77
|
+
const fetcher = this.fetcher ?? globalThis.fetch;
|
|
78
|
+
const audioContext = this.createAudioContext(options.sampleRate);
|
|
79
|
+
const arrayBuffer = await this.readInputAsArrayBuffer(input, fetcher);
|
|
80
|
+
try {
|
|
81
|
+
const decoded = await audioContext.decodeAudioData(arrayBuffer.slice(0));
|
|
82
|
+
const channelData = Array.from({ length: decoded.numberOfChannels }, (_, channel) => decoded.getChannelData(channel));
|
|
83
|
+
return {
|
|
84
|
+
channels,
|
|
85
|
+
sampleRate: decoded.sampleRate,
|
|
86
|
+
samples: downmixToMono(channelData)
|
|
87
|
+
};
|
|
88
|
+
} finally {
|
|
89
|
+
await audioContext.close?.();
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
createAudioContext(sampleRate) {
|
|
93
|
+
if (this.audioContextFactory) {
|
|
94
|
+
return this.audioContextFactory(sampleRate ? { sampleRate } : void 0);
|
|
95
|
+
}
|
|
96
|
+
const AudioContextCtor = globalThis.AudioContext ?? globalThis.webkitAudioContext;
|
|
97
|
+
if (typeof AudioContextCtor !== "function") {
|
|
98
|
+
throw new Error("BrowserNativeAudioDecoder requires AudioContext or webkitAudioContext.");
|
|
99
|
+
}
|
|
100
|
+
return new AudioContextCtor(sampleRate ? { sampleRate } : void 0);
|
|
101
|
+
}
|
|
102
|
+
async readInputAsArrayBuffer(input, fetcher) {
|
|
103
|
+
if (isBlobLike2(input)) {
|
|
104
|
+
return await input.arrayBuffer();
|
|
105
|
+
}
|
|
106
|
+
if (typeof fetcher !== "function") {
|
|
107
|
+
throw new Error("BrowserNativeAudioDecoder requires fetch to load URL inputs.");
|
|
108
|
+
}
|
|
109
|
+
const response = await fetcher(input.toString());
|
|
110
|
+
if (!response.ok) {
|
|
111
|
+
throw new Error(`BrowserNativeAudioDecoder failed to fetch "${input.toString()}": ${response.status} ${response.statusText}`);
|
|
112
|
+
}
|
|
113
|
+
return await response.arrayBuffer();
|
|
114
|
+
}
|
|
115
|
+
};
|
|
116
|
+
function createDefaultAudioDecoderBackend(target = "auto") {
|
|
117
|
+
if (target === "node") {
|
|
118
|
+
return new InternalWavDecoder();
|
|
119
|
+
}
|
|
120
|
+
return new BrowserNativeAudioDecoder();
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// src/index.browser.ts
|
|
124
|
+
var DEFAULT_MODEL = "onnx-community/Voxtral-Mini-4B-Realtime-2602-ONNX";
|
|
125
|
+
var DEFAULT_DEVICE = "cpu";
|
|
126
|
+
var DEFAULT_DTYPE = "q4";
|
|
127
|
+
var DEFAULT_SAMPLE_RATE = 16e3;
|
|
128
|
+
var DEFAULT_TARGET = "auto";
|
|
129
|
+
function createDefaultInferenceBackend() {
|
|
130
|
+
return new TransformersInferenceBackend();
|
|
131
|
+
}
|
|
132
|
+
var VoxtralTranscriber = class {
|
|
133
|
+
audioDecoderBackend;
|
|
134
|
+
inferenceBackend;
|
|
135
|
+
options;
|
|
136
|
+
runtimePromise;
|
|
137
|
+
constructor(options = {}, inferenceBackend = options.inferenceBackend ?? createDefaultInferenceBackend(), audioDecoderBackend = options.audioDecoderBackend ?? createDefaultAudioDecoderBackend(options.target ?? DEFAULT_TARGET)) {
|
|
138
|
+
this.options = {
|
|
139
|
+
cacheDir: options.cacheDir,
|
|
140
|
+
device: options.device ?? DEFAULT_DEVICE,
|
|
141
|
+
dtype: options.dtype ?? DEFAULT_DTYPE,
|
|
142
|
+
localFilesOnly: options.localFilesOnly,
|
|
143
|
+
model: options.model ?? DEFAULT_MODEL,
|
|
144
|
+
progressCallback: options.progressCallback,
|
|
145
|
+
revision: options.revision
|
|
146
|
+
};
|
|
147
|
+
this.inferenceBackend = inferenceBackend;
|
|
148
|
+
this.audioDecoderBackend = audioDecoderBackend;
|
|
149
|
+
}
|
|
150
|
+
async load() {
|
|
151
|
+
await this.ensureRuntime();
|
|
152
|
+
}
|
|
153
|
+
async transcribeAudio(audio, options = {}) {
|
|
154
|
+
const { model, processor } = await this.ensureRuntime();
|
|
155
|
+
const sourceSampleRate = options.sampleRate ?? DEFAULT_SAMPLE_RATE;
|
|
156
|
+
const targetSampleRate = processor.feature_extractor.config.sampling_rate ?? DEFAULT_SAMPLE_RATE;
|
|
157
|
+
const preparedAudio = resampleAudio(audio, sourceSampleRate, targetSampleRate);
|
|
158
|
+
const startedAt = performance.now();
|
|
159
|
+
const inputs = await processor(preparedAudio);
|
|
160
|
+
const outputs = await model.generate({
|
|
161
|
+
...inputs,
|
|
162
|
+
max_new_tokens: options.maxNewTokens
|
|
163
|
+
});
|
|
164
|
+
const text = processor.batch_decode(outputs, { skip_special_tokens: options.skipSpecialTokens ?? true })[0]?.trim() ?? "";
|
|
165
|
+
return {
|
|
166
|
+
decoder: this.audioDecoderBackend.name,
|
|
167
|
+
durationMs: performance.now() - startedAt,
|
|
168
|
+
model: this.options.model,
|
|
169
|
+
sampleRate: targetSampleRate,
|
|
170
|
+
text
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
async transcribeFile(path, options = {}) {
|
|
174
|
+
const { processor } = await this.ensureRuntime();
|
|
175
|
+
const targetSampleRate = processor.feature_extractor.config.sampling_rate ?? DEFAULT_SAMPLE_RATE;
|
|
176
|
+
const decoded = await this.audioDecoderBackend.decodeFile(path, {
|
|
177
|
+
channels: 1,
|
|
178
|
+
sampleRate: targetSampleRate
|
|
179
|
+
});
|
|
180
|
+
return await this.transcribeAudio(decoded.samples, {
|
|
181
|
+
...options,
|
|
182
|
+
sampleRate: decoded.sampleRate
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
async dispose() {
|
|
186
|
+
const runtime = this.runtimePromise ? await this.runtimePromise : void 0;
|
|
187
|
+
if (!runtime) {
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
190
|
+
await runtime.model.dispose();
|
|
191
|
+
this.runtimePromise = void 0;
|
|
192
|
+
}
|
|
193
|
+
async ensureRuntime() {
|
|
194
|
+
if (!this.runtimePromise) {
|
|
195
|
+
this.runtimePromise = this.inferenceBackend.load({
|
|
196
|
+
cacheDir: this.options.cacheDir,
|
|
197
|
+
device: this.options.device,
|
|
198
|
+
dtype: this.options.dtype,
|
|
199
|
+
localFilesOnly: this.options.localFilesOnly,
|
|
200
|
+
model: this.options.model,
|
|
201
|
+
progressCallback: this.options.progressCallback,
|
|
202
|
+
revision: this.options.revision
|
|
203
|
+
});
|
|
204
|
+
}
|
|
205
|
+
return await this.runtimePromise;
|
|
206
|
+
}
|
|
207
|
+
};
|
|
208
|
+
function createTranscriber(options = {}) {
|
|
209
|
+
return new VoxtralTranscriber(options);
|
|
210
|
+
}
|
|
211
|
+
async function transcribeAudio(audio, options = {}) {
|
|
212
|
+
const transcriber = new VoxtralTranscriber(options);
|
|
213
|
+
try {
|
|
214
|
+
return await transcriber.transcribeAudio(audio, options);
|
|
215
|
+
} finally {
|
|
216
|
+
await transcriber.dispose();
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
async function transcribeFile(path, options = {}) {
|
|
220
|
+
const transcriber = new VoxtralTranscriber(options);
|
|
221
|
+
try {
|
|
222
|
+
return await transcriber.transcribeFile(path, options);
|
|
223
|
+
} finally {
|
|
224
|
+
await transcriber.dispose();
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
export {
|
|
228
|
+
BrowserNativeAudioDecoder,
|
|
229
|
+
DEFAULT_DEVICE,
|
|
230
|
+
DEFAULT_DTYPE,
|
|
231
|
+
DEFAULT_MODEL,
|
|
232
|
+
DEFAULT_SAMPLE_RATE,
|
|
233
|
+
DEFAULT_TARGET,
|
|
234
|
+
FfmpegDecoder,
|
|
235
|
+
InternalWavDecoder,
|
|
236
|
+
TransformersInferenceBackend,
|
|
237
|
+
TransformersRuntime,
|
|
238
|
+
VoxtralTranscriber,
|
|
239
|
+
createDefaultAudioDecoderBackend,
|
|
240
|
+
createDefaultInferenceBackend,
|
|
241
|
+
createTranscriber,
|
|
242
|
+
decodeWav,
|
|
243
|
+
readWavFile,
|
|
244
|
+
resampleAudio,
|
|
245
|
+
transcribeAudio,
|
|
246
|
+
transcribeFile
|
|
247
|
+
};
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import { DeviceType, DataType } from '@huggingface/transformers';
|
|
2
|
+
import { SpawnOptionsWithoutStdio } from 'node:child_process';
|
|
3
|
+
import { Readable } from 'node:stream';
|
|
4
|
+
import { D as DecodedWav, I as InferenceBackend } from './audio-core-CYgIOxAr.js';
|
|
5
|
+
export { a as InferenceBackendLoadOptions, M as ModelLike, P as ProcessorLike, T as TransformersInferenceBackend, b as TransformersRuntime, V as VoxtralRuntime, d as decodeWav, r as resampleAudio } from './audio-core-CYgIOxAr.js';
|
|
6
|
+
|
|
7
|
+
interface DecodedAudio {
|
|
8
|
+
channels?: number;
|
|
9
|
+
sampleRate: number;
|
|
10
|
+
samples: Float32Array;
|
|
11
|
+
}
|
|
12
|
+
type VoxtralTarget = "auto" | "browser" | "node";
|
|
13
|
+
type AudioDecoderInput = Blob | string | URL;
|
|
14
|
+
interface DecodeFileOptions {
|
|
15
|
+
channels?: number;
|
|
16
|
+
sampleRate?: number;
|
|
17
|
+
}
|
|
18
|
+
interface AudioDecoderBackend {
|
|
19
|
+
readonly name: string;
|
|
20
|
+
decodeFile(input: AudioDecoderInput, options?: DecodeFileOptions): Promise<DecodedAudio>;
|
|
21
|
+
}
|
|
22
|
+
type SpawnFactory = (command: string, args: readonly string[], options: SpawnOptionsWithoutStdio) => {
|
|
23
|
+
stdout: Readable;
|
|
24
|
+
stderr: Readable;
|
|
25
|
+
once(event: "error", listener: (error: Error) => void): unknown;
|
|
26
|
+
once(event: "close", listener: (code: number | null) => void): unknown;
|
|
27
|
+
};
|
|
28
|
+
type AudioContextLike = {
|
|
29
|
+
close?: () => Promise<void>;
|
|
30
|
+
decodeAudioData(audioData: ArrayBuffer): Promise<{
|
|
31
|
+
length: number;
|
|
32
|
+
numberOfChannels: number;
|
|
33
|
+
sampleRate: number;
|
|
34
|
+
getChannelData(channel: number): Float32Array;
|
|
35
|
+
}>;
|
|
36
|
+
};
|
|
37
|
+
declare class InternalWavDecoder implements AudioDecoderBackend {
|
|
38
|
+
readonly name = "internal-wav";
|
|
39
|
+
decodeFile(input: AudioDecoderInput): Promise<DecodedAudio>;
|
|
40
|
+
}
|
|
41
|
+
interface FfmpegDecoderOptions {
|
|
42
|
+
channels?: number;
|
|
43
|
+
ffmpegPath?: string;
|
|
44
|
+
sampleRate?: number;
|
|
45
|
+
spawnFactory?: SpawnFactory;
|
|
46
|
+
}
|
|
47
|
+
declare class FfmpegDecoder implements AudioDecoderBackend {
|
|
48
|
+
readonly name = "ffmpeg";
|
|
49
|
+
private readonly channels;
|
|
50
|
+
private readonly ffmpegPath;
|
|
51
|
+
private readonly sampleRate;
|
|
52
|
+
private readonly spawnFactory;
|
|
53
|
+
constructor(options?: FfmpegDecoderOptions);
|
|
54
|
+
decodeFile(input: AudioDecoderInput, options?: DecodeFileOptions): Promise<DecodedAudio>;
|
|
55
|
+
}
|
|
56
|
+
interface BrowserNativeAudioDecoderOptions {
|
|
57
|
+
audioContextFactory?: (options?: {
|
|
58
|
+
sampleRate?: number;
|
|
59
|
+
}) => AudioContextLike;
|
|
60
|
+
fetcher?: typeof fetch;
|
|
61
|
+
}
|
|
62
|
+
declare class BrowserNativeAudioDecoder implements AudioDecoderBackend {
|
|
63
|
+
readonly name = "browser-native";
|
|
64
|
+
private readonly audioContextFactory?;
|
|
65
|
+
private readonly fetcher?;
|
|
66
|
+
constructor(options?: BrowserNativeAudioDecoderOptions);
|
|
67
|
+
decodeFile(input: AudioDecoderInput, options?: DecodeFileOptions): Promise<DecodedAudio>;
|
|
68
|
+
private createAudioContext;
|
|
69
|
+
private readInputAsArrayBuffer;
|
|
70
|
+
}
|
|
71
|
+
declare function createDefaultAudioDecoderBackend(target?: VoxtralTarget): AudioDecoderBackend;
|
|
72
|
+
|
|
73
|
+
declare function readWavFile(path: string | URL): Promise<DecodedWav>;
|
|
74
|
+
|
|
75
|
+
type VoxtralDevice = DeviceType;
|
|
76
|
+
type VoxtralDtype = DataType;
|
|
77
|
+
declare const DEFAULT_MODEL = "onnx-community/Voxtral-Mini-4B-Realtime-2602-ONNX";
|
|
78
|
+
declare const DEFAULT_DEVICE: VoxtralDevice;
|
|
79
|
+
declare const DEFAULT_DTYPE: VoxtralDtype;
|
|
80
|
+
declare const DEFAULT_SAMPLE_RATE = 16000;
|
|
81
|
+
declare const DEFAULT_TARGET: VoxtralTarget;
|
|
82
|
+
interface VoxtralTranscriberOptions {
|
|
83
|
+
audioDecoderBackend?: AudioDecoderBackend;
|
|
84
|
+
cacheDir?: string;
|
|
85
|
+
device?: VoxtralDevice;
|
|
86
|
+
dtype?: VoxtralDtype;
|
|
87
|
+
inferenceBackend?: InferenceBackend;
|
|
88
|
+
localFilesOnly?: boolean;
|
|
89
|
+
model?: string;
|
|
90
|
+
progressCallback?: (progress: unknown) => void;
|
|
91
|
+
revision?: string;
|
|
92
|
+
target?: VoxtralTarget;
|
|
93
|
+
}
|
|
94
|
+
interface VoxtralTranscribeOptions {
|
|
95
|
+
maxNewTokens?: number;
|
|
96
|
+
sampleRate?: number;
|
|
97
|
+
skipSpecialTokens?: boolean;
|
|
98
|
+
}
|
|
99
|
+
interface VoxtralTranscriptionResult {
|
|
100
|
+
decoder: string;
|
|
101
|
+
durationMs: number;
|
|
102
|
+
model: string;
|
|
103
|
+
sampleRate: number;
|
|
104
|
+
text: string;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
declare function createDefaultInferenceBackend(): InferenceBackend;
|
|
108
|
+
declare class VoxtralTranscriber {
|
|
109
|
+
private readonly audioDecoderBackend;
|
|
110
|
+
private readonly inferenceBackend;
|
|
111
|
+
private readonly options;
|
|
112
|
+
private runtimePromise?;
|
|
113
|
+
constructor(options?: VoxtralTranscriberOptions, inferenceBackend?: InferenceBackend, audioDecoderBackend?: AudioDecoderBackend);
|
|
114
|
+
load(): Promise<void>;
|
|
115
|
+
transcribeAudio(audio: Float32Array | readonly number[], options?: VoxtralTranscribeOptions): Promise<VoxtralTranscriptionResult>;
|
|
116
|
+
transcribeFile(path: AudioDecoderInput, options?: Omit<VoxtralTranscribeOptions, "sampleRate">): Promise<VoxtralTranscriptionResult>;
|
|
117
|
+
dispose(): Promise<void>;
|
|
118
|
+
private ensureRuntime;
|
|
119
|
+
}
|
|
120
|
+
declare function createTranscriber(options?: VoxtralTranscriberOptions): VoxtralTranscriber;
|
|
121
|
+
declare function transcribeAudio(audio: Float32Array | readonly number[], options?: VoxtralTranscriberOptions & VoxtralTranscribeOptions): Promise<VoxtralTranscriptionResult>;
|
|
122
|
+
declare function transcribeFile(path: AudioDecoderInput, options?: VoxtralTranscriberOptions & Omit<VoxtralTranscribeOptions, "sampleRate">): Promise<VoxtralTranscriptionResult>;
|
|
123
|
+
|
|
124
|
+
export { type AudioDecoderBackend, type AudioDecoderInput, BrowserNativeAudioDecoder, type BrowserNativeAudioDecoderOptions, DEFAULT_DEVICE, DEFAULT_DTYPE, DEFAULT_MODEL, DEFAULT_SAMPLE_RATE, DEFAULT_TARGET, type DecodeFileOptions, type DecodedAudio, DecodedWav, FfmpegDecoder, type FfmpegDecoderOptions, InferenceBackend, InternalWavDecoder, type VoxtralDevice, type VoxtralDtype, type VoxtralTarget, type VoxtralTranscribeOptions, VoxtralTranscriber, type VoxtralTranscriberOptions, type VoxtralTranscriptionResult, createDefaultAudioDecoderBackend, createDefaultInferenceBackend, createTranscriber, readWavFile, transcribeAudio, transcribeFile };
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
import {
|
|
2
|
+
TransformersInferenceBackend,
|
|
3
|
+
TransformersRuntime,
|
|
4
|
+
decodeWav,
|
|
5
|
+
resampleAudio
|
|
6
|
+
} from "./chunk-J6JHGURB.js";
|
|
7
|
+
|
|
8
|
+
// src/audio.node.ts
|
|
9
|
+
import { readFile } from "fs/promises";
|
|
10
|
+
async function readWavFile(path) {
|
|
11
|
+
const buffer = await readFile(path);
|
|
12
|
+
return decodeWav(buffer);
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
// src/decoder.node.ts
|
|
16
|
+
import { spawn } from "child_process";
|
|
17
|
+
import { fileURLToPath } from "url";
|
|
18
|
+
function isBlobLike(value) {
|
|
19
|
+
return typeof Blob !== "undefined" && value instanceof Blob;
|
|
20
|
+
}
|
|
21
|
+
function downmixToMono(channelData) {
|
|
22
|
+
if (channelData.length === 0) {
|
|
23
|
+
return new Float32Array();
|
|
24
|
+
}
|
|
25
|
+
if (channelData.length === 1) {
|
|
26
|
+
return channelData[0].slice();
|
|
27
|
+
}
|
|
28
|
+
const samples = new Float32Array(channelData[0].length);
|
|
29
|
+
for (let sampleIndex = 0; sampleIndex < samples.length; sampleIndex += 1) {
|
|
30
|
+
let sum = 0;
|
|
31
|
+
for (const channel of channelData) {
|
|
32
|
+
sum += channel[sampleIndex] ?? 0;
|
|
33
|
+
}
|
|
34
|
+
samples[sampleIndex] = sum / channelData.length;
|
|
35
|
+
}
|
|
36
|
+
return samples;
|
|
37
|
+
}
|
|
38
|
+
function concatenateUint8Arrays(chunks) {
|
|
39
|
+
const totalLength = chunks.reduce((sum, chunk) => sum + chunk.byteLength, 0);
|
|
40
|
+
const output = new Uint8Array(totalLength);
|
|
41
|
+
let offset = 0;
|
|
42
|
+
for (const chunk of chunks) {
|
|
43
|
+
output.set(chunk, offset);
|
|
44
|
+
offset += chunk.byteLength;
|
|
45
|
+
}
|
|
46
|
+
return output;
|
|
47
|
+
}
|
|
48
|
+
function float32FromBytes(bytes) {
|
|
49
|
+
if (bytes.byteLength % 4 !== 0) {
|
|
50
|
+
throw new Error(`Invalid ffmpeg PCM output size: expected a multiple of 4 bytes, got ${bytes.byteLength}.`);
|
|
51
|
+
}
|
|
52
|
+
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
|
|
53
|
+
const samples = new Float32Array(bytes.byteLength / 4);
|
|
54
|
+
for (let index = 0; index < samples.length; index += 1) {
|
|
55
|
+
samples[index] = view.getFloat32(index * 4, true);
|
|
56
|
+
}
|
|
57
|
+
return samples;
|
|
58
|
+
}
|
|
59
|
+
async function normalizeFfmpegInput(input) {
|
|
60
|
+
if (isBlobLike(input)) {
|
|
61
|
+
throw new Error("FfmpegDecoder does not accept Blob/File inputs. Use BrowserNativeAudioDecoder or transcribeAudio().");
|
|
62
|
+
}
|
|
63
|
+
if (input instanceof URL && input.protocol === "file:") {
|
|
64
|
+
return fileURLToPath(input);
|
|
65
|
+
}
|
|
66
|
+
return input.toString();
|
|
67
|
+
}
|
|
68
|
+
var InternalWavDecoder = class {
|
|
69
|
+
name = "internal-wav";
|
|
70
|
+
async decodeFile(input) {
|
|
71
|
+
if (isBlobLike(input)) {
|
|
72
|
+
return decodeWav(new Uint8Array(await input.arrayBuffer()));
|
|
73
|
+
}
|
|
74
|
+
return await readWavFile(input);
|
|
75
|
+
}
|
|
76
|
+
};
|
|
77
|
+
var FfmpegDecoder = class {
|
|
78
|
+
name = "ffmpeg";
|
|
79
|
+
channels;
|
|
80
|
+
ffmpegPath;
|
|
81
|
+
sampleRate;
|
|
82
|
+
spawnFactory;
|
|
83
|
+
constructor(options = {}) {
|
|
84
|
+
this.channels = options.channels ?? 1;
|
|
85
|
+
this.ffmpegPath = options.ffmpegPath ?? "ffmpeg";
|
|
86
|
+
this.sampleRate = options.sampleRate ?? 16e3;
|
|
87
|
+
this.spawnFactory = options.spawnFactory ?? ((command, args, spawnOptions) => spawn(command, args, spawnOptions));
|
|
88
|
+
}
|
|
89
|
+
async decodeFile(input, options = {}) {
|
|
90
|
+
const channels = options.channels ?? this.channels;
|
|
91
|
+
const sampleRate = options.sampleRate ?? this.sampleRate;
|
|
92
|
+
const normalizedInput = await normalizeFfmpegInput(input);
|
|
93
|
+
const child = this.spawnFactory(
|
|
94
|
+
this.ffmpegPath,
|
|
95
|
+
[
|
|
96
|
+
"-hide_banner",
|
|
97
|
+
"-loglevel",
|
|
98
|
+
"error",
|
|
99
|
+
"-nostdin",
|
|
100
|
+
"-i",
|
|
101
|
+
normalizedInput,
|
|
102
|
+
"-ac",
|
|
103
|
+
String(channels),
|
|
104
|
+
"-ar",
|
|
105
|
+
String(sampleRate),
|
|
106
|
+
"-f",
|
|
107
|
+
"f32le",
|
|
108
|
+
"-acodec",
|
|
109
|
+
"pcm_f32le",
|
|
110
|
+
"pipe:1"
|
|
111
|
+
],
|
|
112
|
+
{ stdio: ["ignore", "pipe", "pipe"] }
|
|
113
|
+
);
|
|
114
|
+
return await new Promise((resolve, reject) => {
|
|
115
|
+
const stdoutChunks = [];
|
|
116
|
+
const stderrChunks = [];
|
|
117
|
+
child.stdout.on("data", (chunk) => {
|
|
118
|
+
stdoutChunks.push(chunk instanceof Uint8Array ? chunk : new Uint8Array(chunk));
|
|
119
|
+
});
|
|
120
|
+
child.stderr.on("data", (chunk) => {
|
|
121
|
+
stderrChunks.push(chunk instanceof Uint8Array ? chunk : new Uint8Array(chunk));
|
|
122
|
+
});
|
|
123
|
+
child.once("error", (error) => {
|
|
124
|
+
reject(new Error(`Failed to start ffmpeg decoder with "${this.ffmpegPath}": ${error.message}`));
|
|
125
|
+
});
|
|
126
|
+
child.once("close", (code) => {
|
|
127
|
+
if (code !== 0) {
|
|
128
|
+
const details = new TextDecoder().decode(concatenateUint8Arrays(stderrChunks)).trim();
|
|
129
|
+
reject(
|
|
130
|
+
new Error(
|
|
131
|
+
details ? `ffmpeg decoder failed (${code}): ${details}` : `ffmpeg decoder failed with exit code ${code}.`
|
|
132
|
+
)
|
|
133
|
+
);
|
|
134
|
+
return;
|
|
135
|
+
}
|
|
136
|
+
resolve({
|
|
137
|
+
channels,
|
|
138
|
+
sampleRate,
|
|
139
|
+
samples: float32FromBytes(concatenateUint8Arrays(stdoutChunks))
|
|
140
|
+
});
|
|
141
|
+
});
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
};
|
|
145
|
+
var BrowserNativeAudioDecoder = class {
|
|
146
|
+
name = "browser-native";
|
|
147
|
+
audioContextFactory;
|
|
148
|
+
fetcher;
|
|
149
|
+
constructor(options = {}) {
|
|
150
|
+
this.audioContextFactory = options.audioContextFactory;
|
|
151
|
+
this.fetcher = options.fetcher;
|
|
152
|
+
}
|
|
153
|
+
async decodeFile(input, options = {}) {
|
|
154
|
+
const channels = options.channels ?? 1;
|
|
155
|
+
if (channels !== 1) {
|
|
156
|
+
throw new Error(`BrowserNativeAudioDecoder currently supports only mono output. Received channels=${channels}.`);
|
|
157
|
+
}
|
|
158
|
+
const fetcher = this.fetcher ?? globalThis.fetch;
|
|
159
|
+
const audioContext = this.createAudioContext(options.sampleRate);
|
|
160
|
+
const arrayBuffer = await this.readInputAsArrayBuffer(input, fetcher);
|
|
161
|
+
try {
|
|
162
|
+
const decoded = await audioContext.decodeAudioData(arrayBuffer.slice(0));
|
|
163
|
+
const channelData = Array.from({ length: decoded.numberOfChannels }, (_, channel) => decoded.getChannelData(channel));
|
|
164
|
+
return {
|
|
165
|
+
channels,
|
|
166
|
+
sampleRate: decoded.sampleRate,
|
|
167
|
+
samples: downmixToMono(channelData)
|
|
168
|
+
};
|
|
169
|
+
} finally {
|
|
170
|
+
await audioContext.close?.();
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
createAudioContext(sampleRate) {
|
|
174
|
+
if (this.audioContextFactory) {
|
|
175
|
+
return this.audioContextFactory(sampleRate ? { sampleRate } : void 0);
|
|
176
|
+
}
|
|
177
|
+
const AudioContextCtor = globalThis.AudioContext ?? globalThis.webkitAudioContext;
|
|
178
|
+
if (typeof AudioContextCtor !== "function") {
|
|
179
|
+
throw new Error("BrowserNativeAudioDecoder requires AudioContext or webkitAudioContext.");
|
|
180
|
+
}
|
|
181
|
+
return new AudioContextCtor(sampleRate ? { sampleRate } : void 0);
|
|
182
|
+
}
|
|
183
|
+
async readInputAsArrayBuffer(input, fetcher) {
|
|
184
|
+
if (isBlobLike(input)) {
|
|
185
|
+
return await input.arrayBuffer();
|
|
186
|
+
}
|
|
187
|
+
if (typeof fetcher !== "function") {
|
|
188
|
+
throw new Error("BrowserNativeAudioDecoder requires fetch to load URL inputs.");
|
|
189
|
+
}
|
|
190
|
+
const response = await fetcher(input.toString());
|
|
191
|
+
if (!response.ok) {
|
|
192
|
+
throw new Error(`BrowserNativeAudioDecoder failed to fetch "${input.toString()}": ${response.status} ${response.statusText}`);
|
|
193
|
+
}
|
|
194
|
+
return await response.arrayBuffer();
|
|
195
|
+
}
|
|
196
|
+
};
|
|
197
|
+
function createDefaultAudioDecoderBackend(target = "auto") {
|
|
198
|
+
if (target === "browser") {
|
|
199
|
+
return new BrowserNativeAudioDecoder();
|
|
200
|
+
}
|
|
201
|
+
return new InternalWavDecoder();
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// src/index.node.ts
|
|
205
|
+
var DEFAULT_MODEL = "onnx-community/Voxtral-Mini-4B-Realtime-2602-ONNX";
|
|
206
|
+
var DEFAULT_DEVICE = "cpu";
|
|
207
|
+
var DEFAULT_DTYPE = "q4";
|
|
208
|
+
var DEFAULT_SAMPLE_RATE = 16e3;
|
|
209
|
+
var DEFAULT_TARGET = "auto";
|
|
210
|
+
function createDefaultInferenceBackend() {
|
|
211
|
+
return new TransformersInferenceBackend();
|
|
212
|
+
}
|
|
213
|
+
var VoxtralTranscriber = class {
|
|
214
|
+
audioDecoderBackend;
|
|
215
|
+
inferenceBackend;
|
|
216
|
+
options;
|
|
217
|
+
runtimePromise;
|
|
218
|
+
constructor(options = {}, inferenceBackend = options.inferenceBackend ?? createDefaultInferenceBackend(), audioDecoderBackend = options.audioDecoderBackend ?? createDefaultAudioDecoderBackend(options.target ?? DEFAULT_TARGET)) {
|
|
219
|
+
this.options = {
|
|
220
|
+
cacheDir: options.cacheDir,
|
|
221
|
+
device: options.device ?? DEFAULT_DEVICE,
|
|
222
|
+
dtype: options.dtype ?? DEFAULT_DTYPE,
|
|
223
|
+
localFilesOnly: options.localFilesOnly,
|
|
224
|
+
model: options.model ?? DEFAULT_MODEL,
|
|
225
|
+
progressCallback: options.progressCallback,
|
|
226
|
+
revision: options.revision
|
|
227
|
+
};
|
|
228
|
+
this.inferenceBackend = inferenceBackend;
|
|
229
|
+
this.audioDecoderBackend = audioDecoderBackend;
|
|
230
|
+
}
|
|
231
|
+
async load() {
|
|
232
|
+
await this.ensureRuntime();
|
|
233
|
+
}
|
|
234
|
+
async transcribeAudio(audio, options = {}) {
|
|
235
|
+
const { model, processor } = await this.ensureRuntime();
|
|
236
|
+
const sourceSampleRate = options.sampleRate ?? DEFAULT_SAMPLE_RATE;
|
|
237
|
+
const targetSampleRate = processor.feature_extractor.config.sampling_rate ?? DEFAULT_SAMPLE_RATE;
|
|
238
|
+
const preparedAudio = resampleAudio(audio, sourceSampleRate, targetSampleRate);
|
|
239
|
+
const startedAt = performance.now();
|
|
240
|
+
const inputs = await processor(preparedAudio);
|
|
241
|
+
const outputs = await model.generate({
|
|
242
|
+
...inputs,
|
|
243
|
+
max_new_tokens: options.maxNewTokens
|
|
244
|
+
});
|
|
245
|
+
const text = processor.batch_decode(outputs, { skip_special_tokens: options.skipSpecialTokens ?? true })[0]?.trim() ?? "";
|
|
246
|
+
return {
|
|
247
|
+
decoder: this.audioDecoderBackend.name,
|
|
248
|
+
durationMs: performance.now() - startedAt,
|
|
249
|
+
model: this.options.model,
|
|
250
|
+
sampleRate: targetSampleRate,
|
|
251
|
+
text
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
async transcribeFile(path, options = {}) {
|
|
255
|
+
const { processor } = await this.ensureRuntime();
|
|
256
|
+
const targetSampleRate = processor.feature_extractor.config.sampling_rate ?? DEFAULT_SAMPLE_RATE;
|
|
257
|
+
const decoded = await this.audioDecoderBackend.decodeFile(path, {
|
|
258
|
+
channels: 1,
|
|
259
|
+
sampleRate: targetSampleRate
|
|
260
|
+
});
|
|
261
|
+
return await this.transcribeAudio(decoded.samples, {
|
|
262
|
+
...options,
|
|
263
|
+
sampleRate: decoded.sampleRate
|
|
264
|
+
});
|
|
265
|
+
}
|
|
266
|
+
async dispose() {
|
|
267
|
+
const runtime = this.runtimePromise ? await this.runtimePromise : void 0;
|
|
268
|
+
if (!runtime) {
|
|
269
|
+
return;
|
|
270
|
+
}
|
|
271
|
+
await runtime.model.dispose();
|
|
272
|
+
this.runtimePromise = void 0;
|
|
273
|
+
}
|
|
274
|
+
async ensureRuntime() {
|
|
275
|
+
if (!this.runtimePromise) {
|
|
276
|
+
this.runtimePromise = this.inferenceBackend.load({
|
|
277
|
+
cacheDir: this.options.cacheDir,
|
|
278
|
+
device: this.options.device,
|
|
279
|
+
dtype: this.options.dtype,
|
|
280
|
+
localFilesOnly: this.options.localFilesOnly,
|
|
281
|
+
model: this.options.model,
|
|
282
|
+
progressCallback: this.options.progressCallback,
|
|
283
|
+
revision: this.options.revision
|
|
284
|
+
});
|
|
285
|
+
}
|
|
286
|
+
return await this.runtimePromise;
|
|
287
|
+
}
|
|
288
|
+
};
|
|
289
|
+
function createTranscriber(options = {}) {
|
|
290
|
+
return new VoxtralTranscriber(options);
|
|
291
|
+
}
|
|
292
|
+
async function transcribeAudio(audio, options = {}) {
|
|
293
|
+
const transcriber = new VoxtralTranscriber(options);
|
|
294
|
+
try {
|
|
295
|
+
return await transcriber.transcribeAudio(audio, options);
|
|
296
|
+
} finally {
|
|
297
|
+
await transcriber.dispose();
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
async function transcribeFile(path, options = {}) {
|
|
301
|
+
const transcriber = new VoxtralTranscriber(options);
|
|
302
|
+
try {
|
|
303
|
+
return await transcriber.transcribeFile(path, options);
|
|
304
|
+
} finally {
|
|
305
|
+
await transcriber.dispose();
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
export {
|
|
309
|
+
BrowserNativeAudioDecoder,
|
|
310
|
+
DEFAULT_DEVICE,
|
|
311
|
+
DEFAULT_DTYPE,
|
|
312
|
+
DEFAULT_MODEL,
|
|
313
|
+
DEFAULT_SAMPLE_RATE,
|
|
314
|
+
DEFAULT_TARGET,
|
|
315
|
+
FfmpegDecoder,
|
|
316
|
+
InternalWavDecoder,
|
|
317
|
+
TransformersInferenceBackend,
|
|
318
|
+
TransformersRuntime,
|
|
319
|
+
VoxtralTranscriber,
|
|
320
|
+
createDefaultAudioDecoderBackend,
|
|
321
|
+
createDefaultInferenceBackend,
|
|
322
|
+
createTranscriber,
|
|
323
|
+
decodeWav,
|
|
324
|
+
readWavFile,
|
|
325
|
+
resampleAudio,
|
|
326
|
+
transcribeAudio,
|
|
327
|
+
transcribeFile
|
|
328
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "voxtral-transcribe-ts",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Minimal TypeScript wrapper for local Voxtral Mini 4B Realtime transcription in Node.js.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.node.js",
|
|
7
|
+
"types": "./dist/index.node.d.ts",
|
|
8
|
+
"browser": "./dist/index.browser.js",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"types": "./dist/index.node.d.ts",
|
|
12
|
+
"browser": "./dist/index.browser.js",
|
|
13
|
+
"import": "./dist/index.node.js",
|
|
14
|
+
"default": "./dist/index.node.js"
|
|
15
|
+
},
|
|
16
|
+
"./browser": {
|
|
17
|
+
"types": "./dist/index.browser.d.ts",
|
|
18
|
+
"import": "./dist/index.browser.js",
|
|
19
|
+
"default": "./dist/index.browser.js"
|
|
20
|
+
},
|
|
21
|
+
"./node": {
|
|
22
|
+
"types": "./dist/index.node.d.ts",
|
|
23
|
+
"import": "./dist/index.node.js",
|
|
24
|
+
"default": "./dist/index.node.js"
|
|
25
|
+
}
|
|
26
|
+
},
|
|
27
|
+
"files": [
|
|
28
|
+
"dist",
|
|
29
|
+
"README.md"
|
|
30
|
+
],
|
|
31
|
+
"publishConfig": {
|
|
32
|
+
"access": "public"
|
|
33
|
+
},
|
|
34
|
+
"scripts": {
|
|
35
|
+
"build": "tsup src/index.node.ts src/index.browser.ts --format esm --dts --clean",
|
|
36
|
+
"check:browser-bundle": "esbuild src/index.browser.ts --bundle --platform=browser --format=esm --outfile=/tmp/voxtral-browser-check.js",
|
|
37
|
+
"check": "tsc --noEmit",
|
|
38
|
+
"test:smoke": "bash ./scripts/smoke-test.sh",
|
|
39
|
+
"test": "npm run build && node --test test/*.test.js",
|
|
40
|
+
"validate": "npm run check && npm run test && npm run check:browser-bundle",
|
|
41
|
+
"prepublishOnly": "npm run validate && npm run test:smoke"
|
|
42
|
+
},
|
|
43
|
+
"keywords": [
|
|
44
|
+
"voxtral",
|
|
45
|
+
"transcription",
|
|
46
|
+
"speech-to-text",
|
|
47
|
+
"mistral",
|
|
48
|
+
"onnx",
|
|
49
|
+
"transformers"
|
|
50
|
+
],
|
|
51
|
+
"license": "MIT",
|
|
52
|
+
"engines": {
|
|
53
|
+
"node": ">=20.11.0"
|
|
54
|
+
},
|
|
55
|
+
"dependencies": {
|
|
56
|
+
"@huggingface/transformers": "^3.8.1",
|
|
57
|
+
"onnxruntime-common": "1.21.0",
|
|
58
|
+
"onnxruntime-node": "1.21.0"
|
|
59
|
+
},
|
|
60
|
+
"devDependencies": {
|
|
61
|
+
"@types/node": "^22.15.17",
|
|
62
|
+
"tsup": "^8.5.0",
|
|
63
|
+
"typescript": "^5.8.3"
|
|
64
|
+
}
|
|
65
|
+
}
|