parakeet.js 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +210 -0
- package/examples/react-demo/index.html +12 -0
- package/examples/react-demo/package.json +20 -0
- package/examples/react-demo/src/App.css +134 -0
- package/examples/react-demo/src/App.jsx +327 -0
- package/examples/react-demo/src/main.jsx +6 -0
- package/examples/react-demo/vite.config.js +41 -0
- package/package.json +30 -0
- package/src/backend.js +99 -0
- package/src/hub.js +242 -0
- package/src/index.js +29 -0
- package/src/parakeet.js +481 -0
- package/src/preprocessor.js +69 -0
- package/src/tokenizer.js +54 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Yunus Seyhan Dede
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
# Parakeet.js
|
|
2
|
+
|
|
3
|
+
Client-side ONNX inference of NVIDIA *Parakeet* speech-to-text models.
|
|
4
|
+
Runs entirely in the browser on **WebGPU** or **WASM** via
|
|
5
|
+
[ONNX Runtime Web](https://onnxruntime.ai/).
|
|
6
|
+
|
|
7
|
+
> **Parakeet.js** offers a high-performance, browser-first implementation for NVIDIA's Parakeet-TDT speech-to-text models, running entirely client-side via WebGPU and WASM. Powered by ONNX Runtime Web, this library makes it simple to integrate state-of-the-art transcription into any web application.
|
|
8
|
+
|
|
9
|
+
> **Status:** Early preview – API is subject to change while things stabilise.
|
|
10
|
+
> **Note:** Currently only supports the Parakeet-TDT model architecture.
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
# npm
|
|
18
|
+
npm i parakeet.js onnxruntime-web
|
|
19
|
+
|
|
20
|
+
# yarn
|
|
21
|
+
yarn add parakeet.js onnxruntime-web
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
`onnxruntime-web` is a peer-dependency that supplies the runtime back-ends (WebGPU, WASM).
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## Model assets
|
|
29
|
+
|
|
30
|
+
We host ready-to-use ONNX exports on the HuggingFace Hub:
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
ysdede/parakeet-tdt-0.6b-v2-onnx
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
The helper `getParakeetModel()` downloads all required files and caches them in **IndexedDB**:
|
|
37
|
+
|
|
38
|
+
```js
|
|
39
|
+
import { getParakeetModel } from 'parakeet.js';
|
|
40
|
+
|
|
41
|
+
const repoId = 'ysdede/parakeet-tdt-0.6b-v2-onnx';
|
|
42
|
+
const { urls, filenames } = await getParakeetModel(repoId, {
|
|
43
|
+
backend: 'webgpu-hybrid', // webgpu-hybrid | wasm
|
|
44
|
+
quantization: 'fp32', // fp32 | int8
|
|
45
|
+
decoderInt8: true, // load INT8 decoder even when encoder fp32
|
|
46
|
+
preprocessor: 'nemo128', // nemo80 | nemo128
|
|
47
|
+
progress: ({file,loaded,total}) => console.log(file, loaded/total)
|
|
48
|
+
});
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Returned structure:
|
|
52
|
+
|
|
53
|
+
```ts
|
|
54
|
+
{
|
|
55
|
+
urls: {
|
|
56
|
+
encoderUrl: string,
|
|
57
|
+
decoderUrl: string,
|
|
58
|
+
encoderDataUrl?: string | null,
|
|
59
|
+
decoderDataUrl?: string | null,
|
|
60
|
+
tokenizerUrl: string,
|
|
61
|
+
preprocessorUrl: string
|
|
62
|
+
},
|
|
63
|
+
filenames: { encoder: string; decoder: string },
|
|
64
|
+
quantisation: { encoder: 'fp32' | 'int8'; decoder: 'fp32' | 'int8' }
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## Creating a model instance
|
|
71
|
+
|
|
72
|
+
```js
|
|
73
|
+
import { ParakeetModel } from 'parakeet.js';
|
|
74
|
+
|
|
75
|
+
const model = await ParakeetModel.fromUrls({
|
|
76
|
+
...urls, // spread the URLs returned above
|
|
77
|
+
filenames, // needed for external .data mapping
|
|
78
|
+
backend: 'webgpu-hybrid',
|
|
79
|
+
decoderOnWasm: true, // force decoder to CPU/WASM for micro-kernels
|
|
80
|
+
decoderInt8: true, // decoder uses INT8 weights
|
|
81
|
+
cpuThreads: 6, // WASM threads (defaults to cores-2)
|
|
82
|
+
verbose: false // ORT verbose log
|
|
83
|
+
});
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Back-end presets
|
|
87
|
+
|
|
88
|
+
| Backend string | Encoder EP | Decoder EP | Typical use-case |
|
|
89
|
+
|---------------------|------------|------------|------------------|
|
|
90
|
+
| `webgpu-hybrid` (default) | WebGPU (fp32) | WASM (fp32/int8) | Modern desktop browsers |
|
|
91
|
+
| `webgpu-strict` | WebGPU (fp32) | **fail** if op unsupported | Benchmarking kernels |
|
|
92
|
+
| `wasm` | WASM (int8/fp32) | WASM | Low-end devices, Node.js |
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Transcribing audio
|
|
97
|
+
|
|
98
|
+
```js
|
|
99
|
+
// 16-kHz mono PCM Float32Array
|
|
100
|
+
await model.transcribe(pcmFloat32, 16_000, {
|
|
101
|
+
returnTimestamps: true,
|
|
102
|
+
returnConfidences: true,
|
|
103
|
+
frameStride: 2, // 1 (default) = highest accuracy / 2-4 faster
|
|
104
|
+
});
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Extra options:
|
|
108
|
+
|
|
109
|
+
| Option | Default | Description |
|
|
110
|
+
|--------|---------|-------------|
|
|
111
|
+
| `temperature` | 1.2 | Softmax temperature for decoding |
|
|
112
|
+
| `frameStride` | 1 | Advance decoder by *n* encoder frames per step |
|
|
113
|
+
|
|
114
|
+
### Result schema
|
|
115
|
+
|
|
116
|
+
```ts
|
|
117
|
+
{
|
|
118
|
+
utterance_text: string,
|
|
119
|
+
words: Array<{text,start_time,end_time,confidence}>,
|
|
120
|
+
tokens: Array<{token,start_time,end_time,confidence}>,
|
|
121
|
+
confidence_scores: { overall_log_prob, word_avg, token_avg },
|
|
122
|
+
metrics: {
|
|
123
|
+
rtf: number,
|
|
124
|
+
total_ms: number,
|
|
125
|
+
preprocess_ms: number,
|
|
126
|
+
encode_ms: number,
|
|
127
|
+
decode_ms: number,
|
|
128
|
+
tokenize_ms: number
|
|
129
|
+
},
|
|
130
|
+
is_final: true
|
|
131
|
+
}
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Warm-up & Verification (Recommended)
|
|
137
|
+
|
|
138
|
+
The first time you run inference after loading a model, the underlying runtime needs to compile the execution graph. This makes the first run significantly slower. To ensure a smooth user experience, it's best practice to perform a "warm-up" run with a dummy or known audio sample immediately after model creation.
|
|
139
|
+
|
|
140
|
+
Our React demo does this and also verifies the output to ensure the model loaded correctly.
|
|
141
|
+
|
|
142
|
+
```js
|
|
143
|
+
// In your app, after `ParakeetModel.fromUrls()` succeeds:
|
|
144
|
+
setStatus('Warming up & verifying…');
|
|
145
|
+
|
|
146
|
+
const audioRes = await fetch('/assets/known_audio.wav');
|
|
147
|
+
const pcm = await decodeAudio(audioRes); // Your audio decoding logic
|
|
148
|
+
const { utterance_text } = await model.transcribe(pcm, 16000);
|
|
149
|
+
|
|
150
|
+
const expected = 'the known transcript for your audio';
|
|
151
|
+
if (utterance_text.toLowerCase().includes(expected)) {
|
|
152
|
+
setStatus('Model ready ✔');
|
|
153
|
+
} else {
|
|
154
|
+
setStatus('Model verification failed!');
|
|
155
|
+
}
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Runtime tuning knobs
|
|
161
|
+
|
|
162
|
+
| Property | Where | Effect |
|
|
163
|
+
|----------|-------|--------|
|
|
164
|
+
| `cpuThreads` | `fromUrls()` | Sets `ort.env.wasm.numThreads`; pick *cores-2* for best balance |
|
|
165
|
+
| `decoderOnWasm` | `fromUrls()` | Forces decoder session to WASM even in hybrid mode |
|
|
166
|
+
| `decoderInt8` | `getParakeetModel()` + `fromUrls()` | Load INT8 weights for decoder only |
|
|
167
|
+
| `frameStride` | `transcribe()` | Trade-off latency vs accuracy |
|
|
168
|
+
| `enableProfiling` | `fromUrls()` | Enables ORT profiler (JSON written to `/tmp/profile_*.json`) |
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Using the React demo as a template
|
|
173
|
+
|
|
174
|
+
Located at `examples/react-demo`.
|
|
175
|
+
|
|
176
|
+
Quick start:
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
cd examples/react-demo
|
|
180
|
+
npm i
|
|
181
|
+
npm run dev # Vite => http://localhost:5173
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
Key components:
|
|
185
|
+
|
|
186
|
+
| File | Purpose |
|
|
187
|
+
|------|---------|
|
|
188
|
+
| `App.jsx` | Complete end-to-end reference UI. Shows how to load a model with progress bars, perform a warm-up/verification step, display performance metrics (RTF, timings), and manage transcription history. |
|
|
189
|
+
| `parakeet.js` | Library entry; houses the model wrapper and performance instrumentation. |
|
|
190
|
+
| `hub.js` | Lightweight HuggingFace Hub helper – downloads and caches model binaries. |
|
|
191
|
+
|
|
192
|
+
Copy-paste the `loadModel()` and `transcribeFile()` functions into your app, adjust UI bindings, and you are ready to go.
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
## Troubleshooting
|
|
197
|
+
|
|
198
|
+
| Symptom | Cause | Fix |
|
|
199
|
+
|---------|-------|-----|
|
|
200
|
+
| `multiple calls to initWasm()` | Two WASM sessions initialised in parallel | In hybrid mode we create encoder session first, then decoder. Keep this order. |
|
|
201
|
+
| GPU memory still ~2.4 GB with INT8 selected | WebGPU kernels don't support INT8 yet – weights are automatically converted to FP32 | Use `decoderInt8:true` (CPU) or wait for upcoming WebGPU INT8 kernels. |
|
|
202
|
+
| `Graph capture feature not available` error | Mixed EPs prevent GPU graph capture | We auto-retry without capture; nothing to do. |
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## Changelog
|
|
207
|
+
|
|
208
|
+
See `OPTIMIZATION_PLAN.md` for a timeline of performance tweaks and planned features.
|
|
209
|
+
|
|
210
|
+
Happy hacking! 🎉
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8" />
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
6
|
+
<title>Parakeet React Demo</title>
|
|
7
|
+
</head>
|
|
8
|
+
<body>
|
|
9
|
+
<div id="root"></div>
|
|
10
|
+
<script type="module" src="/src/main.jsx"></script>
|
|
11
|
+
</body>
|
|
12
|
+
</html>
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "parakeet-react-demo",
|
|
3
|
+
"private": true,
|
|
4
|
+
"type": "module",
|
|
5
|
+
"scripts": {
|
|
6
|
+
"dev": "vite",
|
|
7
|
+
"build": "vite build",
|
|
8
|
+
"preview": "vite preview"
|
|
9
|
+
},
|
|
10
|
+
"dependencies": {
|
|
11
|
+
"react": "^18.2.0",
|
|
12
|
+
"react-dom": "^18.2.0",
|
|
13
|
+
"parakeet.js": "file:../..",
|
|
14
|
+
"onnxruntime-web": "1.22.0-dev.20250409-89f8206ba4"
|
|
15
|
+
},
|
|
16
|
+
"devDependencies": {
|
|
17
|
+
"vite": "^4.5.0",
|
|
18
|
+
"@vitejs/plugin-react": "^4.0.0"
|
|
19
|
+
}
|
|
20
|
+
}
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
:root {
|
|
2
|
+
font-family: Inter, system-ui, sans-serif;
|
|
3
|
+
line-height: 1.4;
|
|
4
|
+
color: #222;
|
|
5
|
+
background: #f3f6f8;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
.app {
|
|
9
|
+
max-width: 760px;
|
|
10
|
+
margin: 2rem auto;
|
|
11
|
+
background: #ffffff;
|
|
12
|
+
border-radius: 8px;
|
|
13
|
+
padding: 1.5rem 2rem;
|
|
14
|
+
box-shadow: 0 4px 14px rgba(0, 0, 0, 0.06);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
.controls {
|
|
18
|
+
display: flex;
|
|
19
|
+
flex-wrap: wrap;
|
|
20
|
+
gap: 0.75rem;
|
|
21
|
+
align-items: center;
|
|
22
|
+
margin-bottom: 1rem;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
.controls label {
|
|
26
|
+
font-size: 0.9rem;
|
|
27
|
+
display: flex;
|
|
28
|
+
align-items: center;
|
|
29
|
+
gap: 0.35rem;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
.controls select,
|
|
33
|
+
.controls input[type="number"] {
|
|
34
|
+
padding: 0.25rem 0.5rem;
|
|
35
|
+
border: 1px solid #d1d5db;
|
|
36
|
+
border-radius: 4px;
|
|
37
|
+
background: #fff;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
button.primary {
|
|
41
|
+
padding: 0.4rem 0.9rem;
|
|
42
|
+
background: #3b82f6;
|
|
43
|
+
color: #ffffff;
|
|
44
|
+
border: none;
|
|
45
|
+
border-radius: 4px;
|
|
46
|
+
cursor: pointer;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
button.primary:hover {
|
|
50
|
+
background: #2563eb;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
.status {
|
|
54
|
+
margin-top: 0.5rem;
|
|
55
|
+
font-weight: 500;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
.progress-wrapper {
|
|
59
|
+
margin: 0.5rem 0;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
.progress-bar {
|
|
63
|
+
height: 8px;
|
|
64
|
+
background: #e2e8f0;
|
|
65
|
+
border-radius: 4px;
|
|
66
|
+
overflow: hidden;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
.progress-bar > div {
|
|
70
|
+
height: 100%;
|
|
71
|
+
background: #10b981;
|
|
72
|
+
transition: width 0.2s;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
.progress-text {
|
|
76
|
+
font-size: 0.8rem;
|
|
77
|
+
color: #555;
|
|
78
|
+
margin-top: 0.25rem;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
.textarea {
|
|
82
|
+
width: 100%;
|
|
83
|
+
height: 6rem;
|
|
84
|
+
resize: vertical;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
.performance {
|
|
88
|
+
font-size: 0.85rem;
|
|
89
|
+
background: #ecfdf5;
|
|
90
|
+
padding: 0.5rem 0.75rem;
|
|
91
|
+
border-radius: 6px;
|
|
92
|
+
border: 1px solid #d1fae5;
|
|
93
|
+
margin-bottom: 1rem;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
.history {
|
|
97
|
+
max-height: 400px;
|
|
98
|
+
overflow-y: auto;
|
|
99
|
+
border: 1px solid #e2e8f0;
|
|
100
|
+
border-radius: 6px;
|
|
101
|
+
background: #ffffff;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
.history-item {
|
|
105
|
+
padding: 1rem;
|
|
106
|
+
border-bottom: 1px solid #f1f5f9;
|
|
107
|
+
background: #ffffff;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
.history-item:last-child {
|
|
111
|
+
border-bottom: none;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
.history-meta {
|
|
115
|
+
display: flex;
|
|
116
|
+
justify-content: space-between;
|
|
117
|
+
font-size: 0.9rem;
|
|
118
|
+
color: #666;
|
|
119
|
+
margin-bottom: 0.5rem;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
.history-stats {
|
|
123
|
+
font-size: 0.75rem;
|
|
124
|
+
color: #666;
|
|
125
|
+
margin-bottom: 0.5rem;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
.history-text {
|
|
129
|
+
background: #f9fafb;
|
|
130
|
+
padding: 0.5rem 0.75rem;
|
|
131
|
+
border-radius: 4px;
|
|
132
|
+
border: 1px solid #e5e7eb;
|
|
133
|
+
font-size: 0.9rem;
|
|
134
|
+
}
|