parakeet.js 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. package/.gitmodules +3 -0
  2. package/README.md +240 -239
  3. package/examples/hf-spaces-demo/README.md +6 -9
  4. package/examples/hf-spaces-demo/package.json +1 -1
  5. package/examples/hf-spaces-demo/src/App.js +307 -316
  6. package/examples/react-demo/package.json +19 -19
  7. package/examples/react-demo/src/App.jsx +324 -326
  8. package/examples/react-demo-dev/src/App.jsx +23 -24
  9. package/package.json +1 -1
  10. package/publish.ps1 +65 -0
  11. package/src/hub.js +235 -241
  12. package/src/parakeet.js +15 -8
  13. package/src/preprocessor.js +75 -68
  14. package/docs/parakeet-transformers-js/.gitattributes +0 -2
  15. package/docs/parakeet-transformers-js/.prettierignore +0 -8
  16. package/docs/parakeet-transformers-js/.prettierrc +0 -10
  17. package/docs/parakeet-transformers-js/.tmp_features.json +0 -1
  18. package/docs/parakeet-transformers-js/LICENSE +0 -202
  19. package/docs/parakeet-transformers-js/README.md +0 -448
  20. package/docs/parakeet-transformers-js/assets/nemo128.onnx +0 -0
  21. package/docs/parakeet-transformers-js/assets/nemo80.onnx +0 -0
  22. package/docs/parakeet-transformers-js/debug_test.js +0 -84
  23. package/docs/parakeet-transformers-js/dev/inspect_decoder.cjs +0 -9
  24. package/docs/parakeet-transformers-js/dev/inspect_joiner.cjs +0 -9
  25. package/docs/parakeet-transformers-js/dev/js_step_by_step.js +0 -249
  26. package/docs/parakeet-transformers-js/dev/parakeet_cli.js +0 -91
  27. package/docs/parakeet-transformers-js/jest.config.mjs +0 -194
  28. package/docs/parakeet-transformers-js/js_preprocessing.json +0 -225
  29. package/docs/parakeet-transformers-js/js_step_by_step.json +0 -837
  30. package/docs/parakeet-transformers-js/js_step_by_step_v2.json +0 -450
  31. package/docs/parakeet-transformers-js/js_step_by_step_v3.json +0 -450
  32. package/docs/parakeet-transformers-js/js_steps.json +0 -821
  33. package/docs/parakeet-transformers-js/package-lock.json +0 -12251
  34. package/docs/parakeet-transformers-js/package.json +0 -96
  35. package/docs/parakeet-transformers-js/src/audio_features.js +0 -178
  36. package/docs/parakeet-transformers-js/src/backends/onnx.js +0 -210
  37. package/docs/parakeet-transformers-js/src/base/feature_extraction_utils.js +0 -54
  38. package/docs/parakeet-transformers-js/src/base/image_processors_utils.js +0 -1105
  39. package/docs/parakeet-transformers-js/src/base/processing_utils.js +0 -173
  40. package/docs/parakeet-transformers-js/src/configs.js +0 -455
  41. package/docs/parakeet-transformers-js/src/env.js +0 -167
  42. package/docs/parakeet-transformers-js/src/generation/configuration_utils.js +0 -388
  43. package/docs/parakeet-transformers-js/src/generation/logits_process.js +0 -727
  44. package/docs/parakeet-transformers-js/src/generation/logits_sampler.js +0 -204
  45. package/docs/parakeet-transformers-js/src/generation/parameters.js +0 -35
  46. package/docs/parakeet-transformers-js/src/generation/stopping_criteria.js +0 -156
  47. package/docs/parakeet-transformers-js/src/generation/streamers.js +0 -225
  48. package/docs/parakeet-transformers-js/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +0 -85
  49. package/docs/parakeet-transformers-js/src/models/auto/feature_extraction_auto.js +0 -25
  50. package/docs/parakeet-transformers-js/src/models/auto/image_processing_auto.js +0 -29
  51. package/docs/parakeet-transformers-js/src/models/auto/processing_auto.js +0 -85
  52. package/docs/parakeet-transformers-js/src/models/beit/image_processing_beit.js +0 -5
  53. package/docs/parakeet-transformers-js/src/models/bit/image_processing_bit.js +0 -5
  54. package/docs/parakeet-transformers-js/src/models/chinese_clip/image_processing_chinese_clip.js +0 -5
  55. package/docs/parakeet-transformers-js/src/models/clap/feature_extraction_clap.js +0 -159
  56. package/docs/parakeet-transformers-js/src/models/clip/image_processing_clip.js +0 -6
  57. package/docs/parakeet-transformers-js/src/models/convnext/image_processing_convnext.js +0 -46
  58. package/docs/parakeet-transformers-js/src/models/dac/feature_extraction_dac.js +0 -3
  59. package/docs/parakeet-transformers-js/src/models/deit/image_processing_deit.js +0 -6
  60. package/docs/parakeet-transformers-js/src/models/detr/image_processing_detr.js +0 -52
  61. package/docs/parakeet-transformers-js/src/models/donut/image_processing_donut.js +0 -31
  62. package/docs/parakeet-transformers-js/src/models/dpt/image_processing_dpt.js +0 -6
  63. package/docs/parakeet-transformers-js/src/models/efficientnet/image_processing_efficientnet.js +0 -14
  64. package/docs/parakeet-transformers-js/src/models/encodec/feature_extraction_encodec.js +0 -32
  65. package/docs/parakeet-transformers-js/src/models/feature_extractors.js +0 -17
  66. package/docs/parakeet-transformers-js/src/models/florence2/processing_florence2.js +0 -131
  67. package/docs/parakeet-transformers-js/src/models/gemma3n/feature_extraction_gemma3n.js +0 -97
  68. package/docs/parakeet-transformers-js/src/models/gemma3n/processing_gemma3n.js +0 -74
  69. package/docs/parakeet-transformers-js/src/models/glpn/image_processing_glpn.js +0 -5
  70. package/docs/parakeet-transformers-js/src/models/grounding_dino/image_processing_grounding_dino.js +0 -29
  71. package/docs/parakeet-transformers-js/src/models/grounding_dino/processing_grounding_dino.js +0 -101
  72. package/docs/parakeet-transformers-js/src/models/idefics3/image_processing_idefics3.js +0 -232
  73. package/docs/parakeet-transformers-js/src/models/idefics3/processing_idefics3.js +0 -136
  74. package/docs/parakeet-transformers-js/src/models/image_processors.js +0 -40
  75. package/docs/parakeet-transformers-js/src/models/janus/image_processing_janus.js +0 -27
  76. package/docs/parakeet-transformers-js/src/models/janus/processing_janus.js +0 -123
  77. package/docs/parakeet-transformers-js/src/models/jina_clip/image_processing_jina_clip.js +0 -26
  78. package/docs/parakeet-transformers-js/src/models/jina_clip/processing_jina_clip.js +0 -24
  79. package/docs/parakeet-transformers-js/src/models/llava/processing_llava.js +0 -44
  80. package/docs/parakeet-transformers-js/src/models/llava_onevision/image_processing_llava_onevision.js +0 -5
  81. package/docs/parakeet-transformers-js/src/models/mask2former/image_processing_mask2former.js +0 -5
  82. package/docs/parakeet-transformers-js/src/models/maskformer/image_processing_maskformer.js +0 -18
  83. package/docs/parakeet-transformers-js/src/models/mgp_str/processing_mgp_str.js +0 -172
  84. package/docs/parakeet-transformers-js/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +0 -7
  85. package/docs/parakeet-transformers-js/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +0 -7
  86. package/docs/parakeet-transformers-js/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +0 -7
  87. package/docs/parakeet-transformers-js/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +0 -7
  88. package/docs/parakeet-transformers-js/src/models/mobilevit/image_processing_mobilevit.js +0 -6
  89. package/docs/parakeet-transformers-js/src/models/moonshine/feature_extraction_moonshine.js +0 -26
  90. package/docs/parakeet-transformers-js/src/models/moonshine/processing_moonshine.js +0 -20
  91. package/docs/parakeet-transformers-js/src/models/nougat/image_processing_nougat.js +0 -5
  92. package/docs/parakeet-transformers-js/src/models/owlv2/image_processing_owlv2.js +0 -5
  93. package/docs/parakeet-transformers-js/src/models/owlvit/image_processing_owlvit.js +0 -12
  94. package/docs/parakeet-transformers-js/src/models/owlvit/processing_owlvit.js +0 -7
  95. package/docs/parakeet-transformers-js/src/models/paligemma/processing_paligemma.js +0 -83
  96. package/docs/parakeet-transformers-js/src/models/parakeet/feature_extraction_parakeet.js +0 -3
  97. package/docs/parakeet-transformers-js/src/models/parakeet/modeling_parakeet.js +0 -3
  98. package/docs/parakeet-transformers-js/src/models/parakeet/processing_parakeet.js +0 -3
  99. package/docs/parakeet-transformers-js/src/models/parakeet/tokenization_parakeet.js +0 -3
  100. package/docs/parakeet-transformers-js/src/models/phi3_v/image_processing_phi3_v.js +0 -163
  101. package/docs/parakeet-transformers-js/src/models/phi3_v/processing_phi3_v.js +0 -53
  102. package/docs/parakeet-transformers-js/src/models/processors.js +0 -22
  103. package/docs/parakeet-transformers-js/src/models/pvt/image_processing_pvt.js +0 -5
  104. package/docs/parakeet-transformers-js/src/models/pyannote/feature_extraction_pyannote.js +0 -85
  105. package/docs/parakeet-transformers-js/src/models/pyannote/processing_pyannote.js +0 -24
  106. package/docs/parakeet-transformers-js/src/models/qwen2_vl/image_processing_qwen2_vl.js +0 -52
  107. package/docs/parakeet-transformers-js/src/models/qwen2_vl/processing_qwen2_vl.js +0 -53
  108. package/docs/parakeet-transformers-js/src/models/rt_detr/image_processing_rt_detr.js +0 -12
  109. package/docs/parakeet-transformers-js/src/models/sam/image_processing_sam.js +0 -242
  110. package/docs/parakeet-transformers-js/src/models/sam/processing_sam.js +0 -20
  111. package/docs/parakeet-transformers-js/src/models/sapiens/image_processing_sapiens.js +0 -13
  112. package/docs/parakeet-transformers-js/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +0 -175
  113. package/docs/parakeet-transformers-js/src/models/segformer/image_processing_segformer.js +0 -13
  114. package/docs/parakeet-transformers-js/src/models/siglip/image_processing_siglip.js +0 -5
  115. package/docs/parakeet-transformers-js/src/models/smolvlm/image_processing_smolvlm.js +0 -2
  116. package/docs/parakeet-transformers-js/src/models/smolvlm/processing_smolvlm.js +0 -2
  117. package/docs/parakeet-transformers-js/src/models/snac/feature_extraction_snac.js +0 -3
  118. package/docs/parakeet-transformers-js/src/models/speecht5/feature_extraction_speecht5.js +0 -4
  119. package/docs/parakeet-transformers-js/src/models/speecht5/processing_speecht5.js +0 -17
  120. package/docs/parakeet-transformers-js/src/models/swin2sr/image_processing_swin2sr.js +0 -24
  121. package/docs/parakeet-transformers-js/src/models/ultravox/processing_ultravox.js +0 -54
  122. package/docs/parakeet-transformers-js/src/models/vit/image_processing_vit.js +0 -7
  123. package/docs/parakeet-transformers-js/src/models/vitmatte/image_processing_vitmatte.js +0 -50
  124. package/docs/parakeet-transformers-js/src/models/vitpose/image_processing_vitpose.js +0 -89
  125. package/docs/parakeet-transformers-js/src/models/wav2vec2/feature_extraction_wav2vec2.js +0 -44
  126. package/docs/parakeet-transformers-js/src/models/wav2vec2/processing_wav2vec2.js +0 -17
  127. package/docs/parakeet-transformers-js/src/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.js +0 -17
  128. package/docs/parakeet-transformers-js/src/models/wespeaker/feature_extraction_wespeaker.js +0 -95
  129. package/docs/parakeet-transformers-js/src/models/whisper/common_whisper.js +0 -157
  130. package/docs/parakeet-transformers-js/src/models/whisper/feature_extraction_whisper.js +0 -92
  131. package/docs/parakeet-transformers-js/src/models/whisper/generation_whisper.js +0 -89
  132. package/docs/parakeet-transformers-js/src/models/whisper/processing_whisper.js +0 -21
  133. package/docs/parakeet-transformers-js/src/models/yolos/image_processing_yolos.js +0 -12
  134. package/docs/parakeet-transformers-js/src/models.js +0 -8644
  135. package/docs/parakeet-transformers-js/src/ops/registry.js +0 -133
  136. package/docs/parakeet-transformers-js/src/ort_env.js +0 -8
  137. package/docs/parakeet-transformers-js/src/parakeet.js +0 -792
  138. package/docs/parakeet-transformers-js/src/pipelines.js +0 -3540
  139. package/docs/parakeet-transformers-js/src/processors.js +0 -16
  140. package/docs/parakeet-transformers-js/src/tokenizers.js +0 -4432
  141. package/docs/parakeet-transformers-js/src/transformers.js +0 -50
  142. package/docs/parakeet-transformers-js/src/utils/audio.js +0 -893
  143. package/docs/parakeet-transformers-js/src/utils/constants.js +0 -9
  144. package/docs/parakeet-transformers-js/src/utils/core.js +0 -259
  145. package/docs/parakeet-transformers-js/src/utils/data-structures.js +0 -574
  146. package/docs/parakeet-transformers-js/src/utils/devices.js +0 -22
  147. package/docs/parakeet-transformers-js/src/utils/dtypes.js +0 -63
  148. package/docs/parakeet-transformers-js/src/utils/generic.js +0 -35
  149. package/docs/parakeet-transformers-js/src/utils/hub.js +0 -780
  150. package/docs/parakeet-transformers-js/src/utils/image.js +0 -834
  151. package/docs/parakeet-transformers-js/src/utils/maths.js +0 -1061
  152. package/docs/parakeet-transformers-js/src/utils/tensor.js +0 -1539
  153. package/docs/parakeet-transformers-js/src/utils/video.js +0 -128
  154. package/docs/parakeet-transformers-js/test/decoder.test.js +0 -114
  155. package/docs/parakeet-transformers-js/test/encoder.test.js +0 -108
  156. package/docs/parakeet-transformers-js/test/preprocessor.test.js +0 -85
  157. package/docs/parakeet-transformers-js/test/tokenizer.test.js +0 -24
  158. package/docs/parakeet-transformers-js/test/transcribe.js +0 -89
  159. package/docs/parakeet-transformers-js/tsconfig.json +0 -21
  160. package/docs/parakeet-transformers-js/webpack.config.js +0 -223
package/.gitmodules ADDED
@@ -0,0 +1,3 @@
1
+ [submodule "examples/hf-spaces-demo"]
2
+ path = examples/hf-spaces-demo
3
+ url = https://huggingface.co/spaces/ysdede/parakeet.js-demo
package/README.md CHANGED
@@ -1,240 +1,241 @@
1
- # Parakeet.js
2
-
3
- Client-side ONNX inference of NVIDIA *Parakeet* speech-to-text models.
4
- Runs entirely in the browser on **WebGPU** or **WASM** via
5
- [ONNX Runtime Web](https://onnxruntime.ai/).
6
-
7
- > **Parakeet.js** offers a high-performance, browser-first implementation for NVIDIA's Parakeet-TDT speech-to-text models, running entirely client-side via WebGPU and WASM. Powered by ONNX Runtime Web, this library makes it simple to integrate state-of-the-art transcription into any web application.
8
-
9
- > **Status:** Early preview – API is subject to change while things stabilise.
10
- > **Note:** Currently only supports the Parakeet-TDT model architecture.
11
-
12
- ---
13
-
14
- ## Installation
15
-
16
- ```bash
17
- # npm
18
- npm i parakeet.js onnxruntime-web
19
-
20
- # yarn
21
- yarn add parakeet.js onnxruntime-web
22
- ```
23
-
24
- `onnxruntime-web` is a peer-dependency that supplies the runtime back-ends (WebGPU, WASM).
25
-
26
- ---
27
-
28
- ## Model assets
29
-
30
- We host ready-to-use ONNX exports on the HuggingFace Hub:
31
-
32
- ```
33
- ysdede/parakeet-tdt-0.6b-v2-onnx
34
- ```
35
-
36
- The helper `getParakeetModel()` downloads all required files and caches them in **IndexedDB**:
37
-
38
- ```js
39
- import { getParakeetModel } from 'parakeet.js';
40
-
41
- const repoId = 'ysdede/parakeet-tdt-0.6b-v2-onnx';
42
- const { urls, filenames } = await getParakeetModel(repoId, {
43
- backend: 'webgpu-hybrid', // webgpu-hybrid | wasm
44
- quantization: 'fp32', // fp32 | int8
45
- decoderInt8: true, // load INT8 decoder even when encoder fp32
46
- preprocessor: 'nemo128', // nemo80 | nemo128
47
- progress: ({file,loaded,total}) => console.log(file, loaded/total)
48
- });
49
- ```
50
-
51
- Returned structure:
52
-
53
- ```ts
54
- {
55
- urls: {
56
- encoderUrl: string,
57
- decoderUrl: string,
58
- encoderDataUrl?: string | null,
59
- decoderDataUrl?: string | null,
60
- tokenizerUrl: string,
61
- preprocessorUrl: string
62
- },
63
- filenames: { encoder: string; decoder: string },
64
- quantisation: { encoder: 'fp32' | 'int8'; decoder: 'fp32' | 'int8' }
65
- }
66
- ```
67
-
68
- ---
69
-
70
- ## Creating a model instance
71
-
72
- ```js
73
- import { ParakeetModel } from 'parakeet.js';
74
-
75
- const model = await ParakeetModel.fromUrls({
76
- ...urls, // spread the URLs returned above
77
- filenames, // needed for external .data mapping
78
- backend: 'webgpu-hybrid',
79
- decoderOnWasm: true, // force decoder to CPU/WASM for micro-kernels
80
- decoderInt8: true, // decoder uses INT8 weights
81
- cpuThreads: 6, // WASM threads (defaults to cores-2)
82
- verbose: false // ORT verbose log
83
- });
84
- ```
85
-
86
- ### Back-end presets
87
-
88
- | Backend string | Encoder EP | Decoder EP | Typical use-case |
89
- |---------------------|------------|------------|------------------|
90
- | `webgpu-hybrid` (default) | WebGPU (fp32) | WASM (fp32/int8) | Modern desktop browsers |
91
- | `webgpu-strict` | WebGPU (fp32) | **fail** if op unsupported | Benchmarking kernels |
92
- | `wasm` | WASM (int8/fp32) | WASM | Low-end devices, Node.js |
93
-
94
- ---
95
-
96
- ## Transcribing audio
97
-
98
- ```js
99
- // 16-kHz mono PCM Float32Array
100
- await model.transcribe(pcmFloat32, 16_000, {
101
- returnTimestamps: true,
102
- returnConfidences: true,
103
- frameStride: 2, // 1 (default) = highest accuracy / 2-4 faster
104
- });
105
- ```
106
-
107
- Extra options:
108
-
109
- | Option | Default | Description |
110
- |--------|---------|-------------|
111
- | `temperature` | 1.2 | Softmax temperature for decoding |
112
- | `frameStride` | 1 | Advance decoder by *n* encoder frames per step |
113
-
114
- ### Result schema
115
-
116
- ```ts
117
- {
118
- utterance_text: string,
119
- words: Array<{text,start_time,end_time,confidence}>,
120
- tokens: Array<{token,start_time,end_time,confidence}>,
121
- confidence_scores: { overall_log_prob, word_avg, token_avg },
122
- metrics: {
123
- rtf: number,
124
- total_ms: number,
125
- preprocess_ms: number,
126
- encode_ms: number,
127
- decode_ms: number,
128
- tokenize_ms: number
129
- },
130
- is_final: true
131
- }
132
- ```
133
-
134
- ---
135
-
136
- ## Warm-up & Verification (Recommended)
137
-
138
- The first time you run inference after loading a model, the underlying runtime needs to compile the execution graph. This makes the first run significantly slower. To ensure a smooth user experience, it's best practice to perform a "warm-up" run with a dummy or known audio sample immediately after model creation.
139
-
140
- Our React demo does this and also verifies the output to ensure the model loaded correctly.
141
-
142
- ```js
143
- // In your app, after `ParakeetModel.fromUrls()` succeeds:
144
- setStatus('Warming up & verifying…');
145
-
146
- const audioRes = await fetch('/assets/known_audio.wav');
147
- const pcm = await decodeAudio(audioRes); // Your audio decoding logic
148
- const { utterance_text } = await model.transcribe(pcm, 16000);
149
-
150
- const expected = 'the known transcript for your audio';
151
- if (utterance_text.toLowerCase().includes(expected)) {
152
- setStatus('Model ready ✔');
153
- } else {
154
- setStatus('Model verification failed!');
155
- }
156
- ```
157
-
158
- ---
159
-
160
- ## Runtime tuning knobs
161
-
162
- | Property | Where | Effect |
163
- |----------|-------|--------|
164
- | `cpuThreads` | `fromUrls()` | Sets `ort.env.wasm.numThreads`; pick *cores-2* for best balance |
165
- | `decoderOnWasm` | `fromUrls()` | Forces decoder session to WASM even in hybrid mode |
166
- | `decoderInt8` | `getParakeetModel()` + `fromUrls()` | Load INT8 weights for decoder only |
167
- | `frameStride` | `transcribe()` | Trade-off latency vs accuracy |
168
- | `enableProfiling` | `fromUrls()` | Enables ORT profiler (JSON written to `/tmp/profile_*.json`) |
169
-
170
- ---
171
-
172
- ## Using the React demo as a template
173
-
174
- Located at `examples/react-demo`.
175
-
176
- Quick start:
177
-
178
- ```bash
179
- cd examples/react-demo
180
- npm i
181
- npm run dev # Vite => http://localhost:5173
182
- ```
183
-
184
- Key components:
185
-
186
- | File | Purpose |
187
- |------|---------|
188
- | `App.jsx` | Complete end-to-end reference UI. Shows how to load a model with progress bars, perform a warm-up/verification step, display performance metrics (RTF, timings), and manage transcription history. |
189
- | `parakeet.js` | Library entry; houses the model wrapper and performance instrumentation. |
190
- | `hub.js` | Lightweight HuggingFace Hub helper downloads and caches model binaries. |
191
-
192
- Copy-paste the `loadModel()` and `transcribeFile()` functions into your app, adjust UI bindings, and you are ready to go.
193
-
194
- ---
195
-
196
- ## 🚀 Live Demo on Hugging Face Spaces
197
-
198
- Try the library instantly in your browser without any setup:
199
-
200
- **🦜 [Parakeet.js Demo on HF Spaces](https://huggingface.co/spaces/ysdede/parakeet.js-demo)**
201
-
202
- This demo showcases:
203
- - **WebGPU/WASM backend selection** - Choose the best performance for your device
204
- - **Real-time transcription** - Upload audio files and see instant results
205
- - **Performance metrics** - View detailed timing information and RTF scores
206
- - **Multi-threaded WASM** - Optimized for maximum performance
207
- - **Complete feature set** - All library capabilities in one place
208
-
209
- The demo is also available locally at `examples/hf-spaces-demo` and can be deployed to your own HF Space.
210
-
211
- ---
212
-
213
- ## Troubleshooting
214
-
215
- | Symptom | Cause | Fix |
216
- |---------|-------|-----|
217
- | `multiple calls to initWasm()` | Two WASM sessions initialised in parallel | In hybrid mode we create encoder session first, then decoder. Keep this order. |
218
- | GPU memory still ~2.4 GB with INT8 selected | WebGPU kernels don't support INT8 yet weights are automatically converted to FP32 | Use `decoderInt8:true` (CPU) or wait for upcoming WebGPU INT8 kernels. |
219
- | `Graph capture feature not available` error | Mixed EPs prevent GPU graph capture | We auto-retry without capture; nothing to do. |
220
-
221
- ---
222
-
223
- ## Changelog
224
-
225
- See `OPTIMIZATION_PLAN.md` for a timeline of performance tweaks and planned features.
226
-
227
- ---
228
-
229
- ## Credits
230
-
231
- This project builds upon the excellent work of:
232
-
233
- - **[istupakov](https://github.com/istupakov)** - For providing the [ONNX-ASR](https://github.com/istupakov/onnx-asr) repository, which served as the foundation and starting point for this JavaScript implementation
234
- - **[istupakov/parakeet-tdt-0.6b-v2-onnx](https://huggingface.co/istupakov/parakeet-tdt-0.6b-v2-onnx)** - For the original ONNX model exports and preprocessor implementations that made browser deployment possible
235
- - **ONNX Runtime Web** - For powering the browser-based inference engine
236
- - **ONNX Runtime Node** - For enabling high-performance server-side inference
237
-
238
- The Python-based ONNX-ASR project provided crucial insights into model handling, preprocessing pipelines, and served as a reference implementation during the development of this browser-compatible version.
239
-
1
+ # Parakeet.js
2
+
3
+ Client-side ONNX inference of NVIDIA *Parakeet* speech-to-text models.
4
+ Runs entirely in the browser on **WebGPU** or **WASM** via
5
+ [ONNX Runtime Web](https://onnxruntime.ai/).
6
+
7
+ > **Parakeet.js** offers a high-performance, browser-first implementation for NVIDIA's Parakeet-TDT speech-to-text models, running entirely client-side via WebGPU and WASM. Powered by ONNX Runtime Web, this library makes it simple to integrate state-of-the-art transcription into any web application.
8
+
9
+ > **Status:** Early preview – API is subject to change while things stabilise.
10
+ > **Note:** Currently only supports the Parakeet-TDT model architecture.
11
+
12
+ ---
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ # npm
18
+ npm i parakeet.js onnxruntime-web
19
+
20
+ # yarn
21
+ yarn add parakeet.js onnxruntime-web
22
+ ```
23
+
24
+ `onnxruntime-web` is a peer-dependency that supplies the runtime back-ends (WebGPU, WASM).
25
+
26
+ ---
27
+
28
+ ## Model assets
29
+
30
+ We host ready-to-use ONNX exports on the HuggingFace Hub:
31
+
32
+ ```
33
+ istupakov/parakeet-tdt-0.6b-v2-onnx
34
+ ```
35
+
36
+ The helper `getParakeetModel()` downloads all required files and caches them in **IndexedDB**:
37
+
38
+ ```js
39
+ import { getParakeetModel } from 'parakeet.js';
40
+
41
+ const repoId = 'istupakov/parakeet-tdt-0.6b-v2-onnx';
42
+ const { urls, filenames } = await getParakeetModel(repoId, {
43
+ backend: 'webgpu', // 'webgpu' or 'wasm'
44
+ encoderQuant: 'fp32', // 'fp32' or 'int8'
45
+ decoderQuant: 'int8', // 'fp32' or 'int8'
46
+ preprocessor: 'nemo128',
47
+ progress: ({file,loaded,total}) => console.log(file, loaded/total)
48
+ });
49
+ ```
50
+
51
+ Returned structure:
52
+
53
+ ```ts
54
+ {
55
+ urls: {
56
+ encoderUrl: string,
57
+ decoderUrl: string,
58
+ encoderDataUrl?: string | null,
59
+ decoderDataUrl?: string | null,
60
+ tokenizerUrl: string,
61
+ preprocessorUrl: string
62
+ },
63
+ filenames: { encoder: string; decoder: string }
64
+ }
65
+ ```
66
+
67
+ ---
68
+
69
+ ## Creating a model instance
70
+
71
+ ```js
72
+ import { ParakeetModel } from 'parakeet.js';
73
+
74
+ const model = await ParakeetModel.fromUrls({
75
+ ...urls, // spread the URLs returned above
76
+ filenames, // needed for external .data mapping
77
+ backend: 'webgpu', // 'webgpu' or 'wasm'
78
+ cpuThreads: 6, // For WASM backend
79
+ verbose: false, // ORT verbose logging
80
+ });
81
+ ```
82
+
83
+ ### Back-end presets
84
+
85
+ The library supports two primary backends: `webgpu` and `wasm`.
86
+
87
+ - **`webgpu` (Default):** This is the fastest option for modern desktop browsers. It runs in a hybrid configuration:
88
+ - The heavy **encoder** model runs on the **GPU** (WebGPU) for maximum throughput.
89
+ - The **decoder** model runs on the **CPU** (WASM). The decoder's architecture contains operations not fully supported by the ONNX Runtime WebGPU backend, causing it to fall back to WASM anyway. This configuration makes the behavior explicit and stable, avoiding performance issues and warnings.
90
+ - In this mode, the encoder must be `fp32`, but you can choose `fp32` or `int8` for the decoder.
91
+
92
+ - **`wasm`:** Both encoder and decoder run on the CPU. This is best for compatibility with older devices or environments without WebGPU support. Both models can be `fp32` or `int8`.
93
+
94
+
95
+ ---
96
+
97
+ ## Transcribing audio
98
+
99
+ ```js
100
+ // 16-kHz mono PCM Float32Array
101
+ await model.transcribe(pcmFloat32, 16_000, {
102
+ returnTimestamps: true,
103
+ returnConfidences: true,
104
+ frameStride: 2, // 1 (default) = highest accuracy / 2-4 faster
105
+ });
106
+ ```
107
+
108
+ Extra options:
109
+
110
+ | Option | Default | Description |
111
+ |--------|---------|-------------|
112
+ | `temperature` | 1.2 | Softmax temperature for decoding |
113
+ | `frameStride` | 1 | Advance decoder by *n* encoder frames per step |
114
+
115
+ ### Result schema
116
+
117
+ ```ts
118
+ {
119
+ utterance_text: string,
120
+ words: Array<{text,start_time,end_time,confidence}>,
121
+ tokens: Array<{token,start_time,end_time,confidence}>,
122
+ confidence_scores: { overall_log_prob, word_avg, token_avg },
123
+ metrics: {
124
+ rtf: number,
125
+ total_ms: number,
126
+ preprocess_ms: number,
127
+ encode_ms: number,
128
+ decode_ms: number,
129
+ tokenize_ms: number
130
+ },
131
+ is_final: true
132
+ }
133
+ ```
134
+
135
+ ---
136
+
137
+ ## Warm-up & Verification (Recommended)
138
+
139
+ The first time you run inference after loading a model, the underlying runtime needs to compile the execution graph. This makes the first run significantly slower. To ensure a smooth user experience, it's best practice to perform a "warm-up" run with a dummy or known audio sample immediately after model creation.
140
+
141
+ Our React demo does this and also verifies the output to ensure the model loaded correctly.
142
+
143
+ ```js
144
+ // In your app, after `ParakeetModel.fromUrls()` succeeds:
145
+ setStatus('Warming up & verifying…');
146
+
147
+ const audioRes = await fetch('/assets/known_audio.wav');
148
+ const pcm = await decodeAudio(audioRes); // Your audio decoding logic
149
+ const { utterance_text } = await model.transcribe(pcm, 16000);
150
+
151
+ const expected = 'the known transcript for your audio';
152
+ if (utterance_text.toLowerCase().includes(expected)) {
153
+ setStatus('Model ready ✔');
154
+ } else {
155
+ setStatus('Model verification failed!');
156
+ }
157
+ ```
158
+
159
+ ---
160
+
161
+ ## Runtime tuning knobs
162
+
163
+ | Property | Where | Effect |
164
+ |----------|-------|--------|
165
+ | `cpuThreads` | `fromUrls()` | Sets `ort.env.wasm.numThreads`; pick *cores-2* for best balance |
166
+ | `encoderQuant` | `getParakeetModel()` | Selects `fp32` or `int8` model for the encoder. |
167
+ | `decoderQuant` | `getParakeetModel()` | Selects `fp32` or `int8` model for the decoder. |
168
+ | `frameStride` | `transcribe()` | Trade-off latency vs accuracy |
169
+ | `enableProfiling` | `fromUrls()` | Enables ORT profiler (JSON written to `/tmp/profile_*.json`) |
170
+
171
+ ---
172
+
173
+ ## Using the React demo as a template
174
+
175
+ Located at `examples/react-demo`.
176
+
177
+ Quick start:
178
+
179
+ ```bash
180
+ cd examples/react-demo
181
+ npm i
182
+ npm run dev # Vite => http://localhost:5173
183
+ ```
184
+
185
+ Key components:
186
+
187
+ | File | Purpose |
188
+ |------|---------|
189
+ | `App.jsx` | Complete end-to-end reference UI. Shows how to load a model with progress bars, perform a warm-up/verification step, display performance metrics (RTF, timings), and manage transcription history. |
190
+ | `parakeet.js` | Library entry; houses the model wrapper and performance instrumentation. |
191
+ | `hub.js` | Lightweight HuggingFace Hub helper – downloads and caches model binaries. |
192
+
193
+ Copy-paste the `loadModel()` and `transcribeFile()` functions into your app, adjust UI bindings, and you are ready to go.
194
+
195
+ ---
196
+
197
+ ## 🚀 Live Demo on Hugging Face Spaces
198
+
199
+ Try the library instantly in your browser without any setup:
200
+
201
+ **🦜 [Parakeet.js Demo on HF Spaces](https://huggingface.co/spaces/ysdede/parakeet.js-demo)**
202
+
203
+ This demo showcases:
204
+ - **WebGPU/WASM backend selection** - Choose the best performance for your device
205
+ - **Real-time transcription** - Upload audio files and see instant results
206
+ - **Performance metrics** - View detailed timing information and RTF scores
207
+ - **Multi-threaded WASM** - Optimized for maximum performance
208
+ - **Complete feature set** - All library capabilities in one place
209
+
210
+ The demo is also available locally at `examples/hf-spaces-demo` and can be deployed to your own HF Space.
211
+
212
+ ---
213
+
214
+ ## Troubleshooting
215
+
216
+ | Symptom | Cause | Fix |
217
+ |---------|-------|-----|
218
+ | `Some nodes were not assigned...` warning | When using the `webgpu` backend, ORT assigns minor operations (`Shape`, `Gather`, etc.) in the encoder to the CPU for efficiency. | This is expected and harmless. The heavy-lifting is still on the GPU. |
219
+ | GPU memory still ~2.4 GB with INT8 selected | In WebGPU mode, the encoder must be `fp32`. The `int8` option only applies to the WASM backend or the decoder in hybrid mode. | This is the expected behavior for the `webgpu` backend. |
220
+ | `Graph capture feature not available` error | Mixed EPs (CPU/GPU) or unsupported ops prevent GPU graph capture. | The library automatically retries without capture; safe to ignore. |
221
+
222
+ ---
223
+
224
+ ## Changelog
225
+
226
+ See `OPTIMIZATION_PLAN.md` for a timeline of performance tweaks and planned features.
227
+
228
+ ---
229
+
230
+ ## Credits
231
+
232
+ This project builds upon the excellent work of:
233
+
234
+ - **[istupakov](https://github.com/istupakov)** - For providing the [ONNX-ASR](https://github.com/istupakov/onnx-asr) repository, which served as the foundation and starting point for this JavaScript implementation
235
+ - **[istupakov/parakeet-tdt-0.6b-v2-onnx](https://huggingface.co/istupakov/parakeet-tdt-0.6b-v2-onnx)** - For the ONNX model exports and preprocessor implementations that made this library possible.
236
+ - **ONNX Runtime Web** - For powering the browser-based inference engine
237
+ - **ONNX Runtime Node** - For enabling high-performance server-side inference
238
+
239
+ The Python-based ONNX-ASR project provided crucial insights into model handling, preprocessing pipelines, and served as a reference implementation during the development of this browser-compatible version.
240
+
240
241
  Happy hacking! 🎉
@@ -10,17 +10,14 @@ app_file: build/index.html
10
10
  license: mit
11
11
  short_description: NVIDIA Parakeet speech recognition for the browser
12
12
  models:
13
- - ysdede/parakeet-tdt-0.6b-v2-onnx
13
+ - istupakov/parakeet-tdt-0.6b-v2-onnx
14
14
  tags:
15
+ - parakeet-js
15
16
  - parakeet
16
- - speech
17
17
  - onnx
18
18
  - webgpu
19
- - wasm
20
- - transcription
21
- - nvidia
22
- - speech-recognition
23
- - browser
19
+ - asr
20
+ - istupakov/parakeet-tdt-0.6b-v2-onnx
24
21
  custom_headers:
25
22
  cross-origin-embedder-policy: require-corp
26
23
  cross-origin-opener-policy: same-origin
@@ -65,7 +62,7 @@ npm install parakeet.js onnxruntime-web
65
62
  import { ParakeetModel, getParakeetModel } from 'parakeet.js';
66
63
 
67
64
  // Load model from HuggingFace Hub
68
- const modelUrls = await getParakeetModel('ysdede/parakeet-tdt-0.6b-v2-onnx');
65
+ const modelUrls = await getParakeetModel('istupakov/parakeet-tdt-0.6b-v2-onnx');
69
66
  const model = await ParakeetModel.fromUrls(modelUrls);
70
67
 
71
68
  // Transcribe audio
@@ -80,7 +77,7 @@ console.log(result.utterance_text);
80
77
 
81
78
  ## 🧠 Model Information
82
79
 
83
- This demo uses the **ysdede/parakeet-tdt-0.6b-v2-onnx** model, which is an ONNX-converted version of NVIDIA's Parakeet speech recognition model optimized for browser deployment.
80
+ This demo uses the **istupakov/parakeet-tdt-0.6b-v2-onnx** model, which is an ONNX-converted version of NVIDIA's Parakeet speech recognition model optimized for browser deployment.
84
81
 
85
82
  ## 💡 Technical Details
86
83
 
@@ -7,7 +7,7 @@
7
7
  "@testing-library/jest-dom": "^6.6.3",
8
8
  "@testing-library/react": "^16.3.0",
9
9
  "@testing-library/user-event": "^13.5.0",
10
- "parakeet.js": "^0.0.1",
10
+ "parakeet.js": "^0.0.3",
11
11
  "onnxruntime-web": "1.22.0-dev.20250409-89f8206ba4",
12
12
  "react": "^19.1.0",
13
13
  "react-dom": "^19.1.0",