parakeet.js 0.0.2 โ†’ 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. package/README.md +337 -239
  2. package/package.json +9 -2
  3. package/src/backend.js +106 -98
  4. package/src/hub.js +245 -241
  5. package/src/index.js +36 -28
  6. package/src/models.js +138 -0
  7. package/src/parakeet.js +172 -39
  8. package/src/preprocessor.js +85 -68
  9. package/src/tokenizer.js +24 -6
  10. package/docs/parakeet-transformers-js/.gitattributes +0 -2
  11. package/docs/parakeet-transformers-js/.prettierignore +0 -8
  12. package/docs/parakeet-transformers-js/.prettierrc +0 -10
  13. package/docs/parakeet-transformers-js/.tmp_features.json +0 -1
  14. package/docs/parakeet-transformers-js/LICENSE +0 -202
  15. package/docs/parakeet-transformers-js/README.md +0 -448
  16. package/docs/parakeet-transformers-js/assets/nemo128.onnx +0 -0
  17. package/docs/parakeet-transformers-js/assets/nemo80.onnx +0 -0
  18. package/docs/parakeet-transformers-js/debug_test.js +0 -84
  19. package/docs/parakeet-transformers-js/dev/inspect_decoder.cjs +0 -9
  20. package/docs/parakeet-transformers-js/dev/inspect_joiner.cjs +0 -9
  21. package/docs/parakeet-transformers-js/dev/js_step_by_step.js +0 -249
  22. package/docs/parakeet-transformers-js/dev/parakeet_cli.js +0 -91
  23. package/docs/parakeet-transformers-js/jest.config.mjs +0 -194
  24. package/docs/parakeet-transformers-js/js_preprocessing.json +0 -225
  25. package/docs/parakeet-transformers-js/js_step_by_step.json +0 -837
  26. package/docs/parakeet-transformers-js/js_step_by_step_v2.json +0 -450
  27. package/docs/parakeet-transformers-js/js_step_by_step_v3.json +0 -450
  28. package/docs/parakeet-transformers-js/js_steps.json +0 -821
  29. package/docs/parakeet-transformers-js/package-lock.json +0 -12251
  30. package/docs/parakeet-transformers-js/package.json +0 -96
  31. package/docs/parakeet-transformers-js/src/audio_features.js +0 -178
  32. package/docs/parakeet-transformers-js/src/backends/onnx.js +0 -210
  33. package/docs/parakeet-transformers-js/src/base/feature_extraction_utils.js +0 -54
  34. package/docs/parakeet-transformers-js/src/base/image_processors_utils.js +0 -1105
  35. package/docs/parakeet-transformers-js/src/base/processing_utils.js +0 -173
  36. package/docs/parakeet-transformers-js/src/configs.js +0 -455
  37. package/docs/parakeet-transformers-js/src/env.js +0 -167
  38. package/docs/parakeet-transformers-js/src/generation/configuration_utils.js +0 -388
  39. package/docs/parakeet-transformers-js/src/generation/logits_process.js +0 -727
  40. package/docs/parakeet-transformers-js/src/generation/logits_sampler.js +0 -204
  41. package/docs/parakeet-transformers-js/src/generation/parameters.js +0 -35
  42. package/docs/parakeet-transformers-js/src/generation/stopping_criteria.js +0 -156
  43. package/docs/parakeet-transformers-js/src/generation/streamers.js +0 -225
  44. package/docs/parakeet-transformers-js/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +0 -85
  45. package/docs/parakeet-transformers-js/src/models/auto/feature_extraction_auto.js +0 -25
  46. package/docs/parakeet-transformers-js/src/models/auto/image_processing_auto.js +0 -29
  47. package/docs/parakeet-transformers-js/src/models/auto/processing_auto.js +0 -85
  48. package/docs/parakeet-transformers-js/src/models/beit/image_processing_beit.js +0 -5
  49. package/docs/parakeet-transformers-js/src/models/bit/image_processing_bit.js +0 -5
  50. package/docs/parakeet-transformers-js/src/models/chinese_clip/image_processing_chinese_clip.js +0 -5
  51. package/docs/parakeet-transformers-js/src/models/clap/feature_extraction_clap.js +0 -159
  52. package/docs/parakeet-transformers-js/src/models/clip/image_processing_clip.js +0 -6
  53. package/docs/parakeet-transformers-js/src/models/convnext/image_processing_convnext.js +0 -46
  54. package/docs/parakeet-transformers-js/src/models/dac/feature_extraction_dac.js +0 -3
  55. package/docs/parakeet-transformers-js/src/models/deit/image_processing_deit.js +0 -6
  56. package/docs/parakeet-transformers-js/src/models/detr/image_processing_detr.js +0 -52
  57. package/docs/parakeet-transformers-js/src/models/donut/image_processing_donut.js +0 -31
  58. package/docs/parakeet-transformers-js/src/models/dpt/image_processing_dpt.js +0 -6
  59. package/docs/parakeet-transformers-js/src/models/efficientnet/image_processing_efficientnet.js +0 -14
  60. package/docs/parakeet-transformers-js/src/models/encodec/feature_extraction_encodec.js +0 -32
  61. package/docs/parakeet-transformers-js/src/models/feature_extractors.js +0 -17
  62. package/docs/parakeet-transformers-js/src/models/florence2/processing_florence2.js +0 -131
  63. package/docs/parakeet-transformers-js/src/models/gemma3n/feature_extraction_gemma3n.js +0 -97
  64. package/docs/parakeet-transformers-js/src/models/gemma3n/processing_gemma3n.js +0 -74
  65. package/docs/parakeet-transformers-js/src/models/glpn/image_processing_glpn.js +0 -5
  66. package/docs/parakeet-transformers-js/src/models/grounding_dino/image_processing_grounding_dino.js +0 -29
  67. package/docs/parakeet-transformers-js/src/models/grounding_dino/processing_grounding_dino.js +0 -101
  68. package/docs/parakeet-transformers-js/src/models/idefics3/image_processing_idefics3.js +0 -232
  69. package/docs/parakeet-transformers-js/src/models/idefics3/processing_idefics3.js +0 -136
  70. package/docs/parakeet-transformers-js/src/models/image_processors.js +0 -40
  71. package/docs/parakeet-transformers-js/src/models/janus/image_processing_janus.js +0 -27
  72. package/docs/parakeet-transformers-js/src/models/janus/processing_janus.js +0 -123
  73. package/docs/parakeet-transformers-js/src/models/jina_clip/image_processing_jina_clip.js +0 -26
  74. package/docs/parakeet-transformers-js/src/models/jina_clip/processing_jina_clip.js +0 -24
  75. package/docs/parakeet-transformers-js/src/models/llava/processing_llava.js +0 -44
  76. package/docs/parakeet-transformers-js/src/models/llava_onevision/image_processing_llava_onevision.js +0 -5
  77. package/docs/parakeet-transformers-js/src/models/mask2former/image_processing_mask2former.js +0 -5
  78. package/docs/parakeet-transformers-js/src/models/maskformer/image_processing_maskformer.js +0 -18
  79. package/docs/parakeet-transformers-js/src/models/mgp_str/processing_mgp_str.js +0 -172
  80. package/docs/parakeet-transformers-js/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +0 -7
  81. package/docs/parakeet-transformers-js/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +0 -7
  82. package/docs/parakeet-transformers-js/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +0 -7
  83. package/docs/parakeet-transformers-js/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +0 -7
  84. package/docs/parakeet-transformers-js/src/models/mobilevit/image_processing_mobilevit.js +0 -6
  85. package/docs/parakeet-transformers-js/src/models/moonshine/feature_extraction_moonshine.js +0 -26
  86. package/docs/parakeet-transformers-js/src/models/moonshine/processing_moonshine.js +0 -20
  87. package/docs/parakeet-transformers-js/src/models/nougat/image_processing_nougat.js +0 -5
  88. package/docs/parakeet-transformers-js/src/models/owlv2/image_processing_owlv2.js +0 -5
  89. package/docs/parakeet-transformers-js/src/models/owlvit/image_processing_owlvit.js +0 -12
  90. package/docs/parakeet-transformers-js/src/models/owlvit/processing_owlvit.js +0 -7
  91. package/docs/parakeet-transformers-js/src/models/paligemma/processing_paligemma.js +0 -83
  92. package/docs/parakeet-transformers-js/src/models/parakeet/feature_extraction_parakeet.js +0 -3
  93. package/docs/parakeet-transformers-js/src/models/parakeet/modeling_parakeet.js +0 -3
  94. package/docs/parakeet-transformers-js/src/models/parakeet/processing_parakeet.js +0 -3
  95. package/docs/parakeet-transformers-js/src/models/parakeet/tokenization_parakeet.js +0 -3
  96. package/docs/parakeet-transformers-js/src/models/phi3_v/image_processing_phi3_v.js +0 -163
  97. package/docs/parakeet-transformers-js/src/models/phi3_v/processing_phi3_v.js +0 -53
  98. package/docs/parakeet-transformers-js/src/models/processors.js +0 -22
  99. package/docs/parakeet-transformers-js/src/models/pvt/image_processing_pvt.js +0 -5
  100. package/docs/parakeet-transformers-js/src/models/pyannote/feature_extraction_pyannote.js +0 -85
  101. package/docs/parakeet-transformers-js/src/models/pyannote/processing_pyannote.js +0 -24
  102. package/docs/parakeet-transformers-js/src/models/qwen2_vl/image_processing_qwen2_vl.js +0 -52
  103. package/docs/parakeet-transformers-js/src/models/qwen2_vl/processing_qwen2_vl.js +0 -53
  104. package/docs/parakeet-transformers-js/src/models/rt_detr/image_processing_rt_detr.js +0 -12
  105. package/docs/parakeet-transformers-js/src/models/sam/image_processing_sam.js +0 -242
  106. package/docs/parakeet-transformers-js/src/models/sam/processing_sam.js +0 -20
  107. package/docs/parakeet-transformers-js/src/models/sapiens/image_processing_sapiens.js +0 -13
  108. package/docs/parakeet-transformers-js/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +0 -175
  109. package/docs/parakeet-transformers-js/src/models/segformer/image_processing_segformer.js +0 -13
  110. package/docs/parakeet-transformers-js/src/models/siglip/image_processing_siglip.js +0 -5
  111. package/docs/parakeet-transformers-js/src/models/smolvlm/image_processing_smolvlm.js +0 -2
  112. package/docs/parakeet-transformers-js/src/models/smolvlm/processing_smolvlm.js +0 -2
  113. package/docs/parakeet-transformers-js/src/models/snac/feature_extraction_snac.js +0 -3
  114. package/docs/parakeet-transformers-js/src/models/speecht5/feature_extraction_speecht5.js +0 -4
  115. package/docs/parakeet-transformers-js/src/models/speecht5/processing_speecht5.js +0 -17
  116. package/docs/parakeet-transformers-js/src/models/swin2sr/image_processing_swin2sr.js +0 -24
  117. package/docs/parakeet-transformers-js/src/models/ultravox/processing_ultravox.js +0 -54
  118. package/docs/parakeet-transformers-js/src/models/vit/image_processing_vit.js +0 -7
  119. package/docs/parakeet-transformers-js/src/models/vitmatte/image_processing_vitmatte.js +0 -50
  120. package/docs/parakeet-transformers-js/src/models/vitpose/image_processing_vitpose.js +0 -89
  121. package/docs/parakeet-transformers-js/src/models/wav2vec2/feature_extraction_wav2vec2.js +0 -44
  122. package/docs/parakeet-transformers-js/src/models/wav2vec2/processing_wav2vec2.js +0 -17
  123. package/docs/parakeet-transformers-js/src/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.js +0 -17
  124. package/docs/parakeet-transformers-js/src/models/wespeaker/feature_extraction_wespeaker.js +0 -95
  125. package/docs/parakeet-transformers-js/src/models/whisper/common_whisper.js +0 -157
  126. package/docs/parakeet-transformers-js/src/models/whisper/feature_extraction_whisper.js +0 -92
  127. package/docs/parakeet-transformers-js/src/models/whisper/generation_whisper.js +0 -89
  128. package/docs/parakeet-transformers-js/src/models/whisper/processing_whisper.js +0 -21
  129. package/docs/parakeet-transformers-js/src/models/yolos/image_processing_yolos.js +0 -12
  130. package/docs/parakeet-transformers-js/src/models.js +0 -8644
  131. package/docs/parakeet-transformers-js/src/ops/registry.js +0 -133
  132. package/docs/parakeet-transformers-js/src/ort_env.js +0 -8
  133. package/docs/parakeet-transformers-js/src/parakeet.js +0 -792
  134. package/docs/parakeet-transformers-js/src/pipelines.js +0 -3540
  135. package/docs/parakeet-transformers-js/src/processors.js +0 -16
  136. package/docs/parakeet-transformers-js/src/tokenizers.js +0 -4432
  137. package/docs/parakeet-transformers-js/src/transformers.js +0 -50
  138. package/docs/parakeet-transformers-js/src/utils/audio.js +0 -893
  139. package/docs/parakeet-transformers-js/src/utils/constants.js +0 -9
  140. package/docs/parakeet-transformers-js/src/utils/core.js +0 -259
  141. package/docs/parakeet-transformers-js/src/utils/data-structures.js +0 -574
  142. package/docs/parakeet-transformers-js/src/utils/devices.js +0 -22
  143. package/docs/parakeet-transformers-js/src/utils/dtypes.js +0 -63
  144. package/docs/parakeet-transformers-js/src/utils/generic.js +0 -35
  145. package/docs/parakeet-transformers-js/src/utils/hub.js +0 -780
  146. package/docs/parakeet-transformers-js/src/utils/image.js +0 -834
  147. package/docs/parakeet-transformers-js/src/utils/maths.js +0 -1061
  148. package/docs/parakeet-transformers-js/src/utils/tensor.js +0 -1539
  149. package/docs/parakeet-transformers-js/src/utils/video.js +0 -128
  150. package/docs/parakeet-transformers-js/test/decoder.test.js +0 -114
  151. package/docs/parakeet-transformers-js/test/encoder.test.js +0 -108
  152. package/docs/parakeet-transformers-js/test/preprocessor.test.js +0 -85
  153. package/docs/parakeet-transformers-js/test/tokenizer.test.js +0 -24
  154. package/docs/parakeet-transformers-js/test/transcribe.js +0 -89
  155. package/docs/parakeet-transformers-js/tsconfig.json +0 -21
  156. package/docs/parakeet-transformers-js/webpack.config.js +0 -223
  157. package/examples/hf-spaces-demo/.gitattributes +0 -35
  158. package/examples/hf-spaces-demo/README.md +0 -95
  159. package/examples/hf-spaces-demo/package-lock.json +0 -17690
  160. package/examples/hf-spaces-demo/package.json +0 -41
  161. package/examples/hf-spaces-demo/public/favicon.ico +0 -0
  162. package/examples/hf-spaces-demo/public/index.html +0 -43
  163. package/examples/hf-spaces-demo/public/logo192.png +0 -0
  164. package/examples/hf-spaces-demo/public/logo512.png +0 -0
  165. package/examples/hf-spaces-demo/public/manifest.json +0 -25
  166. package/examples/hf-spaces-demo/public/robots.txt +0 -3
  167. package/examples/hf-spaces-demo/src/App.css +0 -170
  168. package/examples/hf-spaces-demo/src/App.js +0 -316
  169. package/examples/hf-spaces-demo/src/App.test.js +0 -8
  170. package/examples/hf-spaces-demo/src/index.css +0 -13
  171. package/examples/hf-spaces-demo/src/index.js +0 -17
  172. package/examples/hf-spaces-demo/src/logo.svg +0 -1
  173. package/examples/hf-spaces-demo/src/reportWebVitals.js +0 -13
  174. package/examples/hf-spaces-demo/src/setupTests.js +0 -5
  175. package/examples/react-demo/index.html +0 -12
  176. package/examples/react-demo/package.json +0 -20
  177. package/examples/react-demo/src/App.css +0 -134
  178. package/examples/react-demo/src/App.jsx +0 -327
  179. package/examples/react-demo/src/main.jsx +0 -6
  180. package/examples/react-demo/vite.config.js +0 -41
  181. package/examples/react-demo-dev/index.html +0 -12
  182. package/examples/react-demo-dev/package-lock.json +0 -1417
  183. package/examples/react-demo-dev/package.json +0 -20
  184. package/examples/react-demo-dev/public/assets/life_Jim.wav +0 -0
  185. package/examples/react-demo-dev/src/App.css +0 -134
  186. package/examples/react-demo-dev/src/App.jsx +0 -327
  187. package/examples/react-demo-dev/src/main.jsx +0 -6
  188. package/examples/react-demo-dev/vite.config.js +0 -41
package/README.md CHANGED
@@ -1,240 +1,338 @@
1
- # Parakeet.js
2
-
3
- Client-side ONNX inference of NVIDIA *Parakeet* speech-to-text models.
4
- Runs entirely in the browser on **WebGPU** or **WASM** via
5
- [ONNX Runtime Web](https://onnxruntime.ai/).
6
-
7
- > **Parakeet.js** offers a high-performance, browser-first implementation for NVIDIA's Parakeet-TDT speech-to-text models, running entirely client-side via WebGPU and WASM. Powered by ONNX Runtime Web, this library makes it simple to integrate state-of-the-art transcription into any web application.
8
-
9
- > **Status:** Early preview โ€“ API is subject to change while things stabilise.
10
- > **Note:** Currently only supports the Parakeet-TDT model architecture.
11
-
12
- ---
13
-
14
- ## Installation
15
-
16
- ```bash
17
- # npm
18
- npm i parakeet.js onnxruntime-web
19
-
20
- # yarn
21
- yarn add parakeet.js onnxruntime-web
22
- ```
23
-
24
- `onnxruntime-web` is a peer-dependency that supplies the runtime back-ends (WebGPU, WASM).
25
-
26
- ---
27
-
28
- ## Model assets
29
-
30
- We host ready-to-use ONNX exports on the HuggingFace Hub:
31
-
32
- ```
33
- ysdede/parakeet-tdt-0.6b-v2-onnx
34
- ```
35
-
36
- The helper `getParakeetModel()` downloads all required files and caches them in **IndexedDB**:
37
-
38
- ```js
39
- import { getParakeetModel } from 'parakeet.js';
40
-
41
- const repoId = 'ysdede/parakeet-tdt-0.6b-v2-onnx';
42
- const { urls, filenames } = await getParakeetModel(repoId, {
43
- backend: 'webgpu-hybrid', // webgpu-hybrid | wasm
44
- quantization: 'fp32', // fp32 | int8
45
- decoderInt8: true, // load INT8 decoder even when encoder fp32
46
- preprocessor: 'nemo128', // nemo80 | nemo128
47
- progress: ({file,loaded,total}) => console.log(file, loaded/total)
48
- });
49
- ```
50
-
51
- Returned structure:
52
-
53
- ```ts
54
- {
55
- urls: {
56
- encoderUrl: string,
57
- decoderUrl: string,
58
- encoderDataUrl?: string | null,
59
- decoderDataUrl?: string | null,
60
- tokenizerUrl: string,
61
- preprocessorUrl: string
62
- },
63
- filenames: { encoder: string; decoder: string },
64
- quantisation: { encoder: 'fp32' | 'int8'; decoder: 'fp32' | 'int8' }
65
- }
66
- ```
67
-
68
- ---
69
-
70
- ## Creating a model instance
71
-
72
- ```js
73
- import { ParakeetModel } from 'parakeet.js';
74
-
75
- const model = await ParakeetModel.fromUrls({
76
- ...urls, // spread the URLs returned above
77
- filenames, // needed for external .data mapping
78
- backend: 'webgpu-hybrid',
79
- decoderOnWasm: true, // force decoder to CPU/WASM for micro-kernels
80
- decoderInt8: true, // decoder uses INT8 weights
81
- cpuThreads: 6, // WASM threads (defaults to cores-2)
82
- verbose: false // ORT verbose log
83
- });
84
- ```
85
-
86
- ### Back-end presets
87
-
88
- | Backend string | Encoder EP | Decoder EP | Typical use-case |
89
- |---------------------|------------|------------|------------------|
90
- | `webgpu-hybrid` (default) | WebGPU (fp32) | WASM (fp32/int8) | Modern desktop browsers |
91
- | `webgpu-strict` | WebGPU (fp32) | **fail** if op unsupported | Benchmarking kernels |
92
- | `wasm` | WASM (int8/fp32) | WASM | Low-end devices, Node.js |
93
-
94
- ---
95
-
96
- ## Transcribing audio
97
-
98
- ```js
99
- // 16-kHz mono PCM Float32Array
100
- await model.transcribe(pcmFloat32, 16_000, {
101
- returnTimestamps: true,
102
- returnConfidences: true,
103
- frameStride: 2, // 1 (default) = highest accuracy / 2-4 faster
104
- });
105
- ```
106
-
107
- Extra options:
108
-
109
- | Option | Default | Description |
110
- |--------|---------|-------------|
111
- | `temperature` | 1.2 | Softmax temperature for decoding |
112
- | `frameStride` | 1 | Advance decoder by *n* encoder frames per step |
113
-
114
- ### Result schema
115
-
116
- ```ts
117
- {
118
- utterance_text: string,
119
- words: Array<{text,start_time,end_time,confidence}>,
120
- tokens: Array<{token,start_time,end_time,confidence}>,
121
- confidence_scores: { overall_log_prob, word_avg, token_avg },
122
- metrics: {
123
- rtf: number,
124
- total_ms: number,
125
- preprocess_ms: number,
126
- encode_ms: number,
127
- decode_ms: number,
128
- tokenize_ms: number
129
- },
130
- is_final: true
131
- }
132
- ```
133
-
134
- ---
135
-
136
- ## Warm-up & Verification (Recommended)
137
-
138
- The first time you run inference after loading a model, the underlying runtime needs to compile the execution graph. This makes the first run significantly slower. To ensure a smooth user experience, it's best practice to perform a "warm-up" run with a dummy or known audio sample immediately after model creation.
139
-
140
- Our React demo does this and also verifies the output to ensure the model loaded correctly.
141
-
142
- ```js
143
- // In your app, after `ParakeetModel.fromUrls()` succeeds:
144
- setStatus('Warming up & verifyingโ€ฆ');
145
-
146
- const audioRes = await fetch('/assets/known_audio.wav');
147
- const pcm = await decodeAudio(audioRes); // Your audio decoding logic
148
- const { utterance_text } = await model.transcribe(pcm, 16000);
149
-
150
- const expected = 'the known transcript for your audio';
151
- if (utterance_text.toLowerCase().includes(expected)) {
152
- setStatus('Model ready โœ”');
153
- } else {
154
- setStatus('Model verification failed!');
155
- }
156
- ```
157
-
158
- ---
159
-
160
- ## Runtime tuning knobs
161
-
162
- | Property | Where | Effect |
163
- |----------|-------|--------|
164
- | `cpuThreads` | `fromUrls()` | Sets `ort.env.wasm.numThreads`; pick *cores-2* for best balance |
165
- | `decoderOnWasm` | `fromUrls()` | Forces decoder session to WASM even in hybrid mode |
166
- | `decoderInt8` | `getParakeetModel()` + `fromUrls()` | Load INT8 weights for decoder only |
167
- | `frameStride` | `transcribe()` | Trade-off latency vs accuracy |
168
- | `enableProfiling` | `fromUrls()` | Enables ORT profiler (JSON written to `/tmp/profile_*.json`) |
169
-
170
- ---
171
-
172
- ## Using the React demo as a template
173
-
174
- Located at `examples/react-demo`.
175
-
176
- Quick start:
177
-
178
- ```bash
179
- cd examples/react-demo
180
- npm i
181
- npm run dev # Vite => http://localhost:5173
182
- ```
183
-
184
- Key components:
185
-
186
- | File | Purpose |
187
- |------|---------|
188
- | `App.jsx` | Complete end-to-end reference UI. Shows how to load a model with progress bars, perform a warm-up/verification step, display performance metrics (RTF, timings), and manage transcription history. |
189
- | `parakeet.js` | Library entry; houses the model wrapper and performance instrumentation. |
190
- | `hub.js` | Lightweight HuggingFace Hub helper โ€“ downloads and caches model binaries. |
191
-
192
- Copy-paste the `loadModel()` and `transcribeFile()` functions into your app, adjust UI bindings, and you are ready to go.
193
-
194
- ---
195
-
196
- ## ๐Ÿš€ Live Demo on Hugging Face Spaces
197
-
198
- Try the library instantly in your browser without any setup:
199
-
200
- **๐Ÿฆœ [Parakeet.js Demo on HF Spaces](https://huggingface.co/spaces/ysdede/parakeet.js-demo)**
201
-
202
- This demo showcases:
203
- - **WebGPU/WASM backend selection** - Choose the best performance for your device
204
- - **Real-time transcription** - Upload audio files and see instant results
205
- - **Performance metrics** - View detailed timing information and RTF scores
206
- - **Multi-threaded WASM** - Optimized for maximum performance
207
- - **Complete feature set** - All library capabilities in one place
208
-
209
- The demo is also available locally at `examples/hf-spaces-demo` and can be deployed to your own HF Space.
210
-
211
- ---
212
-
213
- ## Troubleshooting
214
-
215
- | Symptom | Cause | Fix |
216
- |---------|-------|-----|
217
- | `multiple calls to initWasm()` | Two WASM sessions initialised in parallel | In hybrid mode we create encoder session first, then decoder. Keep this order. |
218
- | GPU memory still ~2.4 GB with INT8 selected | WebGPU kernels don't support INT8 yet โ€“ weights are automatically converted to FP32 | Use `decoderInt8:true` (CPU) or wait for upcoming WebGPU INT8 kernels. |
219
- | `Graph capture feature not available` error | Mixed EPs prevent GPU graph capture | We auto-retry without capture; nothing to do. |
220
-
221
- ---
222
-
223
- ## Changelog
224
-
225
- See `OPTIMIZATION_PLAN.md` for a timeline of performance tweaks and planned features.
226
-
227
- ---
228
-
229
- ## Credits
230
-
231
- This project builds upon the excellent work of:
232
-
233
- - **[istupakov](https://github.com/istupakov)** - For providing the [ONNX-ASR](https://github.com/istupakov/onnx-asr) repository, which served as the foundation and starting point for this JavaScript implementation
234
- - **[istupakov/parakeet-tdt-0.6b-v2-onnx](https://huggingface.co/istupakov/parakeet-tdt-0.6b-v2-onnx)** - For the original ONNX model exports and preprocessor implementations that made browser deployment possible
235
- - **ONNX Runtime Web** - For powering the browser-based inference engine
236
- - **ONNX Runtime Node** - For enabling high-performance server-side inference
237
-
238
- The Python-based ONNX-ASR project provided crucial insights into model handling, preprocessing pipelines, and served as a reference implementation during the development of this browser-compatible version.
239
-
1
+ # Parakeet.js
2
+
3
+ Client-side ONNX inference of NVIDIA *Parakeet* speech-to-text models.
4
+ Runs entirely in the browser on **WebGPU** or **WASM** via
5
+ [ONNX Runtime Web](https://onnxruntime.ai/).
6
+
7
+ > **Parakeet.js** offers a high-performance, browser-first implementation for NVIDIA's Parakeet-TDT speech-to-text models, running entirely client-side via WebGPU and WASM. Powered by ONNX Runtime Web, this library makes it simple to integrate state-of-the-art transcription into any web application.
8
+
9
+ > **Status:** Early preview โ€“ API is subject to change while things stabilise.
10
+ > **Note:** Currently supports Parakeet-TDT v2 (English) and v3 (Multilingual) model architectures.
11
+
12
+ ---
13
+
14
+ ## What's New (v0.3.x)
15
+
16
+ ### ๐ŸŒ Parakeet TDT v3 Multilingual Support
17
+ - Added support for **Parakeet TDT 0.6B v3** with 13 languages: English, French, German, Spanish, Italian, Portuguese, Dutch, Polish, Russian, Ukrainian, Japanese, Korean, Chinese
18
+ - Both v2 (English-only) and v3 (Multilingual) models now work out of the box
19
+ - Use model keys for easier loading: `'parakeet-tdt-0.6b-v2'` or `'parakeet-tdt-0.6b-v3'`
20
+
21
+ ### ๐ŸŽ›๏ธ Model Configuration API
22
+ - New `MODELS` export with model metadata (supported languages, vocab size, etc.)
23
+ - `getModelConfig()` for programmatic model introspection
24
+ - `supportsLanguage()` helper to check language compatibility
25
+
26
+ ### ๐Ÿงช Demo App Improvements
27
+ - **Model selector** dropdown to switch between v2 and v3
28
+ - **Language selector** (context-aware, shows only supported languages)
29
+ - **Quick Test** feature with HuggingFace speech datasets (People's Speech, MLS)
30
+ - **Reference text** display for comparing transcription accuracy
31
+
32
+ ---
33
+
34
+ ## Installation
35
+
36
+ ```bash
37
+ # npm
38
+ npm i parakeet.js onnxruntime-web
39
+
40
+ # yarn
41
+ yarn add parakeet.js onnxruntime-web
42
+ ```
43
+
44
+ `onnxruntime-web` is a peer-dependency that supplies the runtime back-ends (WebGPU, WASM).
45
+
46
+ ---
47
+
48
+ ## Model assets
49
+
50
+ We host ready-to-use ONNX exports on the HuggingFace Hub:
51
+
52
+ | Model | Languages | Repo ID |
53
+ |-------|-----------|---------|
54
+ | Parakeet TDT 0.6B v2 | English | `istupakov/parakeet-tdt-0.6b-v2-onnx` |
55
+ | Parakeet TDT 0.6B v3 | 13 languages | `istupakov/parakeet-tdt-0.6b-v3-onnx` |
56
+
57
+ The helper `getParakeetModel()` downloads all required files and caches them in **IndexedDB**:
58
+
59
+ ```js
60
+ import { getParakeetModel, MODELS } from 'parakeet.js';
61
+
62
+ // Option 1: Use model key (recommended)
63
+ const { urls, filenames, modelConfig } = await getParakeetModel('parakeet-tdt-0.6b-v3', {
64
+ backend: 'webgpu',
65
+ progress: ({file,loaded,total}) => console.log(file, loaded/total)
66
+ });
67
+
68
+ // Option 2: Use repo ID directly
69
+ const { urls, filenames } = await getParakeetModel('istupakov/parakeet-tdt-0.6b-v2-onnx', {
70
+ backend: 'webgpu',
71
+ encoderQuant: 'fp32',
72
+ decoderQuant: 'int8',
73
+ preprocessor: 'nemo128',
74
+ });
75
+ ```
76
+
77
+ Returned structure:
78
+
79
+ ```ts
80
+ {
81
+ urls: {
82
+ encoderUrl: string,
83
+ decoderUrl: string,
84
+ encoderDataUrl?: string | null,
85
+ decoderDataUrl?: string | null,
86
+ tokenizerUrl: string,
87
+ preprocessorUrl: string
88
+ },
89
+ filenames: { encoder: string; decoder: string }
90
+ }
91
+ ```
92
+
93
+ ---
94
+
95
+ ## Creating a model instance
96
+
97
+ ```js
98
+ import { ParakeetModel } from 'parakeet.js';
99
+
100
+ const model = await ParakeetModel.fromUrls({
101
+ ...urls, // spread the URLs returned above
102
+ filenames, // needed for external .data mapping
103
+ backend: 'webgpu', // 'webgpu' or 'wasm'
104
+ cpuThreads: 6, // For WASM backend
105
+ verbose: false, // ORT verbose logging
106
+ });
107
+ ```
108
+
109
+ ### Back-end presets
110
+
111
+ The library supports two primary backends: `webgpu` and `wasm`.
112
+
113
+ - **`webgpu` (Default):** This is the fastest option for modern desktop browsers. It runs in a hybrid configuration:
114
+ - The heavy **encoder** model runs on the **GPU** (WebGPU) for maximum throughput.
115
+ - The **decoder** model runs on the **CPU** (WASM). The decoder's architecture contains operations not fully supported by the ONNX Runtime WebGPU backend, causing it to fall back to WASM anyway. This configuration makes the behavior explicit and stable, avoiding performance issues and warnings.
116
+ - In this mode, the encoder must be `fp32`, but you can choose `fp32` or `int8` for the decoder.
117
+
118
+ - **`wasm`:** Both encoder and decoder run on the CPU. This is best for compatibility with older devices or environments without WebGPU support. Both models can be `fp32` or `int8`.
119
+
120
+
121
+ ---
122
+
123
+ ## Transcribing audio
124
+
125
+ ```js
126
+ // 16-kHz mono PCM Float32Array
127
+ await model.transcribe(pcmFloat32, 16_000, {
128
+ returnTimestamps: true,
129
+ returnConfidences: true,
130
+ frameStride: 2, // 1 (default) = highest accuracy / 2-4 faster
131
+ });
132
+ ```
133
+
134
+ Extra options:
135
+
136
+ | Option | Default | Description |
137
+ |--------|---------|-------------|
138
+ | `temperature` | 1.0 | Softmax temperature for decoding (1.0 = greedy, >1.0 = sampling) |
139
+ | `frameStride` | 1 | Advance decoder by *n* encoder frames per step |
140
+
141
+ ### Result schema
142
+
143
+ ```ts
144
+ {
145
+ utterance_text: string,
146
+ words: Array<{text,start_time,end_time,confidence}>,
147
+ tokens: Array<{token,start_time,end_time,confidence}>,
148
+ confidence_scores: { overall_log_prob, word_avg, token_avg },
149
+ metrics: {
150
+ rtf: number,
151
+ total_ms: number,
152
+ preprocess_ms: number,
153
+ encode_ms: number,
154
+ decode_ms: number,
155
+ tokenize_ms: number
156
+ },
157
+ is_final: true
158
+ }
159
+ ```
160
+
161
+ ---
162
+
163
+ ## Warm-up & Verification (Recommended)
164
+
165
+ The first time you run inference after loading a model, the underlying runtime needs to compile the execution graph. This makes the first run significantly slower. To ensure a smooth user experience, it's best practice to perform a "warm-up" run with a dummy or known audio sample immediately after model creation.
166
+
167
+ Our React demo does this and also verifies the output to ensure the model loaded correctly.
168
+
169
+ ```js
170
+ // In your app, after `ParakeetModel.fromUrls()` succeeds:
171
+ setStatus('Warming up & verifyingโ€ฆ');
172
+
173
+ const audioRes = await fetch('/assets/known_audio.wav');
174
+ const pcm = await decodeAudio(audioRes); // Your audio decoding logic
175
+ const { utterance_text } = await model.transcribe(pcm, 16000);
176
+
177
+ const expected = 'the known transcript for your audio';
178
+ if (utterance_text.toLowerCase().includes(expected)) {
179
+ setStatus('Model ready โœ”');
180
+ } else {
181
+ setStatus('Model verification failed!');
182
+ }
183
+ ```
184
+
185
+ ---
186
+
187
+ ## Runtime tuning knobs
188
+
189
+ | Property | Where | Effect |
190
+ |----------|-------|--------|
191
+ | `cpuThreads` | `fromUrls()` | Sets `ort.env.wasm.numThreads`; pick *cores-2* for best balance |
192
+ | `encoderQuant` | `getParakeetModel()` | Selects `fp32` or `int8` model for the encoder. |
193
+ | `decoderQuant` | `getParakeetModel()` | Selects `fp32` or `int8` model for the decoder. |
194
+ | `frameStride` | `transcribe()` | Trade-off latency vs accuracy |
195
+ | `enableProfiling` | `fromUrls()` | Enables ORT profiler (JSON written to `/tmp/profile_*.json`) |
196
+
197
+ ---
198
+
199
+ ## Model Configuration API
200
+
201
+ Query model metadata programmatically:
202
+
203
+ ```js
204
+ import { MODELS, LANGUAGE_NAMES, getModelConfig, supportsLanguage } from 'parakeet.js';
205
+
206
+ // List all available models
207
+ console.log(Object.keys(MODELS));
208
+ // ['parakeet-tdt-0.6b-v2', 'parakeet-tdt-0.6b-v3']
209
+
210
+ // Get model config
211
+ const config = getModelConfig('parakeet-tdt-0.6b-v3');
212
+ console.log(config.languages); // ['en', 'fr', 'de', 'es', ...]
213
+ console.log(config.displayName); // 'Parakeet TDT 0.6B v3 (Multilingual)'
214
+
215
+ // Check language support
216
+ supportsLanguage('parakeet-tdt-0.6b-v3', 'fr'); // true
217
+ supportsLanguage('parakeet-tdt-0.6b-v2', 'fr'); // false
218
+
219
+ // Get language display names
220
+ console.log(LANGUAGE_NAMES['fr']); // 'French'
221
+ ```
222
+
223
+ ---
224
+
225
+ ## Using the React demo as a template
226
+
227
+ Located at `examples/react-demo` (production) and `examples/react-demo-dev` (development).
228
+
229
+ Quick start:
230
+
231
+ ```bash
232
+ cd examples/react-demo-dev
233
+ npm i
234
+ npm run dev # Vite => http://localhost:5173
235
+ ```
236
+
237
+ ### Demo Features
238
+
239
+ The development demo (`react-demo-dev`) includes advanced features:
240
+
241
+ - **Model Selector**: Switch between v2 (English) and v3 (Multilingual)
242
+ - **Language Selector**: Context-aware dropdown showing only supported languages
243
+ - **Quick Test**: Load random samples from HuggingFace speech datasets
244
+ - **Reference Text**: Compare transcription against ground truth
245
+
246
+ ### Speech Dataset Utilities (Demo Only)
247
+
248
+ The demo includes reusable utilities for testing with HuggingFace datasets:
249
+
250
+ ```js
251
+ // Located in: examples/react-demo-dev/src/utils/speechDatasets.js
252
+ import { fetchRandomSample, hasTestSamples, SPEECH_DATASETS } from './utils/speechDatasets';
253
+
254
+ // Check if test samples are available for a language
255
+ if (hasTestSamples('fr')) {
256
+ // Fetch a random French audio sample with transcription
257
+ const sample = await fetchRandomSample('fr', {
258
+ targetSampleRate: 16000,
259
+ onProgress: ({ message }) => console.log(message),
260
+ });
261
+
262
+ console.log(sample.transcription); // Ground truth text
263
+ console.log(sample.pcm); // Float32Array audio
264
+ console.log(sample.duration); // Duration in seconds
265
+ }
266
+ ```
267
+
268
+ **Supported languages for testing:** English (People's Speech), French, German, Spanish, Italian, Portuguese, Dutch, Polish (Multilingual LibriSpeech)
269
+
270
+ ### Key Files
271
+
272
+ | File | Purpose |
273
+ |------|---------|
274
+ | `App.jsx` | Complete end-to-end reference UI with model/language selection, performance metrics, and transcription history |
275
+ | `utils/speechDatasets.js` | Reusable utilities for fetching test samples from HuggingFace datasets |
276
+
277
+ Copy-paste the `loadModel()` and `transcribeFile()` functions into your app, adjust UI bindings, and you are ready to go.
278
+
279
+ ---
280
+
281
+ ## ๐Ÿš€ Live Demo on Hugging Face Spaces
282
+
283
+ Try the library instantly in your browser without any setup:
284
+
285
+ **๐Ÿฆœ [Parakeet.js Demo on HF Spaces](https://huggingface.co/spaces/ysdede/parakeet.js-demo)**
286
+
287
+ This demo showcases:
288
+ - **WebGPU/WASM backend selection** - Choose the best performance for your device
289
+ - **Real-time transcription** - Upload audio files and see instant results
290
+ - **Performance metrics** - View detailed timing information and RTF scores
291
+ - **Multi-threaded WASM** - Optimized for maximum performance
292
+ - **Complete feature set** - All library capabilities in one place
293
+
294
+ The demo is also available locally at `examples/hf-spaces-demo` and can be deployed to your own HF Space.
295
+
296
+ ---
297
+
298
+ ## Troubleshooting
299
+
300
+ | Symptom | Cause | Fix |
301
+ |---------|-------|-----|
302
+ | `Some nodes were not assigned...` warning | When using the `webgpu` backend, ORT assigns minor operations (`Shape`, `Gather`, etc.) in the encoder to the CPU for efficiency. | This is expected and harmless. The heavy-lifting is still on the GPU. |
303
+ | GPU memory still ~2.4 GB with INT8 selected | In WebGPU mode, the encoder must be `fp32`. The `int8` option only applies to the WASM backend or the decoder in hybrid mode. | This is the expected behavior for the `webgpu` backend. |
304
+ | `Graph capture feature not available` error | Mixed EPs (CPU/GPU) or unsupported ops prevent GPU graph capture. | The library automatically retries without capture; safe to ignore. |
305
+
306
+ ---
307
+
308
+ ## Changelog
309
+
310
+ ### v0.3.x (January 2026)
311
+ - โœจ **Multilingual Support**: Added Parakeet TDT 0.6B v3 with 13 languages
312
+ - ๐ŸŽ›๏ธ **Model Config API**: New `MODELS`, `LANGUAGE_NAMES`, `getModelConfig()`, `supportsLanguage()` exports
313
+ - ๐Ÿงช **Demo Enhancements**: Model/language selectors, HuggingFace dataset testing
314
+ - ๐Ÿ”ง **TDT Decoding Fix**: Aligned decoding logic with NeMo framework for improved accuracy
315
+ - ๐ŸŒŠ **Streaming Support**: Added incremental transcription capabilities
316
+
317
+ ### v0.2.x
318
+ - Initial WebGPU/WASM hybrid backend
319
+ - IndexedDB model caching
320
+ - Performance instrumentation (RTF, timing metrics)
321
+
322
+ See `OPTIMIZATION_PLAN.md` for detailed performance notes.
323
+
324
+ ---
325
+
326
+ ## Credits
327
+
328
+ This project builds upon the excellent work of:
329
+
330
+ - **[istupakov](https://github.com/istupakov)** - For providing the [ONNX-ASR](https://github.com/istupakov/onnx-asr) repository, which served as the foundation and starting point for this JavaScript implementation
331
+ - **[istupakov/parakeet-tdt-0.6b-v2-onnx](https://huggingface.co/istupakov/parakeet-tdt-0.6b-v2-onnx)** - English model exports
332
+ - **[istupakov/parakeet-tdt-0.6b-v3-onnx](https://huggingface.co/istupakov/parakeet-tdt-0.6b-v3-onnx)** - Multilingual model exports
333
+ - **ONNX Runtime Web** - For powering the browser-based inference engine
334
+ - **HuggingFace Datasets** - People's Speech, Multilingual LibriSpeech for testing
335
+
336
+ The Python-based ONNX-ASR project provided crucial insights into model handling, preprocessing pipelines, and served as a reference implementation during the development of this browser-compatible version.
337
+
240
338
  Happy hacking! ๐ŸŽ‰
package/package.json CHANGED
@@ -1,18 +1,25 @@
1
1
  {
2
2
  "name": "parakeet.js",
3
- "version": "0.0.2",
3
+ "version": "1.0.0",
4
4
  "description": "NVIDIA Parakeet speech recognition for the browser (WebGPU/WASM) powered by ONNX Runtime Web.",
5
5
  "type": "module",
6
6
  "exports": {
7
7
  ".": "./src/index.js"
8
8
  },
9
+ "files": [
10
+ "src",
11
+ "README.md",
12
+ "LICENSE"
13
+ ],
9
14
  "keywords": [
10
15
  "parakeet",
11
16
  "speech",
12
17
  "onnx",
13
18
  "webgpu",
14
19
  "wasm",
15
- "transcription"
20
+ "transcription",
21
+ "multilingual",
22
+ "asr"
16
23
  ],
17
24
  "dependencies": {
18
25
  "onnxruntime-web": "1.22.0-dev.20250409-89f8206ba4"