@tryhamster/gerbil 1.0.0-rc.9 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +247 -84
  3. package/dist/architectures-C1I5V3Dt.mjs +6070 -0
  4. package/dist/architectures-C1I5V3Dt.mjs.map +1 -0
  5. package/dist/browser/index.d.ts +264 -588
  6. package/dist/browser/index.d.ts.map +1 -1
  7. package/dist/browser/index.js +585 -2334
  8. package/dist/browser/index.js.map +1 -1
  9. package/dist/cli.mjs +625 -1098
  10. package/dist/cli.mjs.map +1 -1
  11. package/dist/defaults-9komdrbY.mjs +24 -0
  12. package/dist/defaults-9komdrbY.mjs.map +1 -0
  13. package/dist/frameworks/express.d.mts +1 -3
  14. package/dist/frameworks/express.d.mts.map +1 -1
  15. package/dist/frameworks/express.mjs +7 -7
  16. package/dist/frameworks/express.mjs.map +1 -1
  17. package/dist/frameworks/fastify.d.mts +1 -1
  18. package/dist/frameworks/fastify.d.mts.map +1 -1
  19. package/dist/frameworks/fastify.mjs +3 -3
  20. package/dist/frameworks/fastify.mjs.map +1 -1
  21. package/dist/frameworks/hono.d.mts +1 -1
  22. package/dist/frameworks/hono.d.mts.map +1 -1
  23. package/dist/frameworks/hono.mjs +4 -4
  24. package/dist/frameworks/hono.mjs.map +1 -1
  25. package/dist/frameworks/next.d.mts +3 -2
  26. package/dist/frameworks/next.d.mts.map +1 -1
  27. package/dist/frameworks/next.mjs +4 -4
  28. package/dist/frameworks/next.mjs.map +1 -1
  29. package/dist/frameworks/react.d.mts +1 -1
  30. package/dist/frameworks/trpc.d.mts +1 -1
  31. package/dist/frameworks/trpc.d.mts.map +1 -1
  32. package/dist/frameworks/trpc.mjs +4 -4
  33. package/dist/frameworks/trpc.mjs.map +1 -1
  34. package/dist/gerbil-BHrJJIa4.mjs +1656 -0
  35. package/dist/gerbil-BHrJJIa4.mjs.map +1 -0
  36. package/dist/gerbil-BT9fCydo.d.mts +488 -0
  37. package/dist/gerbil-BT9fCydo.d.mts.map +1 -0
  38. package/dist/gerbil-DomNfIr1.mjs +4 -0
  39. package/dist/gpu/hooks.d.mts +520 -0
  40. package/dist/gpu/hooks.d.mts.map +1 -0
  41. package/dist/gpu/hooks.mjs +1188 -0
  42. package/dist/gpu/hooks.mjs.map +1 -0
  43. package/dist/gpu/index.d.mts +2 -0
  44. package/dist/gpu/index.mjs +6 -0
  45. package/dist/gpu-33qCAtHW.mjs +3615 -0
  46. package/dist/gpu-33qCAtHW.mjs.map +1 -0
  47. package/dist/index-Dgmb2kE3.d.mts +245 -0
  48. package/dist/index-Dgmb2kE3.d.mts.map +1 -0
  49. package/dist/index-jEAL2s-A.d.mts +2022 -0
  50. package/dist/index-jEAL2s-A.d.mts.map +1 -0
  51. package/dist/index.d.mts +22 -487
  52. package/dist/index.d.mts.map +1 -1
  53. package/dist/index.mjs +13 -8
  54. package/dist/index.mjs.map +1 -1
  55. package/dist/indexeddb-store-BWIMtxxH.mjs +103 -0
  56. package/dist/indexeddb-store-BWIMtxxH.mjs.map +1 -0
  57. package/dist/indexeddb-store-ClH12Xnl.mjs +4 -0
  58. package/dist/integrations/ai-sdk.d.mts +75 -6
  59. package/dist/integrations/ai-sdk.d.mts.map +1 -1
  60. package/dist/integrations/ai-sdk.mjs +131 -15
  61. package/dist/integrations/ai-sdk.mjs.map +1 -1
  62. package/dist/integrations/langchain.d.mts +1 -1
  63. package/dist/integrations/langchain.d.mts.map +1 -1
  64. package/dist/integrations/langchain.mjs +5 -5
  65. package/dist/integrations/langchain.mjs.map +1 -1
  66. package/dist/integrations/llamaindex.d.mts +1 -1
  67. package/dist/integrations/llamaindex.d.mts.map +1 -1
  68. package/dist/integrations/llamaindex.mjs +5 -5
  69. package/dist/integrations/llamaindex.mjs.map +1 -1
  70. package/dist/integrations/mcp-client.mjs +3 -3
  71. package/dist/integrations/mcp-client.mjs.map +1 -1
  72. package/dist/integrations/mcp.d.mts +3 -2
  73. package/dist/integrations/mcp.d.mts.map +1 -1
  74. package/dist/integrations/mcp.mjs +5 -5
  75. package/dist/{mcp-BvbriaBy.mjs → mcp-1DaMsaBc.mjs} +4 -4
  76. package/dist/mcp-1DaMsaBc.mjs.map +1 -0
  77. package/dist/memory/index.d.mts +3 -0
  78. package/dist/memory/index.mjs +6 -0
  79. package/dist/memory-D1P7Tmda.mjs +4 -0
  80. package/dist/memory-DVN0MnIG.mjs +132 -0
  81. package/dist/memory-DVN0MnIG.mjs.map +1 -0
  82. package/dist/memory-Dj0J1v88.mjs +294 -0
  83. package/dist/memory-Dj0J1v88.mjs.map +1 -0
  84. package/dist/moonshine-stt-BLyVoRpB.mjs +4 -0
  85. package/dist/moonshine-stt-v_P_Ci_m.mjs +11936 -0
  86. package/dist/moonshine-stt-v_P_Ci_m.mjs.map +1 -0
  87. package/dist/{one-liner-s-lD8rCC.mjs → one-liner-DnQn7HJK.mjs} +14 -16
  88. package/dist/one-liner-DnQn7HJK.mjs.map +1 -0
  89. package/dist/repl-jV5gcJFA.mjs +9 -0
  90. package/dist/skills/index.d.mts +270 -320
  91. package/dist/skills/index.d.mts.map +1 -1
  92. package/dist/skills/index.mjs +5 -5
  93. package/dist/{skills-CD3Orlex.mjs → skills-DX8D59UH.mjs} +187 -32
  94. package/dist/skills-DX8D59UH.mjs.map +1 -0
  95. package/dist/{tools-Bi1P7Xoy.mjs → tools-DQ1mPUw5.mjs} +34 -22
  96. package/dist/tools-DQ1mPUw5.mjs.map +1 -0
  97. package/dist/{types-CiTc7ez3.d.mts → types-D6FiR_oh.d.mts} +106 -12
  98. package/dist/types-D6FiR_oh.d.mts.map +1 -0
  99. package/dist/types-DQBe2lFo.d.mts +165 -0
  100. package/dist/types-DQBe2lFo.d.mts.map +1 -0
  101. package/dist/{utils-CZBZ8dgR.mjs → utils-DKO55ZmZ.mjs} +1 -1
  102. package/dist/{utils-CZBZ8dgR.mjs.map → utils-DKO55ZmZ.mjs.map} +1 -1
  103. package/dist/vector-B0panuy6.mjs +95 -0
  104. package/dist/vector-B0panuy6.mjs.map +1 -0
  105. package/docs/PROJECT-STATE.md +321 -0
  106. package/docs/adding-a-model-family.md +280 -0
  107. package/docs/ai-sdk.md +70 -61
  108. package/docs/architecture/overview.md +17 -7
  109. package/docs/browser.md +203 -8
  110. package/docs/embeddings.md +156 -0
  111. package/docs/gerbil-site-native-migration.md +217 -0
  112. package/docs/gpu-engine/architectures.md +398 -0
  113. package/docs/gpu-engine/ir.md +372 -0
  114. package/docs/gpu-engine/kernels.md +718 -0
  115. package/docs/gpu-engine/paper.html +1759 -0
  116. package/docs/gpu-engine/paper.md +2109 -0
  117. package/docs/gpu-engine/safetensors.md +312 -0
  118. package/docs/gpu-engine/tokenizer.md +302 -0
  119. package/docs/memory-rag.md +91 -0
  120. package/docs/metal-safari-intel.md +190 -0
  121. package/docs/mobile-failure-diagnosis.md +124 -0
  122. package/docs/mobile.md +99 -0
  123. package/docs/observability.md +230 -0
  124. package/docs/onnx-removal-plan.md +339 -0
  125. package/docs/research/autoresearch-portable.md +904 -0
  126. package/docs/research/dispatch-reduction-hivemind.md +84 -0
  127. package/docs/research/ios-safari-model-caching.md +117 -0
  128. package/docs/research/mobile-webgpu-speed-fusion.md +135 -0
  129. package/docs/research/native-stt-model-selection.md +49 -0
  130. package/docs/research/native-tts-model-selection.md +90 -0
  131. package/docs/research/native-vs-chromium-decision.md +152 -0
  132. package/docs/research/nemotron-mamba2-inference.md +910 -0
  133. package/docs/research/qwen35-multimodal.md +293 -0
  134. package/docs/research/qwen36-gemma4-targets.md +337 -0
  135. package/docs/research/sota-embedding-models.md +179 -0
  136. package/docs/research/sota-mobile-models-2026.md +263 -0
  137. package/docs/research/sota-modality-models.md +202 -0
  138. package/docs/research/tps-baselines.md +71 -0
  139. package/docs/research/webgpu-m4-reference.md +104 -0
  140. package/docs/site-update-plan.md +155 -0
  141. package/docs/structured-output.md +123 -0
  142. package/docs/stt.md +63 -446
  143. package/docs/tts.md +77 -499
  144. package/docs/vision.md +100 -338
  145. package/package.json +22 -7
  146. package/dist/chrome-backend-CORwaIyC.mjs +0 -1212
  147. package/dist/chrome-backend-CORwaIyC.mjs.map +0 -1
  148. package/dist/chrome-backend-DIKYoWj-.mjs +0 -3
  149. package/dist/gerbil-CJ3ifloF.mjs +0 -4
  150. package/dist/gerbil-Dw4Qj77e.mjs +0 -1631
  151. package/dist/gerbil-Dw4Qj77e.mjs.map +0 -1
  152. package/dist/gerbil-qOTe1nl2.d.mts +0 -431
  153. package/dist/gerbil-qOTe1nl2.d.mts.map +0 -1
  154. package/dist/kokoro-BNTb6egA.mjs +0 -20210
  155. package/dist/kokoro-BNTb6egA.mjs.map +0 -1
  156. package/dist/kokoro-CMOGDSgT.js +0 -20212
  157. package/dist/kokoro-CMOGDSgT.js.map +0 -1
  158. package/dist/mcp-BvbriaBy.mjs.map +0 -1
  159. package/dist/one-liner-s-lD8rCC.mjs.map +0 -1
  160. package/dist/repl-DveXw36T.mjs +0 -9
  161. package/dist/skills-CD3Orlex.mjs.map +0 -1
  162. package/dist/stt-Bu-E23Sc.js +0 -433
  163. package/dist/stt-Bu-E23Sc.js.map +0 -1
  164. package/dist/stt-CpLYbGFd.mjs +0 -433
  165. package/dist/stt-CpLYbGFd.mjs.map +0 -1
  166. package/dist/stt-DRPLEEHB.mjs +0 -3
  167. package/dist/tools-Bi1P7Xoy.mjs.map +0 -1
  168. package/dist/transformers.web-DiD1gTwk.js +0 -44695
  169. package/dist/transformers.web-DiD1gTwk.js.map +0 -1
  170. package/dist/transformers.web-u34VxRFM.js +0 -3
  171. package/dist/tts-CqroPaSK.js +0 -724
  172. package/dist/tts-CqroPaSK.js.map +0 -1
  173. package/dist/tts-DXgsKGCe.mjs +0 -3
  174. package/dist/tts-DeGANMNV.mjs +0 -730
  175. package/dist/tts-DeGANMNV.mjs.map +0 -1
  176. package/dist/types-CiTc7ez3.d.mts.map +0 -1
  177. /package/dist/{auto-update-S9s5-g0C.mjs → auto-update-BVaLXcDE.mjs} +0 -0
  178. /package/dist/{chunk-CkXuGtQK.mjs → chunk-B9cbKln6.mjs} +0 -0
  179. /package/dist/{microphone-DaMZFRuR.mjs → microphone-Bqmoz9_K.mjs} +0 -0
@@ -0,0 +1,123 @@
1
+ # Structured Output
2
+
3
+ Gerbil can make a model return a **JSON object** instead of free-form text. The engine
4
+ generates, extracts the JSON from the output, validates it, and — if it fails — retries with
5
+ a corrective nudge until the result is valid or the retry budget is exhausted.
6
+
7
+ This is available at three levels:
8
+
9
+ - **Engine** — `WebGPUEngine.generateObject()` (lowest level)
10
+ - **`Gerbil` class** — `g.generateObject()` (Node-facing wrapper)
11
+ - **One-liner** — `generateObject()` (zero-setup singleton)
12
+ - **React** — the `useObject` hook (`@tryhamster/gerbil/gpu/hooks`)
13
+
14
+ > `generateObject` is distinct from the Zod-driven `Gerbil.json()`. `json()` parses against a
15
+ > Zod schema; `generateObject()` validates with a predicate or a minimal
16
+ > `{ required: [...] }` schema and is the same code path the `useObject` hook uses.
17
+
18
+ ## Validation
19
+
20
+ The `schema` option is an `ObjectValidator`, either:
21
+
22
+ - **A predicate** `(o: unknown) => boolean` — full control; return `true` when the parsed
23
+ value is acceptable.
24
+ - **A minimal schema object** `{ required: string[]; properties?: Record<string, unknown> }`
25
+ — only the `required` key set is enforced (every key must exist on the parsed object).
26
+ - **Omitted** — any syntactically valid JSON value (object or array) is accepted.
27
+
28
+ `maxRetries` (default `4`) is the number of retries **after** the first attempt, so up to
29
+ `maxRetries + 1` generations run. The result includes `attempts` (1 = first try).
30
+
31
+ ## Engine
32
+
33
+ ```typescript
34
+ import { WebGPUEngine } from "@tryhamster/gerbil/gpu";
35
+
36
+ const engine = await WebGPUEngine.create({ repo: "mlx-community/Qwen3.5-0.8B-4bit" });
37
+
38
+ const { object, text, attempts } = await engine.generateObject(
39
+ 'Extract {name, age} from: "I am Sarah, 28"',
40
+ { schema: { required: ["name", "age"] }, maxRetries: 4 },
41
+ );
42
+ // object === { name: "Sarah", age: 28 }
43
+ ```
44
+
45
+ `generateObject` extends the engine's `GenerateOptions`, so `maxTokens`, `systemPrompt`, and
46
+ `sampling` (e.g. `{ temperature: 0.2 }`) are all available.
47
+
48
+ ## `Gerbil` class
49
+
50
+ ```typescript
51
+ import { Gerbil } from "@tryhamster/gerbil";
52
+
53
+ const g = new Gerbil();
54
+ await g.loadModel("qwen3.5-0.8b");
55
+
56
+ const { object } = await g.generateObject<{ primes: number[] }>(
57
+ "List the first 3 primes as {primes: number[]}",
58
+ { schema: (o) => Array.isArray((o as any).primes) },
59
+ );
60
+ ```
61
+
62
+ ## One-liner
63
+
64
+ ```typescript
65
+ import { generateObject } from "@tryhamster/gerbil";
66
+
67
+ const { object } = await generateObject(
68
+ "Return {ok: true}",
69
+ { model: "qwen3.5-0.8b", schema: { required: ["ok"] } },
70
+ );
71
+ ```
72
+
73
+ ## React (`useObject`)
74
+
75
+ ```tsx
76
+ import { useObject } from "@tryhamster/gerbil/gpu/hooks";
77
+
78
+ function CityExtractor() {
79
+ const { generate, object, isGenerating, attempts } = useObject<{ city: string }>({
80
+ model: "mlx-community/Qwen3.5-0.8B-4bit",
81
+ });
82
+
83
+ return (
84
+ <div>
85
+ <button
86
+ disabled={isGenerating}
87
+ onClick={() =>
88
+ generate("Extract the city: I live in Paris", { schema: { required: ["city"] } })
89
+ }
90
+ >
91
+ Extract
92
+ </button>
93
+ {object && <pre>{JSON.stringify(object, null, 2)} ({attempts} attempts)</pre>}
94
+ </div>
95
+ );
96
+ }
97
+ ```
98
+
99
+ ## CLI
100
+
101
+ ```bash
102
+ # Inline, no schema (any valid JSON accepted)
103
+ gerbil object "Return {ok: true} as JSON"
104
+
105
+ # With a schema file (a minimal { required: [...] } validator)
106
+ echo '{ "required": ["name", "age"] }' > person.json
107
+ gerbil object "Extract {name, age}: I am Sarah, 28" --schema person.json
108
+
109
+ # Flags: --model, --max-tokens, --temperature, --retries, --schema
110
+ ```
111
+
112
+ The command prints the parsed object as pretty JSON followed by the attempt count.
113
+
114
+ ## Types
115
+
116
+ ```typescript
117
+ import type {
118
+ GenerateObjectOptions,
119
+ GenerateObjectResult,
120
+ ObjectSchema,
121
+ ObjectValidator,
122
+ } from "@tryhamster/gerbil";
123
+ ```
package/docs/stt.md CHANGED
@@ -1,494 +1,111 @@
1
1
  # Speech-to-Text (STT)
2
2
 
3
- Gerbil provides local speech-to-text transcription using Whisper ONNX models via transformers.js. No API keys required - everything runs on-device.
3
+ Gerbil transcribes audio with **Moonshine** a native, raw-waveform encoder/decoder STT
4
+ model (no FFT, no log-mel) running on the WebGPU engine. It uses a Conv1d front-end over the
5
+ raw waveform, encodes once, then autoregressively decodes with cross-attention into the
6
+ frozen encoder K/V. No ONNX, no API keys.
7
+
8
+ > **Pre-1.0.** Moonshine is the only STT path. The old Whisper ONNX/transformers.js lane has
9
+ > been removed. The `Gerbil`-class `transcribe()` method still works but now runs native
10
+ > Moonshine under the hood (see [below](#gerbil-class-transcribe-native-wrapper)).
4
11
 
5
12
  ## Quick Start
6
13
 
7
- ### Node.js
14
+ Moonshine has its own engine class, `MoonshineSTT`. Feed it raw **16 kHz mono PCM**
15
+ (`Float32Array`).
8
16
 
9
17
  ```typescript
10
- import { Gerbil } from "@tryhamster/gerbil";
11
- import { readFileSync } from "fs";
18
+ import { MoonshineSTT } from "@tryhamster/gerbil/gpu";
12
19
 
13
- const g = new Gerbil();
20
+ const stt = await MoonshineSTT.create({ repo: "UsefulSensors/moonshine-base" });
14
21
 
15
- // Transcribe a WAV file
16
- const audioData = new Uint8Array(readFileSync("audio.wav"));
17
- const result = await g.transcribe(audioData);
22
+ // pcm: Float32Array, 16 kHz mono
23
+ const result = await stt.transcribe(pcm);
18
24
  console.log(result.text);
19
- ```
20
-
21
- ### CLI
22
25
 
23
- ```bash
24
- # Transcribe audio file
25
- gerbil transcribe audio.wav
26
-
27
- # With timestamps
28
- gerbil transcribe audio.wav --timestamps
29
-
30
- # List available models
31
- gerbil transcribe --list-models
32
-
33
- # Use a different model
34
- gerbil transcribe audio.wav --model whisper-base.en
26
+ stt.destroy();
35
27
  ```
36
28
 
37
- ## Available Models
38
-
39
- | Model | Size | Languages | Description |
40
- |-------|------|-----------|-------------|
41
- | `whisper-tiny.en` | 39M | English | Fastest, good for simple audio |
42
- | `whisper-tiny` | 39M | Multi | Multilingual tiny model |
43
- | `whisper-base.en` | 74M | English | Good balance of speed/accuracy |
44
- | `whisper-base` | 74M | Multi | Multilingual base model |
45
- | `whisper-small.en` | 244M | English | High quality transcription |
46
- | `whisper-small` | 244M | Multi | High quality multilingual |
47
- | `whisper-large-v3-turbo` | 809M | 80+ langs | Best quality, 5.4x faster than v3 |
29
+ ## API
48
30
 
49
- ## API Reference
50
-
51
- ### Gerbil Class Methods
31
+ ### `MoonshineSTT.create(options?)`
52
32
 
53
33
  ```typescript
54
- // Load STT model explicitly (optional - auto-loads on first transcribe)
55
- await g.loadSTT("whisper-tiny.en", {
56
- onProgress: (p) => console.log(p.status),
57
- });
58
-
59
- // Transcribe audio
60
- const result = await g.transcribe(audioData, {
61
- language: "en", // Language hint (multilingual models only)
62
- timestamps: true, // Return segment timestamps
63
- onProgress: (p) => console.log(p.status),
64
- });
65
-
66
- // Result structure
67
- console.log(result.text); // Full transcription
68
- console.log(result.duration); // Audio duration in seconds
69
- console.log(result.totalTime); // Processing time in ms
70
- console.log(result.segments); // Timestamped segments (if requested)
71
-
72
- // Check if loaded
73
- console.log(g.isSTTLoaded());
74
-
75
- // List available models
76
- const models = await g.listSTTModels();
77
- ```
78
-
79
- ### Audio Input Formats
80
-
81
- ```typescript
82
- // WAV file (Uint8Array) - automatically decoded
83
- const wavData = new Uint8Array(fs.readFileSync("audio.wav"));
84
- await g.transcribe(wavData);
85
-
86
- // Raw audio (Float32Array at 16kHz mono)
87
- const raw16k = new Float32Array([...]); // 16kHz mono samples
88
- await g.transcribe(raw16k);
89
-
90
- // WAV at any sample rate - automatically resampled to 16kHz
91
- const wav44k = new Uint8Array(fs.readFileSync("audio-44khz.wav"));
92
- await g.transcribe(wav44k); // Resampled internally
93
- ```
94
-
95
- ### Transcription with Timestamps
96
-
97
- ```typescript
98
- const result = await g.transcribe(audioData, { timestamps: true });
99
-
100
- for (const segment of result.segments || []) {
101
- console.log(`[${segment.start.toFixed(1)}s - ${segment.end.toFixed(1)}s] ${segment.text}`);
34
+ interface MoonshineSTTOptions {
35
+ /** HF repo (default "UsefulSensors/moonshine-base"). */
36
+ repo?: string;
37
+ revision?: string;
38
+ hfToken?: string;
39
+ cacheDir?: string;
40
+ onProgress?: (loaded: number, total: number, message: string) => void;
102
41
  }
103
- // [0.0s - 5.2s] Hello world, this is a test
104
- // [5.2s - 8.0s] of the transcription system
105
- ```
106
-
107
- ### Language Detection (Multilingual Models)
108
-
109
- ```typescript
110
- // Use multilingual model
111
- await g.loadSTT("whisper-base");
112
-
113
- // Let Whisper detect language
114
- const result = await g.transcribe(audioData);
115
- console.log(result.language); // "en", "es", "fr", etc.
116
-
117
- // Or provide a language hint
118
- const spanishResult = await g.transcribe(audioData, { language: "es" });
119
- ```
120
-
121
- ## Direct WhisperSTT Class
122
-
123
- For lower-level control:
124
-
125
- ```typescript
126
- import { WhisperSTT, decodeWav, resampleAudio } from "@tryhamster/gerbil/core/stt";
127
-
128
- const stt = new WhisperSTT("whisper-tiny.en");
129
-
130
- await stt.load({
131
- onProgress: (p) => console.log(p.status),
132
- });
133
-
134
- const result = await stt.transcribe(audioData, {
135
- timestamps: true,
136
- });
137
-
138
- console.log(result.text);
139
-
140
- stt.dispose();
141
- ```
142
-
143
- ### Audio Utilities
144
-
145
- ```typescript
146
- import { decodeWav, resampleAudio } from "@tryhamster/gerbil/core/stt";
147
-
148
- // Decode WAV to Float32Array
149
- const { audio, sampleRate } = decodeWav(wavUint8Array);
150
-
151
- // Resample to 16kHz (Whisper requirement)
152
- const audio16k = resampleAudio(audio, sampleRate, 16000);
153
- ```
154
-
155
- ## Streaming Transcription (Real-time)
156
-
157
- Transcribe audio in chunks as it comes in - perfect for live captioning, call transcription, or real-time subtitles:
158
-
159
- ```typescript
160
- import { Gerbil } from "@tryhamster/gerbil";
161
-
162
- const g = new Gerbil();
163
- await g.loadSTT("whisper-tiny.en");
164
-
165
- // Create streaming session
166
- const session = await g.createStreamingTranscription({
167
- chunkDuration: 1500, // Transcribe every 1.5 seconds (default)
168
- onChunk: (text, chunkIndex) => {
169
- console.log(`Chunk ${chunkIndex}: ${text}`);
170
- },
171
- onTranscript: (fullText) => {
172
- console.log("Full transcript:", fullText);
173
- },
174
- onError: (err) => console.error(err),
175
- });
176
-
177
- // Start the session (begins automatic interval-based transcription)
178
- session.start();
179
-
180
- // Feed audio data as it comes in (Float32Array at 16kHz)
181
- // This could come from a microphone, WebRTC stream, etc.
182
- session.feedAudio(audioChunk);
183
- session.feedAudio(moreAudioChunk);
184
-
185
- // Or manually trigger transcription of buffered audio
186
- await session.flush();
187
-
188
- // Get current state
189
- console.log(session.getTranscript()); // Full accumulated text
190
- console.log(session.getChunkCount()); // Number of chunks processed
191
- console.log(session.isRunning()); // Whether session is active
192
-
193
- // Stop and get final transcript
194
- const finalText = await session.stop();
195
- console.log("Final:", finalText);
196
-
197
- // Reset for reuse
198
- session.reset();
199
42
  ```
200
43
 
201
- ### Session API
44
+ ### `transcribe(pcm, options?)`
202
45
 
203
46
  ```typescript
204
- interface StreamingTranscriptionSession {
205
- feedAudio(audio: Float32Array): void; // Add audio to buffer
206
- flush(): Promise<string>; // Transcribe buffer now
207
- start(): void; // Start auto-transcription
208
- stop(): Promise<string>; // Stop and get final text
209
- isRunning(): boolean; // Check if running
210
- getTranscript(): string; // Get current full text
211
- getChunkCount(): number; // Number of chunks processed
212
- reset(): void; // Clear buffer and transcript
47
+ interface TranscribeOptions {
48
+ /** Stop after this many decoded tokens (default 194). */
49
+ maxNewTokens?: number;
213
50
  }
214
51
 
215
- interface StreamingTranscriptionOptions {
216
- chunkDuration?: number; // Interval in ms (default: 1500)
217
- minChunkSize?: number; // Min samples before transcribing (default: 8000)
218
- onChunk?: (text: string, chunkIndex: number) => void;
219
- onTranscript?: (fullText: string) => void;
220
- onError?: (error: string) => void;
221
- language?: string; // Language hint for multilingual models
52
+ interface TranscribeResult {
53
+ text: string;
54
+ /** Decoded token ids (excluding the start token, including trailing EOS). */
55
+ tokens: number[];
56
+ /** Encoder frames produced by the conv frontend. */
57
+ encoderFrames: number;
58
+ /** Audio duration in seconds (samples / 16000). */
59
+ audioSeconds: number;
222
60
  }
223
61
  ```
224
62
 
225
- ### Streaming with Microphone (Node.js)
63
+ `pcm` must be raw 16 kHz mono `Float32Array` samples. If your audio is a WAV file or at a
64
+ different sample rate, decode and resample it to 16 kHz first.
226
65
 
227
- ```typescript
228
- import { Gerbil, StreamingMicrophone } from "@tryhamster/gerbil";
229
-
230
- const g = new Gerbil();
231
- await g.loadSTT("whisper-tiny.en");
232
-
233
- // Create streaming transcription session
234
- const session = await g.createStreamingTranscription({
235
- chunkDuration: 2500,
236
- onChunk: (text, idx) => console.log(`[${idx}] ${text}`),
237
- onTranscript: (full) => console.log("Transcript:", full),
238
- });
239
-
240
- session.start();
241
-
242
- // Create streaming microphone that feeds to session
243
- const mic = new StreamingMicrophone({
244
- onAudioChunk: (audio) => session.feedAudio(audio),
245
- chunkDuration: 500, // 0.5s audio chunks
246
- });
247
-
248
- await mic.start();
249
- console.log("Recording... Press Ctrl+C to stop");
250
-
251
- // Handle graceful shutdown
252
- process.on("SIGINT", async () => {
253
- await mic.stop();
254
- const transcript = await session.stop();
255
- console.log("\nFinal:", transcript);
256
- process.exit(0);
257
- });
258
- ```
259
-
260
- ## React Hook (Browser)
261
-
262
- Use `useVoiceInput` for browser-based voice recording and transcription:
263
-
264
- ```tsx
265
- import { useVoiceInput } from "@tryhamster/gerbil/browser";
266
-
267
- function VoiceInput() {
268
- const {
269
- startRecording,
270
- stopRecording,
271
- isRecording,
272
- isTranscribing,
273
- transcript,
274
- error,
275
- } = useVoiceInput({
276
- model: "whisper-tiny.en",
277
- onTranscript: (text) => console.log("User said:", text),
278
- });
279
-
280
- return (
281
- <div>
282
- <button onClick={isRecording ? stopRecording : startRecording}>
283
- {isRecording ? "🔴 Stop" : "🎤 Record"}
284
- </button>
285
- {isTranscribing && <span>Transcribing...</span>}
286
- {transcript && <p>{transcript}</p>}
287
- </div>
288
- );
289
- }
290
- ```
291
-
292
- ### Hook Options
66
+ ### Decoding / resampling helpers
293
67
 
294
68
  ```typescript
295
- const {
296
- startRecording, // () => Promise<void> - start recording
297
- stopRecording, // () => Promise<string> - stop and transcribe
298
- cancelRecording, // () => void - stop without transcribing
299
- transcribe, // (audio: Float32Array) => Promise<string>
300
- isRecording, // boolean - currently recording
301
- isTranscribing, // boolean - transcribing audio
302
- isLoading, // boolean - model loading
303
- isReady, // boolean - model ready
304
- transcript, // string - latest result
305
- loadingProgress, // STTProgress - loading status
306
- error, // string | null
307
- load, // () => void - manually load model
308
- } = useVoiceInput({
309
- model: "whisper-tiny.en", // STT model ID
310
- autoLoad: false, // loads on first record
311
- onReady: () => {}, // called when model ready
312
- onTranscript: (text) => {},// called on each transcription
313
- onError: (error) => {}, // called on errors
314
- onProgress: (p) => {}, // loading progress
315
- });
316
- ```
317
-
318
- ## CLI Commands
319
-
320
- ### Transcribe
321
-
322
- ```bash
323
- # Basic transcription
324
- gerbil transcribe recording.wav
325
-
326
- # With timestamps
327
- gerbil transcribe recording.wav --timestamps
328
-
329
- # Save to file
330
- gerbil transcribe recording.wav --output transcript.txt
331
-
332
- # Use specific model
333
- gerbil transcribe recording.wav --model whisper-base.en
69
+ // Reuse the audio utilities from the core package to get 16 kHz mono PCM.
70
+ import { decodeWav, resampleAudio } from "@tryhamster/gerbil/core/stt";
334
71
 
335
- # Multilingual with language hint
336
- gerbil transcribe recording.wav --model whisper-base --language es
72
+ const { audio, sampleRate } = decodeWav(wavUint8Array);
73
+ const pcm16k = sampleRate === 16000 ? audio : resampleAudio(audio, sampleRate, 16000);
337
74
 
338
- # List models
339
- gerbil transcribe --list-models
75
+ const { text } = await stt.transcribe(pcm16k);
340
76
  ```
341
77
 
342
- ### Voice Chat (STT → LLM → TTS)
343
-
344
- Complete voice conversation loop from audio file:
345
-
346
- ```bash
347
- # Voice chat with audio file
348
- gerbil voice question.wav
78
+ ## Model
349
79
 
350
- # Customize models and voice
351
- gerbil voice question.wav --model qwen3-1.7b --voice bf_emma
80
+ ### Moonshine
352
81
 
353
- # With custom system prompt
354
- gerbil voice question.wav --system "You are a pirate. Speak like one!"
82
+ - **Architecture:** `MoonshineForConditionalGeneration` (encoder-decoder)
83
+ - **Front-end:** raw-waveform Conv1d (no FFT / log-mel)
84
+ - **Default repo:** `UsefulSensors/moonshine-base`
85
+ - **Sample rate:** 16 kHz mono
86
+ - **Language:** English (the native engine is English-only; multilingual Whisper has been removed).
355
87
 
356
- # Enable thinking mode
357
- gerbil voice question.wav --thinking
358
- ```
359
-
360
- This command:
361
- 1. Transcribes your audio (Whisper)
362
- 2. Generates a response (LLM)
363
- 3. Speaks the response (Kokoro TTS)
88
+ ---
364
89
 
365
- ### Microphone Recording
90
+ ## `Gerbil`-class `transcribe()` (native wrapper)
366
91
 
367
- Record directly from your microphone:
92
+ > The Whisper ONNX/transformers.js lane (and its multilingual support) has been removed. The
93
+ > `Gerbil`-class `transcribe()` method now runs native Moonshine: it requires WebGPU,
94
+ > accepts 16 kHz mono `Float32Array` PCM (not WAV bytes), is English-only, and does not
95
+ > produce timestamps. The browser `useVoiceInput` hook is gone — use `useSTT` from
96
+ > `@tryhamster/gerbil/gpu/hooks`. The AI SDK `gerbil.transcription()` provider also routes
97
+ > through this native path.
368
98
 
369
99
  ```typescript
370
100
  import { Gerbil } from "@tryhamster/gerbil";
371
101
 
372
102
  const g = new Gerbil();
373
-
374
- // Record for 5 seconds and transcribe
375
- const result = await g.listen(5000);
103
+ const pcm16kMono = /* Float32Array @ 16 kHz */;
104
+ const result = await g.transcribe(pcm16kMono);
376
105
  console.log(result.text);
377
-
378
- // Check if microphone is available
379
- const available = await g.isMicrophoneAvailable();
380
- ```
381
-
382
- **Requirements:** SoX must be installed for microphone recording:
383
- - macOS: `brew install sox`
384
- - Ubuntu: `sudo apt install sox`
385
- - Windows: Download from https://sox.sourceforge.net/
386
-
387
- ### REPL Voice Input
388
-
389
- In the Gerbil REPL chat:
390
- - Press `Ctrl+R` to record from microphone (5 seconds)
391
- - The transcribed text appears in the input field
392
- - Combined with voice mode (`Ctrl+V`), enables full voice conversations
393
-
394
- ```bash
395
- gerbil chat
396
- # In chat: Press ^R to record, ^V to enable voice responses
397
- ```
398
-
399
- ## Performance
400
-
401
- Transcription speed on Apple M1:
402
-
403
- | Model | Audio Length | Time | Speed |
404
- |-------|--------------|------|-------|
405
- | whisper-tiny.en | 11s | ~250ms | 44x realtime |
406
- | whisper-base.en | 11s | ~500ms | 22x realtime |
407
- | whisper-small.en | 11s | ~1.5s | 7x realtime |
408
-
409
- ## Troubleshooting
410
-
411
- ### "Unsupported bit depth"
412
- Only 16-bit WAV files are supported. Convert with:
413
- ```bash
414
- ffmpeg -i audio.mp3 -ar 16000 -ac 1 -sample_fmt s16 output.wav
415
- ```
416
-
417
- ### Slow first transcription
418
- The first call downloads the model (~40-250MB). Subsequent calls use the cached model.
419
-
420
- ### Memory usage
421
- Whisper models require:
422
- - tiny: ~150MB RAM
423
- - base: ~300MB RAM
424
- - small: ~600MB RAM
425
- - large-v3-turbo: ~1.5GB RAM
426
-
427
- ## useVoiceChat Hook (Full Voice Conversation)
428
-
429
- For complete voice-to-voice conversations (STT → LLM → TTS):
430
-
431
- ```tsx
432
- import { useVoiceChat } from "@tryhamster/gerbil/browser";
433
-
434
- function VoiceAssistant() {
435
- const {
436
- messages,
437
- startListening,
438
- stopListening,
439
- isListening,
440
- isSpeaking,
441
- stage, // "idle" | "listening" | "transcribing" | "thinking" | "speaking"
442
- isReady,
443
- } = useVoiceChat({
444
- llmModel: "qwen3-0.6b",
445
- sttModel: "whisper-tiny.en",
446
- voice: "af_bella",
447
- system: "You are a helpful voice assistant.",
448
- });
449
-
450
- return (
451
- <div>
452
- {messages.map(m => <p key={m.id}>{m.role}: {m.content}</p>)}
453
- <button
454
- onMouseDown={startListening}
455
- onMouseUp={stopListening}
456
- >
457
- {stage === "idle" ? "🎤 Hold to Speak" : stage}
458
- </button>
459
- </div>
460
- );
461
- }
462
- ```
463
-
464
- ## Adding Custom Models
465
-
466
- Gerbil supports any ONNX model with the `transformers.js` tag on HuggingFace:
467
-
468
- ```typescript
469
- // Use any onnx-community model
470
- await g.loadSTT("whisper-large-v3-turbo");
471
-
472
- // Browse compatible models at:
473
- // https://huggingface.co/models?library=transformers.js&pipeline_tag=automatic-speech-recognition
474
- ```
475
-
476
- To add a new model to the registry, edit `src/core/stt.ts`:
477
-
478
- ```typescript
479
- {
480
- id: "my-custom-model",
481
- repo: "onnx-community/my-model",
482
- description: "My custom model",
483
- size: "100M",
484
- multilingual: true,
485
- languages: ["en", "es"],
486
- sampleRate: 16000,
487
- }
488
106
  ```
489
107
 
490
108
  ## See Also
491
109
 
492
- - [Text-to-Speech (TTS)](./tts.md) - Speech synthesis
493
- - [Browser Hooks](./browser.md) - useChat, useVoiceInput, useVoiceChat
494
-
110
+ - [Text-to-Speech (TTS)](./tts.md) native Kani-TTS-2
111
+ - [Browser Hooks](./browser.md) React hooks