@tryhamster/gerbil 1.0.0-rc.9 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +318 -104
- package/dist/architectures-C1I5V3Dt.mjs +6070 -0
- package/dist/architectures-C1I5V3Dt.mjs.map +1 -0
- package/dist/browser/index.d.ts +276 -590
- package/dist/browser/index.d.ts.map +1 -1
- package/dist/browser/index.js +592 -2334
- package/dist/browser/index.js.map +1 -1
- package/dist/cli.mjs +625 -1098
- package/dist/cli.mjs.map +1 -1
- package/dist/defaults-9komdrbY.mjs +24 -0
- package/dist/defaults-9komdrbY.mjs.map +1 -0
- package/dist/frameworks/express.d.mts +1 -3
- package/dist/frameworks/express.d.mts.map +1 -1
- package/dist/frameworks/express.mjs +7 -7
- package/dist/frameworks/express.mjs.map +1 -1
- package/dist/frameworks/fastify.d.mts +1 -1
- package/dist/frameworks/fastify.d.mts.map +1 -1
- package/dist/frameworks/fastify.mjs +3 -3
- package/dist/frameworks/fastify.mjs.map +1 -1
- package/dist/frameworks/hono.d.mts +1 -1
- package/dist/frameworks/hono.d.mts.map +1 -1
- package/dist/frameworks/hono.mjs +4 -4
- package/dist/frameworks/hono.mjs.map +1 -1
- package/dist/frameworks/next.d.mts +3 -2
- package/dist/frameworks/next.d.mts.map +1 -1
- package/dist/frameworks/next.mjs +4 -4
- package/dist/frameworks/next.mjs.map +1 -1
- package/dist/frameworks/react.d.mts +1 -1
- package/dist/frameworks/trpc.d.mts +1 -1
- package/dist/frameworks/trpc.d.mts.map +1 -1
- package/dist/frameworks/trpc.mjs +4 -4
- package/dist/frameworks/trpc.mjs.map +1 -1
- package/dist/gerbil-BetB5xb0.d.mts +488 -0
- package/dist/gerbil-BetB5xb0.d.mts.map +1 -0
- package/dist/gerbil-CTZUa8EZ.mjs +4 -0
- package/dist/gerbil-DNniplr4.mjs +1656 -0
- package/dist/gerbil-DNniplr4.mjs.map +1 -0
- package/dist/gpu/hooks.d.mts +640 -0
- package/dist/gpu/hooks.d.mts.map +1 -0
- package/dist/gpu/hooks.mjs +1369 -0
- package/dist/gpu/hooks.mjs.map +1 -0
- package/dist/gpu/index.d.mts +2 -0
- package/dist/gpu/index.mjs +6 -0
- package/dist/gpu-DFuglcEx.mjs +3790 -0
- package/dist/gpu-DFuglcEx.mjs.map +1 -0
- package/dist/index-Dgmb2kE3.d.mts +245 -0
- package/dist/index-Dgmb2kE3.d.mts.map +1 -0
- package/dist/index-DukkJRMj.d.mts +2114 -0
- package/dist/index-DukkJRMj.d.mts.map +1 -0
- package/dist/index.d.mts +22 -487
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +13 -8
- package/dist/index.mjs.map +1 -1
- package/dist/indexeddb-store-BWIMtxxH.mjs +103 -0
- package/dist/indexeddb-store-BWIMtxxH.mjs.map +1 -0
- package/dist/indexeddb-store-ClH12Xnl.mjs +4 -0
- package/dist/integrations/ai-sdk.d.mts +75 -6
- package/dist/integrations/ai-sdk.d.mts.map +1 -1
- package/dist/integrations/ai-sdk.mjs +131 -15
- package/dist/integrations/ai-sdk.mjs.map +1 -1
- package/dist/integrations/langchain.d.mts +1 -1
- package/dist/integrations/langchain.d.mts.map +1 -1
- package/dist/integrations/langchain.mjs +5 -5
- package/dist/integrations/langchain.mjs.map +1 -1
- package/dist/integrations/llamaindex.d.mts +1 -1
- package/dist/integrations/llamaindex.d.mts.map +1 -1
- package/dist/integrations/llamaindex.mjs +5 -5
- package/dist/integrations/llamaindex.mjs.map +1 -1
- package/dist/integrations/mcp-client.mjs +3 -3
- package/dist/integrations/mcp-client.mjs.map +1 -1
- package/dist/integrations/mcp.d.mts +3 -2
- package/dist/integrations/mcp.d.mts.map +1 -1
- package/dist/integrations/mcp.mjs +5 -5
- package/dist/{mcp-BvbriaBy.mjs → mcp-D2vvH1Xc.mjs} +4 -4
- package/dist/mcp-D2vvH1Xc.mjs.map +1 -0
- package/dist/memory/index.d.mts +3 -0
- package/dist/memory/index.mjs +6 -0
- package/dist/memory-D1P7Tmda.mjs +4 -0
- package/dist/memory-DVN0MnIG.mjs +132 -0
- package/dist/memory-DVN0MnIG.mjs.map +1 -0
- package/dist/memory-Dj0J1v88.mjs +294 -0
- package/dist/memory-Dj0J1v88.mjs.map +1 -0
- package/dist/moonshine-stt-17dpP1kr.mjs +4 -0
- package/dist/moonshine-stt-4ojLtMq7.mjs +11962 -0
- package/dist/moonshine-stt-4ojLtMq7.mjs.map +1 -0
- package/dist/{one-liner-s-lD8rCC.mjs → one-liner-JhdIPxzF.mjs} +14 -16
- package/dist/one-liner-JhdIPxzF.mjs.map +1 -0
- package/dist/repl-BDRkwPGX.mjs +9 -0
- package/dist/skills/index.d.mts +270 -320
- package/dist/skills/index.d.mts.map +1 -1
- package/dist/skills/index.mjs +5 -5
- package/dist/{skills-CD3Orlex.mjs → skills-CU694Dc8.mjs} +187 -32
- package/dist/skills-CU694Dc8.mjs.map +1 -0
- package/dist/{tools-Bi1P7Xoy.mjs → tools-DQ1mPUw5.mjs} +34 -22
- package/dist/tools-DQ1mPUw5.mjs.map +1 -0
- package/dist/types-DQBe2lFo.d.mts +165 -0
- package/dist/types-DQBe2lFo.d.mts.map +1 -0
- package/dist/{types-CiTc7ez3.d.mts → types-LlyYILII.d.mts} +112 -14
- package/dist/types-LlyYILII.d.mts.map +1 -0
- package/dist/{utils-CZBZ8dgR.mjs → utils-DKO55ZmZ.mjs} +1 -1
- package/dist/{utils-CZBZ8dgR.mjs.map → utils-DKO55ZmZ.mjs.map} +1 -1
- package/dist/vector-B0panuy6.mjs +95 -0
- package/dist/vector-B0panuy6.mjs.map +1 -0
- package/docs/PROJECT-STATE.md +321 -0
- package/docs/adding-a-model-family.md +280 -0
- package/docs/ai-sdk.md +70 -61
- package/docs/architecture/overview.md +17 -7
- package/docs/browser.md +203 -8
- package/docs/embeddings.md +156 -0
- package/docs/gerbil-site-native-migration.md +217 -0
- package/docs/gpu-engine/architectures.md +398 -0
- package/docs/gpu-engine/ir.md +372 -0
- package/docs/gpu-engine/kernels.md +718 -0
- package/docs/gpu-engine/paper.html +1759 -0
- package/docs/gpu-engine/paper.md +2109 -0
- package/docs/gpu-engine/safetensors.md +312 -0
- package/docs/gpu-engine/tokenizer.md +302 -0
- package/docs/memory-rag.md +91 -0
- package/docs/metal-safari-intel.md +190 -0
- package/docs/mobile-failure-diagnosis.md +124 -0
- package/docs/mobile.md +99 -0
- package/docs/observability.md +230 -0
- package/docs/onnx-removal-plan.md +339 -0
- package/docs/research/autoresearch-portable.md +904 -0
- package/docs/research/dispatch-reduction-hivemind.md +84 -0
- package/docs/research/ios-safari-model-caching.md +117 -0
- package/docs/research/mobile-webgpu-speed-fusion.md +135 -0
- package/docs/research/native-stt-model-selection.md +49 -0
- package/docs/research/native-tts-model-selection.md +90 -0
- package/docs/research/native-vs-chromium-decision.md +152 -0
- package/docs/research/nemotron-mamba2-inference.md +910 -0
- package/docs/research/qwen35-multimodal.md +293 -0
- package/docs/research/qwen36-gemma4-targets.md +337 -0
- package/docs/research/sota-embedding-models.md +179 -0
- package/docs/research/sota-mobile-models-2026.md +263 -0
- package/docs/research/sota-modality-models.md +202 -0
- package/docs/research/tps-baselines.md +71 -0
- package/docs/research/webgpu-m4-reference.md +104 -0
- package/docs/site-update-plan.md +155 -0
- package/docs/structured-output.md +123 -0
- package/docs/stt.md +63 -446
- package/docs/tts.md +77 -499
- package/docs/vision.md +100 -338
- package/package.json +22 -7
- package/dist/chrome-backend-CORwaIyC.mjs +0 -1212
- package/dist/chrome-backend-CORwaIyC.mjs.map +0 -1
- package/dist/chrome-backend-DIKYoWj-.mjs +0 -3
- package/dist/gerbil-CJ3ifloF.mjs +0 -4
- package/dist/gerbil-Dw4Qj77e.mjs +0 -1631
- package/dist/gerbil-Dw4Qj77e.mjs.map +0 -1
- package/dist/gerbil-qOTe1nl2.d.mts +0 -431
- package/dist/gerbil-qOTe1nl2.d.mts.map +0 -1
- package/dist/kokoro-BNTb6egA.mjs +0 -20210
- package/dist/kokoro-BNTb6egA.mjs.map +0 -1
- package/dist/kokoro-CMOGDSgT.js +0 -20212
- package/dist/kokoro-CMOGDSgT.js.map +0 -1
- package/dist/mcp-BvbriaBy.mjs.map +0 -1
- package/dist/one-liner-s-lD8rCC.mjs.map +0 -1
- package/dist/repl-DveXw36T.mjs +0 -9
- package/dist/skills-CD3Orlex.mjs.map +0 -1
- package/dist/stt-Bu-E23Sc.js +0 -433
- package/dist/stt-Bu-E23Sc.js.map +0 -1
- package/dist/stt-CpLYbGFd.mjs +0 -433
- package/dist/stt-CpLYbGFd.mjs.map +0 -1
- package/dist/stt-DRPLEEHB.mjs +0 -3
- package/dist/tools-Bi1P7Xoy.mjs.map +0 -1
- package/dist/transformers.web-DiD1gTwk.js +0 -44695
- package/dist/transformers.web-DiD1gTwk.js.map +0 -1
- package/dist/transformers.web-u34VxRFM.js +0 -3
- package/dist/tts-CqroPaSK.js +0 -724
- package/dist/tts-CqroPaSK.js.map +0 -1
- package/dist/tts-DXgsKGCe.mjs +0 -3
- package/dist/tts-DeGANMNV.mjs +0 -730
- package/dist/tts-DeGANMNV.mjs.map +0 -1
- package/dist/types-CiTc7ez3.d.mts.map +0 -1
- /package/dist/{auto-update-S9s5-g0C.mjs → auto-update-BVaLXcDE.mjs} +0 -0
- /package/dist/{chunk-CkXuGtQK.mjs → chunk-B9cbKln6.mjs} +0 -0
- /package/dist/{microphone-DaMZFRuR.mjs → microphone-Bqmoz9_K.mjs} +0 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# Structured Output
|
|
2
|
+
|
|
3
|
+
Gerbil can make a model return a **JSON object** instead of free-form text. The engine
|
|
4
|
+
generates, extracts the JSON from the output, validates it, and — if it fails — retries with
|
|
5
|
+
a corrective nudge until the result is valid or the retry budget is exhausted.
|
|
6
|
+
|
|
7
|
+
This is available at three levels:
|
|
8
|
+
|
|
9
|
+
- **Engine** — `WebGPUEngine.generateObject()` (lowest level)
|
|
10
|
+
- **`Gerbil` class** — `g.generateObject()` (Node-facing wrapper)
|
|
11
|
+
- **One-liner** — `generateObject()` (zero-setup singleton)
|
|
12
|
+
- **React** — the `useObject` hook (`@tryhamster/gerbil/gpu/hooks`)
|
|
13
|
+
|
|
14
|
+
> `generateObject` is distinct from the Zod-driven `Gerbil.json()`. `json()` parses against a
|
|
15
|
+
> Zod schema; `generateObject()` validates with a predicate or a minimal
|
|
16
|
+
> `{ required: [...] }` schema and is the same code path the `useObject` hook uses.
|
|
17
|
+
|
|
18
|
+
## Validation
|
|
19
|
+
|
|
20
|
+
The `schema` option is an `ObjectValidator`, either:
|
|
21
|
+
|
|
22
|
+
- **A predicate** `(o: unknown) => boolean` — full control; return `true` when the parsed
|
|
23
|
+
value is acceptable.
|
|
24
|
+
- **A minimal schema object** `{ required: string[]; properties?: Record<string, unknown> }`
|
|
25
|
+
— only the `required` key set is enforced (every key must exist on the parsed object).
|
|
26
|
+
- **Omitted** — any syntactically valid JSON value (object or array) is accepted.
|
|
27
|
+
|
|
28
|
+
`maxRetries` (default `4`) is the number of retries **after** the first attempt, so up to
|
|
29
|
+
`maxRetries + 1` generations run. The result includes `attempts` (1 = first try).
|
|
30
|
+
|
|
31
|
+
## Engine
|
|
32
|
+
|
|
33
|
+
```typescript
|
|
34
|
+
import { WebGPUEngine } from "@tryhamster/gerbil/gpu";
|
|
35
|
+
|
|
36
|
+
const engine = await WebGPUEngine.create({ repo: "mlx-community/Qwen3.5-0.8B-4bit" });
|
|
37
|
+
|
|
38
|
+
const { object, text, attempts } = await engine.generateObject(
|
|
39
|
+
'Extract {name, age} from: "I am Sarah, 28"',
|
|
40
|
+
{ schema: { required: ["name", "age"] }, maxRetries: 4 },
|
|
41
|
+
);
|
|
42
|
+
// object === { name: "Sarah", age: 28 }
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
`generateObject` extends the engine's `GenerateOptions`, so `maxTokens`, `systemPrompt`, and
|
|
46
|
+
`sampling` (e.g. `{ temperature: 0.2 }`) are all available.
|
|
47
|
+
|
|
48
|
+
## `Gerbil` class
|
|
49
|
+
|
|
50
|
+
```typescript
|
|
51
|
+
import { Gerbil } from "@tryhamster/gerbil";
|
|
52
|
+
|
|
53
|
+
const g = new Gerbil();
|
|
54
|
+
await g.loadModel("qwen3.5-0.8b");
|
|
55
|
+
|
|
56
|
+
const { object } = await g.generateObject<{ primes: number[] }>(
|
|
57
|
+
"List the first 3 primes as {primes: number[]}",
|
|
58
|
+
{ schema: (o) => Array.isArray((o as any).primes) },
|
|
59
|
+
);
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## One-liner
|
|
63
|
+
|
|
64
|
+
```typescript
|
|
65
|
+
import { generateObject } from "@tryhamster/gerbil";
|
|
66
|
+
|
|
67
|
+
const { object } = await generateObject(
|
|
68
|
+
"Return {ok: true}",
|
|
69
|
+
{ model: "qwen3.5-0.8b", schema: { required: ["ok"] } },
|
|
70
|
+
);
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## React (`useObject`)
|
|
74
|
+
|
|
75
|
+
```tsx
|
|
76
|
+
import { useObject } from "@tryhamster/gerbil/gpu/hooks";
|
|
77
|
+
|
|
78
|
+
function CityExtractor() {
|
|
79
|
+
const { generate, object, isGenerating, attempts } = useObject<{ city: string }>({
|
|
80
|
+
model: "mlx-community/Qwen3.5-0.8B-4bit",
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
return (
|
|
84
|
+
<div>
|
|
85
|
+
<button
|
|
86
|
+
disabled={isGenerating}
|
|
87
|
+
onClick={() =>
|
|
88
|
+
generate("Extract the city: I live in Paris", { schema: { required: ["city"] } })
|
|
89
|
+
}
|
|
90
|
+
>
|
|
91
|
+
Extract
|
|
92
|
+
</button>
|
|
93
|
+
{object && <pre>{JSON.stringify(object, null, 2)} ({attempts} attempts)</pre>}
|
|
94
|
+
</div>
|
|
95
|
+
);
|
|
96
|
+
}
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## CLI
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
# Inline, no schema (any valid JSON accepted)
|
|
103
|
+
gerbil object "Return {ok: true} as JSON"
|
|
104
|
+
|
|
105
|
+
# With a schema file (a minimal { required: [...] } validator)
|
|
106
|
+
echo '{ "required": ["name", "age"] }' > person.json
|
|
107
|
+
gerbil object "Extract {name, age}: I am Sarah, 28" --schema person.json
|
|
108
|
+
|
|
109
|
+
# Flags: --model, --max-tokens, --temperature, --retries, --schema
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
The command prints the parsed object as pretty JSON followed by the attempt count.
|
|
113
|
+
|
|
114
|
+
## Types
|
|
115
|
+
|
|
116
|
+
```typescript
|
|
117
|
+
import type {
|
|
118
|
+
GenerateObjectOptions,
|
|
119
|
+
GenerateObjectResult,
|
|
120
|
+
ObjectSchema,
|
|
121
|
+
ObjectValidator,
|
|
122
|
+
} from "@tryhamster/gerbil";
|
|
123
|
+
```
|
package/docs/stt.md
CHANGED
|
@@ -1,494 +1,111 @@
|
|
|
1
1
|
# Speech-to-Text (STT)
|
|
2
2
|
|
|
3
|
-
Gerbil
|
|
3
|
+
Gerbil transcribes audio with **Moonshine** — a native, raw-waveform encoder/decoder STT
|
|
4
|
+
model (no FFT, no log-mel) running on the WebGPU engine. It uses a Conv1d front-end over the
|
|
5
|
+
raw waveform, encodes once, then autoregressively decodes with cross-attention into the
|
|
6
|
+
frozen encoder K/V. No ONNX, no API keys.
|
|
7
|
+
|
|
8
|
+
> **Pre-1.0.** Moonshine is the only STT path. The old Whisper ONNX/transformers.js lane has
|
|
9
|
+
> been removed. The `Gerbil`-class `transcribe()` method still works but now runs native
|
|
10
|
+
> Moonshine under the hood (see [below](#gerbil-class-transcribe-native-wrapper)).
|
|
4
11
|
|
|
5
12
|
## Quick Start
|
|
6
13
|
|
|
7
|
-
|
|
14
|
+
Moonshine has its own engine class, `MoonshineSTT`. Feed it raw **16 kHz mono PCM**
|
|
15
|
+
(`Float32Array`).
|
|
8
16
|
|
|
9
17
|
```typescript
|
|
10
|
-
import {
|
|
11
|
-
import { readFileSync } from "fs";
|
|
18
|
+
import { MoonshineSTT } from "@tryhamster/gerbil/gpu";
|
|
12
19
|
|
|
13
|
-
const
|
|
20
|
+
const stt = await MoonshineSTT.create({ repo: "UsefulSensors/moonshine-base" });
|
|
14
21
|
|
|
15
|
-
//
|
|
16
|
-
const
|
|
17
|
-
const result = await g.transcribe(audioData);
|
|
22
|
+
// pcm: Float32Array, 16 kHz mono
|
|
23
|
+
const result = await stt.transcribe(pcm);
|
|
18
24
|
console.log(result.text);
|
|
19
|
-
```
|
|
20
|
-
|
|
21
|
-
### CLI
|
|
22
25
|
|
|
23
|
-
|
|
24
|
-
# Transcribe audio file
|
|
25
|
-
gerbil transcribe audio.wav
|
|
26
|
-
|
|
27
|
-
# With timestamps
|
|
28
|
-
gerbil transcribe audio.wav --timestamps
|
|
29
|
-
|
|
30
|
-
# List available models
|
|
31
|
-
gerbil transcribe --list-models
|
|
32
|
-
|
|
33
|
-
# Use a different model
|
|
34
|
-
gerbil transcribe audio.wav --model whisper-base.en
|
|
26
|
+
stt.destroy();
|
|
35
27
|
```
|
|
36
28
|
|
|
37
|
-
##
|
|
38
|
-
|
|
39
|
-
| Model | Size | Languages | Description |
|
|
40
|
-
|-------|------|-----------|-------------|
|
|
41
|
-
| `whisper-tiny.en` | 39M | English | Fastest, good for simple audio |
|
|
42
|
-
| `whisper-tiny` | 39M | Multi | Multilingual tiny model |
|
|
43
|
-
| `whisper-base.en` | 74M | English | Good balance of speed/accuracy |
|
|
44
|
-
| `whisper-base` | 74M | Multi | Multilingual base model |
|
|
45
|
-
| `whisper-small.en` | 244M | English | High quality transcription |
|
|
46
|
-
| `whisper-small` | 244M | Multi | High quality multilingual |
|
|
47
|
-
| `whisper-large-v3-turbo` | 809M | 80+ langs | Best quality, 5.4x faster than v3 |
|
|
29
|
+
## API
|
|
48
30
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
### Gerbil Class Methods
|
|
31
|
+
### `MoonshineSTT.create(options?)`
|
|
52
32
|
|
|
53
33
|
```typescript
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
language: "en", // Language hint (multilingual models only)
|
|
62
|
-
timestamps: true, // Return segment timestamps
|
|
63
|
-
onProgress: (p) => console.log(p.status),
|
|
64
|
-
});
|
|
65
|
-
|
|
66
|
-
// Result structure
|
|
67
|
-
console.log(result.text); // Full transcription
|
|
68
|
-
console.log(result.duration); // Audio duration in seconds
|
|
69
|
-
console.log(result.totalTime); // Processing time in ms
|
|
70
|
-
console.log(result.segments); // Timestamped segments (if requested)
|
|
71
|
-
|
|
72
|
-
// Check if loaded
|
|
73
|
-
console.log(g.isSTTLoaded());
|
|
74
|
-
|
|
75
|
-
// List available models
|
|
76
|
-
const models = await g.listSTTModels();
|
|
77
|
-
```
|
|
78
|
-
|
|
79
|
-
### Audio Input Formats
|
|
80
|
-
|
|
81
|
-
```typescript
|
|
82
|
-
// WAV file (Uint8Array) - automatically decoded
|
|
83
|
-
const wavData = new Uint8Array(fs.readFileSync("audio.wav"));
|
|
84
|
-
await g.transcribe(wavData);
|
|
85
|
-
|
|
86
|
-
// Raw audio (Float32Array at 16kHz mono)
|
|
87
|
-
const raw16k = new Float32Array([...]); // 16kHz mono samples
|
|
88
|
-
await g.transcribe(raw16k);
|
|
89
|
-
|
|
90
|
-
// WAV at any sample rate - automatically resampled to 16kHz
|
|
91
|
-
const wav44k = new Uint8Array(fs.readFileSync("audio-44khz.wav"));
|
|
92
|
-
await g.transcribe(wav44k); // Resampled internally
|
|
93
|
-
```
|
|
94
|
-
|
|
95
|
-
### Transcription with Timestamps
|
|
96
|
-
|
|
97
|
-
```typescript
|
|
98
|
-
const result = await g.transcribe(audioData, { timestamps: true });
|
|
99
|
-
|
|
100
|
-
for (const segment of result.segments || []) {
|
|
101
|
-
console.log(`[${segment.start.toFixed(1)}s - ${segment.end.toFixed(1)}s] ${segment.text}`);
|
|
34
|
+
interface MoonshineSTTOptions {
|
|
35
|
+
/** HF repo (default "UsefulSensors/moonshine-base"). */
|
|
36
|
+
repo?: string;
|
|
37
|
+
revision?: string;
|
|
38
|
+
hfToken?: string;
|
|
39
|
+
cacheDir?: string;
|
|
40
|
+
onProgress?: (loaded: number, total: number, message: string) => void;
|
|
102
41
|
}
|
|
103
|
-
// [0.0s - 5.2s] Hello world, this is a test
|
|
104
|
-
// [5.2s - 8.0s] of the transcription system
|
|
105
|
-
```
|
|
106
|
-
|
|
107
|
-
### Language Detection (Multilingual Models)
|
|
108
|
-
|
|
109
|
-
```typescript
|
|
110
|
-
// Use multilingual model
|
|
111
|
-
await g.loadSTT("whisper-base");
|
|
112
|
-
|
|
113
|
-
// Let Whisper detect language
|
|
114
|
-
const result = await g.transcribe(audioData);
|
|
115
|
-
console.log(result.language); // "en", "es", "fr", etc.
|
|
116
|
-
|
|
117
|
-
// Or provide a language hint
|
|
118
|
-
const spanishResult = await g.transcribe(audioData, { language: "es" });
|
|
119
|
-
```
|
|
120
|
-
|
|
121
|
-
## Direct WhisperSTT Class
|
|
122
|
-
|
|
123
|
-
For lower-level control:
|
|
124
|
-
|
|
125
|
-
```typescript
|
|
126
|
-
import { WhisperSTT, decodeWav, resampleAudio } from "@tryhamster/gerbil/core/stt";
|
|
127
|
-
|
|
128
|
-
const stt = new WhisperSTT("whisper-tiny.en");
|
|
129
|
-
|
|
130
|
-
await stt.load({
|
|
131
|
-
onProgress: (p) => console.log(p.status),
|
|
132
|
-
});
|
|
133
|
-
|
|
134
|
-
const result = await stt.transcribe(audioData, {
|
|
135
|
-
timestamps: true,
|
|
136
|
-
});
|
|
137
|
-
|
|
138
|
-
console.log(result.text);
|
|
139
|
-
|
|
140
|
-
stt.dispose();
|
|
141
|
-
```
|
|
142
|
-
|
|
143
|
-
### Audio Utilities
|
|
144
|
-
|
|
145
|
-
```typescript
|
|
146
|
-
import { decodeWav, resampleAudio } from "@tryhamster/gerbil/core/stt";
|
|
147
|
-
|
|
148
|
-
// Decode WAV to Float32Array
|
|
149
|
-
const { audio, sampleRate } = decodeWav(wavUint8Array);
|
|
150
|
-
|
|
151
|
-
// Resample to 16kHz (Whisper requirement)
|
|
152
|
-
const audio16k = resampleAudio(audio, sampleRate, 16000);
|
|
153
|
-
```
|
|
154
|
-
|
|
155
|
-
## Streaming Transcription (Real-time)
|
|
156
|
-
|
|
157
|
-
Transcribe audio in chunks as it comes in - perfect for live captioning, call transcription, or real-time subtitles:
|
|
158
|
-
|
|
159
|
-
```typescript
|
|
160
|
-
import { Gerbil } from "@tryhamster/gerbil";
|
|
161
|
-
|
|
162
|
-
const g = new Gerbil();
|
|
163
|
-
await g.loadSTT("whisper-tiny.en");
|
|
164
|
-
|
|
165
|
-
// Create streaming session
|
|
166
|
-
const session = await g.createStreamingTranscription({
|
|
167
|
-
chunkDuration: 1500, // Transcribe every 1.5 seconds (default)
|
|
168
|
-
onChunk: (text, chunkIndex) => {
|
|
169
|
-
console.log(`Chunk ${chunkIndex}: ${text}`);
|
|
170
|
-
},
|
|
171
|
-
onTranscript: (fullText) => {
|
|
172
|
-
console.log("Full transcript:", fullText);
|
|
173
|
-
},
|
|
174
|
-
onError: (err) => console.error(err),
|
|
175
|
-
});
|
|
176
|
-
|
|
177
|
-
// Start the session (begins automatic interval-based transcription)
|
|
178
|
-
session.start();
|
|
179
|
-
|
|
180
|
-
// Feed audio data as it comes in (Float32Array at 16kHz)
|
|
181
|
-
// This could come from a microphone, WebRTC stream, etc.
|
|
182
|
-
session.feedAudio(audioChunk);
|
|
183
|
-
session.feedAudio(moreAudioChunk);
|
|
184
|
-
|
|
185
|
-
// Or manually trigger transcription of buffered audio
|
|
186
|
-
await session.flush();
|
|
187
|
-
|
|
188
|
-
// Get current state
|
|
189
|
-
console.log(session.getTranscript()); // Full accumulated text
|
|
190
|
-
console.log(session.getChunkCount()); // Number of chunks processed
|
|
191
|
-
console.log(session.isRunning()); // Whether session is active
|
|
192
|
-
|
|
193
|
-
// Stop and get final transcript
|
|
194
|
-
const finalText = await session.stop();
|
|
195
|
-
console.log("Final:", finalText);
|
|
196
|
-
|
|
197
|
-
// Reset for reuse
|
|
198
|
-
session.reset();
|
|
199
42
|
```
|
|
200
43
|
|
|
201
|
-
###
|
|
44
|
+
### `transcribe(pcm, options?)`
|
|
202
45
|
|
|
203
46
|
```typescript
|
|
204
|
-
interface
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
start(): void; // Start auto-transcription
|
|
208
|
-
stop(): Promise<string>; // Stop and get final text
|
|
209
|
-
isRunning(): boolean; // Check if running
|
|
210
|
-
getTranscript(): string; // Get current full text
|
|
211
|
-
getChunkCount(): number; // Number of chunks processed
|
|
212
|
-
reset(): void; // Clear buffer and transcript
|
|
47
|
+
interface TranscribeOptions {
|
|
48
|
+
/** Stop after this many decoded tokens (default 194). */
|
|
49
|
+
maxNewTokens?: number;
|
|
213
50
|
}
|
|
214
51
|
|
|
215
|
-
interface
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
52
|
+
interface TranscribeResult {
|
|
53
|
+
text: string;
|
|
54
|
+
/** Decoded token ids (excluding the start token, including trailing EOS). */
|
|
55
|
+
tokens: number[];
|
|
56
|
+
/** Encoder frames produced by the conv frontend. */
|
|
57
|
+
encoderFrames: number;
|
|
58
|
+
/** Audio duration in seconds (samples / 16000). */
|
|
59
|
+
audioSeconds: number;
|
|
222
60
|
}
|
|
223
61
|
```
|
|
224
62
|
|
|
225
|
-
|
|
63
|
+
`pcm` must be raw 16 kHz mono `Float32Array` samples. If your audio is a WAV file or at a
|
|
64
|
+
different sample rate, decode and resample it to 16 kHz first.
|
|
226
65
|
|
|
227
|
-
|
|
228
|
-
import { Gerbil, StreamingMicrophone } from "@tryhamster/gerbil";
|
|
229
|
-
|
|
230
|
-
const g = new Gerbil();
|
|
231
|
-
await g.loadSTT("whisper-tiny.en");
|
|
232
|
-
|
|
233
|
-
// Create streaming transcription session
|
|
234
|
-
const session = await g.createStreamingTranscription({
|
|
235
|
-
chunkDuration: 2500,
|
|
236
|
-
onChunk: (text, idx) => console.log(`[${idx}] ${text}`),
|
|
237
|
-
onTranscript: (full) => console.log("Transcript:", full),
|
|
238
|
-
});
|
|
239
|
-
|
|
240
|
-
session.start();
|
|
241
|
-
|
|
242
|
-
// Create streaming microphone that feeds to session
|
|
243
|
-
const mic = new StreamingMicrophone({
|
|
244
|
-
onAudioChunk: (audio) => session.feedAudio(audio),
|
|
245
|
-
chunkDuration: 500, // 0.5s audio chunks
|
|
246
|
-
});
|
|
247
|
-
|
|
248
|
-
await mic.start();
|
|
249
|
-
console.log("Recording... Press Ctrl+C to stop");
|
|
250
|
-
|
|
251
|
-
// Handle graceful shutdown
|
|
252
|
-
process.on("SIGINT", async () => {
|
|
253
|
-
await mic.stop();
|
|
254
|
-
const transcript = await session.stop();
|
|
255
|
-
console.log("\nFinal:", transcript);
|
|
256
|
-
process.exit(0);
|
|
257
|
-
});
|
|
258
|
-
```
|
|
259
|
-
|
|
260
|
-
## React Hook (Browser)
|
|
261
|
-
|
|
262
|
-
Use `useVoiceInput` for browser-based voice recording and transcription:
|
|
263
|
-
|
|
264
|
-
```tsx
|
|
265
|
-
import { useVoiceInput } from "@tryhamster/gerbil/browser";
|
|
266
|
-
|
|
267
|
-
function VoiceInput() {
|
|
268
|
-
const {
|
|
269
|
-
startRecording,
|
|
270
|
-
stopRecording,
|
|
271
|
-
isRecording,
|
|
272
|
-
isTranscribing,
|
|
273
|
-
transcript,
|
|
274
|
-
error,
|
|
275
|
-
} = useVoiceInput({
|
|
276
|
-
model: "whisper-tiny.en",
|
|
277
|
-
onTranscript: (text) => console.log("User said:", text),
|
|
278
|
-
});
|
|
279
|
-
|
|
280
|
-
return (
|
|
281
|
-
<div>
|
|
282
|
-
<button onClick={isRecording ? stopRecording : startRecording}>
|
|
283
|
-
{isRecording ? "🔴 Stop" : "🎤 Record"}
|
|
284
|
-
</button>
|
|
285
|
-
{isTranscribing && <span>Transcribing...</span>}
|
|
286
|
-
{transcript && <p>{transcript}</p>}
|
|
287
|
-
</div>
|
|
288
|
-
);
|
|
289
|
-
}
|
|
290
|
-
```
|
|
291
|
-
|
|
292
|
-
### Hook Options
|
|
66
|
+
### Decoding / resampling helpers
|
|
293
67
|
|
|
294
68
|
```typescript
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
stopRecording, // () => Promise<string> - stop and transcribe
|
|
298
|
-
cancelRecording, // () => void - stop without transcribing
|
|
299
|
-
transcribe, // (audio: Float32Array) => Promise<string>
|
|
300
|
-
isRecording, // boolean - currently recording
|
|
301
|
-
isTranscribing, // boolean - transcribing audio
|
|
302
|
-
isLoading, // boolean - model loading
|
|
303
|
-
isReady, // boolean - model ready
|
|
304
|
-
transcript, // string - latest result
|
|
305
|
-
loadingProgress, // STTProgress - loading status
|
|
306
|
-
error, // string | null
|
|
307
|
-
load, // () => void - manually load model
|
|
308
|
-
} = useVoiceInput({
|
|
309
|
-
model: "whisper-tiny.en", // STT model ID
|
|
310
|
-
autoLoad: false, // loads on first record
|
|
311
|
-
onReady: () => {}, // called when model ready
|
|
312
|
-
onTranscript: (text) => {},// called on each transcription
|
|
313
|
-
onError: (error) => {}, // called on errors
|
|
314
|
-
onProgress: (p) => {}, // loading progress
|
|
315
|
-
});
|
|
316
|
-
```
|
|
317
|
-
|
|
318
|
-
## CLI Commands
|
|
319
|
-
|
|
320
|
-
### Transcribe
|
|
321
|
-
|
|
322
|
-
```bash
|
|
323
|
-
# Basic transcription
|
|
324
|
-
gerbil transcribe recording.wav
|
|
325
|
-
|
|
326
|
-
# With timestamps
|
|
327
|
-
gerbil transcribe recording.wav --timestamps
|
|
328
|
-
|
|
329
|
-
# Save to file
|
|
330
|
-
gerbil transcribe recording.wav --output transcript.txt
|
|
331
|
-
|
|
332
|
-
# Use specific model
|
|
333
|
-
gerbil transcribe recording.wav --model whisper-base.en
|
|
69
|
+
// Reuse the audio utilities from the core package to get 16 kHz mono PCM.
|
|
70
|
+
import { decodeWav, resampleAudio } from "@tryhamster/gerbil/core/stt";
|
|
334
71
|
|
|
335
|
-
|
|
336
|
-
|
|
72
|
+
const { audio, sampleRate } = decodeWav(wavUint8Array);
|
|
73
|
+
const pcm16k = sampleRate === 16000 ? audio : resampleAudio(audio, sampleRate, 16000);
|
|
337
74
|
|
|
338
|
-
|
|
339
|
-
gerbil transcribe --list-models
|
|
75
|
+
const { text } = await stt.transcribe(pcm16k);
|
|
340
76
|
```
|
|
341
77
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
Complete voice conversation loop from audio file:
|
|
345
|
-
|
|
346
|
-
```bash
|
|
347
|
-
# Voice chat with audio file
|
|
348
|
-
gerbil voice question.wav
|
|
78
|
+
## Model
|
|
349
79
|
|
|
350
|
-
|
|
351
|
-
gerbil voice question.wav --model qwen3-1.7b --voice bf_emma
|
|
80
|
+
### Moonshine
|
|
352
81
|
|
|
353
|
-
|
|
354
|
-
|
|
82
|
+
- **Architecture:** `MoonshineForConditionalGeneration` (encoder-decoder)
|
|
83
|
+
- **Front-end:** raw-waveform Conv1d (no FFT / log-mel)
|
|
84
|
+
- **Default repo:** `UsefulSensors/moonshine-base`
|
|
85
|
+
- **Sample rate:** 16 kHz mono
|
|
86
|
+
- **Language:** English (the native engine is English-only; multilingual Whisper has been removed).
|
|
355
87
|
|
|
356
|
-
|
|
357
|
-
gerbil voice question.wav --thinking
|
|
358
|
-
```
|
|
359
|
-
|
|
360
|
-
This command:
|
|
361
|
-
1. Transcribes your audio (Whisper)
|
|
362
|
-
2. Generates a response (LLM)
|
|
363
|
-
3. Speaks the response (Kokoro TTS)
|
|
88
|
+
---
|
|
364
89
|
|
|
365
|
-
|
|
90
|
+
## `Gerbil`-class `transcribe()` (native wrapper)
|
|
366
91
|
|
|
367
|
-
|
|
92
|
+
> The Whisper ONNX/transformers.js lane (and its multilingual support) has been removed. The
|
|
93
|
+
> `Gerbil`-class `transcribe()` method now runs native Moonshine: it requires WebGPU,
|
|
94
|
+
> accepts 16 kHz mono `Float32Array` PCM (not WAV bytes), is English-only, and does not
|
|
95
|
+
> produce timestamps. The browser `useVoiceInput` hook is gone — use `useSTT` from
|
|
96
|
+
> `@tryhamster/gerbil/gpu/hooks`. The AI SDK `gerbil.transcription()` provider also routes
|
|
97
|
+
> through this native path.
|
|
368
98
|
|
|
369
99
|
```typescript
|
|
370
100
|
import { Gerbil } from "@tryhamster/gerbil";
|
|
371
101
|
|
|
372
102
|
const g = new Gerbil();
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
const result = await g.listen(5000);
|
|
103
|
+
const pcm16kMono = /* Float32Array @ 16 kHz */;
|
|
104
|
+
const result = await g.transcribe(pcm16kMono);
|
|
376
105
|
console.log(result.text);
|
|
377
|
-
|
|
378
|
-
// Check if microphone is available
|
|
379
|
-
const available = await g.isMicrophoneAvailable();
|
|
380
|
-
```
|
|
381
|
-
|
|
382
|
-
**Requirements:** SoX must be installed for microphone recording:
|
|
383
|
-
- macOS: `brew install sox`
|
|
384
|
-
- Ubuntu: `sudo apt install sox`
|
|
385
|
-
- Windows: Download from https://sox.sourceforge.net/
|
|
386
|
-
|
|
387
|
-
### REPL Voice Input
|
|
388
|
-
|
|
389
|
-
In the Gerbil REPL chat:
|
|
390
|
-
- Press `Ctrl+R` to record from microphone (5 seconds)
|
|
391
|
-
- The transcribed text appears in the input field
|
|
392
|
-
- Combined with voice mode (`Ctrl+V`), enables full voice conversations
|
|
393
|
-
|
|
394
|
-
```bash
|
|
395
|
-
gerbil chat
|
|
396
|
-
# In chat: Press ^R to record, ^V to enable voice responses
|
|
397
|
-
```
|
|
398
|
-
|
|
399
|
-
## Performance
|
|
400
|
-
|
|
401
|
-
Transcription speed on Apple M1:
|
|
402
|
-
|
|
403
|
-
| Model | Audio Length | Time | Speed |
|
|
404
|
-
|-------|--------------|------|-------|
|
|
405
|
-
| whisper-tiny.en | 11s | ~250ms | 44x realtime |
|
|
406
|
-
| whisper-base.en | 11s | ~500ms | 22x realtime |
|
|
407
|
-
| whisper-small.en | 11s | ~1.5s | 7x realtime |
|
|
408
|
-
|
|
409
|
-
## Troubleshooting
|
|
410
|
-
|
|
411
|
-
### "Unsupported bit depth"
|
|
412
|
-
Only 16-bit WAV files are supported. Convert with:
|
|
413
|
-
```bash
|
|
414
|
-
ffmpeg -i audio.mp3 -ar 16000 -ac 1 -sample_fmt s16 output.wav
|
|
415
|
-
```
|
|
416
|
-
|
|
417
|
-
### Slow first transcription
|
|
418
|
-
The first call downloads the model (~40-250MB). Subsequent calls use the cached model.
|
|
419
|
-
|
|
420
|
-
### Memory usage
|
|
421
|
-
Whisper models require:
|
|
422
|
-
- tiny: ~150MB RAM
|
|
423
|
-
- base: ~300MB RAM
|
|
424
|
-
- small: ~600MB RAM
|
|
425
|
-
- large-v3-turbo: ~1.5GB RAM
|
|
426
|
-
|
|
427
|
-
## useVoiceChat Hook (Full Voice Conversation)
|
|
428
|
-
|
|
429
|
-
For complete voice-to-voice conversations (STT → LLM → TTS):
|
|
430
|
-
|
|
431
|
-
```tsx
|
|
432
|
-
import { useVoiceChat } from "@tryhamster/gerbil/browser";
|
|
433
|
-
|
|
434
|
-
function VoiceAssistant() {
|
|
435
|
-
const {
|
|
436
|
-
messages,
|
|
437
|
-
startListening,
|
|
438
|
-
stopListening,
|
|
439
|
-
isListening,
|
|
440
|
-
isSpeaking,
|
|
441
|
-
stage, // "idle" | "listening" | "transcribing" | "thinking" | "speaking"
|
|
442
|
-
isReady,
|
|
443
|
-
} = useVoiceChat({
|
|
444
|
-
llmModel: "qwen3-0.6b",
|
|
445
|
-
sttModel: "whisper-tiny.en",
|
|
446
|
-
voice: "af_bella",
|
|
447
|
-
system: "You are a helpful voice assistant.",
|
|
448
|
-
});
|
|
449
|
-
|
|
450
|
-
return (
|
|
451
|
-
<div>
|
|
452
|
-
{messages.map(m => <p key={m.id}>{m.role}: {m.content}</p>)}
|
|
453
|
-
<button
|
|
454
|
-
onMouseDown={startListening}
|
|
455
|
-
onMouseUp={stopListening}
|
|
456
|
-
>
|
|
457
|
-
{stage === "idle" ? "🎤 Hold to Speak" : stage}
|
|
458
|
-
</button>
|
|
459
|
-
</div>
|
|
460
|
-
);
|
|
461
|
-
}
|
|
462
|
-
```
|
|
463
|
-
|
|
464
|
-
## Adding Custom Models
|
|
465
|
-
|
|
466
|
-
Gerbil supports any ONNX model with the `transformers.js` tag on HuggingFace:
|
|
467
|
-
|
|
468
|
-
```typescript
|
|
469
|
-
// Use any onnx-community model
|
|
470
|
-
await g.loadSTT("whisper-large-v3-turbo");
|
|
471
|
-
|
|
472
|
-
// Browse compatible models at:
|
|
473
|
-
// https://huggingface.co/models?library=transformers.js&pipeline_tag=automatic-speech-recognition
|
|
474
|
-
```
|
|
475
|
-
|
|
476
|
-
To add a new model to the registry, edit `src/core/stt.ts`:
|
|
477
|
-
|
|
478
|
-
```typescript
|
|
479
|
-
{
|
|
480
|
-
id: "my-custom-model",
|
|
481
|
-
repo: "onnx-community/my-model",
|
|
482
|
-
description: "My custom model",
|
|
483
|
-
size: "100M",
|
|
484
|
-
multilingual: true,
|
|
485
|
-
languages: ["en", "es"],
|
|
486
|
-
sampleRate: 16000,
|
|
487
|
-
}
|
|
488
106
|
```
|
|
489
107
|
|
|
490
108
|
## See Also
|
|
491
109
|
|
|
492
|
-
- [Text-to-Speech (TTS)](./tts.md)
|
|
493
|
-
- [Browser Hooks](./browser.md)
|
|
494
|
-
|
|
110
|
+
- [Text-to-Speech (TTS)](./tts.md) — native Kani-TTS-2
|
|
111
|
+
- [Browser Hooks](./browser.md) — React hooks
|