@tryhamster/gerbil 1.0.0-rc.0 → 1.0.0-rc.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -14
- package/dist/auto-update-DsWBBnEk.mjs +3 -0
- package/dist/browser/index.d.mts +401 -5
- package/dist/browser/index.d.mts.map +1 -1
- package/dist/browser/index.mjs +1772 -146
- package/dist/browser/index.mjs.map +1 -1
- package/dist/{chrome-backend-CtwPENIW.mjs → chrome-backend-JEPeM2YE.mjs} +1 -1
- package/dist/{chrome-backend-C5Un08O4.mjs → chrome-backend-Y9F7W5VQ.mjs} +514 -73
- package/dist/chrome-backend-Y9F7W5VQ.mjs.map +1 -0
- package/dist/cli.mjs +3359 -646
- package/dist/cli.mjs.map +1 -1
- package/dist/frameworks/express.d.mts +1 -1
- package/dist/frameworks/express.mjs +3 -3
- package/dist/frameworks/fastify.d.mts +1 -1
- package/dist/frameworks/fastify.mjs +3 -3
- package/dist/frameworks/hono.d.mts +1 -1
- package/dist/frameworks/hono.mjs +3 -3
- package/dist/frameworks/next.d.mts +2 -2
- package/dist/frameworks/next.mjs +3 -3
- package/dist/frameworks/react.d.mts +1 -1
- package/dist/frameworks/trpc.d.mts +1 -1
- package/dist/frameworks/trpc.mjs +3 -3
- package/dist/gerbil-DeQlX_Mt.mjs +5 -0
- package/dist/gerbil-POAz8peb.d.mts +431 -0
- package/dist/gerbil-POAz8peb.d.mts.map +1 -0
- package/dist/gerbil-yoSpRHgv.mjs +1463 -0
- package/dist/gerbil-yoSpRHgv.mjs.map +1 -0
- package/dist/index.d.mts +395 -9
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +8 -6
- package/dist/index.mjs.map +1 -1
- package/dist/integrations/ai-sdk.d.mts +122 -4
- package/dist/integrations/ai-sdk.d.mts.map +1 -1
- package/dist/integrations/ai-sdk.mjs +239 -11
- package/dist/integrations/ai-sdk.mjs.map +1 -1
- package/dist/integrations/langchain.d.mts +132 -2
- package/dist/integrations/langchain.d.mts.map +1 -1
- package/dist/integrations/langchain.mjs +176 -8
- package/dist/integrations/langchain.mjs.map +1 -1
- package/dist/integrations/llamaindex.d.mts +1 -1
- package/dist/integrations/llamaindex.mjs +3 -3
- package/dist/integrations/mcp-client.mjs +4 -4
- package/dist/integrations/mcp-client.mjs.map +1 -1
- package/dist/integrations/mcp.d.mts +2 -2
- package/dist/integrations/mcp.d.mts.map +1 -1
- package/dist/integrations/mcp.mjs +6 -6
- package/dist/{mcp-R8kRLIKb.mjs → mcp-Bitg4sjX.mjs} +10 -37
- package/dist/mcp-Bitg4sjX.mjs.map +1 -0
- package/dist/microphone-D-6y9aiE.mjs +3 -0
- package/dist/{models-DKULvhOr.mjs → models-BAtL8qsA.mjs} +42 -7
- package/dist/models-BAtL8qsA.mjs.map +1 -0
- package/dist/{models-De2-_GmQ.d.mts → models-CE0fBq0U.d.mts} +2 -2
- package/dist/models-CE0fBq0U.d.mts.map +1 -0
- package/dist/{one-liner-BUQR0nqq.mjs → one-liner-B1rmFto6.mjs} +2 -2
- package/dist/{one-liner-BUQR0nqq.mjs.map → one-liner-B1rmFto6.mjs.map} +1 -1
- package/dist/repl-D20JO260.mjs +10 -0
- package/dist/skills/index.d.mts +303 -12
- package/dist/skills/index.d.mts.map +1 -1
- package/dist/skills/index.mjs +6 -6
- package/dist/skills-5DxAV-rn.mjs +1435 -0
- package/dist/skills-5DxAV-rn.mjs.map +1 -0
- package/dist/stt-Bv_dum-R.mjs +433 -0
- package/dist/stt-Bv_dum-R.mjs.map +1 -0
- package/dist/stt-KzSoNvwI.mjs +3 -0
- package/dist/{tools-BsiEE6f2.mjs → tools-IYPrqoek.mjs} +6 -7
- package/dist/{tools-BsiEE6f2.mjs.map → tools-IYPrqoek.mjs.map} +1 -1
- package/dist/tts-5yWeP_I0.mjs +3 -0
- package/dist/tts-DG6denWG.mjs +729 -0
- package/dist/tts-DG6denWG.mjs.map +1 -0
- package/dist/types-s6Py2_DL.d.mts +353 -0
- package/dist/types-s6Py2_DL.d.mts.map +1 -0
- package/dist/{utils-7vXqtq2Q.mjs → utils-CkB4Roi6.mjs} +1 -1
- package/dist/{utils-7vXqtq2Q.mjs.map → utils-CkB4Roi6.mjs.map} +1 -1
- package/docs/ai-sdk.md +137 -21
- package/docs/browser.md +241 -2
- package/docs/memory.md +72 -0
- package/docs/stt.md +494 -0
- package/docs/tts.md +569 -0
- package/docs/vision.md +396 -0
- package/package.json +17 -18
- package/dist/auto-update-BbNHbSU1.mjs +0 -3
- package/dist/chrome-backend-C5Un08O4.mjs.map +0 -1
- package/dist/gerbil-BfnsFWRE.mjs +0 -644
- package/dist/gerbil-BfnsFWRE.mjs.map +0 -1
- package/dist/gerbil-BjW-z7Fq.mjs +0 -5
- package/dist/gerbil-DZ1k3ChC.d.mts +0 -138
- package/dist/gerbil-DZ1k3ChC.d.mts.map +0 -1
- package/dist/mcp-R8kRLIKb.mjs.map +0 -1
- package/dist/models-DKULvhOr.mjs.map +0 -1
- package/dist/models-De2-_GmQ.d.mts.map +0 -1
- package/dist/skills-D3CEpgDc.mjs +0 -630
- package/dist/skills-D3CEpgDc.mjs.map +0 -1
- package/dist/types-BS1N92Jt.d.mts +0 -183
- package/dist/types-BS1N92Jt.d.mts.map +0 -1
package/docs/stt.md
ADDED
|
@@ -0,0 +1,494 @@
|
|
|
1
|
+
# Speech-to-Text (STT)
|
|
2
|
+
|
|
3
|
+
Gerbil provides local speech-to-text transcription using Whisper ONNX models via transformers.js. No API keys required - everything runs on-device.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
### Node.js
|
|
8
|
+
|
|
9
|
+
```typescript
|
|
10
|
+
import { Gerbil } from "@tryhamster/gerbil";
|
|
11
|
+
import { readFileSync } from "fs";
|
|
12
|
+
|
|
13
|
+
const g = new Gerbil();
|
|
14
|
+
|
|
15
|
+
// Transcribe a WAV file
|
|
16
|
+
const audioData = new Uint8Array(readFileSync("audio.wav"));
|
|
17
|
+
const result = await g.transcribe(audioData);
|
|
18
|
+
console.log(result.text);
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
### CLI
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
# Transcribe audio file
|
|
25
|
+
gerbil transcribe audio.wav
|
|
26
|
+
|
|
27
|
+
# With timestamps
|
|
28
|
+
gerbil transcribe audio.wav --timestamps
|
|
29
|
+
|
|
30
|
+
# List available models
|
|
31
|
+
gerbil transcribe --list-models
|
|
32
|
+
|
|
33
|
+
# Use a different model
|
|
34
|
+
gerbil transcribe audio.wav --model whisper-base.en
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Available Models
|
|
38
|
+
|
|
39
|
+
| Model | Size | Languages | Description |
|
|
40
|
+
|-------|------|-----------|-------------|
|
|
41
|
+
| `whisper-tiny.en` | 39M | English | Fastest, good for simple audio |
|
|
42
|
+
| `whisper-tiny` | 39M | Multi | Multilingual tiny model |
|
|
43
|
+
| `whisper-base.en` | 74M | English | Good balance of speed/accuracy |
|
|
44
|
+
| `whisper-base` | 74M | Multi | Multilingual base model |
|
|
45
|
+
| `whisper-small.en` | 244M | English | High quality transcription |
|
|
46
|
+
| `whisper-small` | 244M | Multi | High quality multilingual |
|
|
47
|
+
| `whisper-large-v3-turbo` | 809M | 80+ langs | Best quality, 5.4x faster than v3 |
|
|
48
|
+
|
|
49
|
+
## API Reference
|
|
50
|
+
|
|
51
|
+
### Gerbil Class Methods
|
|
52
|
+
|
|
53
|
+
```typescript
|
|
54
|
+
// Load STT model explicitly (optional - auto-loads on first transcribe)
|
|
55
|
+
await g.loadSTT("whisper-tiny.en", {
|
|
56
|
+
onProgress: (p) => console.log(p.status),
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
// Transcribe audio
|
|
60
|
+
const result = await g.transcribe(audioData, {
|
|
61
|
+
language: "en", // Language hint (multilingual models only)
|
|
62
|
+
timestamps: true, // Return segment timestamps
|
|
63
|
+
onProgress: (p) => console.log(p.status),
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
// Result structure
|
|
67
|
+
console.log(result.text); // Full transcription
|
|
68
|
+
console.log(result.duration); // Audio duration in seconds
|
|
69
|
+
console.log(result.totalTime); // Processing time in ms
|
|
70
|
+
console.log(result.segments); // Timestamped segments (if requested)
|
|
71
|
+
|
|
72
|
+
// Check if loaded
|
|
73
|
+
console.log(g.isSTTLoaded());
|
|
74
|
+
|
|
75
|
+
// List available models
|
|
76
|
+
const models = await g.listSTTModels();
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Audio Input Formats
|
|
80
|
+
|
|
81
|
+
```typescript
|
|
82
|
+
// WAV file (Uint8Array) - automatically decoded
|
|
83
|
+
const wavData = new Uint8Array(fs.readFileSync("audio.wav"));
|
|
84
|
+
await g.transcribe(wavData);
|
|
85
|
+
|
|
86
|
+
// Raw audio (Float32Array at 16kHz mono)
|
|
87
|
+
const raw16k = new Float32Array([...]); // 16kHz mono samples
|
|
88
|
+
await g.transcribe(raw16k);
|
|
89
|
+
|
|
90
|
+
// WAV at any sample rate - automatically resampled to 16kHz
|
|
91
|
+
const wav44k = new Uint8Array(fs.readFileSync("audio-44khz.wav"));
|
|
92
|
+
await g.transcribe(wav44k); // Resampled internally
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Transcription with Timestamps
|
|
96
|
+
|
|
97
|
+
```typescript
|
|
98
|
+
const result = await g.transcribe(audioData, { timestamps: true });
|
|
99
|
+
|
|
100
|
+
for (const segment of result.segments || []) {
|
|
101
|
+
console.log(`[${segment.start.toFixed(1)}s - ${segment.end.toFixed(1)}s] ${segment.text}`);
|
|
102
|
+
}
|
|
103
|
+
// [0.0s - 5.2s] Hello world, this is a test
|
|
104
|
+
// [5.2s - 8.0s] of the transcription system
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Language Detection (Multilingual Models)
|
|
108
|
+
|
|
109
|
+
```typescript
|
|
110
|
+
// Use multilingual model
|
|
111
|
+
await g.loadSTT("whisper-base");
|
|
112
|
+
|
|
113
|
+
// Let Whisper detect language
|
|
114
|
+
const result = await g.transcribe(audioData);
|
|
115
|
+
console.log(result.language); // "en", "es", "fr", etc.
|
|
116
|
+
|
|
117
|
+
// Or provide a language hint
|
|
118
|
+
const spanishResult = await g.transcribe(audioData, { language: "es" });
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Direct WhisperSTT Class
|
|
122
|
+
|
|
123
|
+
For lower-level control:
|
|
124
|
+
|
|
125
|
+
```typescript
|
|
126
|
+
import { WhisperSTT, decodeWav, resampleAudio } from "@tryhamster/gerbil/core/stt";
|
|
127
|
+
|
|
128
|
+
const stt = new WhisperSTT("whisper-tiny.en");
|
|
129
|
+
|
|
130
|
+
await stt.load({
|
|
131
|
+
onProgress: (p) => console.log(p.status),
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
const result = await stt.transcribe(audioData, {
|
|
135
|
+
timestamps: true,
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
console.log(result.text);
|
|
139
|
+
|
|
140
|
+
stt.dispose();
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Audio Utilities
|
|
144
|
+
|
|
145
|
+
```typescript
|
|
146
|
+
import { decodeWav, resampleAudio } from "@tryhamster/gerbil/core/stt";
|
|
147
|
+
|
|
148
|
+
// Decode WAV to Float32Array
|
|
149
|
+
const { audio, sampleRate } = decodeWav(wavUint8Array);
|
|
150
|
+
|
|
151
|
+
// Resample to 16kHz (Whisper requirement)
|
|
152
|
+
const audio16k = resampleAudio(audio, sampleRate, 16000);
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Streaming Transcription (Real-time)
|
|
156
|
+
|
|
157
|
+
Transcribe audio in chunks as it comes in - perfect for live captioning, call transcription, or real-time subtitles:
|
|
158
|
+
|
|
159
|
+
```typescript
|
|
160
|
+
import { Gerbil } from "@tryhamster/gerbil";
|
|
161
|
+
|
|
162
|
+
const g = new Gerbil();
|
|
163
|
+
await g.loadSTT("whisper-tiny.en");
|
|
164
|
+
|
|
165
|
+
// Create streaming session
|
|
166
|
+
const session = await g.createStreamingTranscription({
|
|
167
|
+
chunkDuration: 1500, // Transcribe every 1.5 seconds (default)
|
|
168
|
+
onChunk: (text, chunkIndex) => {
|
|
169
|
+
console.log(`Chunk ${chunkIndex}: ${text}`);
|
|
170
|
+
},
|
|
171
|
+
onTranscript: (fullText) => {
|
|
172
|
+
console.log("Full transcript:", fullText);
|
|
173
|
+
},
|
|
174
|
+
onError: (err) => console.error(err),
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
// Start the session (begins automatic interval-based transcription)
|
|
178
|
+
session.start();
|
|
179
|
+
|
|
180
|
+
// Feed audio data as it comes in (Float32Array at 16kHz)
|
|
181
|
+
// This could come from a microphone, WebRTC stream, etc.
|
|
182
|
+
session.feedAudio(audioChunk);
|
|
183
|
+
session.feedAudio(moreAudioChunk);
|
|
184
|
+
|
|
185
|
+
// Or manually trigger transcription of buffered audio
|
|
186
|
+
await session.flush();
|
|
187
|
+
|
|
188
|
+
// Get current state
|
|
189
|
+
console.log(session.getTranscript()); // Full accumulated text
|
|
190
|
+
console.log(session.getChunkCount()); // Number of chunks processed
|
|
191
|
+
console.log(session.isRunning()); // Whether session is active
|
|
192
|
+
|
|
193
|
+
// Stop and get final transcript
|
|
194
|
+
const finalText = await session.stop();
|
|
195
|
+
console.log("Final:", finalText);
|
|
196
|
+
|
|
197
|
+
// Reset for reuse
|
|
198
|
+
session.reset();
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### Session API
|
|
202
|
+
|
|
203
|
+
```typescript
|
|
204
|
+
interface StreamingTranscriptionSession {
|
|
205
|
+
feedAudio(audio: Float32Array): void; // Add audio to buffer
|
|
206
|
+
flush(): Promise<string>; // Transcribe buffer now
|
|
207
|
+
start(): void; // Start auto-transcription
|
|
208
|
+
stop(): Promise<string>; // Stop and get final text
|
|
209
|
+
isRunning(): boolean; // Check if running
|
|
210
|
+
getTranscript(): string; // Get current full text
|
|
211
|
+
getChunkCount(): number; // Number of chunks processed
|
|
212
|
+
reset(): void; // Clear buffer and transcript
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
interface StreamingTranscriptionOptions {
|
|
216
|
+
chunkDuration?: number; // Interval in ms (default: 1500)
|
|
217
|
+
minChunkSize?: number; // Min samples before transcribing (default: 8000)
|
|
218
|
+
onChunk?: (text: string, chunkIndex: number) => void;
|
|
219
|
+
onTranscript?: (fullText: string) => void;
|
|
220
|
+
onError?: (error: string) => void;
|
|
221
|
+
language?: string; // Language hint for multilingual models
|
|
222
|
+
}
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### Streaming with Microphone (Node.js)
|
|
226
|
+
|
|
227
|
+
```typescript
|
|
228
|
+
import { Gerbil, StreamingMicrophone } from "@tryhamster/gerbil";
|
|
229
|
+
|
|
230
|
+
const g = new Gerbil();
|
|
231
|
+
await g.loadSTT("whisper-tiny.en");
|
|
232
|
+
|
|
233
|
+
// Create streaming transcription session
|
|
234
|
+
const session = await g.createStreamingTranscription({
|
|
235
|
+
chunkDuration: 2500,
|
|
236
|
+
onChunk: (text, idx) => console.log(`[${idx}] ${text}`),
|
|
237
|
+
onTranscript: (full) => console.log("Transcript:", full),
|
|
238
|
+
});
|
|
239
|
+
|
|
240
|
+
session.start();
|
|
241
|
+
|
|
242
|
+
// Create streaming microphone that feeds to session
|
|
243
|
+
const mic = new StreamingMicrophone({
|
|
244
|
+
onAudioChunk: (audio) => session.feedAudio(audio),
|
|
245
|
+
chunkDuration: 500, // 0.5s audio chunks
|
|
246
|
+
});
|
|
247
|
+
|
|
248
|
+
await mic.start();
|
|
249
|
+
console.log("Recording... Press Ctrl+C to stop");
|
|
250
|
+
|
|
251
|
+
// Handle graceful shutdown
|
|
252
|
+
process.on("SIGINT", async () => {
|
|
253
|
+
await mic.stop();
|
|
254
|
+
const transcript = await session.stop();
|
|
255
|
+
console.log("\nFinal:", transcript);
|
|
256
|
+
process.exit(0);
|
|
257
|
+
});
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## React Hook (Browser)
|
|
261
|
+
|
|
262
|
+
Use `useVoiceInput` for browser-based voice recording and transcription:
|
|
263
|
+
|
|
264
|
+
```tsx
|
|
265
|
+
import { useVoiceInput } from "@tryhamster/gerbil/browser";
|
|
266
|
+
|
|
267
|
+
function VoiceInput() {
|
|
268
|
+
const {
|
|
269
|
+
startRecording,
|
|
270
|
+
stopRecording,
|
|
271
|
+
isRecording,
|
|
272
|
+
isTranscribing,
|
|
273
|
+
transcript,
|
|
274
|
+
error,
|
|
275
|
+
} = useVoiceInput({
|
|
276
|
+
model: "whisper-tiny.en",
|
|
277
|
+
onTranscript: (text) => console.log("User said:", text),
|
|
278
|
+
});
|
|
279
|
+
|
|
280
|
+
return (
|
|
281
|
+
<div>
|
|
282
|
+
<button onClick={isRecording ? stopRecording : startRecording}>
|
|
283
|
+
{isRecording ? "🔴 Stop" : "🎤 Record"}
|
|
284
|
+
</button>
|
|
285
|
+
{isTranscribing && <span>Transcribing...</span>}
|
|
286
|
+
{transcript && <p>{transcript}</p>}
|
|
287
|
+
</div>
|
|
288
|
+
);
|
|
289
|
+
}
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
### Hook Options
|
|
293
|
+
|
|
294
|
+
```typescript
|
|
295
|
+
const {
|
|
296
|
+
startRecording, // () => Promise<void> - start recording
|
|
297
|
+
stopRecording, // () => Promise<string> - stop and transcribe
|
|
298
|
+
cancelRecording, // () => void - stop without transcribing
|
|
299
|
+
transcribe, // (audio: Float32Array) => Promise<string>
|
|
300
|
+
isRecording, // boolean - currently recording
|
|
301
|
+
isTranscribing, // boolean - transcribing audio
|
|
302
|
+
isLoading, // boolean - model loading
|
|
303
|
+
isReady, // boolean - model ready
|
|
304
|
+
transcript, // string - latest result
|
|
305
|
+
loadingProgress, // STTProgress - loading status
|
|
306
|
+
error, // string | null
|
|
307
|
+
load, // () => void - manually load model
|
|
308
|
+
} = useVoiceInput({
|
|
309
|
+
model: "whisper-tiny.en", // STT model ID
|
|
310
|
+
autoLoad: false, // loads on first record
|
|
311
|
+
onReady: () => {}, // called when model ready
|
|
312
|
+
onTranscript: (text) => {},// called on each transcription
|
|
313
|
+
onError: (error) => {}, // called on errors
|
|
314
|
+
onProgress: (p) => {}, // loading progress
|
|
315
|
+
});
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
## CLI Commands
|
|
319
|
+
|
|
320
|
+
### Transcribe
|
|
321
|
+
|
|
322
|
+
```bash
|
|
323
|
+
# Basic transcription
|
|
324
|
+
gerbil transcribe recording.wav
|
|
325
|
+
|
|
326
|
+
# With timestamps
|
|
327
|
+
gerbil transcribe recording.wav --timestamps
|
|
328
|
+
|
|
329
|
+
# Save to file
|
|
330
|
+
gerbil transcribe recording.wav --output transcript.txt
|
|
331
|
+
|
|
332
|
+
# Use specific model
|
|
333
|
+
gerbil transcribe recording.wav --model whisper-base.en
|
|
334
|
+
|
|
335
|
+
# Multilingual with language hint
|
|
336
|
+
gerbil transcribe recording.wav --model whisper-base --language es
|
|
337
|
+
|
|
338
|
+
# List models
|
|
339
|
+
gerbil transcribe --list-models
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
### Voice Chat (STT → LLM → TTS)
|
|
343
|
+
|
|
344
|
+
Complete voice conversation loop from audio file:
|
|
345
|
+
|
|
346
|
+
```bash
|
|
347
|
+
# Voice chat with audio file
|
|
348
|
+
gerbil voice question.wav
|
|
349
|
+
|
|
350
|
+
# Customize models and voice
|
|
351
|
+
gerbil voice question.wav --model qwen3-1.7b --voice bf_emma
|
|
352
|
+
|
|
353
|
+
# With custom system prompt
|
|
354
|
+
gerbil voice question.wav --system "You are a pirate. Speak like one!"
|
|
355
|
+
|
|
356
|
+
# Enable thinking mode
|
|
357
|
+
gerbil voice question.wav --thinking
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
This command:
|
|
361
|
+
1. Transcribes your audio (Whisper)
|
|
362
|
+
2. Generates a response (LLM)
|
|
363
|
+
3. Speaks the response (Kokoro TTS)
|
|
364
|
+
|
|
365
|
+
### Microphone Recording
|
|
366
|
+
|
|
367
|
+
Record directly from your microphone:
|
|
368
|
+
|
|
369
|
+
```typescript
|
|
370
|
+
import { Gerbil } from "@tryhamster/gerbil";
|
|
371
|
+
|
|
372
|
+
const g = new Gerbil();
|
|
373
|
+
|
|
374
|
+
// Record for 5 seconds and transcribe
|
|
375
|
+
const result = await g.listen(5000);
|
|
376
|
+
console.log(result.text);
|
|
377
|
+
|
|
378
|
+
// Check if microphone is available
|
|
379
|
+
const available = await g.isMicrophoneAvailable();
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
**Requirements:** SoX must be installed for microphone recording:
|
|
383
|
+
- macOS: `brew install sox`
|
|
384
|
+
- Ubuntu: `sudo apt install sox`
|
|
385
|
+
- Windows: Download from https://sox.sourceforge.net/
|
|
386
|
+
|
|
387
|
+
### REPL Voice Input
|
|
388
|
+
|
|
389
|
+
In the Gerbil REPL chat:
|
|
390
|
+
- Press `Ctrl+R` to record from microphone (5 seconds)
|
|
391
|
+
- The transcribed text appears in the input field
|
|
392
|
+
- Combined with voice mode (`Ctrl+V`), enables full voice conversations
|
|
393
|
+
|
|
394
|
+
```bash
|
|
395
|
+
gerbil chat
|
|
396
|
+
# In chat: Press ^R to record, ^V to enable voice responses
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
## Performance
|
|
400
|
+
|
|
401
|
+
Transcription speed on Apple M1:
|
|
402
|
+
|
|
403
|
+
| Model | Audio Length | Time | Speed |
|
|
404
|
+
|-------|--------------|------|-------|
|
|
405
|
+
| whisper-tiny.en | 11s | ~250ms | 44x realtime |
|
|
406
|
+
| whisper-base.en | 11s | ~500ms | 22x realtime |
|
|
407
|
+
| whisper-small.en | 11s | ~1.5s | 7x realtime |
|
|
408
|
+
|
|
409
|
+
## Troubleshooting
|
|
410
|
+
|
|
411
|
+
### "Unsupported bit depth"
|
|
412
|
+
Only 16-bit WAV files are supported. Convert with:
|
|
413
|
+
```bash
|
|
414
|
+
ffmpeg -i audio.mp3 -ar 16000 -ac 1 -sample_fmt s16 output.wav
|
|
415
|
+
```
|
|
416
|
+
|
|
417
|
+
### Slow first transcription
|
|
418
|
+
The first call downloads the model (~40-250MB). Subsequent calls use the cached model.
|
|
419
|
+
|
|
420
|
+
### Memory usage
|
|
421
|
+
Whisper models require:
|
|
422
|
+
- tiny: ~150MB RAM
|
|
423
|
+
- base: ~300MB RAM
|
|
424
|
+
- small: ~600MB RAM
|
|
425
|
+
- large-v3-turbo: ~1.5GB RAM
|
|
426
|
+
|
|
427
|
+
## useVoiceChat Hook (Full Voice Conversation)
|
|
428
|
+
|
|
429
|
+
For complete voice-to-voice conversations (STT → LLM → TTS):
|
|
430
|
+
|
|
431
|
+
```tsx
|
|
432
|
+
import { useVoiceChat } from "@tryhamster/gerbil/browser";
|
|
433
|
+
|
|
434
|
+
function VoiceAssistant() {
|
|
435
|
+
const {
|
|
436
|
+
messages,
|
|
437
|
+
startListening,
|
|
438
|
+
stopListening,
|
|
439
|
+
isListening,
|
|
440
|
+
isSpeaking,
|
|
441
|
+
stage, // "idle" | "listening" | "transcribing" | "thinking" | "speaking"
|
|
442
|
+
isReady,
|
|
443
|
+
} = useVoiceChat({
|
|
444
|
+
llmModel: "qwen3-0.6b",
|
|
445
|
+
sttModel: "whisper-tiny.en",
|
|
446
|
+
voice: "af_bella",
|
|
447
|
+
system: "You are a helpful voice assistant.",
|
|
448
|
+
});
|
|
449
|
+
|
|
450
|
+
return (
|
|
451
|
+
<div>
|
|
452
|
+
{messages.map(m => <p key={m.id}>{m.role}: {m.content}</p>)}
|
|
453
|
+
<button
|
|
454
|
+
onMouseDown={startListening}
|
|
455
|
+
onMouseUp={stopListening}
|
|
456
|
+
>
|
|
457
|
+
{stage === "idle" ? "🎤 Hold to Speak" : stage}
|
|
458
|
+
</button>
|
|
459
|
+
</div>
|
|
460
|
+
);
|
|
461
|
+
}
|
|
462
|
+
```
|
|
463
|
+
|
|
464
|
+
## Adding Custom Models
|
|
465
|
+
|
|
466
|
+
Gerbil supports any ONNX model with the `transformers.js` tag on HuggingFace:
|
|
467
|
+
|
|
468
|
+
```typescript
|
|
469
|
+
// Use any onnx-community model
|
|
470
|
+
await g.loadSTT("whisper-large-v3-turbo");
|
|
471
|
+
|
|
472
|
+
// Browse compatible models at:
|
|
473
|
+
// https://huggingface.co/models?library=transformers.js&pipeline_tag=automatic-speech-recognition
|
|
474
|
+
```
|
|
475
|
+
|
|
476
|
+
To add a new model to the registry, edit `src/core/stt.ts`:
|
|
477
|
+
|
|
478
|
+
```typescript
|
|
479
|
+
{
|
|
480
|
+
id: "my-custom-model",
|
|
481
|
+
repo: "onnx-community/my-model",
|
|
482
|
+
description: "My custom model",
|
|
483
|
+
size: "100M",
|
|
484
|
+
multilingual: true,
|
|
485
|
+
languages: ["en", "es"],
|
|
486
|
+
sampleRate: 16000,
|
|
487
|
+
}
|
|
488
|
+
```
|
|
489
|
+
|
|
490
|
+
## See Also
|
|
491
|
+
|
|
492
|
+
- [Text-to-Speech (TTS)](./tts.md) - Speech synthesis
|
|
493
|
+
- [Browser Hooks](./browser.md) - useChat, useVoiceInput, useVoiceChat
|
|
494
|
+
|