@tryhamster/gerbil 1.0.0-rc.0 → 1.0.0-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +79 -14
  2. package/dist/auto-update-DsWBBnEk.mjs +3 -0
  3. package/dist/browser/index.d.mts +401 -5
  4. package/dist/browser/index.d.mts.map +1 -1
  5. package/dist/browser/index.mjs +1772 -146
  6. package/dist/browser/index.mjs.map +1 -1
  7. package/dist/{chrome-backend-CtwPENIW.mjs → chrome-backend-JEPeM2YE.mjs} +1 -1
  8. package/dist/{chrome-backend-C5Un08O4.mjs → chrome-backend-Y9F7W5VQ.mjs} +514 -73
  9. package/dist/chrome-backend-Y9F7W5VQ.mjs.map +1 -0
  10. package/dist/cli.mjs +3359 -646
  11. package/dist/cli.mjs.map +1 -1
  12. package/dist/frameworks/express.d.mts +1 -1
  13. package/dist/frameworks/express.mjs +3 -3
  14. package/dist/frameworks/fastify.d.mts +1 -1
  15. package/dist/frameworks/fastify.mjs +3 -3
  16. package/dist/frameworks/hono.d.mts +1 -1
  17. package/dist/frameworks/hono.mjs +3 -3
  18. package/dist/frameworks/next.d.mts +2 -2
  19. package/dist/frameworks/next.mjs +3 -3
  20. package/dist/frameworks/react.d.mts +1 -1
  21. package/dist/frameworks/trpc.d.mts +1 -1
  22. package/dist/frameworks/trpc.mjs +3 -3
  23. package/dist/gerbil-DeQlX_Mt.mjs +5 -0
  24. package/dist/gerbil-POAz8peb.d.mts +431 -0
  25. package/dist/gerbil-POAz8peb.d.mts.map +1 -0
  26. package/dist/gerbil-yoSpRHgv.mjs +1463 -0
  27. package/dist/gerbil-yoSpRHgv.mjs.map +1 -0
  28. package/dist/index.d.mts +395 -9
  29. package/dist/index.d.mts.map +1 -1
  30. package/dist/index.mjs +8 -6
  31. package/dist/index.mjs.map +1 -1
  32. package/dist/integrations/ai-sdk.d.mts +122 -4
  33. package/dist/integrations/ai-sdk.d.mts.map +1 -1
  34. package/dist/integrations/ai-sdk.mjs +239 -11
  35. package/dist/integrations/ai-sdk.mjs.map +1 -1
  36. package/dist/integrations/langchain.d.mts +132 -2
  37. package/dist/integrations/langchain.d.mts.map +1 -1
  38. package/dist/integrations/langchain.mjs +176 -8
  39. package/dist/integrations/langchain.mjs.map +1 -1
  40. package/dist/integrations/llamaindex.d.mts +1 -1
  41. package/dist/integrations/llamaindex.mjs +3 -3
  42. package/dist/integrations/mcp-client.mjs +4 -4
  43. package/dist/integrations/mcp-client.mjs.map +1 -1
  44. package/dist/integrations/mcp.d.mts +2 -2
  45. package/dist/integrations/mcp.d.mts.map +1 -1
  46. package/dist/integrations/mcp.mjs +6 -6
  47. package/dist/{mcp-R8kRLIKb.mjs → mcp-Bitg4sjX.mjs} +10 -37
  48. package/dist/mcp-Bitg4sjX.mjs.map +1 -0
  49. package/dist/microphone-D-6y9aiE.mjs +3 -0
  50. package/dist/{models-DKULvhOr.mjs → models-BAtL8qsA.mjs} +42 -7
  51. package/dist/models-BAtL8qsA.mjs.map +1 -0
  52. package/dist/{models-De2-_GmQ.d.mts → models-CE0fBq0U.d.mts} +2 -2
  53. package/dist/models-CE0fBq0U.d.mts.map +1 -0
  54. package/dist/{one-liner-BUQR0nqq.mjs → one-liner-B1rmFto6.mjs} +2 -2
  55. package/dist/{one-liner-BUQR0nqq.mjs.map → one-liner-B1rmFto6.mjs.map} +1 -1
  56. package/dist/repl-D20JO260.mjs +10 -0
  57. package/dist/skills/index.d.mts +303 -12
  58. package/dist/skills/index.d.mts.map +1 -1
  59. package/dist/skills/index.mjs +6 -6
  60. package/dist/skills-5DxAV-rn.mjs +1435 -0
  61. package/dist/skills-5DxAV-rn.mjs.map +1 -0
  62. package/dist/stt-Bv_dum-R.mjs +433 -0
  63. package/dist/stt-Bv_dum-R.mjs.map +1 -0
  64. package/dist/stt-KzSoNvwI.mjs +3 -0
  65. package/dist/{tools-BsiEE6f2.mjs → tools-IYPrqoek.mjs} +6 -7
  66. package/dist/{tools-BsiEE6f2.mjs.map → tools-IYPrqoek.mjs.map} +1 -1
  67. package/dist/tts-5yWeP_I0.mjs +3 -0
  68. package/dist/tts-DG6denWG.mjs +729 -0
  69. package/dist/tts-DG6denWG.mjs.map +1 -0
  70. package/dist/types-s6Py2_DL.d.mts +353 -0
  71. package/dist/types-s6Py2_DL.d.mts.map +1 -0
  72. package/dist/{utils-7vXqtq2Q.mjs → utils-CkB4Roi6.mjs} +1 -1
  73. package/dist/{utils-7vXqtq2Q.mjs.map → utils-CkB4Roi6.mjs.map} +1 -1
  74. package/docs/ai-sdk.md +137 -21
  75. package/docs/browser.md +241 -2
  76. package/docs/memory.md +72 -0
  77. package/docs/stt.md +494 -0
  78. package/docs/tts.md +569 -0
  79. package/docs/vision.md +396 -0
  80. package/package.json +17 -18
  81. package/dist/auto-update-BbNHbSU1.mjs +0 -3
  82. package/dist/chrome-backend-C5Un08O4.mjs.map +0 -1
  83. package/dist/gerbil-BfnsFWRE.mjs +0 -644
  84. package/dist/gerbil-BfnsFWRE.mjs.map +0 -1
  85. package/dist/gerbil-BjW-z7Fq.mjs +0 -5
  86. package/dist/gerbil-DZ1k3ChC.d.mts +0 -138
  87. package/dist/gerbil-DZ1k3ChC.d.mts.map +0 -1
  88. package/dist/mcp-R8kRLIKb.mjs.map +0 -1
  89. package/dist/models-DKULvhOr.mjs.map +0 -1
  90. package/dist/models-De2-_GmQ.d.mts.map +0 -1
  91. package/dist/skills-D3CEpgDc.mjs +0 -630
  92. package/dist/skills-D3CEpgDc.mjs.map +0 -1
  93. package/dist/types-BS1N92Jt.d.mts +0 -183
  94. package/dist/types-BS1N92Jt.d.mts.map +0 -1
package/docs/stt.md ADDED
@@ -0,0 +1,494 @@
1
+ # Speech-to-Text (STT)
2
+
3
+ Gerbil provides local speech-to-text transcription using Whisper ONNX models via transformers.js. No API keys required - everything runs on-device.
4
+
5
+ ## Quick Start
6
+
7
+ ### Node.js
8
+
9
+ ```typescript
10
+ import { Gerbil } from "@tryhamster/gerbil";
11
+ import { readFileSync } from "fs";
12
+
13
+ const g = new Gerbil();
14
+
15
+ // Transcribe a WAV file
16
+ const audioData = new Uint8Array(readFileSync("audio.wav"));
17
+ const result = await g.transcribe(audioData);
18
+ console.log(result.text);
19
+ ```
20
+
21
+ ### CLI
22
+
23
+ ```bash
24
+ # Transcribe audio file
25
+ gerbil transcribe audio.wav
26
+
27
+ # With timestamps
28
+ gerbil transcribe audio.wav --timestamps
29
+
30
+ # List available models
31
+ gerbil transcribe --list-models
32
+
33
+ # Use a different model
34
+ gerbil transcribe audio.wav --model whisper-base.en
35
+ ```
36
+
37
+ ## Available Models
38
+
39
+ | Model | Size | Languages | Description |
40
+ |-------|------|-----------|-------------|
41
+ | `whisper-tiny.en` | 39M | English | Fastest, good for simple audio |
42
+ | `whisper-tiny` | 39M | Multi | Multilingual tiny model |
43
+ | `whisper-base.en` | 74M | English | Good balance of speed/accuracy |
44
+ | `whisper-base` | 74M | Multi | Multilingual base model |
45
+ | `whisper-small.en` | 244M | English | High quality transcription |
46
+ | `whisper-small` | 244M | Multi | High quality multilingual |
47
+ | `whisper-large-v3-turbo` | 809M | 80+ langs | Best quality, 5.4x faster than v3 |
48
+
49
+ ## API Reference
50
+
51
+ ### Gerbil Class Methods
52
+
53
+ ```typescript
54
+ // Load STT model explicitly (optional - auto-loads on first transcribe)
55
+ await g.loadSTT("whisper-tiny.en", {
56
+ onProgress: (p) => console.log(p.status),
57
+ });
58
+
59
+ // Transcribe audio
60
+ const result = await g.transcribe(audioData, {
61
+ language: "en", // Language hint (multilingual models only)
62
+ timestamps: true, // Return segment timestamps
63
+ onProgress: (p) => console.log(p.status),
64
+ });
65
+
66
+ // Result structure
67
+ console.log(result.text); // Full transcription
68
+ console.log(result.duration); // Audio duration in seconds
69
+ console.log(result.totalTime); // Processing time in ms
70
+ console.log(result.segments); // Timestamped segments (if requested)
71
+
72
+ // Check if loaded
73
+ console.log(g.isSTTLoaded());
74
+
75
+ // List available models
76
+ const models = await g.listSTTModels();
77
+ ```
78
+
79
+ ### Audio Input Formats
80
+
81
+ ```typescript
82
+ // WAV file (Uint8Array) - automatically decoded
83
+ const wavData = new Uint8Array(fs.readFileSync("audio.wav"));
84
+ await g.transcribe(wavData);
85
+
86
+ // Raw audio (Float32Array at 16kHz mono)
87
+ const raw16k = new Float32Array([...]); // 16kHz mono samples
88
+ await g.transcribe(raw16k);
89
+
90
+ // WAV at any sample rate - automatically resampled to 16kHz
91
+ const wav44k = new Uint8Array(fs.readFileSync("audio-44khz.wav"));
92
+ await g.transcribe(wav44k); // Resampled internally
93
+ ```
94
+
95
+ ### Transcription with Timestamps
96
+
97
+ ```typescript
98
+ const result = await g.transcribe(audioData, { timestamps: true });
99
+
100
+ for (const segment of result.segments || []) {
101
+ console.log(`[${segment.start.toFixed(1)}s - ${segment.end.toFixed(1)}s] ${segment.text}`);
102
+ }
103
+ // [0.0s - 5.2s] Hello world, this is a test
104
+ // [5.2s - 8.0s] of the transcription system
105
+ ```
106
+
107
+ ### Language Detection (Multilingual Models)
108
+
109
+ ```typescript
110
+ // Use multilingual model
111
+ await g.loadSTT("whisper-base");
112
+
113
+ // Let Whisper detect language
114
+ const result = await g.transcribe(audioData);
115
+ console.log(result.language); // "en", "es", "fr", etc.
116
+
117
+ // Or provide a language hint
118
+ const spanishResult = await g.transcribe(audioData, { language: "es" });
119
+ ```
120
+
121
+ ## Direct WhisperSTT Class
122
+
123
+ For lower-level control:
124
+
125
+ ```typescript
126
+ import { WhisperSTT, decodeWav, resampleAudio } from "@tryhamster/gerbil/core/stt";
127
+
128
+ const stt = new WhisperSTT("whisper-tiny.en");
129
+
130
+ await stt.load({
131
+ onProgress: (p) => console.log(p.status),
132
+ });
133
+
134
+ const result = await stt.transcribe(audioData, {
135
+ timestamps: true,
136
+ });
137
+
138
+ console.log(result.text);
139
+
140
+ stt.dispose();
141
+ ```
142
+
143
+ ### Audio Utilities
144
+
145
+ ```typescript
146
+ import { decodeWav, resampleAudio } from "@tryhamster/gerbil/core/stt";
147
+
148
+ // Decode WAV to Float32Array
149
+ const { audio, sampleRate } = decodeWav(wavUint8Array);
150
+
151
+ // Resample to 16kHz (Whisper requirement)
152
+ const audio16k = resampleAudio(audio, sampleRate, 16000);
153
+ ```
154
+
155
+ ## Streaming Transcription (Real-time)
156
+
157
+ Transcribe audio in chunks as it comes in - perfect for live captioning, call transcription, or real-time subtitles:
158
+
159
+ ```typescript
160
+ import { Gerbil } from "@tryhamster/gerbil";
161
+
162
+ const g = new Gerbil();
163
+ await g.loadSTT("whisper-tiny.en");
164
+
165
+ // Create streaming session
166
+ const session = await g.createStreamingTranscription({
167
+ chunkDuration: 1500, // Transcribe every 1.5 seconds (default)
168
+ onChunk: (text, chunkIndex) => {
169
+ console.log(`Chunk ${chunkIndex}: ${text}`);
170
+ },
171
+ onTranscript: (fullText) => {
172
+ console.log("Full transcript:", fullText);
173
+ },
174
+ onError: (err) => console.error(err),
175
+ });
176
+
177
+ // Start the session (begins automatic interval-based transcription)
178
+ session.start();
179
+
180
+ // Feed audio data as it comes in (Float32Array at 16kHz)
181
+ // This could come from a microphone, WebRTC stream, etc.
182
+ session.feedAudio(audioChunk);
183
+ session.feedAudio(moreAudioChunk);
184
+
185
+ // Or manually trigger transcription of buffered audio
186
+ await session.flush();
187
+
188
+ // Get current state
189
+ console.log(session.getTranscript()); // Full accumulated text
190
+ console.log(session.getChunkCount()); // Number of chunks processed
191
+ console.log(session.isRunning()); // Whether session is active
192
+
193
+ // Stop and get final transcript
194
+ const finalText = await session.stop();
195
+ console.log("Final:", finalText);
196
+
197
+ // Reset for reuse
198
+ session.reset();
199
+ ```
200
+
201
+ ### Session API
202
+
203
+ ```typescript
204
+ interface StreamingTranscriptionSession {
205
+ feedAudio(audio: Float32Array): void; // Add audio to buffer
206
+ flush(): Promise<string>; // Transcribe buffer now
207
+ start(): void; // Start auto-transcription
208
+ stop(): Promise<string>; // Stop and get final text
209
+ isRunning(): boolean; // Check if running
210
+ getTranscript(): string; // Get current full text
211
+ getChunkCount(): number; // Number of chunks processed
212
+ reset(): void; // Clear buffer and transcript
213
+ }
214
+
215
+ interface StreamingTranscriptionOptions {
216
+ chunkDuration?: number; // Interval in ms (default: 1500)
217
+ minChunkSize?: number; // Min samples before transcribing (default: 8000)
218
+ onChunk?: (text: string, chunkIndex: number) => void;
219
+ onTranscript?: (fullText: string) => void;
220
+ onError?: (error: string) => void;
221
+ language?: string; // Language hint for multilingual models
222
+ }
223
+ ```
224
+
225
+ ### Streaming with Microphone (Node.js)
226
+
227
+ ```typescript
228
+ import { Gerbil, StreamingMicrophone } from "@tryhamster/gerbil";
229
+
230
+ const g = new Gerbil();
231
+ await g.loadSTT("whisper-tiny.en");
232
+
233
+ // Create streaming transcription session
234
+ const session = await g.createStreamingTranscription({
235
+ chunkDuration: 2500,
236
+ onChunk: (text, idx) => console.log(`[${idx}] ${text}`),
237
+ onTranscript: (full) => console.log("Transcript:", full),
238
+ });
239
+
240
+ session.start();
241
+
242
+ // Create streaming microphone that feeds to session
243
+ const mic = new StreamingMicrophone({
244
+ onAudioChunk: (audio) => session.feedAudio(audio),
245
+ chunkDuration: 500, // 0.5s audio chunks
246
+ });
247
+
248
+ await mic.start();
249
+ console.log("Recording... Press Ctrl+C to stop");
250
+
251
+ // Handle graceful shutdown
252
+ process.on("SIGINT", async () => {
253
+ await mic.stop();
254
+ const transcript = await session.stop();
255
+ console.log("\nFinal:", transcript);
256
+ process.exit(0);
257
+ });
258
+ ```
259
+
260
+ ## React Hook (Browser)
261
+
262
+ Use `useVoiceInput` for browser-based voice recording and transcription:
263
+
264
+ ```tsx
265
+ import { useVoiceInput } from "@tryhamster/gerbil/browser";
266
+
267
+ function VoiceInput() {
268
+ const {
269
+ startRecording,
270
+ stopRecording,
271
+ isRecording,
272
+ isTranscribing,
273
+ transcript,
274
+ error,
275
+ } = useVoiceInput({
276
+ model: "whisper-tiny.en",
277
+ onTranscript: (text) => console.log("User said:", text),
278
+ });
279
+
280
+ return (
281
+ <div>
282
+ <button onClick={isRecording ? stopRecording : startRecording}>
283
+ {isRecording ? "🔴 Stop" : "🎤 Record"}
284
+ </button>
285
+ {isTranscribing && <span>Transcribing...</span>}
286
+ {transcript && <p>{transcript}</p>}
287
+ </div>
288
+ );
289
+ }
290
+ ```
291
+
292
+ ### Hook Options
293
+
294
+ ```typescript
295
+ const {
296
+ startRecording, // () => Promise<void> - start recording
297
+ stopRecording, // () => Promise<string> - stop and transcribe
298
+ cancelRecording, // () => void - stop without transcribing
299
+ transcribe, // (audio: Float32Array) => Promise<string>
300
+ isRecording, // boolean - currently recording
301
+ isTranscribing, // boolean - transcribing audio
302
+ isLoading, // boolean - model loading
303
+ isReady, // boolean - model ready
304
+ transcript, // string - latest result
305
+ loadingProgress, // STTProgress - loading status
306
+ error, // string | null
307
+ load, // () => void - manually load model
308
+ } = useVoiceInput({
309
+ model: "whisper-tiny.en", // STT model ID
310
+ autoLoad: false, // loads on first record
311
+ onReady: () => {}, // called when model ready
312
+ onTranscript: (text) => {},// called on each transcription
313
+ onError: (error) => {}, // called on errors
314
+ onProgress: (p) => {}, // loading progress
315
+ });
316
+ ```
317
+
318
+ ## CLI Commands
319
+
320
+ ### Transcribe
321
+
322
+ ```bash
323
+ # Basic transcription
324
+ gerbil transcribe recording.wav
325
+
326
+ # With timestamps
327
+ gerbil transcribe recording.wav --timestamps
328
+
329
+ # Save to file
330
+ gerbil transcribe recording.wav --output transcript.txt
331
+
332
+ # Use specific model
333
+ gerbil transcribe recording.wav --model whisper-base.en
334
+
335
+ # Multilingual with language hint
336
+ gerbil transcribe recording.wav --model whisper-base --language es
337
+
338
+ # List models
339
+ gerbil transcribe --list-models
340
+ ```
341
+
342
+ ### Voice Chat (STT → LLM → TTS)
343
+
344
+ Complete voice conversation loop from audio file:
345
+
346
+ ```bash
347
+ # Voice chat with audio file
348
+ gerbil voice question.wav
349
+
350
+ # Customize models and voice
351
+ gerbil voice question.wav --model qwen3-1.7b --voice bf_emma
352
+
353
+ # With custom system prompt
354
+ gerbil voice question.wav --system "You are a pirate. Speak like one!"
355
+
356
+ # Enable thinking mode
357
+ gerbil voice question.wav --thinking
358
+ ```
359
+
360
+ This command:
361
+ 1. Transcribes your audio (Whisper)
362
+ 2. Generates a response (LLM)
363
+ 3. Speaks the response (Kokoro TTS)
364
+
365
+ ### Microphone Recording
366
+
367
+ Record directly from your microphone:
368
+
369
+ ```typescript
370
+ import { Gerbil } from "@tryhamster/gerbil";
371
+
372
+ const g = new Gerbil();
373
+
374
+ // Record for 5 seconds and transcribe
375
+ const result = await g.listen(5000);
376
+ console.log(result.text);
377
+
378
+ // Check if microphone is available
379
+ const available = await g.isMicrophoneAvailable();
380
+ ```
381
+
382
+ **Requirements:** SoX must be installed for microphone recording:
383
+ - macOS: `brew install sox`
384
+ - Ubuntu: `sudo apt install sox`
385
+ - Windows: Download from https://sox.sourceforge.net/
386
+
387
+ ### REPL Voice Input
388
+
389
+ In the Gerbil REPL chat:
390
+ - Press `Ctrl+R` to record from microphone (5 seconds)
391
+ - The transcribed text appears in the input field
392
+ - Combined with voice mode (`Ctrl+V`), enables full voice conversations
393
+
394
+ ```bash
395
+ gerbil chat
396
+ # In chat: Press ^R to record, ^V to enable voice responses
397
+ ```
398
+
399
+ ## Performance
400
+
401
+ Transcription speed on Apple M1:
402
+
403
+ | Model | Audio Length | Time | Speed |
404
+ |-------|--------------|------|-------|
405
+ | whisper-tiny.en | 11s | ~250ms | 44x realtime |
406
+ | whisper-base.en | 11s | ~500ms | 22x realtime |
407
+ | whisper-small.en | 11s | ~1.5s | 7x realtime |
408
+
409
+ ## Troubleshooting
410
+
411
+ ### "Unsupported bit depth"
412
+ Only 16-bit WAV files are supported. Convert with:
413
+ ```bash
414
+ ffmpeg -i audio.mp3 -ar 16000 -ac 1 -sample_fmt s16 output.wav
415
+ ```
416
+
417
+ ### Slow first transcription
418
+ The first call downloads the model (~40-250MB). Subsequent calls use the cached model.
419
+
420
+ ### Memory usage
421
+ Whisper models require:
422
+ - tiny: ~150MB RAM
423
+ - base: ~300MB RAM
424
+ - small: ~600MB RAM
425
+ - large-v3-turbo: ~1.5GB RAM
426
+
427
+ ## useVoiceChat Hook (Full Voice Conversation)
428
+
429
+ For complete voice-to-voice conversations (STT → LLM → TTS):
430
+
431
+ ```tsx
432
+ import { useVoiceChat } from "@tryhamster/gerbil/browser";
433
+
434
+ function VoiceAssistant() {
435
+ const {
436
+ messages,
437
+ startListening,
438
+ stopListening,
439
+ isListening,
440
+ isSpeaking,
441
+ stage, // "idle" | "listening" | "transcribing" | "thinking" | "speaking"
442
+ isReady,
443
+ } = useVoiceChat({
444
+ llmModel: "qwen3-0.6b",
445
+ sttModel: "whisper-tiny.en",
446
+ voice: "af_bella",
447
+ system: "You are a helpful voice assistant.",
448
+ });
449
+
450
+ return (
451
+ <div>
452
+ {messages.map(m => <p key={m.id}>{m.role}: {m.content}</p>)}
453
+ <button
454
+ onMouseDown={startListening}
455
+ onMouseUp={stopListening}
456
+ >
457
+ {stage === "idle" ? "🎤 Hold to Speak" : stage}
458
+ </button>
459
+ </div>
460
+ );
461
+ }
462
+ ```
463
+
464
+ ## Adding Custom Models
465
+
466
+ Gerbil supports any ONNX model with the `transformers.js` tag on HuggingFace:
467
+
468
+ ```typescript
469
+ // Use any onnx-community model
470
+ await g.loadSTT("whisper-large-v3-turbo");
471
+
472
+ // Browse compatible models at:
473
+ // https://huggingface.co/models?library=transformers.js&pipeline_tag=automatic-speech-recognition
474
+ ```
475
+
476
+ To add a new model to the registry, edit `src/core/stt.ts`:
477
+
478
+ ```typescript
479
+ {
480
+ id: "my-custom-model",
481
+ repo: "onnx-community/my-model",
482
+ description: "My custom model",
483
+ size: "100M",
484
+ multilingual: true,
485
+ languages: ["en", "es"],
486
+ sampleRate: 16000,
487
+ }
488
+ ```
489
+
490
+ ## See Also
491
+
492
+ - [Text-to-Speech (TTS)](./tts.md) - Speech synthesis
493
+ - [Browser Hooks](./browser.md) - useChat, useVoiceInput, useVoiceChat
494
+