modular-voice-agent-sdk 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. package/README.md +102 -0
  2. package/USAGE.md +567 -0
  3. package/dist/backends/cloud/index.d.ts +7 -0
  4. package/dist/backends/cloud/index.d.ts.map +1 -0
  5. package/dist/backends/cloud/index.js +6 -0
  6. package/dist/backends/cloud/index.js.map +1 -0
  7. package/dist/backends/cloud/llm.d.ts +22 -0
  8. package/dist/backends/cloud/llm.d.ts.map +1 -0
  9. package/dist/backends/cloud/llm.js +234 -0
  10. package/dist/backends/cloud/llm.js.map +1 -0
  11. package/dist/backends/index.d.ts +2 -0
  12. package/dist/backends/index.d.ts.map +1 -0
  13. package/dist/backends/index.js +6 -0
  14. package/dist/backends/index.js.map +1 -0
  15. package/dist/backends/native/index.d.ts +5 -0
  16. package/dist/backends/native/index.d.ts.map +1 -0
  17. package/dist/backends/native/index.js +6 -0
  18. package/dist/backends/native/index.js.map +1 -0
  19. package/dist/backends/native/llm.d.ts +71 -0
  20. package/dist/backends/native/llm.d.ts.map +1 -0
  21. package/dist/backends/native/llm.js +435 -0
  22. package/dist/backends/native/llm.js.map +1 -0
  23. package/dist/backends/native/stt.d.ts +15 -0
  24. package/dist/backends/native/stt.d.ts.map +1 -0
  25. package/dist/backends/native/stt.js +94 -0
  26. package/dist/backends/native/stt.js.map +1 -0
  27. package/dist/backends/native/tts.d.ts +21 -0
  28. package/dist/backends/native/tts.d.ts.map +1 -0
  29. package/dist/backends/native/tts.js +105 -0
  30. package/dist/backends/native/tts.js.map +1 -0
  31. package/dist/backends/transformers/index.d.ts +4 -0
  32. package/dist/backends/transformers/index.d.ts.map +1 -0
  33. package/dist/backends/transformers/index.js +4 -0
  34. package/dist/backends/transformers/index.js.map +1 -0
  35. package/dist/backends/transformers/llm.d.ts +29 -0
  36. package/dist/backends/transformers/llm.d.ts.map +1 -0
  37. package/dist/backends/transformers/llm.js +117 -0
  38. package/dist/backends/transformers/llm.js.map +1 -0
  39. package/dist/backends/transformers/stt.d.ts +17 -0
  40. package/dist/backends/transformers/stt.d.ts.map +1 -0
  41. package/dist/backends/transformers/stt.js +43 -0
  42. package/dist/backends/transformers/stt.js.map +1 -0
  43. package/dist/backends/transformers/tts.d.ts +17 -0
  44. package/dist/backends/transformers/tts.d.ts.map +1 -0
  45. package/dist/backends/transformers/tts.js +40 -0
  46. package/dist/backends/transformers/tts.js.map +1 -0
  47. package/dist/cache.d.ts +37 -0
  48. package/dist/cache.d.ts.map +1 -0
  49. package/dist/cache.js +49 -0
  50. package/dist/cache.js.map +1 -0
  51. package/dist/cli.d.ts +11 -0
  52. package/dist/cli.d.ts.map +1 -0
  53. package/dist/cli.js +392 -0
  54. package/dist/cli.js.map +1 -0
  55. package/dist/client/audio-player.d.ts +45 -0
  56. package/dist/client/audio-player.d.ts.map +1 -0
  57. package/dist/client/audio-player.js +90 -0
  58. package/dist/client/audio-player.js.map +1 -0
  59. package/dist/client/audio-recorder.d.ts +42 -0
  60. package/dist/client/audio-recorder.d.ts.map +1 -0
  61. package/dist/client/audio-recorder.js +128 -0
  62. package/dist/client/audio-recorder.js.map +1 -0
  63. package/dist/client/index.d.ts +34 -0
  64. package/dist/client/index.d.ts.map +1 -0
  65. package/dist/client/index.js +33 -0
  66. package/dist/client/index.js.map +1 -0
  67. package/dist/client/protocol.d.ts +80 -0
  68. package/dist/client/protocol.d.ts.map +1 -0
  69. package/dist/client/protocol.js +29 -0
  70. package/dist/client/protocol.js.map +1 -0
  71. package/dist/client/voice-client.d.ts +249 -0
  72. package/dist/client/voice-client.d.ts.map +1 -0
  73. package/dist/client/voice-client.js +826 -0
  74. package/dist/client/voice-client.js.map +1 -0
  75. package/dist/client/web-speech-stt.d.ts +65 -0
  76. package/dist/client/web-speech-stt.d.ts.map +1 -0
  77. package/dist/client/web-speech-stt.js +122 -0
  78. package/dist/client/web-speech-stt.js.map +1 -0
  79. package/dist/client/web-speech-tts.d.ts +59 -0
  80. package/dist/client/web-speech-tts.d.ts.map +1 -0
  81. package/dist/client/web-speech-tts.js +145 -0
  82. package/dist/client/web-speech-tts.js.map +1 -0
  83. package/dist/index.d.ts +10 -0
  84. package/dist/index.d.ts.map +1 -0
  85. package/dist/index.js +13 -0
  86. package/dist/index.js.map +1 -0
  87. package/dist/server/encoding.d.ts +18 -0
  88. package/dist/server/encoding.d.ts.map +1 -0
  89. package/dist/server/encoding.js +41 -0
  90. package/dist/server/encoding.js.map +1 -0
  91. package/dist/server/handler.d.ts +86 -0
  92. package/dist/server/handler.d.ts.map +1 -0
  93. package/dist/server/handler.js +224 -0
  94. package/dist/server/handler.js.map +1 -0
  95. package/dist/server/index.d.ts +31 -0
  96. package/dist/server/index.d.ts.map +1 -0
  97. package/dist/server/index.js +32 -0
  98. package/dist/server/index.js.map +1 -0
  99. package/dist/services/function-service.d.ts +17 -0
  100. package/dist/services/function-service.d.ts.map +1 -0
  101. package/dist/services/function-service.js +82 -0
  102. package/dist/services/function-service.js.map +1 -0
  103. package/dist/services/index.d.ts +4 -0
  104. package/dist/services/index.d.ts.map +1 -0
  105. package/dist/services/index.js +3 -0
  106. package/dist/services/index.js.map +1 -0
  107. package/dist/services/llm-logger.d.ts +136 -0
  108. package/dist/services/llm-logger.d.ts.map +1 -0
  109. package/dist/services/llm-logger.js +275 -0
  110. package/dist/services/llm-logger.js.map +1 -0
  111. package/dist/services/text-normalizer.d.ts +17 -0
  112. package/dist/services/text-normalizer.d.ts.map +1 -0
  113. package/dist/services/text-normalizer.js +100 -0
  114. package/dist/services/text-normalizer.js.map +1 -0
  115. package/dist/types.d.ts +195 -0
  116. package/dist/types.d.ts.map +1 -0
  117. package/dist/types.js +48 -0
  118. package/dist/types.js.map +1 -0
  119. package/dist/voice-pipeline.d.ts +125 -0
  120. package/dist/voice-pipeline.d.ts.map +1 -0
  121. package/dist/voice-pipeline.js +390 -0
  122. package/dist/voice-pipeline.js.map +1 -0
  123. package/package.json +96 -0
  124. package/scripts/setup-binaries.sh +159 -0
  125. package/scripts/setup.sh +201 -0
package/README.md ADDED
@@ -0,0 +1,102 @@
1
+ # Modular Voice Agent SDK (MVAS)
2
+
3
+ Build voice assistants without the plumbing. One SDK, any backend, same interface.
4
+
5
+ Voice apps either need expensive multimodal models, or a pipeline of three pieces: speech-to-text, an LLM, text-to-speech. Wiring them together means audio capture code, streaming logic, WebSocket boilerplate. This library handles all of that.
6
+
7
+ ```typescript
8
+ import { createVoiceClient, WebSpeechSTT, WebSpeechTTS } from 'modular-voice-agent-sdk/client';
9
+ import { TransformersLLM } from 'modular-voice-agent-sdk';
10
+
11
+ const client = createVoiceClient({
12
+ stt: new WebSpeechSTT(),
13
+ llm: new TransformersLLM({ model: 'HuggingFaceTB/SmolLM2-360M-Instruct' }),
14
+ tts: new WebSpeechTTS(),
15
+ systemPrompt: 'You are a helpful assistant.',
16
+ });
17
+
18
+ await client.connect();
19
+ button.onmousedown = () => client.startRecording();
20
+ button.onmouseup = () => client.stopRecording();
21
+ ```
22
+
23
+ That's a working voice assistant. No server required. See [`examples/example-0-bare-bones`](./examples/example-0-bare-bones/index.html) for the complete 30-line HTML version.
24
+
25
+ ## Mix and Match
26
+
27
+ Each component can run in the browser, on a server, or in the cloud. Pick any from each column — they all work together with the same API.
28
+
29
+ ```
30
+ ┌───────────────────────────┐ ┌───────────────────────────┐ ┌───────────────────────────┐
31
+ │ │ │ │ │ │
32
+ │ STT │ ────► │ LLM │ ────► │ TTS │
33
+ │ │ │ │ │ │
34
+ └───────────────────────────┘ └───────────────────────────┘ └───────────────────────────┘
35
+
36
+ 🌐 Browser speech recognition N/A 🌐 Browser speech synthesis
37
+ (Web Speech API) (Web Speech API)
38
+
39
+ 🌐 Browser JS transcriber 🌐 Browser JS LLM 🌐 Browser JS synthesis
40
+ (Transformers.js, WebGPU) (Transformers.js, WebGPU) (Transformers.js, WebGPU)
41
+
42
+ 🖥️ Server JS transcriber 🖥️ Server JS LLM 🖥️ Server JS synthesis
43
+ (Transformers.js, Node.js) (Transformers.js, Node.js) (Transformers.js, Node.js)
44
+
45
+ 🖥️ Server binary transcriber 🖥️ Server binary LLM 🖥️ Server binary synthesis
46
+ (whisper.cpp) (llama.cpp) (sherpa-onnx)
47
+
48
+ N/A ☁️ Cloud LLM N/A
49
+ (OpenAI, Ollama, vLLM)
50
+
51
+ ```
52
+
53
+ Want browser speech recognition + a cloud LLM + browser speech synthesis? Done. Want everything running locally on your server with native binaries? Also done. Same code structure, same events, different backends.
54
+
55
+ ## Features
56
+
57
+ - **Streaming** — responses stream token-by-token to TTS
58
+ - **Function calling** — tools work across all LLM backends (cloud, native, transformers)
59
+ - **Conversation history** — automatic context management
60
+ - **Hybrid configs** — mix browser and server components freely
61
+
62
+ See [`USAGE.md`](./USAGE.md) for full API documentation.
63
+
64
+ ## Examples
65
+
66
+ See [`examples/`](./examples/) for 10 interactive demos covering all configurations.
67
+
68
+ ```bash
69
+ cd examples
70
+ npm install
71
+ npm run example0 # or example1, example2, etc.
72
+ ```
73
+
74
+ ## Install
75
+
76
+ ```bash
77
+ npm install modular-voice-agent-sdk
78
+ ```
79
+
80
+ For native backends (whisper.cpp, llama.cpp, sherpa-onnx):
81
+
82
+ ```bash
83
+ # macOS
84
+ brew install whisper-cpp llama.cpp
85
+
86
+ # Download models
87
+ npx mvas setup
88
+ ```
89
+
90
+ For cloud LLMs:
91
+
92
+ ```bash
93
+ # OpenAI
94
+ export OPENAI_API_KEY=sk-your-key-here
95
+
96
+ # Or Ollama (local, no API key)
97
+ brew install ollama && ollama pull llama3.2
98
+ ```
99
+
100
+ ## License
101
+
102
+ MIT
package/USAGE.md ADDED
@@ -0,0 +1,567 @@
1
+ # Usage Guide
2
+
3
+ Detailed documentation for modular-voice-agent-sdk. For a quick overview, see [README.md](./README.md).
4
+
5
+ ## Two Ways to Use
6
+
7
+ **Option 1: Browser Only**
8
+
9
+ Create a `VoiceClient` with all components running locally in the browser. No server needed.
10
+
11
+ ```typescript
12
+ import { createVoiceClient, WebSpeechSTT, WebSpeechTTS } from 'modular-voice-agent-sdk/client';
13
+ import { TransformersLLM } from 'modular-voice-agent-sdk';
14
+
15
+ const client = createVoiceClient({
16
+ stt: new WebSpeechSTT(),
17
+ llm: new TransformersLLM({ model: '...' }),
18
+ tts: new WebSpeechTTS(),
19
+ systemPrompt: '...',
20
+ });
21
+ ```
22
+
23
+ **Option 2: Browser + Server**
24
+
25
+ Create a `VoiceClient` in the browser and a `VoicePipeline` on the server. Connect them via WebSocket. Set any component to `null` on the client to have the server handle it.
26
+
27
+ ```typescript
28
+ // Browser
29
+ const client = createVoiceClient({
30
+ stt: null, // server handles
31
+ llm: null, // server handles
32
+ tts: null, // server handles
33
+ serverUrl: 'ws://localhost:3100',
34
+ });
35
+
36
+ // Server
37
+ const pipeline = new VoicePipeline({
38
+ stt: new NativeSTT({ model: 'base.en' }),
39
+ llm: new CloudLLM({ model: 'gpt-4o', ... }),
40
+ tts: new NativeTTS({ model: 'en_US-amy-medium' }),
41
+ systemPrompt: '...',
42
+ });
43
+ ```
44
+
45
+ You can mix and match — run STT and TTS in the browser while the server handles just the LLM, or any other combination.
46
+
47
+ ---
48
+
49
+ ## Table of Contents
50
+
51
+ - [VoiceClient (Browser)](#voiceclient-browser)
52
+ - [VoicePipeline (Server)](#voicepipeline-server)
53
+ - [Configuration Examples](#configuration-examples)
54
+ - [Tools (Function Calling)](#tools-function-calling)
55
+ - [Backend Reference](#backend-reference)
56
+ - [Exports](#exports)
57
+
58
+ ## VoiceClient (Browser)
59
+
60
+ The unified browser SDK for voice interactions.
61
+
62
+ ### Creating a Client
63
+
64
+ ```typescript
65
+ const client = createVoiceClient({
66
+ // STT options: browser speech API, browser/server JS, or server handles
67
+ stt: WebSpeechSTT | TransformersSTT | null,
68
+
69
+ // LLM options: browser/server JS, or server handles
70
+ llm: TransformersLLM | null,
71
+
72
+ // TTS options: browser speech API, browser/server JS, or server handles
73
+ tts: WebSpeechTTS | TransformersTTS | null,
74
+
75
+ // Required if any component is null
76
+ serverUrl: 'ws://localhost:3100',
77
+
78
+ // Required if llm is provided locally
79
+ systemPrompt: 'You are a helpful assistant.',
80
+
81
+ // Optional
82
+ sampleRate: 16000,
83
+ autoReconnect: true,
84
+ reconnectDelay: 2000,
85
+ });
86
+ ```
87
+
88
+ ### Events
89
+
90
+ ```typescript
91
+ client.on('status', (status) => {
92
+ // 'disconnected' | 'connecting' | 'initializing' | 'ready' | 'listening' | 'processing' | 'speaking'
93
+ });
94
+
95
+ client.on('transcript', (text) => {
96
+ // User's transcribed speech
97
+ });
98
+
99
+ client.on('responseChunk', (chunk) => {
100
+ // Streaming LLM token
101
+ });
102
+
103
+ client.on('responseComplete', (fullText) => {
104
+ // Complete LLM response
105
+ });
106
+
107
+ client.on('progress', ({ status, file, progress }) => {
108
+ // Model loading progress (for Transformers.js backends)
109
+ });
110
+
111
+ client.on('error', (err) => {
112
+ // Error object
113
+ });
114
+ ```
115
+
116
+ ### Methods
117
+
118
+ ```typescript
119
+ await client.connect(); // Initialize local components + connect to server
120
+ await client.startRecording(); // Start listening
121
+ await client.stopRecording(); // Stop and process
122
+ client.clearHistory(); // Reset conversation
123
+ client.getMode(); // 'local' | 'remote' | 'hybrid'
124
+ client.getLocalComponents(); // { stt: boolean, llm: boolean, tts: boolean }
125
+ client.isReady();
126
+ client.isRecording();
127
+ client.disconnect();
128
+ await client.dispose();
129
+ ```
130
+
131
+ ## VoicePipeline (Server)
132
+
133
+ The server-side pipeline that processes audio/text through STT → LLM → TTS.
134
+
135
+ ### Creating a Pipeline
136
+
137
+ ```typescript
138
+ const pipeline = new VoicePipeline({
139
+ // STT options: JS or native binary, or null if client handles
140
+ stt: TransformersSTT | NativeSTT | null,
141
+
142
+ // LLM options: JS, native binary, or cloud API (required)
143
+ llm: TransformersLLM | NativeLLM | CloudLLM,
144
+
145
+ // TTS options: JS or native binary, or null if client handles
146
+ tts: TransformersTTS | NativeTTS | null,
147
+
148
+ systemPrompt: string,
149
+ tools?: Tool[], // optional function calling
150
+ });
151
+
152
+ await pipeline.initialize();
153
+ ```
154
+
155
+ ### Processing
156
+
157
+ ```typescript
158
+ // Process audio (requires STT)
159
+ await pipeline.processAudio(audioFloat32Array, {
160
+ onTranscript: (text) => {},
161
+ onResponseChunk: (chunk) => {},
162
+ onAudio: (playable) => {},
163
+ onComplete: () => {},
164
+ onError: (err) => {},
165
+ });
166
+
167
+ // Process text (for when client does STT)
168
+ await pipeline.processText('Hello', callbacks);
169
+ ```
170
+
171
+ ### Utility Methods
172
+
173
+ ```typescript
174
+ pipeline.hasSTT(); // boolean
175
+ pipeline.hasTTS(); // boolean
176
+ pipeline.clearHistory(); // Reset conversation context
177
+ ```
178
+
179
+ ### WebSocket Handler
180
+
181
+ For integrating with any WebSocket server:
182
+
183
+ ```typescript
184
+ import { WebSocketServer } from 'ws';
185
+ import { createPipelineHandler } from 'modular-voice-agent-sdk/server';
186
+
187
+ const handler = createPipelineHandler(pipeline);
188
+ const wss = new WebSocketServer({ port: 3100 });
189
+
190
+ wss.on('connection', (ws) => {
191
+ const session = handler.createSession();
192
+ ws.on('message', async (data) => {
193
+ for await (const msg of session.handle(JSON.parse(data.toString()))) {
194
+ ws.send(JSON.stringify(msg));
195
+ }
196
+ });
197
+ ws.on('close', () => session.destroy());
198
+ });
199
+ ```
200
+
201
+ ## Configuration Examples
202
+
203
+ ### Fully Local (No Server)
204
+
205
+ Everything runs in the browser:
206
+
207
+ ```typescript
208
+ import { createVoiceClient, WebSpeechSTT, WebSpeechTTS } from 'modular-voice-agent-sdk/client';
209
+ import { TransformersLLM } from 'modular-voice-agent-sdk';
210
+
211
+ const client = createVoiceClient({
212
+ stt: new WebSpeechSTT({ language: 'en-US' }),
213
+ llm: new TransformersLLM({
214
+ model: 'HuggingFaceTB/SmolLM2-360M-Instruct',
215
+ dtype: 'q4',
216
+ maxNewTokens: 140,
217
+ device: 'webgpu',
218
+ }),
219
+ tts: new WebSpeechTTS({ voiceName: 'Samantha' }),
220
+ systemPrompt: 'You are a helpful voice assistant.',
221
+ });
222
+ ```
223
+
224
+ ### Fully Remote (Server)
225
+
226
+ Client sends audio, server handles everything:
227
+
228
+ **Client:**
229
+ ```typescript
230
+ const client = createVoiceClient({
231
+ stt: null,
232
+ llm: null,
233
+ tts: null,
234
+ serverUrl: 'ws://localhost:3100',
235
+ });
236
+ ```
237
+
238
+ **Server:**
239
+ ```typescript
240
+ const pipeline = new VoicePipeline({
241
+ stt: new TransformersSTT({ model: 'Xenova/whisper-small', dtype: 'q8' }),
242
+ llm: new TransformersLLM({ model: 'HuggingFaceTB/SmolLM2-1.7B-Instruct', dtype: 'q4' }),
243
+ tts: new TransformersTTS({ model: 'Xenova/speecht5_tts', dtype: 'fp16', speakerEmbeddings: '...' }),
244
+ systemPrompt: 'You are a helpful voice assistant.',
245
+ });
246
+ ```
247
+
248
+ ### Hybrid (Browser STT/TTS + Server LLM)
249
+
250
+ **Client:**
251
+ ```typescript
252
+ const client = createVoiceClient({
253
+ stt: new WebSpeechSTT({ language: 'en-US' }),
254
+ llm: null,
255
+ tts: new WebSpeechTTS({ voiceName: 'Samantha' }),
256
+ serverUrl: 'ws://localhost:3100',
257
+ });
258
+ ```
259
+
260
+ **Server:**
261
+ ```typescript
262
+ const pipeline = new VoicePipeline({
263
+ stt: null,
264
+ llm: new NativeLLM({ model: 'llama-3.2-1b-instruct-q4_k_m.gguf' }),
265
+ tts: null,
266
+ systemPrompt: 'You are a helpful voice assistant.',
267
+ });
268
+ ```
269
+
270
+ ### Cloud LLM
271
+
272
+ **Server:**
273
+ ```typescript
274
+ import { CloudLLM } from 'modular-voice-agent-sdk/cloud';
275
+
276
+ const pipeline = new VoicePipeline({
277
+ stt: new NativeSTT({ model: 'base.en' }),
278
+ llm: new CloudLLM({
279
+ baseUrl: 'https://api.openai.com/v1',
280
+ apiKey: process.env.OPENAI_API_KEY,
281
+ model: 'gpt-4o',
282
+ maxTokens: 256,
283
+ }),
284
+ tts: new NativeTTS({ model: 'en_US-amy-medium' }),
285
+ systemPrompt: 'You are a helpful voice assistant.',
286
+ });
287
+ ```
288
+
289
+ Works with **OpenAI**, **Ollama** (`http://localhost:11434/v1`), **vLLM**, **LMStudio**, and any OpenAI-compatible endpoint.
290
+
291
+ ## Tools (Function Calling)
292
+
293
+ Give your voice assistant the ability to take actions — check the weather, control smart home devices, query databases, or call any API.
294
+
295
+ ### Defining a Tool
296
+
297
+ ```typescript
298
+ import { Tool } from 'modular-voice-agent-sdk';
299
+
300
+ const getWeather: Tool = {
301
+ name: 'get_weather',
302
+ description: 'Get the current weather for a location',
303
+ parameters: {
304
+ type: 'object',
305
+ properties: {
306
+ location: {
307
+ type: 'string',
308
+ description: 'City name, e.g., "San Francisco"',
309
+ },
310
+ },
311
+ required: ['location'],
312
+ },
313
+ execute: async (args) => {
314
+ const location = args.location as string;
315
+ // Call your weather API here
316
+ return { location, temperature: '72°F', condition: 'sunny' };
317
+ },
318
+ };
319
+ ```
320
+
321
+ ### Using Tools
322
+
323
+ ```typescript
324
+ const pipeline = new VoicePipeline({
325
+ llm: new CloudLLM({ ... }),
326
+ systemPrompt: 'You are a helpful assistant.',
327
+ tools: [getWeather, getTime, rollDice],
328
+ });
329
+ ```
330
+
331
+ ### Tool Events
332
+
333
+ ```typescript
334
+ await pipeline.processText('What\'s the weather in Tokyo?', {
335
+ onToolCall: (call) => console.log(`Calling ${call.name}...`),
336
+ onToolResult: (id, result) => console.log('Result:', result),
337
+ onResponseChunk: (chunk) => console.log(chunk),
338
+ });
339
+ ```
340
+
341
+ ### Backend Support
342
+
343
+ All LLM backends support tools with the same API:
344
+
345
+ | Backend | How Tools Work |
346
+ |---------|----------------|
347
+ | `CloudLLM` | Native OpenAI function calling API |
348
+ | `NativeLLM` | GBNF grammar constraint — guarantees valid JSON tool calls |
349
+ | `TransformersLLM` | Prompt injection (instructions added to system prompt) |
350
+
351
+ You don't need to do anything different — just pass `tools` and the backend handles it.
352
+
353
+ ### Complete Example
354
+
355
+ ```typescript
356
+ import { VoicePipeline, Tool } from 'modular-voice-agent-sdk';
357
+ import { CloudLLM } from 'modular-voice-agent-sdk/cloud';
358
+
359
+ const tools: Tool[] = [
360
+ {
361
+ name: 'get_current_time',
362
+ description: 'Get the current date and time',
363
+ parameters: { type: 'object', properties: {} },
364
+ execute: async () => ({
365
+ time: new Date().toLocaleTimeString(),
366
+ date: new Date().toLocaleDateString(),
367
+ }),
368
+ },
369
+ {
370
+ name: 'roll_dice',
371
+ description: 'Roll dice, e.g., "2d6" for two six-sided dice',
372
+ parameters: {
373
+ type: 'object',
374
+ properties: {
375
+ notation: { type: 'string', description: 'Dice notation like "2d6"' },
376
+ },
377
+ required: ['notation'],
378
+ },
379
+ execute: async (args) => {
380
+ const [num, sides] = (args.notation as string).split('d').map(Number);
381
+ const rolls = Array.from({ length: num }, () => Math.floor(Math.random() * sides) + 1);
382
+ return { rolls, total: rolls.reduce((a, b) => a + b, 0) };
383
+ },
384
+ },
385
+ ];
386
+
387
+ const pipeline = new VoicePipeline({
388
+ llm: new CloudLLM({
389
+ baseUrl: 'https://api.openai.com/v1',
390
+ apiKey: process.env.OPENAI_API_KEY,
391
+ model: 'gpt-4o',
392
+ }),
393
+ systemPrompt: 'You are a helpful assistant. Use tools when needed.',
394
+ tools,
395
+ });
396
+ ```
397
+
398
+ See `examples/example-5` and `examples/example-8` for full working examples with tools.
399
+
400
+ ## Backend Reference
401
+
402
+ ### WebSpeechSTT
403
+
404
+ Browser Speech Recognition API. Zero setup, works in Chrome/Edge/Safari.
405
+
406
+ ```typescript
407
+ new WebSpeechSTT({
408
+ language: 'en-US', // BCP-47 language code
409
+ continuous: false, // Keep listening after speech ends
410
+ interimResults: false, // Emit partial results
411
+ })
412
+ ```
413
+
414
+ ### WebSpeechTTS
415
+
416
+ Browser Speech Synthesis API. Zero setup, uses system voices.
417
+
418
+ ```typescript
419
+ new WebSpeechTTS({
420
+ voiceName: 'Samantha', // System voice name (optional)
421
+ lang: 'en-US', // Language code
422
+ rate: 1.0, // Speech rate (0.1 - 10)
423
+ pitch: 1.0, // Pitch (0 - 2)
424
+ })
425
+ ```
426
+
427
+ ### TransformersSTT
428
+
429
+ Whisper models via Transformers.js. Runs in browser (WebGPU) or Node.js.
430
+
431
+ ```typescript
432
+ new TransformersSTT({
433
+ model: 'Xenova/whisper-small',
434
+ dtype: 'q8', // 'fp32' | 'fp16' | 'q8' | 'q4'
435
+ device: 'webgpu', // 'webgpu' | 'cpu' (browser) or 'cpu' (node)
436
+ })
437
+ ```
438
+
439
+ ### TransformersLLM
440
+
441
+ Any Hugging Face text-generation model via Transformers.js.
442
+
443
+ ```typescript
444
+ new TransformersLLM({
445
+ model: 'HuggingFaceTB/SmolLM2-360M-Instruct',
446
+ dtype: 'q4',
447
+ device: 'webgpu',
448
+ maxNewTokens: 140,
449
+ })
450
+ ```
451
+
452
+ ### TransformersTTS
453
+
454
+ SpeechT5 via Transformers.js.
455
+
456
+ ```typescript
457
+ new TransformersTTS({
458
+ model: 'Xenova/speecht5_tts',
459
+ dtype: 'fp16',
460
+ speakerEmbeddings: '...', // URL or path to speaker embeddings
461
+ })
462
+ ```
463
+
464
+ ### NativeSTT
465
+
466
+ whisper.cpp binary. Server-only, fast.
467
+
468
+ ```typescript
469
+ new NativeSTT({
470
+ model: 'base.en', // Whisper model name
471
+ modelPath: '...', // Optional: custom path to .bin file
472
+ })
473
+ ```
474
+
475
+ ### NativeLLM
476
+
477
+ llama.cpp binary. Server-only, fast.
478
+
479
+ ```typescript
480
+ new NativeLLM({
481
+ model: 'llama-3.2-1b-instruct-q4_k_m.gguf',
482
+ modelPath: '...', // Optional: custom path
483
+ contextSize: 2048,
484
+ temperature: 0.7,
485
+ })
486
+ ```
487
+
488
+ ### NativeTTS
489
+
490
+ sherpa-onnx binary. Server-only.
491
+
492
+ ```typescript
493
+ new NativeTTS({
494
+ model: 'en_US-amy-medium',
495
+ modelPath: '...', // Optional: custom path
496
+ })
497
+ ```
498
+
499
+ ### CloudLLM
500
+
501
+ Any OpenAI-compatible API.
502
+
503
+ ```typescript
504
+ new CloudLLM({
505
+ baseUrl: 'https://api.openai.com/v1',
506
+ apiKey: process.env.OPENAI_API_KEY,
507
+ model: 'gpt-4o',
508
+ maxTokens: 256,
509
+ temperature: 0.7,
510
+ })
511
+ ```
512
+
513
+ **Compatible services:**
514
+ - OpenAI (`https://api.openai.com/v1`)
515
+ - Ollama (`http://localhost:11434/v1`)
516
+ - vLLM (`http://localhost:8000/v1`)
517
+ - LMStudio (`http://localhost:1234/v1`)
518
+ - Any OpenAI-compatible endpoint
519
+
520
+ ## Capability Detection
521
+
522
+ The server supports automatic capability detection for scenarios where you need one server to handle multiple client types (e.g., during rolling deployments).
523
+
524
+ ```typescript
525
+ const pipeline = new VoicePipeline({
526
+ stt: new NativeSTT({ ... }), // used if client sends audio
527
+ llm: new NativeLLM({ ... }),
528
+ tts: new NativeTTS({ ... }), // skipped if client has local TTS
529
+ systemPrompt: '...',
530
+ });
531
+ ```
532
+
533
+ When a client connects, it announces its capabilities. The server then:
534
+ - Skips STT if client sends text instead of audio
535
+ - Skips TTS if client handles speech synthesis locally
536
+
537
+ This is useful for zero-downtime upgrades where old and new clients coexist, but for most cases you should just configure the server with exactly what it needs (using `null` for components the client handles).
538
+
539
+ ## Exports
540
+
541
+ ```typescript
542
+ // Main library - pipeline + Transformers.js backends
543
+ import { VoicePipeline, TransformersSTT, TransformersLLM, TransformersTTS } from 'modular-voice-agent-sdk';
544
+
545
+ // Client SDK - unified browser interface
546
+ import {
547
+ createVoiceClient,
548
+ VoiceClient,
549
+ WebSpeechSTT,
550
+ WebSpeechTTS
551
+ } from 'modular-voice-agent-sdk/client';
552
+
553
+ // Server utilities - WebSocket handler
554
+ import { createPipelineHandler } from 'modular-voice-agent-sdk/server';
555
+
556
+ // Native backends (server-only)
557
+ import {
558
+ NativeSTT,
559
+ NativeLLM,
560
+ NativeTTS,
561
+ defaultPaths,
562
+ getCacheDir
563
+ } from 'modular-voice-agent-sdk/native';
564
+
565
+ // Cloud backends (server-only)
566
+ import { CloudLLM } from 'modular-voice-agent-sdk/cloud';
567
+ ```
@@ -0,0 +1,7 @@
1
+ /**
2
+ * Cloud Backend - OpenAI-compatible API
3
+ * Works with: OpenAI, Ollama, vLLM, LMStudio, and any OpenAI-compatible endpoint
4
+ */
5
+ export { CloudLLM } from './llm';
6
+ export type { CloudLLMConfig } from '../../types';
7
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/backends/cloud/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,OAAO,CAAC;AACjC,YAAY,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC"}
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Cloud Backend - OpenAI-compatible API
3
+ * Works with: OpenAI, Ollama, vLLM, LMStudio, and any OpenAI-compatible endpoint
4
+ */
5
+ export { CloudLLM } from './llm';
6
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/backends/cloud/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,OAAO,CAAC"}