modular-voice-agent-sdk 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +5 -2
  2. package/dist/backends/cloud/audio-llm.d.ts +72 -0
  3. package/dist/backends/cloud/audio-llm.d.ts.map +1 -0
  4. package/dist/backends/cloud/audio-llm.js +366 -0
  5. package/dist/backends/cloud/audio-llm.js.map +1 -0
  6. package/dist/backends/cloud/index.d.ts +2 -0
  7. package/dist/backends/cloud/index.d.ts.map +1 -1
  8. package/dist/backends/cloud/index.js +2 -0
  9. package/dist/backends/cloud/index.js.map +1 -1
  10. package/dist/backends/cloud/llm.d.ts.map +1 -1
  11. package/dist/backends/cloud/llm.js +31 -18
  12. package/dist/backends/cloud/llm.js.map +1 -1
  13. package/dist/backends/native/audio-llm.d.ts +126 -0
  14. package/dist/backends/native/audio-llm.d.ts.map +1 -0
  15. package/dist/backends/native/audio-llm.js +680 -0
  16. package/dist/backends/native/audio-llm.js.map +1 -0
  17. package/dist/backends/native/llm.d.ts.map +1 -1
  18. package/dist/backends/native/llm.js +5 -7
  19. package/dist/backends/native/llm.js.map +1 -1
  20. package/dist/backends/native/stt.d.ts +2 -2
  21. package/dist/backends/native/stt.d.ts.map +1 -1
  22. package/dist/backends/native/stt.js +1 -1
  23. package/dist/backends/native/stt.js.map +1 -1
  24. package/dist/backends/transformers/llm.d.ts.map +1 -1
  25. package/dist/backends/transformers/llm.js +13 -10
  26. package/dist/backends/transformers/llm.js.map +1 -1
  27. package/dist/backends/transformers/stt.d.ts +2 -2
  28. package/dist/backends/transformers/stt.d.ts.map +1 -1
  29. package/dist/backends/transformers/stt.js +12 -7
  30. package/dist/backends/transformers/stt.js.map +1 -1
  31. package/dist/backends/transformers/tts.d.ts.map +1 -1
  32. package/dist/backends/transformers/tts.js +11 -6
  33. package/dist/backends/transformers/tts.js.map +1 -1
  34. package/dist/cache.d.ts +19 -0
  35. package/dist/cache.d.ts.map +1 -1
  36. package/dist/cache.js +39 -0
  37. package/dist/cache.js.map +1 -1
  38. package/dist/cli.js +47 -7
  39. package/dist/cli.js.map +1 -1
  40. package/dist/client/voice-client.d.ts +4 -2
  41. package/dist/client/voice-client.d.ts.map +1 -1
  42. package/dist/client/voice-client.js +15 -13
  43. package/dist/client/voice-client.js.map +1 -1
  44. package/dist/client/web-speech-stt.d.ts +12 -1
  45. package/dist/client/web-speech-stt.d.ts.map +1 -1
  46. package/dist/client/web-speech-stt.js +49 -4
  47. package/dist/client/web-speech-stt.js.map +1 -1
  48. package/dist/server/handler.d.ts +12 -7
  49. package/dist/server/handler.d.ts.map +1 -1
  50. package/dist/server/handler.js +20 -20
  51. package/dist/server/handler.js.map +1 -1
  52. package/dist/services/llm-logger.d.ts +7 -18
  53. package/dist/services/llm-logger.d.ts.map +1 -1
  54. package/dist/services/llm-logger.js +22 -41
  55. package/dist/services/llm-logger.js.map +1 -1
  56. package/dist/types.d.ts +27 -5
  57. package/dist/types.d.ts.map +1 -1
  58. package/dist/voice-pipeline.d.ts +48 -10
  59. package/dist/voice-pipeline.d.ts.map +1 -1
  60. package/dist/voice-pipeline.js +138 -40
  61. package/dist/voice-pipeline.js.map +1 -1
  62. package/package.json +1 -1
  63. package/scripts/setup.sh +23 -0
  64. package/USAGE.md +0 -567
@@ -0,0 +1,680 @@
1
+ /**
2
+ * NativeAudioLLM - Native Multimodal Audio LLM Backend
3
+ *
4
+ * Implements BOTH STTPipeline and LLMPipeline interfaces.
5
+ * Register the same instance as both `stt` and `llm` in VoicePipeline.
6
+ *
7
+ * Uses internal caching to achieve single binary invocation:
8
+ * 1. transcribe(audio) → calls audio LLM binary, caches response, returns transcript
9
+ * 2. generate(messages) → returns cached response (no second invocation)
10
+ *
11
+ * The model is prompted to return both transcription and response in a structured format.
12
+ *
13
+ * Works with: Qwen2-Audio, and other audio-capable models via llama.cpp or similar.
14
+ */
15
+ import { spawn } from 'child_process';
16
+ import { existsSync, writeFileSync, unlinkSync } from 'fs';
17
+ import { tmpdir } from 'os';
18
+ import { join } from 'path';
19
+ import { LLMLogger, LLMConversationTracker } from '../../services';
20
+ export class NativeAudioLLM {
21
+ config;
22
+ ready = false;
23
+ tracker;
24
+ // Cache for single-call optimization
25
+ cachedResponse = null;
26
+ // Store conversation context for audio processing
27
+ pendingMessages = [];
28
+ pendingOptions;
29
+ constructor(config) {
30
+ this.config = {
31
+ sampleRate: 16000,
32
+ ...config,
33
+ };
34
+ this.tracker = new LLMConversationTracker(new LLMLogger());
35
+ }
36
+ async initialize(_onProgress) {
37
+ console.log('Initializing NativeAudioLLM...');
38
+ if (!existsSync(this.config.binaryPath)) {
39
+ throw new Error(`Audio LLM binary not found at: ${this.config.binaryPath}`);
40
+ }
41
+ if (!existsSync(this.config.modelPath)) {
42
+ throw new Error(`Audio LLM model not found at: ${this.config.modelPath}`);
43
+ }
44
+ if (this.config.projectorPath && !existsSync(this.config.projectorPath)) {
45
+ throw new Error(`Audio projector model not found at: ${this.config.projectorPath}`);
46
+ }
47
+ this.ready = true;
48
+ console.log(`NativeAudioLLM ready. Model: ${this.config.modelPath}`);
49
+ }
50
+ isReady() {
51
+ return this.ready;
52
+ }
53
+ supportsTools() {
54
+ return true;
55
+ }
56
+ // ============================================================
57
+ // STTPipeline Implementation
58
+ // ============================================================
59
+ /**
60
+ * Transcribe audio by calling the audio LLM binary.
61
+ * Caches the full response for the subsequent generate() call.
62
+ */
63
+ async transcribe(audio) {
64
+ const audioHash = this.hashAudio(audio);
65
+ // Check if we already processed this audio
66
+ if (this.cachedResponse?.audioHash === audioHash) {
67
+ return this.cachedResponse.transcript;
68
+ }
69
+ // Build messages with system prompt instructing structured output
70
+ const messages = this.pendingMessages.length > 0 ? this.pendingMessages : [];
71
+ // Process audio with native binary
72
+ const { transcript, result } = await this.processAudioWithBinary(audio, messages, this.pendingOptions);
73
+ // Cache for subsequent generate() call
74
+ this.cachedResponse = { transcript, result, audioHash };
75
+ return transcript;
76
+ }
77
+ // ============================================================
78
+ // LLMPipeline Implementation
79
+ // ============================================================
80
+ /**
81
+ * Generate response. If called after transcribe() with matching context,
82
+ * returns cached response (single binary invocation total).
83
+ */
84
+ async generate(messages, options) {
85
+ // Store context for potential audio processing
86
+ this.pendingMessages = messages;
87
+ this.pendingOptions = options;
88
+ // Check if we have a cached response from transcribe()
89
+ if (this.cachedResponse) {
90
+ const cached = this.cachedResponse;
91
+ this.cachedResponse = null; // Clear cache
92
+ // Stream the cached response if callback provided
93
+ if (options?.onToken && cached.result.content) {
94
+ for (const char of cached.result.content) {
95
+ options.onToken(char);
96
+ }
97
+ }
98
+ // Log the cached response
99
+ this.tracker.logOutput(options?.conversationId ?? 'default', cached.result.content, cached.result.toolCalls);
100
+ return cached.result;
101
+ }
102
+ // No cache - do standard text generation (text-only mode)
103
+ return this.textGenerate(messages, options);
104
+ }
105
+ // ============================================================
106
+ // Internal: Audio Processing
107
+ // ============================================================
108
+ async processAudioWithBinary(audio, messages, options) {
109
+ const conversationId = options?.conversationId ?? 'default';
110
+ // Write audio to temp file
111
+ const audioPath = join(tmpdir(), `audio-llm-${Date.now()}-${Math.random().toString(36).slice(2)}.wav`);
112
+ // Build grammar for structured output
113
+ const grammar = this.buildAudioGrammar(options?.tools);
114
+ const grammarPath = join(tmpdir(), `audio-grammar-${Date.now()}.gbnf`);
115
+ writeFileSync(grammarPath, grammar);
116
+ try {
117
+ this.writeWav(audioPath, audio, this.config.sampleRate);
118
+ // Build prompt with structured output instruction
119
+ const prompt = this.formatAudioPrompt(messages, options?.tools);
120
+ // Log input
121
+ this.tracker.logInput(conversationId, messages);
122
+ // Build args for the binary (includes grammar)
123
+ const args = this.buildAudioArgs(audioPath, prompt, grammarPath);
124
+ // Run the audio binary (llama-mtmd-cli)
125
+ const output = await this.runAudioBinary(args, options);
126
+ // Parse the structured response
127
+ const { transcript, responseText, toolCalls } = this.parseStructuredResponse(output);
128
+ const result = {
129
+ content: responseText,
130
+ toolCalls: toolCalls.length > 0 ? toolCalls : undefined,
131
+ finishReason: toolCalls.length > 0 ? 'tool_calls' : 'stop',
132
+ };
133
+ // Log output
134
+ this.tracker.logOutput(conversationId, responseText, result.toolCalls);
135
+ return { transcript, result };
136
+ }
137
+ finally {
138
+ // Clean up temp files
139
+ try {
140
+ if (existsSync(audioPath))
141
+ unlinkSync(audioPath);
142
+ if (existsSync(grammarPath))
143
+ unlinkSync(grammarPath);
144
+ }
145
+ catch {
146
+ // Ignore cleanup errors
147
+ }
148
+ }
149
+ }
150
+ /**
151
+ * Build command line args for the audio LLM binary (llama-mtmd-cli)
152
+ *
153
+ * Uses llama-mtmd-cli for multimodal audio processing.
154
+ * Requires --mmproj for the audio projector.
155
+ *
156
+ * Output: clean response text (no special formatting needed)
157
+ */
158
+ buildAudioArgs(audioPath, prompt, grammarPath) {
159
+ const args = [
160
+ '-m',
161
+ this.config.modelPath,
162
+ '-n',
163
+ String(this.config.maxNewTokens),
164
+ '--temp',
165
+ String(this.config.temperature ?? 0.7),
166
+ '--no-warmup', // Skip warmup for faster response
167
+ // Audio input
168
+ '--audio',
169
+ audioPath,
170
+ '-p',
171
+ prompt,
172
+ ];
173
+ // Add grammar if provided
174
+ if (grammarPath) {
175
+ args.push('--grammar-file', grammarPath);
176
+ }
177
+ // Projector is REQUIRED for audio models
178
+ if (this.config.projectorPath) {
179
+ args.push('--mmproj', this.config.projectorPath);
180
+ }
181
+ if (this.config.gpuLayers !== undefined) {
182
+ args.push('-ngl', String(this.config.gpuLayers));
183
+ }
184
+ // Debug: log the command being run
185
+ console.log(`[NativeAudioLLM] Running: ${this.config.binaryPath} ${args.join(' ').slice(0, 200)}...`);
186
+ return args;
187
+ }
188
+ /**
189
+ * Build GBNF grammar for audio mode:
190
+ * <transcription>text</transcription>\n(SAY: text | TOOL: json)
191
+ */
192
+ buildAudioGrammar(tools) {
193
+ const hasTools = tools && tools.length > 0;
194
+ if (hasTools) {
195
+ const toolNames = tools.map(t => `"\\"${t.name}\\""`).join(' | ');
196
+ return `# Grammar for audio: Heard then SAY or TOOL
197
+ root ::= heard "\\n" response
198
+
199
+ heard ::= "Heard:" ws heard-text
200
+ heard-text ::= heard-char+
201
+ heard-char ::= [a-zA-Z0-9 .,!?'\"():;-]
202
+
203
+ response ::= say-response | tool-call
204
+
205
+ say-response ::= "SAY:" ws text-content
206
+ text-content ::= text-char+
207
+ text-char ::= [a-zA-Z0-9 .,!?'\"():;\\n\\t\\r-]
208
+
209
+ tool-call ::= "TOOL:" ws tool-array
210
+ tool-array ::= "[" ws tool-obj ws "]"
211
+ tool-obj ::= "{" ws "\\"name\\":" ws tool-name "," ws "\\"arguments\\":" ws arguments ws "}"
212
+ tool-name ::= ${toolNames}
213
+ arguments ::= "{" ws (keyval ("," ws keyval)*)? ws "}"
214
+ keyval ::= string ":" ws value
215
+ value ::= string | number | bool | null | object | array
216
+ string ::= "\\"" chars "\\""
217
+ chars ::= char*
218
+ char ::= [^"\\\\\\x00-\\x1f] | "\\\\" escape
219
+ escape ::= ["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
220
+ number ::= "-"? ("0" | [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)?
221
+ bool ::= "true" | "false"
222
+ null ::= "null"
223
+ object ::= "{" ws (keyval ("," ws keyval)*)? ws "}"
224
+ array ::= "[" ws (value ("," ws value)*)? ws "]"
225
+ ws ::= [ \\t\\n]*
226
+ `;
227
+ }
228
+ else {
229
+ // No tools - just Heard then Reply
230
+ return `# Grammar for audio: Heard then Reply
231
+ root ::= heard "\\n\\n" reply
232
+
233
+ heard ::= "Heard:" ws heard-text
234
+ heard-text ::= heard-char+
235
+ heard-char ::= [a-zA-Z0-9 .,!?'\"():;-]
236
+
237
+ reply ::= "Reply:" ws text-content
238
+ text-content ::= text-char+
239
+ text-char ::= [a-zA-Z0-9 .,!?'\"():;\\n\\t\\r-]
240
+ `;
241
+ }
242
+ }
243
+ /**
244
+ * Build GBNF grammar for text mode (no transcription):
245
+ * SAY: text | TOOL: json
246
+ */
247
+ buildTextGrammar(tools) {
248
+ const toolNames = tools.map(t => `"\\"${t.name}\\""`).join(' | ');
249
+ return `# Grammar for text: SAY or TOOL
250
+ root ::= say-response | tool-call
251
+
252
+ say-response ::= "SAY:" ws text-content
253
+ text-content ::= text-char+
254
+ text-char ::= [a-zA-Z0-9 .,!?'\"():;\\n\\t\\r-]
255
+
256
+ tool-call ::= "TOOL:" ws tool-array
257
+ tool-array ::= "[" ws tool-obj ws "]"
258
+ tool-obj ::= "{" ws "\\"name\\":" ws tool-name "," ws "\\"arguments\\":" ws arguments ws "}"
259
+ tool-name ::= ${toolNames}
260
+ arguments ::= "{" ws (keyval ("," ws keyval)*)? ws "}"
261
+ keyval ::= string ":" ws value
262
+ value ::= string | number | bool | null | object | array
263
+ string ::= "\\"" chars "\\""
264
+ chars ::= char*
265
+ char ::= [^"\\\\\\x00-\\x1f] | "\\\\" escape
266
+ escape ::= ["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
267
+ number ::= "-"? ("0" | [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)?
268
+ bool ::= "true" | "false"
269
+ null ::= "null"
270
+ object ::= "{" ws (keyval ("," ws keyval)*)? ws "}"
271
+ array ::= "[" ws (value ("," ws value)*)? ws "]"
272
+ ws ::= [ \\t\\n]*
273
+ `;
274
+ }
275
+ /**
276
+ * Format prompt for audio processing with llama-mtmd-cli
277
+ *
278
+ * Uses "Heard:" prefix for transcript and "SAY:"/"TOOL:" for response.
279
+ * This format works well with Ultravox for getting both transcript and response.
280
+ */
281
+ formatAudioPrompt(messages, tools) {
282
+ const hasTools = tools && tools.length > 0;
283
+ // Build system context
284
+ let systemContext = '';
285
+ for (const msg of messages) {
286
+ if (msg.role === 'system') {
287
+ systemContext = msg.content;
288
+ break;
289
+ }
290
+ }
291
+ // Build prompt for llama-mtmd-cli
292
+ let prompt = '';
293
+ if (systemContext) {
294
+ prompt += `${systemContext}\n\n`;
295
+ }
296
+ if (hasTools) {
297
+ // Build tool list with clear instructions
298
+ const toolDescriptions = tools
299
+ .map((t) => `- ${t.name}(${Object.keys(t.parameters?.properties || {}).join(', ')}): ${t.description}`)
300
+ .join('\n');
301
+ prompt += `Tools available:\n${toolDescriptions}\n\n`;
302
+ prompt += `If the user asks for something a tool provides, you MUST use TOOL.
303
+ Format:
304
+ Heard: [what the user said]
305
+ TOOL: [{"name":"tool_name","arguments":{...}}] OR SAY: [your response]`;
306
+ }
307
+ else {
308
+ prompt += `Format:
309
+ Heard: [what the user said]
310
+
311
+ Reply: [your brief response]`;
312
+ }
313
+ return prompt;
314
+ }
315
+ /**
316
+ * Parse the audio LLM response
317
+ *
318
+ * Format with tools:
319
+ * Heard: [what user said]
320
+ * SAY: [response] OR TOOL: [{"name":"...","arguments":{...}}]
321
+ *
322
+ * Format without tools:
323
+ * Heard: [what user said]
324
+ * Reply: [response]
325
+ */
326
+ parseStructuredResponse(output) {
327
+ // Extract transcript from "Heard: ..." line
328
+ const heardMatch = output.match(/Heard:\s*(.+?)(?:\n|$)/);
329
+ const transcript = heardMatch ? heardMatch[1].trim() : '[transcript unavailable]';
330
+ // Get everything after the Heard line
331
+ const afterHeard = output.replace(/Heard:\s*.+?(?:\n|$)/, '').trim();
332
+ const toolCalls = [];
333
+ let responseText = '';
334
+ // Check for TOOL: prefix
335
+ if (afterHeard.startsWith('TOOL:')) {
336
+ const toolJson = afterHeard.slice(5).trim();
337
+ try {
338
+ const parsed = JSON.parse(toolJson);
339
+ // Handle array format [{"name": ..., "arguments": ...}]
340
+ if (Array.isArray(parsed)) {
341
+ for (const tc of parsed) {
342
+ if (tc && typeof tc.name === 'string') {
343
+ toolCalls.push({
344
+ id: `native-audio-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`,
345
+ name: tc.name,
346
+ arguments: tc.arguments || {},
347
+ });
348
+ }
349
+ }
350
+ }
351
+ // Handle single object format {"name": ..., "arguments": ...}
352
+ else if (parsed && typeof parsed.name === 'string') {
353
+ toolCalls.push({
354
+ id: `native-audio-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`,
355
+ name: parsed.name,
356
+ arguments: parsed.arguments || {},
357
+ });
358
+ }
359
+ }
360
+ catch {
361
+ // Failed to parse tool call, treat as response
362
+ responseText = afterHeard;
363
+ }
364
+ }
365
+ // Check for SAY: prefix
366
+ else if (afterHeard.startsWith('SAY:')) {
367
+ responseText = afterHeard.slice(4).trim();
368
+ }
369
+ // Check for Reply: prefix (no-tools format)
370
+ else if (afterHeard.startsWith('Reply:')) {
371
+ responseText = afterHeard.slice(6).trim();
372
+ }
373
+ // Fallback: use entire content after Heard as response
374
+ else {
375
+ responseText = afterHeard;
376
+ }
377
+ return { transcript, responseText, toolCalls };
378
+ }
379
+ // ============================================================
380
+ // Internal: Text Generation (fallback for text-only mode)
381
+ // ============================================================
382
+ async textGenerate(messages, options) {
383
+ const conversationId = options?.conversationId ?? 'default';
384
+ const hasTools = options?.tools && options.tools.length > 0;
385
+ this.tracker.logInput(conversationId, messages);
386
+ const prompt = this.formatTextPrompt(messages, options?.tools);
387
+ // If tools provided, use grammar mode
388
+ let grammarPath;
389
+ if (hasTools) {
390
+ const grammar = this.buildTextGrammar(options.tools);
391
+ grammarPath = join(tmpdir(), `text-grammar-${Date.now()}.gbnf`);
392
+ writeFileSync(grammarPath, grammar);
393
+ }
394
+ try {
395
+ // Use text binary (llama-completion) for text-only generation
396
+ const textBinary = this.config.textBinaryPath || this.config.binaryPath.replace('llama-mtmd-cli', 'llama-completion');
397
+ const args = [
398
+ '-m',
399
+ this.config.modelPath,
400
+ '-n',
401
+ String(this.config.maxNewTokens),
402
+ '--temp',
403
+ String(this.config.temperature ?? 0.7),
404
+ '-p',
405
+ prompt,
406
+ ];
407
+ if (grammarPath) {
408
+ args.push('--grammar-file', grammarPath);
409
+ }
410
+ if (this.config.gpuLayers !== undefined) {
411
+ args.push('-ngl', String(this.config.gpuLayers));
412
+ }
413
+ console.log(`[NativeAudioLLM] Text generation: ${textBinary} ${args.slice(0, 6).join(' ')}...`);
414
+ // Run llama-completion for text generation
415
+ const output = await this.runTextBinary(textBinary, args, options);
416
+ this.tracker.logOutput(conversationId, output);
417
+ // Parse output for tool calls if grammar mode
418
+ if (hasTools) {
419
+ return this.parseTextResponse(output);
420
+ }
421
+ return {
422
+ content: output,
423
+ finishReason: 'stop',
424
+ };
425
+ }
426
+ finally {
427
+ // Clean up grammar file
428
+ if (grammarPath && existsSync(grammarPath)) {
429
+ try {
430
+ unlinkSync(grammarPath);
431
+ }
432
+ catch { /* ignore */ }
433
+ }
434
+ }
435
+ }
436
+ /**
437
+ * Parse text response for SAY:/TOOL: format
438
+ */
439
+ parseTextResponse(output) {
440
+ const trimmed = output.trim();
441
+ if (trimmed.startsWith('TOOL:')) {
442
+ const toolJson = trimmed.slice(5).trim();
443
+ try {
444
+ const parsed = JSON.parse(toolJson);
445
+ const toolCalls = [];
446
+ if (Array.isArray(parsed)) {
447
+ for (const tc of parsed) {
448
+ if (tc && typeof tc.name === 'string') {
449
+ toolCalls.push({
450
+ id: `native-text-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`,
451
+ name: tc.name,
452
+ arguments: tc.arguments || {},
453
+ });
454
+ }
455
+ }
456
+ }
457
+ else if (parsed && typeof parsed.name === 'string') {
458
+ toolCalls.push({
459
+ id: `native-text-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`,
460
+ name: parsed.name,
461
+ arguments: parsed.arguments || {},
462
+ });
463
+ }
464
+ return {
465
+ content: '',
466
+ toolCalls: toolCalls.length > 0 ? toolCalls : undefined,
467
+ finishReason: toolCalls.length > 0 ? 'tool_calls' : 'stop',
468
+ };
469
+ }
470
+ catch {
471
+ // Failed to parse, treat as text
472
+ }
473
+ }
474
+ // SAY: response or fallback
475
+ const content = trimmed.startsWith('SAY:') ? trimmed.slice(4).trim() : trimmed;
476
+ return {
477
+ content,
478
+ finishReason: 'stop',
479
+ };
480
+ }
481
+ /**
482
+ * Format prompt for text-only mode
483
+ */
484
+ formatTextPrompt(messages, _tools) {
485
+ let prompt = '';
486
+ for (const msg of messages) {
487
+ if (msg.role === 'system') {
488
+ prompt += `<|im_start|>system\n${msg.content}<|im_end|>\n`;
489
+ }
490
+ else if (msg.role === 'user') {
491
+ prompt += `<|im_start|>user\n${msg.content}<|im_end|>\n`;
492
+ }
493
+ else if (msg.role === 'assistant') {
494
+ prompt += `<|im_start|>assistant\n${msg.content}<|im_end|>\n`;
495
+ }
496
+ else if (msg.role === 'tool') {
497
+ prompt += `<|im_start|>tool\n${msg.content}<|im_end|>\n`;
498
+ }
499
+ }
500
+ prompt += '<|im_start|>assistant\n';
501
+ return prompt;
502
+ }
503
+ // ============================================================
504
+ // Internal: Binary Execution
505
+ // ============================================================
506
+ /** Special tokens to filter from output */
507
+ static SPECIAL_TOKENS = /\[end of text\]|<\|im_end\|>|<\|im_start\|>|<\|endoftext\|>/gi;
508
+ /**
509
+ * Run llama-mtmd-cli with audio input
510
+ * Output format: clean response text (no markers needed)
511
+ */
512
+ runAudioBinary(args, options) {
513
+ return new Promise((resolve, reject) => {
514
+ const proc = spawn(this.config.binaryPath, args, {
515
+ stdio: ['pipe', 'pipe', 'pipe'],
516
+ });
517
+ let output = '';
518
+ let stderrOutput = '';
519
+ let hasOutput = false;
520
+ // Timeout after 120 seconds (audio loading can be slow)
521
+ const timeout = setTimeout(() => {
522
+ if (!hasOutput) {
523
+ proc.kill();
524
+ reject(new Error(`Audio LLM binary timed out (no output after 120s).\n` +
525
+ `Ensure llama-mtmd-cli is installed and supports audio.\n` +
526
+ `stderr: ${stderrOutput.slice(0, 500)}`));
527
+ }
528
+ }, 120000);
529
+ proc.stdout.on('data', (data) => {
530
+ hasOutput = true;
531
+ output += data.toString();
532
+ });
533
+ proc.stderr.on('data', (data) => {
534
+ stderrOutput += data.toString();
535
+ });
536
+ proc.on('close', (code) => {
537
+ clearTimeout(timeout);
538
+ if (code !== 0) {
539
+ reject(new Error(`Audio LLM binary exited with code ${code}.\n` +
540
+ `stderr: ${stderrOutput.slice(0, 500)}`));
541
+ return;
542
+ }
543
+ // llama-mtmd-cli outputs clean response text to stdout
544
+ const response = this.parseMtmdOutput(output);
545
+ console.log(`[NativeAudioLLM] Response: ${response.slice(0, 200)}...`);
546
+ // Stream parsed response if callback provided
547
+ if (options?.onToken && response) {
548
+ for (const char of response) {
549
+ options.onToken(char);
550
+ }
551
+ }
552
+ resolve(response);
553
+ });
554
+ proc.on('error', (err) => {
555
+ clearTimeout(timeout);
556
+ reject(new Error(`Failed to spawn audio binary: ${err.message}`));
557
+ });
558
+ });
559
+ }
560
+ /**
561
+ * Parse llama-mtmd-cli's output format
562
+ * Output is clean response text on stdout (no special formatting)
563
+ */
564
+ parseMtmdOutput(output) {
565
+ // llama-mtmd-cli outputs clean response text
566
+ // Just trim whitespace and remove any special tokens
567
+ return output
568
+ .replace(NativeAudioLLM.SPECIAL_TOKENS, '')
569
+ .trim();
570
+ }
571
+ /**
572
+ * Run llama-completion for text-only generation (after tool calls)
573
+ * Output is clean completion text
574
+ */
575
+ runTextBinary(binaryPath, args, options) {
576
+ return new Promise((resolve, reject) => {
577
+ const proc = spawn(binaryPath, args, {
578
+ stdio: ['pipe', 'pipe', 'pipe'],
579
+ });
580
+ let output = '';
581
+ let stderrOutput = '';
582
+ // Timeout after 60 seconds
583
+ const timeout = setTimeout(() => {
584
+ proc.kill();
585
+ reject(new Error(`Text LLM binary timed out.\n` +
586
+ `stderr: ${stderrOutput.slice(0, 500)}`));
587
+ }, 60000);
588
+ proc.stdout.on('data', (data) => {
589
+ output += data.toString();
590
+ });
591
+ proc.stderr.on('data', (data) => {
592
+ stderrOutput += data.toString();
593
+ });
594
+ proc.on('close', (code) => {
595
+ clearTimeout(timeout);
596
+ if (code !== 0) {
597
+ reject(new Error(`Text LLM binary exited with code ${code}.\n` +
598
+ `stderr: ${stderrOutput.slice(0, 500)}`));
599
+ return;
600
+ }
601
+ // llama-completion outputs clean response text
602
+ const response = output
603
+ .replace(NativeAudioLLM.SPECIAL_TOKENS, '')
604
+ .trim();
605
+ console.log(`[NativeAudioLLM] Text response: ${response.slice(0, 200)}...`);
606
+ // Stream response if callback provided
607
+ if (options?.onToken && response) {
608
+ for (const char of response) {
609
+ options.onToken(char);
610
+ }
611
+ }
612
+ resolve(response);
613
+ });
614
+ proc.on('error', (err) => {
615
+ clearTimeout(timeout);
616
+ reject(new Error(`Failed to spawn text binary: ${err.message}`));
617
+ });
618
+ });
619
+ }
620
+ // ============================================================
621
+ // Internal: Helpers
622
+ // ============================================================
623
+ hashAudio(audio) {
624
+ // Simple hash based on length and samples
625
+ let hash = audio.length;
626
+ const step = Math.max(1, Math.floor(audio.length / 100));
627
+ for (let i = 0; i < audio.length; i += step) {
628
+ hash = (hash << 5) - hash + Math.floor(audio[i] * 1000);
629
+ hash |= 0;
630
+ }
631
+ return `audio-${audio.length}-${hash}`;
632
+ }
633
+ /**
634
+ * Write Float32Array audio data to a WAV file
635
+ */
636
+ writeWav(path, audio, sampleRate) {
637
+ const numChannels = 1;
638
+ const bitsPerSample = 16;
639
+ const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
640
+ const blockAlign = numChannels * (bitsPerSample / 8);
641
+ const dataSize = audio.length * (bitsPerSample / 8);
642
+ const fileSize = 36 + dataSize;
643
+ const buffer = Buffer.alloc(44 + dataSize);
644
+ let offset = 0;
645
+ buffer.write('RIFF', offset);
646
+ offset += 4;
647
+ buffer.writeUInt32LE(fileSize, offset);
648
+ offset += 4;
649
+ buffer.write('WAVE', offset);
650
+ offset += 4;
651
+ buffer.write('fmt ', offset);
652
+ offset += 4;
653
+ buffer.writeUInt32LE(16, offset);
654
+ offset += 4;
655
+ buffer.writeUInt16LE(1, offset);
656
+ offset += 2;
657
+ buffer.writeUInt16LE(numChannels, offset);
658
+ offset += 2;
659
+ buffer.writeUInt32LE(sampleRate, offset);
660
+ offset += 4;
661
+ buffer.writeUInt32LE(byteRate, offset);
662
+ offset += 4;
663
+ buffer.writeUInt16LE(blockAlign, offset);
664
+ offset += 2;
665
+ buffer.writeUInt16LE(bitsPerSample, offset);
666
+ offset += 2;
667
+ buffer.write('data', offset);
668
+ offset += 4;
669
+ buffer.writeUInt32LE(dataSize, offset);
670
+ offset += 4;
671
+ for (let i = 0; i < audio.length; i++) {
672
+ const sample = Math.max(-1, Math.min(1, audio[i]));
673
+ const int16 = sample < 0 ? sample * 32768 : sample * 32767;
674
+ buffer.writeInt16LE(Math.round(int16), offset);
675
+ offset += 2;
676
+ }
677
+ writeFileSync(path, buffer);
678
+ }
679
+ }
680
+ //# sourceMappingURL=audio-llm.js.map