modular-voice-agent-sdk 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/dist/backends/cloud/audio-llm.d.ts +72 -0
- package/dist/backends/cloud/audio-llm.d.ts.map +1 -0
- package/dist/backends/cloud/audio-llm.js +366 -0
- package/dist/backends/cloud/audio-llm.js.map +1 -0
- package/dist/backends/cloud/index.d.ts +2 -0
- package/dist/backends/cloud/index.d.ts.map +1 -1
- package/dist/backends/cloud/index.js +2 -0
- package/dist/backends/cloud/index.js.map +1 -1
- package/dist/backends/cloud/llm.d.ts.map +1 -1
- package/dist/backends/cloud/llm.js +31 -18
- package/dist/backends/cloud/llm.js.map +1 -1
- package/dist/backends/native/audio-llm.d.ts +126 -0
- package/dist/backends/native/audio-llm.d.ts.map +1 -0
- package/dist/backends/native/audio-llm.js +680 -0
- package/dist/backends/native/audio-llm.js.map +1 -0
- package/dist/backends/native/llm.d.ts.map +1 -1
- package/dist/backends/native/llm.js +5 -7
- package/dist/backends/native/llm.js.map +1 -1
- package/dist/backends/native/stt.d.ts +2 -2
- package/dist/backends/native/stt.d.ts.map +1 -1
- package/dist/backends/native/stt.js +1 -1
- package/dist/backends/native/stt.js.map +1 -1
- package/dist/backends/transformers/llm.d.ts.map +1 -1
- package/dist/backends/transformers/llm.js +13 -10
- package/dist/backends/transformers/llm.js.map +1 -1
- package/dist/backends/transformers/stt.d.ts +2 -2
- package/dist/backends/transformers/stt.d.ts.map +1 -1
- package/dist/backends/transformers/stt.js +12 -7
- package/dist/backends/transformers/stt.js.map +1 -1
- package/dist/backends/transformers/tts.d.ts.map +1 -1
- package/dist/backends/transformers/tts.js +11 -6
- package/dist/backends/transformers/tts.js.map +1 -1
- package/dist/cache.d.ts +19 -0
- package/dist/cache.d.ts.map +1 -1
- package/dist/cache.js +39 -0
- package/dist/cache.js.map +1 -1
- package/dist/cli.js +47 -7
- package/dist/cli.js.map +1 -1
- package/dist/client/voice-client.d.ts +4 -2
- package/dist/client/voice-client.d.ts.map +1 -1
- package/dist/client/voice-client.js +15 -13
- package/dist/client/voice-client.js.map +1 -1
- package/dist/client/web-speech-stt.d.ts +12 -1
- package/dist/client/web-speech-stt.d.ts.map +1 -1
- package/dist/client/web-speech-stt.js +49 -4
- package/dist/client/web-speech-stt.js.map +1 -1
- package/dist/server/handler.d.ts +12 -7
- package/dist/server/handler.d.ts.map +1 -1
- package/dist/server/handler.js +20 -20
- package/dist/server/handler.js.map +1 -1
- package/dist/services/llm-logger.d.ts +7 -18
- package/dist/services/llm-logger.d.ts.map +1 -1
- package/dist/services/llm-logger.js +22 -41
- package/dist/services/llm-logger.js.map +1 -1
- package/dist/types.d.ts +27 -5
- package/dist/types.d.ts.map +1 -1
- package/dist/voice-pipeline.d.ts +48 -10
- package/dist/voice-pipeline.d.ts.map +1 -1
- package/dist/voice-pipeline.js +138 -40
- package/dist/voice-pipeline.js.map +1 -1
- package/package.json +1 -1
- package/scripts/setup.sh +23 -0
- package/USAGE.md +0 -567
|
@@ -0,0 +1,680 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* NativeAudioLLM - Native Multimodal Audio LLM Backend
|
|
3
|
+
*
|
|
4
|
+
* Implements BOTH STTPipeline and LLMPipeline interfaces.
|
|
5
|
+
* Register the same instance as both `stt` and `llm` in VoicePipeline.
|
|
6
|
+
*
|
|
7
|
+
* Uses internal caching to achieve single binary invocation:
|
|
8
|
+
* 1. transcribe(audio) → calls audio LLM binary, caches response, returns transcript
|
|
9
|
+
* 2. generate(messages) → returns cached response (no second invocation)
|
|
10
|
+
*
|
|
11
|
+
* The model is prompted to return both transcription and response in a structured format.
|
|
12
|
+
*
|
|
13
|
+
* Works with: Qwen2-Audio, and other audio-capable models via llama.cpp or similar.
|
|
14
|
+
*/
|
|
15
|
+
import { spawn } from 'child_process';
|
|
16
|
+
import { existsSync, writeFileSync, unlinkSync } from 'fs';
|
|
17
|
+
import { tmpdir } from 'os';
|
|
18
|
+
import { join } from 'path';
|
|
19
|
+
import { LLMLogger, LLMConversationTracker } from '../../services';
|
|
20
|
+
export class NativeAudioLLM {
|
|
21
|
+
config;
|
|
22
|
+
ready = false;
|
|
23
|
+
tracker;
|
|
24
|
+
// Cache for single-call optimization
|
|
25
|
+
cachedResponse = null;
|
|
26
|
+
// Store conversation context for audio processing
|
|
27
|
+
pendingMessages = [];
|
|
28
|
+
pendingOptions;
|
|
29
|
+
constructor(config) {
|
|
30
|
+
this.config = {
|
|
31
|
+
sampleRate: 16000,
|
|
32
|
+
...config,
|
|
33
|
+
};
|
|
34
|
+
this.tracker = new LLMConversationTracker(new LLMLogger());
|
|
35
|
+
}
|
|
36
|
+
async initialize(_onProgress) {
|
|
37
|
+
console.log('Initializing NativeAudioLLM...');
|
|
38
|
+
if (!existsSync(this.config.binaryPath)) {
|
|
39
|
+
throw new Error(`Audio LLM binary not found at: ${this.config.binaryPath}`);
|
|
40
|
+
}
|
|
41
|
+
if (!existsSync(this.config.modelPath)) {
|
|
42
|
+
throw new Error(`Audio LLM model not found at: ${this.config.modelPath}`);
|
|
43
|
+
}
|
|
44
|
+
if (this.config.projectorPath && !existsSync(this.config.projectorPath)) {
|
|
45
|
+
throw new Error(`Audio projector model not found at: ${this.config.projectorPath}`);
|
|
46
|
+
}
|
|
47
|
+
this.ready = true;
|
|
48
|
+
console.log(`NativeAudioLLM ready. Model: ${this.config.modelPath}`);
|
|
49
|
+
}
|
|
50
|
+
isReady() {
|
|
51
|
+
return this.ready;
|
|
52
|
+
}
|
|
53
|
+
supportsTools() {
|
|
54
|
+
return true;
|
|
55
|
+
}
|
|
56
|
+
// ============================================================
|
|
57
|
+
// STTPipeline Implementation
|
|
58
|
+
// ============================================================
|
|
59
|
+
/**
|
|
60
|
+
* Transcribe audio by calling the audio LLM binary.
|
|
61
|
+
* Caches the full response for the subsequent generate() call.
|
|
62
|
+
*/
|
|
63
|
+
async transcribe(audio) {
|
|
64
|
+
const audioHash = this.hashAudio(audio);
|
|
65
|
+
// Check if we already processed this audio
|
|
66
|
+
if (this.cachedResponse?.audioHash === audioHash) {
|
|
67
|
+
return this.cachedResponse.transcript;
|
|
68
|
+
}
|
|
69
|
+
// Build messages with system prompt instructing structured output
|
|
70
|
+
const messages = this.pendingMessages.length > 0 ? this.pendingMessages : [];
|
|
71
|
+
// Process audio with native binary
|
|
72
|
+
const { transcript, result } = await this.processAudioWithBinary(audio, messages, this.pendingOptions);
|
|
73
|
+
// Cache for subsequent generate() call
|
|
74
|
+
this.cachedResponse = { transcript, result, audioHash };
|
|
75
|
+
return transcript;
|
|
76
|
+
}
|
|
77
|
+
// ============================================================
|
|
78
|
+
// LLMPipeline Implementation
|
|
79
|
+
// ============================================================
|
|
80
|
+
/**
|
|
81
|
+
* Generate response. If called after transcribe() with matching context,
|
|
82
|
+
* returns cached response (single binary invocation total).
|
|
83
|
+
*/
|
|
84
|
+
async generate(messages, options) {
|
|
85
|
+
// Store context for potential audio processing
|
|
86
|
+
this.pendingMessages = messages;
|
|
87
|
+
this.pendingOptions = options;
|
|
88
|
+
// Check if we have a cached response from transcribe()
|
|
89
|
+
if (this.cachedResponse) {
|
|
90
|
+
const cached = this.cachedResponse;
|
|
91
|
+
this.cachedResponse = null; // Clear cache
|
|
92
|
+
// Stream the cached response if callback provided
|
|
93
|
+
if (options?.onToken && cached.result.content) {
|
|
94
|
+
for (const char of cached.result.content) {
|
|
95
|
+
options.onToken(char);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
// Log the cached response
|
|
99
|
+
this.tracker.logOutput(options?.conversationId ?? 'default', cached.result.content, cached.result.toolCalls);
|
|
100
|
+
return cached.result;
|
|
101
|
+
}
|
|
102
|
+
// No cache - do standard text generation (text-only mode)
|
|
103
|
+
return this.textGenerate(messages, options);
|
|
104
|
+
}
|
|
105
|
+
// ============================================================
|
|
106
|
+
// Internal: Audio Processing
|
|
107
|
+
// ============================================================
|
|
108
|
+
async processAudioWithBinary(audio, messages, options) {
|
|
109
|
+
const conversationId = options?.conversationId ?? 'default';
|
|
110
|
+
// Write audio to temp file
|
|
111
|
+
const audioPath = join(tmpdir(), `audio-llm-${Date.now()}-${Math.random().toString(36).slice(2)}.wav`);
|
|
112
|
+
// Build grammar for structured output
|
|
113
|
+
const grammar = this.buildAudioGrammar(options?.tools);
|
|
114
|
+
const grammarPath = join(tmpdir(), `audio-grammar-${Date.now()}.gbnf`);
|
|
115
|
+
writeFileSync(grammarPath, grammar);
|
|
116
|
+
try {
|
|
117
|
+
this.writeWav(audioPath, audio, this.config.sampleRate);
|
|
118
|
+
// Build prompt with structured output instruction
|
|
119
|
+
const prompt = this.formatAudioPrompt(messages, options?.tools);
|
|
120
|
+
// Log input
|
|
121
|
+
this.tracker.logInput(conversationId, messages);
|
|
122
|
+
// Build args for the binary (includes grammar)
|
|
123
|
+
const args = this.buildAudioArgs(audioPath, prompt, grammarPath);
|
|
124
|
+
// Run the audio binary (llama-mtmd-cli)
|
|
125
|
+
const output = await this.runAudioBinary(args, options);
|
|
126
|
+
// Parse the structured response
|
|
127
|
+
const { transcript, responseText, toolCalls } = this.parseStructuredResponse(output);
|
|
128
|
+
const result = {
|
|
129
|
+
content: responseText,
|
|
130
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : undefined,
|
|
131
|
+
finishReason: toolCalls.length > 0 ? 'tool_calls' : 'stop',
|
|
132
|
+
};
|
|
133
|
+
// Log output
|
|
134
|
+
this.tracker.logOutput(conversationId, responseText, result.toolCalls);
|
|
135
|
+
return { transcript, result };
|
|
136
|
+
}
|
|
137
|
+
finally {
|
|
138
|
+
// Clean up temp files
|
|
139
|
+
try {
|
|
140
|
+
if (existsSync(audioPath))
|
|
141
|
+
unlinkSync(audioPath);
|
|
142
|
+
if (existsSync(grammarPath))
|
|
143
|
+
unlinkSync(grammarPath);
|
|
144
|
+
}
|
|
145
|
+
catch {
|
|
146
|
+
// Ignore cleanup errors
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Build command line args for the audio LLM binary (llama-mtmd-cli)
|
|
152
|
+
*
|
|
153
|
+
* Uses llama-mtmd-cli for multimodal audio processing.
|
|
154
|
+
* Requires --mmproj for the audio projector.
|
|
155
|
+
*
|
|
156
|
+
* Output: clean response text (no special formatting needed)
|
|
157
|
+
*/
|
|
158
|
+
buildAudioArgs(audioPath, prompt, grammarPath) {
|
|
159
|
+
const args = [
|
|
160
|
+
'-m',
|
|
161
|
+
this.config.modelPath,
|
|
162
|
+
'-n',
|
|
163
|
+
String(this.config.maxNewTokens),
|
|
164
|
+
'--temp',
|
|
165
|
+
String(this.config.temperature ?? 0.7),
|
|
166
|
+
'--no-warmup', // Skip warmup for faster response
|
|
167
|
+
// Audio input
|
|
168
|
+
'--audio',
|
|
169
|
+
audioPath,
|
|
170
|
+
'-p',
|
|
171
|
+
prompt,
|
|
172
|
+
];
|
|
173
|
+
// Add grammar if provided
|
|
174
|
+
if (grammarPath) {
|
|
175
|
+
args.push('--grammar-file', grammarPath);
|
|
176
|
+
}
|
|
177
|
+
// Projector is REQUIRED for audio models
|
|
178
|
+
if (this.config.projectorPath) {
|
|
179
|
+
args.push('--mmproj', this.config.projectorPath);
|
|
180
|
+
}
|
|
181
|
+
if (this.config.gpuLayers !== undefined) {
|
|
182
|
+
args.push('-ngl', String(this.config.gpuLayers));
|
|
183
|
+
}
|
|
184
|
+
// Debug: log the command being run
|
|
185
|
+
console.log(`[NativeAudioLLM] Running: ${this.config.binaryPath} ${args.join(' ').slice(0, 200)}...`);
|
|
186
|
+
return args;
|
|
187
|
+
}
|
|
188
|
+
/**
|
|
189
|
+
* Build GBNF grammar for audio mode:
|
|
190
|
+
* <transcription>text</transcription>\n(SAY: text | TOOL: json)
|
|
191
|
+
*/
|
|
192
|
+
buildAudioGrammar(tools) {
|
|
193
|
+
const hasTools = tools && tools.length > 0;
|
|
194
|
+
if (hasTools) {
|
|
195
|
+
const toolNames = tools.map(t => `"\\"${t.name}\\""`).join(' | ');
|
|
196
|
+
return `# Grammar for audio: Heard then SAY or TOOL
|
|
197
|
+
root ::= heard "\\n" response
|
|
198
|
+
|
|
199
|
+
heard ::= "Heard:" ws heard-text
|
|
200
|
+
heard-text ::= heard-char+
|
|
201
|
+
heard-char ::= [a-zA-Z0-9 .,!?'\"():;-]
|
|
202
|
+
|
|
203
|
+
response ::= say-response | tool-call
|
|
204
|
+
|
|
205
|
+
say-response ::= "SAY:" ws text-content
|
|
206
|
+
text-content ::= text-char+
|
|
207
|
+
text-char ::= [a-zA-Z0-9 .,!?'\"():;\\n\\t\\r-]
|
|
208
|
+
|
|
209
|
+
tool-call ::= "TOOL:" ws tool-array
|
|
210
|
+
tool-array ::= "[" ws tool-obj ws "]"
|
|
211
|
+
tool-obj ::= "{" ws "\\"name\\":" ws tool-name "," ws "\\"arguments\\":" ws arguments ws "}"
|
|
212
|
+
tool-name ::= ${toolNames}
|
|
213
|
+
arguments ::= "{" ws (keyval ("," ws keyval)*)? ws "}"
|
|
214
|
+
keyval ::= string ":" ws value
|
|
215
|
+
value ::= string | number | bool | null | object | array
|
|
216
|
+
string ::= "\\"" chars "\\""
|
|
217
|
+
chars ::= char*
|
|
218
|
+
char ::= [^"\\\\\\x00-\\x1f] | "\\\\" escape
|
|
219
|
+
escape ::= ["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
|
|
220
|
+
number ::= "-"? ("0" | [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)?
|
|
221
|
+
bool ::= "true" | "false"
|
|
222
|
+
null ::= "null"
|
|
223
|
+
object ::= "{" ws (keyval ("," ws keyval)*)? ws "}"
|
|
224
|
+
array ::= "[" ws (value ("," ws value)*)? ws "]"
|
|
225
|
+
ws ::= [ \\t\\n]*
|
|
226
|
+
`;
|
|
227
|
+
}
|
|
228
|
+
else {
|
|
229
|
+
// No tools - just Heard then Reply
|
|
230
|
+
return `# Grammar for audio: Heard then Reply
|
|
231
|
+
root ::= heard "\\n\\n" reply
|
|
232
|
+
|
|
233
|
+
heard ::= "Heard:" ws heard-text
|
|
234
|
+
heard-text ::= heard-char+
|
|
235
|
+
heard-char ::= [a-zA-Z0-9 .,!?'\"():;-]
|
|
236
|
+
|
|
237
|
+
reply ::= "Reply:" ws text-content
|
|
238
|
+
text-content ::= text-char+
|
|
239
|
+
text-char ::= [a-zA-Z0-9 .,!?'\"():;\\n\\t\\r-]
|
|
240
|
+
`;
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* Build GBNF grammar for text mode (no transcription):
|
|
245
|
+
* SAY: text | TOOL: json
|
|
246
|
+
*/
|
|
247
|
+
buildTextGrammar(tools) {
|
|
248
|
+
const toolNames = tools.map(t => `"\\"${t.name}\\""`).join(' | ');
|
|
249
|
+
return `# Grammar for text: SAY or TOOL
|
|
250
|
+
root ::= say-response | tool-call
|
|
251
|
+
|
|
252
|
+
say-response ::= "SAY:" ws text-content
|
|
253
|
+
text-content ::= text-char+
|
|
254
|
+
text-char ::= [a-zA-Z0-9 .,!?'\"():;\\n\\t\\r-]
|
|
255
|
+
|
|
256
|
+
tool-call ::= "TOOL:" ws tool-array
|
|
257
|
+
tool-array ::= "[" ws tool-obj ws "]"
|
|
258
|
+
tool-obj ::= "{" ws "\\"name\\":" ws tool-name "," ws "\\"arguments\\":" ws arguments ws "}"
|
|
259
|
+
tool-name ::= ${toolNames}
|
|
260
|
+
arguments ::= "{" ws (keyval ("," ws keyval)*)? ws "}"
|
|
261
|
+
keyval ::= string ":" ws value
|
|
262
|
+
value ::= string | number | bool | null | object | array
|
|
263
|
+
string ::= "\\"" chars "\\""
|
|
264
|
+
chars ::= char*
|
|
265
|
+
char ::= [^"\\\\\\x00-\\x1f] | "\\\\" escape
|
|
266
|
+
escape ::= ["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
|
|
267
|
+
number ::= "-"? ("0" | [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)?
|
|
268
|
+
bool ::= "true" | "false"
|
|
269
|
+
null ::= "null"
|
|
270
|
+
object ::= "{" ws (keyval ("," ws keyval)*)? ws "}"
|
|
271
|
+
array ::= "[" ws (value ("," ws value)*)? ws "]"
|
|
272
|
+
ws ::= [ \\t\\n]*
|
|
273
|
+
`;
|
|
274
|
+
}
|
|
275
|
+
/**
|
|
276
|
+
* Format prompt for audio processing with llama-mtmd-cli
|
|
277
|
+
*
|
|
278
|
+
* Uses "Heard:" prefix for transcript and "SAY:"/"TOOL:" for response.
|
|
279
|
+
* This format works well with Ultravox for getting both transcript and response.
|
|
280
|
+
*/
|
|
281
|
+
formatAudioPrompt(messages, tools) {
|
|
282
|
+
const hasTools = tools && tools.length > 0;
|
|
283
|
+
// Build system context
|
|
284
|
+
let systemContext = '';
|
|
285
|
+
for (const msg of messages) {
|
|
286
|
+
if (msg.role === 'system') {
|
|
287
|
+
systemContext = msg.content;
|
|
288
|
+
break;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
// Build prompt for llama-mtmd-cli
|
|
292
|
+
let prompt = '';
|
|
293
|
+
if (systemContext) {
|
|
294
|
+
prompt += `${systemContext}\n\n`;
|
|
295
|
+
}
|
|
296
|
+
if (hasTools) {
|
|
297
|
+
// Build tool list with clear instructions
|
|
298
|
+
const toolDescriptions = tools
|
|
299
|
+
.map((t) => `- ${t.name}(${Object.keys(t.parameters?.properties || {}).join(', ')}): ${t.description}`)
|
|
300
|
+
.join('\n');
|
|
301
|
+
prompt += `Tools available:\n${toolDescriptions}\n\n`;
|
|
302
|
+
prompt += `If the user asks for something a tool provides, you MUST use TOOL.
|
|
303
|
+
Format:
|
|
304
|
+
Heard: [what the user said]
|
|
305
|
+
TOOL: [{"name":"tool_name","arguments":{...}}] OR SAY: [your response]`;
|
|
306
|
+
}
|
|
307
|
+
else {
|
|
308
|
+
prompt += `Format:
|
|
309
|
+
Heard: [what the user said]
|
|
310
|
+
|
|
311
|
+
Reply: [your brief response]`;
|
|
312
|
+
}
|
|
313
|
+
return prompt;
|
|
314
|
+
}
|
|
315
|
+
/**
|
|
316
|
+
* Parse the audio LLM response
|
|
317
|
+
*
|
|
318
|
+
* Format with tools:
|
|
319
|
+
* Heard: [what user said]
|
|
320
|
+
* SAY: [response] OR TOOL: [{"name":"...","arguments":{...}}]
|
|
321
|
+
*
|
|
322
|
+
* Format without tools:
|
|
323
|
+
* Heard: [what user said]
|
|
324
|
+
* Reply: [response]
|
|
325
|
+
*/
|
|
326
|
+
parseStructuredResponse(output) {
|
|
327
|
+
// Extract transcript from "Heard: ..." line
|
|
328
|
+
const heardMatch = output.match(/Heard:\s*(.+?)(?:\n|$)/);
|
|
329
|
+
const transcript = heardMatch ? heardMatch[1].trim() : '[transcript unavailable]';
|
|
330
|
+
// Get everything after the Heard line
|
|
331
|
+
const afterHeard = output.replace(/Heard:\s*.+?(?:\n|$)/, '').trim();
|
|
332
|
+
const toolCalls = [];
|
|
333
|
+
let responseText = '';
|
|
334
|
+
// Check for TOOL: prefix
|
|
335
|
+
if (afterHeard.startsWith('TOOL:')) {
|
|
336
|
+
const toolJson = afterHeard.slice(5).trim();
|
|
337
|
+
try {
|
|
338
|
+
const parsed = JSON.parse(toolJson);
|
|
339
|
+
// Handle array format [{"name": ..., "arguments": ...}]
|
|
340
|
+
if (Array.isArray(parsed)) {
|
|
341
|
+
for (const tc of parsed) {
|
|
342
|
+
if (tc && typeof tc.name === 'string') {
|
|
343
|
+
toolCalls.push({
|
|
344
|
+
id: `native-audio-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`,
|
|
345
|
+
name: tc.name,
|
|
346
|
+
arguments: tc.arguments || {},
|
|
347
|
+
});
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
// Handle single object format {"name": ..., "arguments": ...}
|
|
352
|
+
else if (parsed && typeof parsed.name === 'string') {
|
|
353
|
+
toolCalls.push({
|
|
354
|
+
id: `native-audio-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`,
|
|
355
|
+
name: parsed.name,
|
|
356
|
+
arguments: parsed.arguments || {},
|
|
357
|
+
});
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
catch {
|
|
361
|
+
// Failed to parse tool call, treat as response
|
|
362
|
+
responseText = afterHeard;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
// Check for SAY: prefix
|
|
366
|
+
else if (afterHeard.startsWith('SAY:')) {
|
|
367
|
+
responseText = afterHeard.slice(4).trim();
|
|
368
|
+
}
|
|
369
|
+
// Check for Reply: prefix (no-tools format)
|
|
370
|
+
else if (afterHeard.startsWith('Reply:')) {
|
|
371
|
+
responseText = afterHeard.slice(6).trim();
|
|
372
|
+
}
|
|
373
|
+
// Fallback: use entire content after Heard as response
|
|
374
|
+
else {
|
|
375
|
+
responseText = afterHeard;
|
|
376
|
+
}
|
|
377
|
+
return { transcript, responseText, toolCalls };
|
|
378
|
+
}
|
|
379
|
+
// ============================================================
|
|
380
|
+
// Internal: Text Generation (fallback for text-only mode)
|
|
381
|
+
// ============================================================
|
|
382
|
+
async textGenerate(messages, options) {
|
|
383
|
+
const conversationId = options?.conversationId ?? 'default';
|
|
384
|
+
const hasTools = options?.tools && options.tools.length > 0;
|
|
385
|
+
this.tracker.logInput(conversationId, messages);
|
|
386
|
+
const prompt = this.formatTextPrompt(messages, options?.tools);
|
|
387
|
+
// If tools provided, use grammar mode
|
|
388
|
+
let grammarPath;
|
|
389
|
+
if (hasTools) {
|
|
390
|
+
const grammar = this.buildTextGrammar(options.tools);
|
|
391
|
+
grammarPath = join(tmpdir(), `text-grammar-${Date.now()}.gbnf`);
|
|
392
|
+
writeFileSync(grammarPath, grammar);
|
|
393
|
+
}
|
|
394
|
+
try {
|
|
395
|
+
// Use text binary (llama-completion) for text-only generation
|
|
396
|
+
const textBinary = this.config.textBinaryPath || this.config.binaryPath.replace('llama-mtmd-cli', 'llama-completion');
|
|
397
|
+
const args = [
|
|
398
|
+
'-m',
|
|
399
|
+
this.config.modelPath,
|
|
400
|
+
'-n',
|
|
401
|
+
String(this.config.maxNewTokens),
|
|
402
|
+
'--temp',
|
|
403
|
+
String(this.config.temperature ?? 0.7),
|
|
404
|
+
'-p',
|
|
405
|
+
prompt,
|
|
406
|
+
];
|
|
407
|
+
if (grammarPath) {
|
|
408
|
+
args.push('--grammar-file', grammarPath);
|
|
409
|
+
}
|
|
410
|
+
if (this.config.gpuLayers !== undefined) {
|
|
411
|
+
args.push('-ngl', String(this.config.gpuLayers));
|
|
412
|
+
}
|
|
413
|
+
console.log(`[NativeAudioLLM] Text generation: ${textBinary} ${args.slice(0, 6).join(' ')}...`);
|
|
414
|
+
// Run llama-completion for text generation
|
|
415
|
+
const output = await this.runTextBinary(textBinary, args, options);
|
|
416
|
+
this.tracker.logOutput(conversationId, output);
|
|
417
|
+
// Parse output for tool calls if grammar mode
|
|
418
|
+
if (hasTools) {
|
|
419
|
+
return this.parseTextResponse(output);
|
|
420
|
+
}
|
|
421
|
+
return {
|
|
422
|
+
content: output,
|
|
423
|
+
finishReason: 'stop',
|
|
424
|
+
};
|
|
425
|
+
}
|
|
426
|
+
finally {
|
|
427
|
+
// Clean up grammar file
|
|
428
|
+
if (grammarPath && existsSync(grammarPath)) {
|
|
429
|
+
try {
|
|
430
|
+
unlinkSync(grammarPath);
|
|
431
|
+
}
|
|
432
|
+
catch { /* ignore */ }
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
/**
|
|
437
|
+
* Parse text response for SAY:/TOOL: format
|
|
438
|
+
*/
|
|
439
|
+
parseTextResponse(output) {
|
|
440
|
+
const trimmed = output.trim();
|
|
441
|
+
if (trimmed.startsWith('TOOL:')) {
|
|
442
|
+
const toolJson = trimmed.slice(5).trim();
|
|
443
|
+
try {
|
|
444
|
+
const parsed = JSON.parse(toolJson);
|
|
445
|
+
const toolCalls = [];
|
|
446
|
+
if (Array.isArray(parsed)) {
|
|
447
|
+
for (const tc of parsed) {
|
|
448
|
+
if (tc && typeof tc.name === 'string') {
|
|
449
|
+
toolCalls.push({
|
|
450
|
+
id: `native-text-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`,
|
|
451
|
+
name: tc.name,
|
|
452
|
+
arguments: tc.arguments || {},
|
|
453
|
+
});
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
else if (parsed && typeof parsed.name === 'string') {
|
|
458
|
+
toolCalls.push({
|
|
459
|
+
id: `native-text-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`,
|
|
460
|
+
name: parsed.name,
|
|
461
|
+
arguments: parsed.arguments || {},
|
|
462
|
+
});
|
|
463
|
+
}
|
|
464
|
+
return {
|
|
465
|
+
content: '',
|
|
466
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : undefined,
|
|
467
|
+
finishReason: toolCalls.length > 0 ? 'tool_calls' : 'stop',
|
|
468
|
+
};
|
|
469
|
+
}
|
|
470
|
+
catch {
|
|
471
|
+
// Failed to parse, treat as text
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
// SAY: response or fallback
|
|
475
|
+
const content = trimmed.startsWith('SAY:') ? trimmed.slice(4).trim() : trimmed;
|
|
476
|
+
return {
|
|
477
|
+
content,
|
|
478
|
+
finishReason: 'stop',
|
|
479
|
+
};
|
|
480
|
+
}
|
|
481
|
+
/**
|
|
482
|
+
* Format prompt for text-only mode
|
|
483
|
+
*/
|
|
484
|
+
formatTextPrompt(messages, _tools) {
|
|
485
|
+
let prompt = '';
|
|
486
|
+
for (const msg of messages) {
|
|
487
|
+
if (msg.role === 'system') {
|
|
488
|
+
prompt += `<|im_start|>system\n${msg.content}<|im_end|>\n`;
|
|
489
|
+
}
|
|
490
|
+
else if (msg.role === 'user') {
|
|
491
|
+
prompt += `<|im_start|>user\n${msg.content}<|im_end|>\n`;
|
|
492
|
+
}
|
|
493
|
+
else if (msg.role === 'assistant') {
|
|
494
|
+
prompt += `<|im_start|>assistant\n${msg.content}<|im_end|>\n`;
|
|
495
|
+
}
|
|
496
|
+
else if (msg.role === 'tool') {
|
|
497
|
+
prompt += `<|im_start|>tool\n${msg.content}<|im_end|>\n`;
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
prompt += '<|im_start|>assistant\n';
|
|
501
|
+
return prompt;
|
|
502
|
+
}
|
|
503
|
+
// ============================================================
|
|
504
|
+
// Internal: Binary Execution
|
|
505
|
+
// ============================================================
|
|
506
|
+
/** Special tokens to filter from output */
|
|
507
|
+
static SPECIAL_TOKENS = /\[end of text\]|<\|im_end\|>|<\|im_start\|>|<\|endoftext\|>/gi;
|
|
508
|
+
/**
|
|
509
|
+
* Run llama-mtmd-cli with audio input
|
|
510
|
+
* Output format: clean response text (no markers needed)
|
|
511
|
+
*/
|
|
512
|
+
runAudioBinary(args, options) {
|
|
513
|
+
return new Promise((resolve, reject) => {
|
|
514
|
+
const proc = spawn(this.config.binaryPath, args, {
|
|
515
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
516
|
+
});
|
|
517
|
+
let output = '';
|
|
518
|
+
let stderrOutput = '';
|
|
519
|
+
let hasOutput = false;
|
|
520
|
+
// Timeout after 120 seconds (audio loading can be slow)
|
|
521
|
+
const timeout = setTimeout(() => {
|
|
522
|
+
if (!hasOutput) {
|
|
523
|
+
proc.kill();
|
|
524
|
+
reject(new Error(`Audio LLM binary timed out (no output after 120s).\n` +
|
|
525
|
+
`Ensure llama-mtmd-cli is installed and supports audio.\n` +
|
|
526
|
+
`stderr: ${stderrOutput.slice(0, 500)}`));
|
|
527
|
+
}
|
|
528
|
+
}, 120000);
|
|
529
|
+
proc.stdout.on('data', (data) => {
|
|
530
|
+
hasOutput = true;
|
|
531
|
+
output += data.toString();
|
|
532
|
+
});
|
|
533
|
+
proc.stderr.on('data', (data) => {
|
|
534
|
+
stderrOutput += data.toString();
|
|
535
|
+
});
|
|
536
|
+
proc.on('close', (code) => {
|
|
537
|
+
clearTimeout(timeout);
|
|
538
|
+
if (code !== 0) {
|
|
539
|
+
reject(new Error(`Audio LLM binary exited with code ${code}.\n` +
|
|
540
|
+
`stderr: ${stderrOutput.slice(0, 500)}`));
|
|
541
|
+
return;
|
|
542
|
+
}
|
|
543
|
+
// llama-mtmd-cli outputs clean response text to stdout
|
|
544
|
+
const response = this.parseMtmdOutput(output);
|
|
545
|
+
console.log(`[NativeAudioLLM] Response: ${response.slice(0, 200)}...`);
|
|
546
|
+
// Stream parsed response if callback provided
|
|
547
|
+
if (options?.onToken && response) {
|
|
548
|
+
for (const char of response) {
|
|
549
|
+
options.onToken(char);
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
resolve(response);
|
|
553
|
+
});
|
|
554
|
+
proc.on('error', (err) => {
|
|
555
|
+
clearTimeout(timeout);
|
|
556
|
+
reject(new Error(`Failed to spawn audio binary: ${err.message}`));
|
|
557
|
+
});
|
|
558
|
+
});
|
|
559
|
+
}
|
|
560
|
+
/**
|
|
561
|
+
* Parse llama-mtmd-cli's output format
|
|
562
|
+
* Output is clean response text on stdout (no special formatting)
|
|
563
|
+
*/
|
|
564
|
+
parseMtmdOutput(output) {
|
|
565
|
+
// llama-mtmd-cli outputs clean response text
|
|
566
|
+
// Just trim whitespace and remove any special tokens
|
|
567
|
+
return output
|
|
568
|
+
.replace(NativeAudioLLM.SPECIAL_TOKENS, '')
|
|
569
|
+
.trim();
|
|
570
|
+
}
|
|
571
|
+
/**
|
|
572
|
+
* Run llama-completion for text-only generation (after tool calls)
|
|
573
|
+
* Output is clean completion text
|
|
574
|
+
*/
|
|
575
|
+
runTextBinary(binaryPath, args, options) {
|
|
576
|
+
return new Promise((resolve, reject) => {
|
|
577
|
+
const proc = spawn(binaryPath, args, {
|
|
578
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
579
|
+
});
|
|
580
|
+
let output = '';
|
|
581
|
+
let stderrOutput = '';
|
|
582
|
+
// Timeout after 60 seconds
|
|
583
|
+
const timeout = setTimeout(() => {
|
|
584
|
+
proc.kill();
|
|
585
|
+
reject(new Error(`Text LLM binary timed out.\n` +
|
|
586
|
+
`stderr: ${stderrOutput.slice(0, 500)}`));
|
|
587
|
+
}, 60000);
|
|
588
|
+
proc.stdout.on('data', (data) => {
|
|
589
|
+
output += data.toString();
|
|
590
|
+
});
|
|
591
|
+
proc.stderr.on('data', (data) => {
|
|
592
|
+
stderrOutput += data.toString();
|
|
593
|
+
});
|
|
594
|
+
proc.on('close', (code) => {
|
|
595
|
+
clearTimeout(timeout);
|
|
596
|
+
if (code !== 0) {
|
|
597
|
+
reject(new Error(`Text LLM binary exited with code ${code}.\n` +
|
|
598
|
+
`stderr: ${stderrOutput.slice(0, 500)}`));
|
|
599
|
+
return;
|
|
600
|
+
}
|
|
601
|
+
// llama-completion outputs clean response text
|
|
602
|
+
const response = output
|
|
603
|
+
.replace(NativeAudioLLM.SPECIAL_TOKENS, '')
|
|
604
|
+
.trim();
|
|
605
|
+
console.log(`[NativeAudioLLM] Text response: ${response.slice(0, 200)}...`);
|
|
606
|
+
// Stream response if callback provided
|
|
607
|
+
if (options?.onToken && response) {
|
|
608
|
+
for (const char of response) {
|
|
609
|
+
options.onToken(char);
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
resolve(response);
|
|
613
|
+
});
|
|
614
|
+
proc.on('error', (err) => {
|
|
615
|
+
clearTimeout(timeout);
|
|
616
|
+
reject(new Error(`Failed to spawn text binary: ${err.message}`));
|
|
617
|
+
});
|
|
618
|
+
});
|
|
619
|
+
}
|
|
620
|
+
// ============================================================
|
|
621
|
+
// Internal: Helpers
|
|
622
|
+
// ============================================================
|
|
623
|
+
hashAudio(audio) {
|
|
624
|
+
// Simple hash based on length and samples
|
|
625
|
+
let hash = audio.length;
|
|
626
|
+
const step = Math.max(1, Math.floor(audio.length / 100));
|
|
627
|
+
for (let i = 0; i < audio.length; i += step) {
|
|
628
|
+
hash = (hash << 5) - hash + Math.floor(audio[i] * 1000);
|
|
629
|
+
hash |= 0;
|
|
630
|
+
}
|
|
631
|
+
return `audio-${audio.length}-${hash}`;
|
|
632
|
+
}
|
|
633
|
+
/**
|
|
634
|
+
* Write Float32Array audio data to a WAV file
|
|
635
|
+
*/
|
|
636
|
+
writeWav(path, audio, sampleRate) {
|
|
637
|
+
const numChannels = 1;
|
|
638
|
+
const bitsPerSample = 16;
|
|
639
|
+
const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
|
|
640
|
+
const blockAlign = numChannels * (bitsPerSample / 8);
|
|
641
|
+
const dataSize = audio.length * (bitsPerSample / 8);
|
|
642
|
+
const fileSize = 36 + dataSize;
|
|
643
|
+
const buffer = Buffer.alloc(44 + dataSize);
|
|
644
|
+
let offset = 0;
|
|
645
|
+
buffer.write('RIFF', offset);
|
|
646
|
+
offset += 4;
|
|
647
|
+
buffer.writeUInt32LE(fileSize, offset);
|
|
648
|
+
offset += 4;
|
|
649
|
+
buffer.write('WAVE', offset);
|
|
650
|
+
offset += 4;
|
|
651
|
+
buffer.write('fmt ', offset);
|
|
652
|
+
offset += 4;
|
|
653
|
+
buffer.writeUInt32LE(16, offset);
|
|
654
|
+
offset += 4;
|
|
655
|
+
buffer.writeUInt16LE(1, offset);
|
|
656
|
+
offset += 2;
|
|
657
|
+
buffer.writeUInt16LE(numChannels, offset);
|
|
658
|
+
offset += 2;
|
|
659
|
+
buffer.writeUInt32LE(sampleRate, offset);
|
|
660
|
+
offset += 4;
|
|
661
|
+
buffer.writeUInt32LE(byteRate, offset);
|
|
662
|
+
offset += 4;
|
|
663
|
+
buffer.writeUInt16LE(blockAlign, offset);
|
|
664
|
+
offset += 2;
|
|
665
|
+
buffer.writeUInt16LE(bitsPerSample, offset);
|
|
666
|
+
offset += 2;
|
|
667
|
+
buffer.write('data', offset);
|
|
668
|
+
offset += 4;
|
|
669
|
+
buffer.writeUInt32LE(dataSize, offset);
|
|
670
|
+
offset += 4;
|
|
671
|
+
for (let i = 0; i < audio.length; i++) {
|
|
672
|
+
const sample = Math.max(-1, Math.min(1, audio[i]));
|
|
673
|
+
const int16 = sample < 0 ? sample * 32768 : sample * 32767;
|
|
674
|
+
buffer.writeInt16LE(Math.round(int16), offset);
|
|
675
|
+
offset += 2;
|
|
676
|
+
}
|
|
677
|
+
writeFileSync(path, buffer);
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
//# sourceMappingURL=audio-llm.js.map
|