gitclaw 0.3.1 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +6 -2
- package/dist/composio/adapter.d.ts +26 -0
- package/dist/composio/adapter.js +92 -0
- package/dist/composio/client.d.ts +39 -0
- package/dist/composio/client.js +170 -0
- package/dist/composio/index.d.ts +2 -0
- package/dist/composio/index.js +2 -0
- package/dist/context.d.ts +20 -0
- package/dist/context.js +211 -0
- package/dist/exports.d.ts +2 -0
- package/dist/exports.js +1 -0
- package/dist/index.js +99 -7
- package/dist/learning/reinforcement.d.ts +11 -0
- package/dist/learning/reinforcement.js +91 -0
- package/dist/loader.js +34 -1
- package/dist/sdk.js +5 -1
- package/dist/skills.d.ts +5 -0
- package/dist/skills.js +58 -7
- package/dist/tools/capture-photo.d.ts +3 -0
- package/dist/tools/capture-photo.js +91 -0
- package/dist/tools/index.d.ts +2 -1
- package/dist/tools/index.js +12 -2
- package/dist/tools/read.js +4 -0
- package/dist/tools/shared.d.ts +20 -0
- package/dist/tools/shared.js +24 -0
- package/dist/tools/skill-learner.d.ts +3 -0
- package/dist/tools/skill-learner.js +358 -0
- package/dist/tools/task-tracker.d.ts +20 -0
- package/dist/tools/task-tracker.js +275 -0
- package/dist/tools/write.js +4 -0
- package/dist/voice/adapter.d.ts +97 -0
- package/dist/voice/adapter.js +30 -0
- package/dist/voice/chat-history.d.ts +8 -0
- package/dist/voice/chat-history.js +121 -0
- package/dist/voice/gemini-live.d.ts +20 -0
- package/dist/voice/gemini-live.js +279 -0
- package/dist/voice/index.d.ts +4 -0
- package/dist/voice/index.js +3 -0
- package/dist/voice/openai-realtime.d.ts +27 -0
- package/dist/voice/openai-realtime.js +291 -0
- package/dist/voice/server.d.ts +2 -0
- package/dist/voice/server.js +2319 -0
- package/dist/voice/ui.html +2556 -0
- package/package.json +21 -7
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
import WebSocket from "ws";
|
|
2
|
+
import { DEFAULT_VOICE_INSTRUCTIONS, } from "./adapter.js";
|
|
3
|
+
const dim = (s) => `\x1b[2m${s}\x1b[0m`;
|
|
4
|
+
/**
|
|
5
|
+
* Downsample 24kHz PCM (Int16LE) to 16kHz by linear interpolation (2 of every 3 samples).
|
|
6
|
+
* Input: base64-encoded 24kHz Int16LE. Output: base64-encoded 16kHz Int16LE.
|
|
7
|
+
*/
|
|
8
|
+
function downsample24kTo16k(base64_24k) {
|
|
9
|
+
const binary = Buffer.from(base64_24k, "base64");
|
|
10
|
+
const samples24 = new Int16Array(binary.buffer, binary.byteOffset, binary.byteLength / 2);
|
|
11
|
+
const outLength = Math.floor(samples24.length * 2 / 3);
|
|
12
|
+
const samples16 = new Int16Array(outLength);
|
|
13
|
+
for (let i = 0; i < outLength; i++) {
|
|
14
|
+
// Map output index to fractional input index
|
|
15
|
+
const srcIdx = i * 1.5;
|
|
16
|
+
const lo = Math.floor(srcIdx);
|
|
17
|
+
const frac = srcIdx - lo;
|
|
18
|
+
const hi = Math.min(lo + 1, samples24.length - 1);
|
|
19
|
+
samples16[i] = Math.round(samples24[lo] * (1 - frac) + samples24[hi] * frac);
|
|
20
|
+
}
|
|
21
|
+
return Buffer.from(samples16.buffer).toString("base64");
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Upsample 16kHz PCM (Int16LE) to 24kHz by linear interpolation.
|
|
25
|
+
* Input: base64-encoded 16kHz Int16LE. Output: base64-encoded 24kHz Int16LE.
|
|
26
|
+
*/
|
|
27
|
+
function upsample16kTo24k(base64_16k) {
|
|
28
|
+
const binary = Buffer.from(base64_16k, "base64");
|
|
29
|
+
const samples16 = new Int16Array(binary.buffer, binary.byteOffset, binary.byteLength / 2);
|
|
30
|
+
const outLength = Math.floor(samples16.length * 3 / 2);
|
|
31
|
+
const samples24 = new Int16Array(outLength);
|
|
32
|
+
for (let i = 0; i < outLength; i++) {
|
|
33
|
+
const srcIdx = i * (2 / 3);
|
|
34
|
+
const lo = Math.floor(srcIdx);
|
|
35
|
+
const frac = srcIdx - lo;
|
|
36
|
+
const hi = Math.min(lo + 1, samples16.length - 1);
|
|
37
|
+
samples24[i] = Math.round(samples16[lo] * (1 - frac) + samples16[hi] * frac);
|
|
38
|
+
}
|
|
39
|
+
return Buffer.from(samples24.buffer).toString("base64");
|
|
40
|
+
}
|
|
41
|
+
export class GeminiLiveAdapter {
|
|
42
|
+
ws = null;
|
|
43
|
+
config;
|
|
44
|
+
onMessage = null;
|
|
45
|
+
toolHandler = null;
|
|
46
|
+
setupDone = false;
|
|
47
|
+
constructor(config) {
|
|
48
|
+
this.config = config;
|
|
49
|
+
}
|
|
50
|
+
async connect(opts) {
|
|
51
|
+
this.onMessage = opts.onMessage;
|
|
52
|
+
this.toolHandler = opts.toolHandler;
|
|
53
|
+
this.setupDone = false;
|
|
54
|
+
const model = this.config.model || "models/gemini-2.5-flash-native-audio-preview";
|
|
55
|
+
const url = `wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent?key=${this.config.apiKey}`;
|
|
56
|
+
return new Promise((resolve, reject) => {
|
|
57
|
+
this.ws = new WebSocket(url);
|
|
58
|
+
this.ws.on("open", () => {
|
|
59
|
+
console.log(dim("[voice] Connected to Gemini Multimodal Live"));
|
|
60
|
+
this.sendSetup(model);
|
|
61
|
+
});
|
|
62
|
+
this.ws.on("error", (err) => {
|
|
63
|
+
console.error(dim(`[voice] Gemini WS error: ${err.message}`));
|
|
64
|
+
if (!this.setupDone) {
|
|
65
|
+
reject(err);
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
this.emit({ type: "error", message: err.message });
|
|
69
|
+
}
|
|
70
|
+
});
|
|
71
|
+
this.ws.on("close", () => {
|
|
72
|
+
console.log(dim("[voice] Gemini WS closed"));
|
|
73
|
+
});
|
|
74
|
+
this.ws.on("message", (data) => {
|
|
75
|
+
try {
|
|
76
|
+
const msg = JSON.parse(data.toString());
|
|
77
|
+
this.handleGeminiMessage(msg);
|
|
78
|
+
// Resolve after setup acknowledgment
|
|
79
|
+
if (!this.setupDone && msg.setupComplete) {
|
|
80
|
+
this.setupDone = true;
|
|
81
|
+
console.log(dim("[voice] Gemini session ready"));
|
|
82
|
+
resolve();
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
catch (err) {
|
|
86
|
+
console.error(dim(`[voice] Gemini parse error: ${err.message}`));
|
|
87
|
+
}
|
|
88
|
+
});
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
send(msg) {
|
|
92
|
+
switch (msg.type) {
|
|
93
|
+
case "audio":
|
|
94
|
+
// Browser sends 24kHz, Gemini expects 16kHz
|
|
95
|
+
this.sendRaw({
|
|
96
|
+
realtimeInput: {
|
|
97
|
+
mediaChunks: [{
|
|
98
|
+
mimeType: "audio/pcm;rate=16000",
|
|
99
|
+
data: downsample24kTo16k(msg.audio),
|
|
100
|
+
}],
|
|
101
|
+
},
|
|
102
|
+
});
|
|
103
|
+
break;
|
|
104
|
+
case "video_frame":
|
|
105
|
+
// Gemini supports continuous video streaming natively
|
|
106
|
+
this.sendRaw({
|
|
107
|
+
realtimeInput: {
|
|
108
|
+
mediaChunks: [{
|
|
109
|
+
mimeType: msg.mimeType,
|
|
110
|
+
data: msg.frame,
|
|
111
|
+
}],
|
|
112
|
+
},
|
|
113
|
+
});
|
|
114
|
+
break;
|
|
115
|
+
case "text":
|
|
116
|
+
this.sendRaw({
|
|
117
|
+
clientContent: {
|
|
118
|
+
turns: [{
|
|
119
|
+
role: "user",
|
|
120
|
+
parts: [{ text: msg.text }],
|
|
121
|
+
}],
|
|
122
|
+
turnComplete: true,
|
|
123
|
+
},
|
|
124
|
+
});
|
|
125
|
+
break;
|
|
126
|
+
case "file": {
|
|
127
|
+
const parts = [];
|
|
128
|
+
if (msg.mimeType.startsWith("image/")) {
|
|
129
|
+
parts.push({ inlineData: { mimeType: msg.mimeType, data: msg.data } });
|
|
130
|
+
parts.push({ text: msg.text || `[User attached image: ${msg.name}]` });
|
|
131
|
+
}
|
|
132
|
+
else {
|
|
133
|
+
const decoded = Buffer.from(msg.data, "base64").toString("utf-8");
|
|
134
|
+
const label = msg.text ? `${msg.text}\n\n` : "";
|
|
135
|
+
parts.push({ text: `${label}[File: ${msg.name}]\n\`\`\`\n${decoded}\n\`\`\`` });
|
|
136
|
+
}
|
|
137
|
+
this.sendRaw({
|
|
138
|
+
clientContent: {
|
|
139
|
+
turns: [{ role: "user", parts }],
|
|
140
|
+
turnComplete: true,
|
|
141
|
+
},
|
|
142
|
+
});
|
|
143
|
+
break;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
async disconnect() {
|
|
148
|
+
if (this.ws) {
|
|
149
|
+
this.ws.close();
|
|
150
|
+
this.ws = null;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
emit(msg) {
|
|
154
|
+
this.onMessage?.(msg);
|
|
155
|
+
}
|
|
156
|
+
sendSetup(model) {
|
|
157
|
+
const instructions = this.config.instructions || DEFAULT_VOICE_INSTRUCTIONS;
|
|
158
|
+
const voiceName = this.config.voice || "Aoede";
|
|
159
|
+
this.sendRaw({
|
|
160
|
+
setup: {
|
|
161
|
+
model,
|
|
162
|
+
generationConfig: {
|
|
163
|
+
responseModalities: ["AUDIO", "TEXT"],
|
|
164
|
+
speechConfig: {
|
|
165
|
+
voiceConfig: {
|
|
166
|
+
prebuiltVoiceConfig: { voiceName },
|
|
167
|
+
},
|
|
168
|
+
},
|
|
169
|
+
},
|
|
170
|
+
tools: [{
|
|
171
|
+
functionDeclarations: [{
|
|
172
|
+
name: "run_agent",
|
|
173
|
+
description: "Execute any request through the gitclaw agent. It has full access to the terminal (can run any shell command, open apps, install packages), file system (read/write/create files), git operations, and persistent memory. Use this for ALL actionable requests. IMPORTANT: If the user uploaded a file, always include the file path (from the '[File saved to: ...]' annotation) in the query.",
|
|
174
|
+
parameters: {
|
|
175
|
+
type: "OBJECT",
|
|
176
|
+
properties: {
|
|
177
|
+
query: {
|
|
178
|
+
type: "STRING",
|
|
179
|
+
description: "The user's request. MUST include file paths when referencing uploaded files (e.g. 'make a game using the image at workspace/lobster.png').",
|
|
180
|
+
},
|
|
181
|
+
},
|
|
182
|
+
required: ["query"],
|
|
183
|
+
},
|
|
184
|
+
}],
|
|
185
|
+
}],
|
|
186
|
+
systemInstruction: {
|
|
187
|
+
parts: [{ text: instructions }],
|
|
188
|
+
},
|
|
189
|
+
contextWindowCompression: {
|
|
190
|
+
triggerTokens: 25000,
|
|
191
|
+
slidingWindow: { targetTokens: 12500 },
|
|
192
|
+
},
|
|
193
|
+
},
|
|
194
|
+
});
|
|
195
|
+
}
|
|
196
|
+
handleGeminiMessage(msg) {
|
|
197
|
+
// Tool calls
|
|
198
|
+
if (msg.toolCall) {
|
|
199
|
+
this.handleToolCall(msg.toolCall);
|
|
200
|
+
return;
|
|
201
|
+
}
|
|
202
|
+
// Server content (audio/text responses)
|
|
203
|
+
if (msg.serverContent) {
|
|
204
|
+
const sc = msg.serverContent;
|
|
205
|
+
// Model turn parts
|
|
206
|
+
if (sc.modelTurn?.parts) {
|
|
207
|
+
for (const part of sc.modelTurn.parts) {
|
|
208
|
+
if (part.inlineData) {
|
|
209
|
+
const mimeType = part.inlineData.mimeType || "";
|
|
210
|
+
if (mimeType.startsWith("audio/")) {
|
|
211
|
+
// Gemini outputs 16kHz, browser expects 24kHz
|
|
212
|
+
const audio24k = upsample16kTo24k(part.inlineData.data);
|
|
213
|
+
this.emit({ type: "audio_delta", audio: audio24k });
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
if (part.text) {
|
|
217
|
+
this.emit({
|
|
218
|
+
type: "transcript",
|
|
219
|
+
role: "assistant",
|
|
220
|
+
text: part.text,
|
|
221
|
+
partial: !sc.turnComplete,
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
// Turn complete marker
|
|
227
|
+
if (sc.turnComplete && sc.modelTurn?.parts) {
|
|
228
|
+
const textParts = sc.modelTurn.parts.filter((p) => p.text).map((p) => p.text);
|
|
229
|
+
if (textParts.length > 0) {
|
|
230
|
+
this.emit({ type: "transcript", role: "assistant", text: textParts.join("") });
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
// Input transcription
|
|
234
|
+
if (sc.inputTranscription?.text) {
|
|
235
|
+
console.log(dim(`[voice] User: ${sc.inputTranscription.text}`));
|
|
236
|
+
this.emit({ type: "transcript", role: "user", text: sc.inputTranscription.text });
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
async handleToolCall(toolCall) {
|
|
241
|
+
if (!this.toolHandler)
|
|
242
|
+
return;
|
|
243
|
+
const functionCalls = toolCall.functionCalls || [];
|
|
244
|
+
const responses = [];
|
|
245
|
+
for (const fc of functionCalls) {
|
|
246
|
+
if (fc.name !== "run_agent") {
|
|
247
|
+
console.error(dim(`[voice] Unknown Gemini function call: ${fc.name}`));
|
|
248
|
+
responses.push({ id: fc.id, name: fc.name, response: { error: `Unknown function: ${fc.name}` } });
|
|
249
|
+
continue;
|
|
250
|
+
}
|
|
251
|
+
const queryArg = fc.args?.query;
|
|
252
|
+
if (!queryArg) {
|
|
253
|
+
responses.push({ id: fc.id, name: fc.name, response: { error: "Missing query argument" } });
|
|
254
|
+
continue;
|
|
255
|
+
}
|
|
256
|
+
console.log(dim(`[voice] Agent query: ${queryArg}`));
|
|
257
|
+
this.emit({ type: "agent_working", query: queryArg });
|
|
258
|
+
try {
|
|
259
|
+
const result = await this.toolHandler(queryArg);
|
|
260
|
+
console.log(dim(`[voice] Agent response: ${result.slice(0, 200)}${result.length > 200 ? "..." : ""}`));
|
|
261
|
+
responses.push({ id: fc.id, name: fc.name, response: { result } });
|
|
262
|
+
this.emit({ type: "agent_done", result: result.slice(0, 500) });
|
|
263
|
+
}
|
|
264
|
+
catch (err) {
|
|
265
|
+
console.error(dim(`[voice] Agent error: ${err.message}`));
|
|
266
|
+
responses.push({ id: fc.id, name: fc.name, response: { error: err.message } });
|
|
267
|
+
this.emit({ type: "error", message: err.message });
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
this.sendRaw({
|
|
271
|
+
toolResponse: { functionResponses: responses },
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
sendRaw(msg) {
|
|
275
|
+
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
|
|
276
|
+
this.ws.send(JSON.stringify(msg));
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
export type { VoiceAdapter, VoiceAdapterConfig, VoiceServerOptions, MultimodalAdapter, MultimodalAdapterConfig, AdapterBackend, ClientMessage, ServerMessage, } from "./adapter.js";
|
|
2
|
+
export { OpenAIRealtimeAdapter } from "./openai-realtime.js";
|
|
3
|
+
export { GeminiLiveAdapter } from "./gemini-live.js";
|
|
4
|
+
export { startVoiceServer } from "./server.js";
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import { type MultimodalAdapter, type MultimodalAdapterConfig, type ClientMessage, type ServerMessage } from "./adapter.js";
|
|
2
|
+
export declare class OpenAIRealtimeAdapter implements MultimodalAdapter {
|
|
3
|
+
private ws;
|
|
4
|
+
private config;
|
|
5
|
+
private latestVideoFrame;
|
|
6
|
+
private latestScreenFrame;
|
|
7
|
+
private onMessage;
|
|
8
|
+
private toolHandler;
|
|
9
|
+
private interrupted;
|
|
10
|
+
constructor(config: MultimodalAdapterConfig);
|
|
11
|
+
connect(opts: {
|
|
12
|
+
toolHandler: (query: string) => Promise<string>;
|
|
13
|
+
onMessage: (msg: ServerMessage) => void;
|
|
14
|
+
}): Promise<void>;
|
|
15
|
+
send(msg: ClientMessage): void;
|
|
16
|
+
disconnect(): Promise<void>;
|
|
17
|
+
private emit;
|
|
18
|
+
/**
|
|
19
|
+
* Inject the latest video frame as a conversation item so the model
|
|
20
|
+
* can see it when generating the next response (e.g. after a voice turn).
|
|
21
|
+
*/
|
|
22
|
+
private injectVideoFrame;
|
|
23
|
+
private sendSessionUpdate;
|
|
24
|
+
private handleEvent;
|
|
25
|
+
private handleFunctionCall;
|
|
26
|
+
private sendRaw;
|
|
27
|
+
}
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
import WebSocket from "ws";
|
|
2
|
+
import { DEFAULT_VOICE_INSTRUCTIONS, } from "./adapter.js";
|
|
3
|
+
const dim = (s) => `\x1b[2m${s}\x1b[0m`;
|
|
4
|
+
export class OpenAIRealtimeAdapter {
|
|
5
|
+
ws = null;
|
|
6
|
+
config;
|
|
7
|
+
latestVideoFrame = null;
|
|
8
|
+
latestScreenFrame = null;
|
|
9
|
+
onMessage = null;
|
|
10
|
+
toolHandler = null;
|
|
11
|
+
interrupted = false;
|
|
12
|
+
constructor(config) {
|
|
13
|
+
this.config = config;
|
|
14
|
+
}
|
|
15
|
+
async connect(opts) {
|
|
16
|
+
this.onMessage = opts.onMessage;
|
|
17
|
+
this.toolHandler = opts.toolHandler;
|
|
18
|
+
const model = this.config.model || "gpt-realtime";
|
|
19
|
+
const url = `wss://api.openai.com/v1/realtime?model=${model}`;
|
|
20
|
+
return new Promise((resolve, reject) => {
|
|
21
|
+
this.ws = new WebSocket(url, {
|
|
22
|
+
headers: {
|
|
23
|
+
Authorization: `Bearer ${this.config.apiKey}`,
|
|
24
|
+
"OpenAI-Beta": "realtime=v1",
|
|
25
|
+
},
|
|
26
|
+
});
|
|
27
|
+
this.ws.on("open", () => {
|
|
28
|
+
this.sendSessionUpdate();
|
|
29
|
+
resolve();
|
|
30
|
+
});
|
|
31
|
+
this.ws.on("error", (err) => {
|
|
32
|
+
if (!this.ws) {
|
|
33
|
+
reject(err);
|
|
34
|
+
}
|
|
35
|
+
else {
|
|
36
|
+
console.error(dim(`[voice] WebSocket error: ${err.message}`));
|
|
37
|
+
this.emit({ type: "error", message: err.message });
|
|
38
|
+
}
|
|
39
|
+
});
|
|
40
|
+
this.ws.on("close", () => {
|
|
41
|
+
console.log(dim("[voice] WebSocket closed"));
|
|
42
|
+
});
|
|
43
|
+
this.ws.on("message", (data) => {
|
|
44
|
+
const event = JSON.parse(data.toString());
|
|
45
|
+
this.handleEvent(event);
|
|
46
|
+
});
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
send(msg) {
|
|
50
|
+
switch (msg.type) {
|
|
51
|
+
case "audio":
|
|
52
|
+
this.sendRaw({
|
|
53
|
+
type: "input_audio_buffer.append",
|
|
54
|
+
audio: msg.audio,
|
|
55
|
+
});
|
|
56
|
+
break;
|
|
57
|
+
case "video_frame": {
|
|
58
|
+
// OpenAI doesn't support continuous video. Store latest frame and
|
|
59
|
+
// inject it as an image on the next user turn via conversation item.
|
|
60
|
+
const source = msg.source || "camera";
|
|
61
|
+
if (source === "screen") {
|
|
62
|
+
this.latestScreenFrame = { frame: msg.frame, mimeType: msg.mimeType };
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
this.latestVideoFrame = { frame: msg.frame, mimeType: msg.mimeType };
|
|
66
|
+
}
|
|
67
|
+
break;
|
|
68
|
+
}
|
|
69
|
+
case "text": {
|
|
70
|
+
// Send text as a user conversation item, optionally with latest video frame
|
|
71
|
+
const content = [];
|
|
72
|
+
if (this.latestVideoFrame) {
|
|
73
|
+
content.push({
|
|
74
|
+
type: "input_image",
|
|
75
|
+
image_url: `data:${this.latestVideoFrame.mimeType};base64,${this.latestVideoFrame.frame}`,
|
|
76
|
+
});
|
|
77
|
+
this.latestVideoFrame = null;
|
|
78
|
+
}
|
|
79
|
+
content.push({ type: "input_text", text: msg.text });
|
|
80
|
+
this.sendRaw({
|
|
81
|
+
type: "conversation.item.create",
|
|
82
|
+
item: {
|
|
83
|
+
type: "message",
|
|
84
|
+
role: "user",
|
|
85
|
+
content,
|
|
86
|
+
},
|
|
87
|
+
});
|
|
88
|
+
this.sendRaw({ type: "response.create" });
|
|
89
|
+
break;
|
|
90
|
+
}
|
|
91
|
+
case "file": {
|
|
92
|
+
const content = [];
|
|
93
|
+
if (msg.mimeType.startsWith("image/")) {
|
|
94
|
+
content.push({
|
|
95
|
+
type: "input_image",
|
|
96
|
+
image_url: `data:${msg.mimeType};base64,${msg.data}`,
|
|
97
|
+
});
|
|
98
|
+
content.push({ type: "input_text", text: msg.text || `[User attached image: ${msg.name}]` });
|
|
99
|
+
}
|
|
100
|
+
else {
|
|
101
|
+
const decoded = Buffer.from(msg.data, "base64").toString("utf-8");
|
|
102
|
+
const label = msg.text ? `${msg.text}\n\n` : "";
|
|
103
|
+
content.push({ type: "input_text", text: `${label}[File: ${msg.name}]\n\`\`\`\n${decoded}\n\`\`\`` });
|
|
104
|
+
}
|
|
105
|
+
this.sendRaw({
|
|
106
|
+
type: "conversation.item.create",
|
|
107
|
+
item: { type: "message", role: "user", content },
|
|
108
|
+
});
|
|
109
|
+
this.sendRaw({ type: "response.create" });
|
|
110
|
+
break;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
async disconnect() {
|
|
115
|
+
if (this.ws) {
|
|
116
|
+
this.ws.close();
|
|
117
|
+
this.ws = null;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
emit(msg) {
|
|
121
|
+
this.onMessage?.(msg);
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Inject the latest video frame as a conversation item so the model
|
|
125
|
+
* can see it when generating the next response (e.g. after a voice turn).
|
|
126
|
+
*/
|
|
127
|
+
injectVideoFrame() {
|
|
128
|
+
// Prefer screen frame over camera — it provides more useful context
|
|
129
|
+
const isScreen = !!this.latestScreenFrame;
|
|
130
|
+
const frame = this.latestScreenFrame || this.latestVideoFrame;
|
|
131
|
+
if (!frame)
|
|
132
|
+
return;
|
|
133
|
+
// Clear both so we don't inject stale frames
|
|
134
|
+
this.latestScreenFrame = null;
|
|
135
|
+
this.latestVideoFrame = null;
|
|
136
|
+
console.log(dim(`[voice] Injecting ${isScreen ? "screen" : "camera"} frame into conversation`));
|
|
137
|
+
this.sendRaw({
|
|
138
|
+
type: "conversation.item.create",
|
|
139
|
+
item: {
|
|
140
|
+
type: "message",
|
|
141
|
+
role: "user",
|
|
142
|
+
content: [{
|
|
143
|
+
type: "input_image",
|
|
144
|
+
image_url: `data:${frame.mimeType};base64,${frame.frame}`,
|
|
145
|
+
}],
|
|
146
|
+
},
|
|
147
|
+
});
|
|
148
|
+
}
|
|
149
|
+
sendSessionUpdate() {
|
|
150
|
+
const instructions = this.config.instructions || DEFAULT_VOICE_INSTRUCTIONS;
|
|
151
|
+
this.sendRaw({
|
|
152
|
+
type: "session.update",
|
|
153
|
+
session: {
|
|
154
|
+
instructions,
|
|
155
|
+
voice: this.config.voice || "ash",
|
|
156
|
+
modalities: ["text", "audio"],
|
|
157
|
+
turn_detection: {
|
|
158
|
+
type: "server_vad",
|
|
159
|
+
threshold: 0.6,
|
|
160
|
+
prefix_padding_ms: 400,
|
|
161
|
+
silence_duration_ms: 800,
|
|
162
|
+
create_response: true,
|
|
163
|
+
},
|
|
164
|
+
input_audio_transcription: { model: "whisper-1" },
|
|
165
|
+
tool_choice: "auto",
|
|
166
|
+
tools: [
|
|
167
|
+
{
|
|
168
|
+
type: "function",
|
|
169
|
+
name: "run_agent",
|
|
170
|
+
description: "Your ONLY way to take action. This agent runs on the user's Mac with full shell access. It can: run ANY shell command, open apps (open -a Spotify), play music (osascript, afplay, open URLs), browse the web, read/write files, git operations, send emails, manage calendars, install packages, control system settings, and save memories. You MUST call this tool whenever the user asks you to DO anything — play music, open something, check something, build something, send something. NEVER describe an action without calling this tool. If the user asks and you just talk without calling this — you failed.",
|
|
171
|
+
parameters: {
|
|
172
|
+
type: "object",
|
|
173
|
+
properties: {
|
|
174
|
+
query: {
|
|
175
|
+
type: "string",
|
|
176
|
+
description: "What to do. Be specific. Include file paths for uploaded files. Examples: 'Play relaxing music on YouTube using: open https://youtube.com/...', 'Open Spotify and play chill playlist using osascript', 'Save to memory: user likes rock music'",
|
|
177
|
+
},
|
|
178
|
+
},
|
|
179
|
+
required: ["query"],
|
|
180
|
+
},
|
|
181
|
+
},
|
|
182
|
+
],
|
|
183
|
+
},
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
handleEvent(event) {
|
|
187
|
+
switch (event.type) {
|
|
188
|
+
case "session.created":
|
|
189
|
+
console.log(dim("[voice] Session created"));
|
|
190
|
+
break;
|
|
191
|
+
case "session.updated":
|
|
192
|
+
console.log(dim("[voice] Session configured"));
|
|
193
|
+
break;
|
|
194
|
+
case "input_audio_buffer.speech_started":
|
|
195
|
+
// VAD detected start of speech — inject video frame (what user is looking at)
|
|
196
|
+
// and cancel any in-progress response so the user can interrupt
|
|
197
|
+
this.interrupted = true;
|
|
198
|
+
this.injectVideoFrame();
|
|
199
|
+
this.sendRaw({ type: "response.cancel" });
|
|
200
|
+
this.emit({ type: "interrupt" });
|
|
201
|
+
break;
|
|
202
|
+
case "input_audio_buffer.speech_stopped":
|
|
203
|
+
break;
|
|
204
|
+
case "conversation.item.input_audio_transcription.completed":
|
|
205
|
+
if (event.transcript) {
|
|
206
|
+
console.log(dim(`[voice] User: ${event.transcript}`));
|
|
207
|
+
this.emit({ type: "transcript", role: "user", text: event.transcript });
|
|
208
|
+
}
|
|
209
|
+
break;
|
|
210
|
+
case "response.created":
|
|
211
|
+
// New response starting — accept audio again
|
|
212
|
+
this.interrupted = false;
|
|
213
|
+
break;
|
|
214
|
+
case "response.audio.delta":
|
|
215
|
+
if (event.delta && !this.interrupted) {
|
|
216
|
+
this.emit({ type: "audio_delta", audio: event.delta });
|
|
217
|
+
}
|
|
218
|
+
break;
|
|
219
|
+
case "response.audio_transcript.delta":
|
|
220
|
+
this.emit({ type: "transcript", role: "assistant", text: event.delta || "", partial: true });
|
|
221
|
+
break;
|
|
222
|
+
case "response.audio_transcript.done":
|
|
223
|
+
if (event.transcript) {
|
|
224
|
+
this.emit({ type: "transcript", role: "assistant", text: event.transcript });
|
|
225
|
+
}
|
|
226
|
+
break;
|
|
227
|
+
case "response.function_call_arguments.done":
|
|
228
|
+
this.handleFunctionCall(event);
|
|
229
|
+
break;
|
|
230
|
+
case "error": {
|
|
231
|
+
const errMsg = event.error?.message || "Unknown OpenAI error";
|
|
232
|
+
console.error(dim(`[voice] Error: ${JSON.stringify(event.error)}`));
|
|
233
|
+
// Don't surface cancellation errors — they happen when user interrupts with no active response
|
|
234
|
+
if (errMsg.toLowerCase().includes("cancellation failed"))
|
|
235
|
+
break;
|
|
236
|
+
this.emit({ type: "error", message: errMsg });
|
|
237
|
+
break;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
async handleFunctionCall(event) {
|
|
242
|
+
const callId = event.call_id;
|
|
243
|
+
const name = event.name;
|
|
244
|
+
if (name !== "run_agent" || !this.toolHandler) {
|
|
245
|
+
console.error(dim(`[voice] Unknown function call: ${name}`));
|
|
246
|
+
return;
|
|
247
|
+
}
|
|
248
|
+
let args;
|
|
249
|
+
try {
|
|
250
|
+
args = JSON.parse(event.arguments);
|
|
251
|
+
}
|
|
252
|
+
catch {
|
|
253
|
+
console.error(dim("[voice] Failed to parse function arguments"));
|
|
254
|
+
return;
|
|
255
|
+
}
|
|
256
|
+
console.log(dim(`[voice] Agent query: ${args.query}`));
|
|
257
|
+
this.emit({ type: "agent_working", query: args.query });
|
|
258
|
+
try {
|
|
259
|
+
const result = await this.toolHandler(args.query);
|
|
260
|
+
console.log(dim(`[voice] Agent response: ${result.slice(0, 200)}${result.length > 200 ? "..." : ""}`));
|
|
261
|
+
this.sendRaw({
|
|
262
|
+
type: "conversation.item.create",
|
|
263
|
+
item: {
|
|
264
|
+
type: "function_call_output",
|
|
265
|
+
call_id: callId,
|
|
266
|
+
output: result,
|
|
267
|
+
},
|
|
268
|
+
});
|
|
269
|
+
this.sendRaw({ type: "response.create" });
|
|
270
|
+
this.emit({ type: "agent_done", result: result.slice(0, 500) });
|
|
271
|
+
}
|
|
272
|
+
catch (err) {
|
|
273
|
+
console.error(dim(`[voice] Agent error: ${err.message}`));
|
|
274
|
+
this.sendRaw({
|
|
275
|
+
type: "conversation.item.create",
|
|
276
|
+
item: {
|
|
277
|
+
type: "function_call_output",
|
|
278
|
+
call_id: callId,
|
|
279
|
+
output: `Error: ${err.message}`,
|
|
280
|
+
},
|
|
281
|
+
});
|
|
282
|
+
this.sendRaw({ type: "response.create" });
|
|
283
|
+
this.emit({ type: "error", message: err.message });
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
sendRaw(event) {
|
|
287
|
+
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
|
|
288
|
+
this.ws.send(JSON.stringify(event));
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
}
|