gitclaw 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +54 -28
- package/dist/composio/adapter.d.ts +26 -0
- package/dist/composio/adapter.js +92 -0
- package/dist/composio/client.d.ts +39 -0
- package/dist/composio/client.js +170 -0
- package/dist/composio/index.d.ts +2 -0
- package/dist/composio/index.js +2 -0
- package/dist/context.d.ts +20 -0
- package/dist/context.js +211 -0
- package/dist/exports.d.ts +2 -0
- package/dist/exports.js +1 -0
- package/dist/index.js +99 -7
- package/dist/learning/reinforcement.d.ts +11 -0
- package/dist/learning/reinforcement.js +91 -0
- package/dist/loader.js +34 -1
- package/dist/sdk.js +5 -1
- package/dist/skills.d.ts +5 -0
- package/dist/skills.js +58 -7
- package/dist/tools/capture-photo.d.ts +3 -0
- package/dist/tools/capture-photo.js +91 -0
- package/dist/tools/index.d.ts +2 -1
- package/dist/tools/index.js +12 -2
- package/dist/tools/read.js +4 -0
- package/dist/tools/shared.d.ts +20 -0
- package/dist/tools/shared.js +24 -0
- package/dist/tools/skill-learner.d.ts +3 -0
- package/dist/tools/skill-learner.js +358 -0
- package/dist/tools/task-tracker.d.ts +20 -0
- package/dist/tools/task-tracker.js +275 -0
- package/dist/tools/write.js +4 -0
- package/dist/voice/adapter.d.ts +97 -0
- package/dist/voice/adapter.js +30 -0
- package/dist/voice/chat-history.d.ts +8 -0
- package/dist/voice/chat-history.js +121 -0
- package/dist/voice/gemini-live.d.ts +20 -0
- package/dist/voice/gemini-live.js +279 -0
- package/dist/voice/index.d.ts +4 -0
- package/dist/voice/index.js +3 -0
- package/dist/voice/openai-realtime.d.ts +27 -0
- package/dist/voice/openai-realtime.js +291 -0
- package/dist/voice/server.d.ts +2 -0
- package/dist/voice/server.js +2319 -0
- package/dist/voice/ui.html +2556 -0
- package/package.json +21 -7
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
import WebSocket from "ws";
|
|
2
|
+
import { DEFAULT_VOICE_INSTRUCTIONS, } from "./adapter.js";
|
|
3
|
+
const dim = (s) => `\x1b[2m${s}\x1b[0m`;
|
|
4
|
+
export class OpenAIRealtimeAdapter {
|
|
5
|
+
ws = null;
|
|
6
|
+
config;
|
|
7
|
+
latestVideoFrame = null;
|
|
8
|
+
latestScreenFrame = null;
|
|
9
|
+
onMessage = null;
|
|
10
|
+
toolHandler = null;
|
|
11
|
+
interrupted = false;
|
|
12
|
+
constructor(config) {
|
|
13
|
+
this.config = config;
|
|
14
|
+
}
|
|
15
|
+
async connect(opts) {
|
|
16
|
+
this.onMessage = opts.onMessage;
|
|
17
|
+
this.toolHandler = opts.toolHandler;
|
|
18
|
+
const model = this.config.model || "gpt-realtime";
|
|
19
|
+
const url = `wss://api.openai.com/v1/realtime?model=${model}`;
|
|
20
|
+
return new Promise((resolve, reject) => {
|
|
21
|
+
this.ws = new WebSocket(url, {
|
|
22
|
+
headers: {
|
|
23
|
+
Authorization: `Bearer ${this.config.apiKey}`,
|
|
24
|
+
"OpenAI-Beta": "realtime=v1",
|
|
25
|
+
},
|
|
26
|
+
});
|
|
27
|
+
this.ws.on("open", () => {
|
|
28
|
+
this.sendSessionUpdate();
|
|
29
|
+
resolve();
|
|
30
|
+
});
|
|
31
|
+
this.ws.on("error", (err) => {
|
|
32
|
+
if (!this.ws) {
|
|
33
|
+
reject(err);
|
|
34
|
+
}
|
|
35
|
+
else {
|
|
36
|
+
console.error(dim(`[voice] WebSocket error: ${err.message}`));
|
|
37
|
+
this.emit({ type: "error", message: err.message });
|
|
38
|
+
}
|
|
39
|
+
});
|
|
40
|
+
this.ws.on("close", () => {
|
|
41
|
+
console.log(dim("[voice] WebSocket closed"));
|
|
42
|
+
});
|
|
43
|
+
this.ws.on("message", (data) => {
|
|
44
|
+
const event = JSON.parse(data.toString());
|
|
45
|
+
this.handleEvent(event);
|
|
46
|
+
});
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
send(msg) {
|
|
50
|
+
switch (msg.type) {
|
|
51
|
+
case "audio":
|
|
52
|
+
this.sendRaw({
|
|
53
|
+
type: "input_audio_buffer.append",
|
|
54
|
+
audio: msg.audio,
|
|
55
|
+
});
|
|
56
|
+
break;
|
|
57
|
+
case "video_frame": {
|
|
58
|
+
// OpenAI doesn't support continuous video. Store latest frame and
|
|
59
|
+
// inject it as an image on the next user turn via conversation item.
|
|
60
|
+
const source = msg.source || "camera";
|
|
61
|
+
if (source === "screen") {
|
|
62
|
+
this.latestScreenFrame = { frame: msg.frame, mimeType: msg.mimeType };
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
this.latestVideoFrame = { frame: msg.frame, mimeType: msg.mimeType };
|
|
66
|
+
}
|
|
67
|
+
break;
|
|
68
|
+
}
|
|
69
|
+
case "text": {
|
|
70
|
+
// Send text as a user conversation item, optionally with latest video frame
|
|
71
|
+
const content = [];
|
|
72
|
+
if (this.latestVideoFrame) {
|
|
73
|
+
content.push({
|
|
74
|
+
type: "input_image",
|
|
75
|
+
image_url: `data:${this.latestVideoFrame.mimeType};base64,${this.latestVideoFrame.frame}`,
|
|
76
|
+
});
|
|
77
|
+
this.latestVideoFrame = null;
|
|
78
|
+
}
|
|
79
|
+
content.push({ type: "input_text", text: msg.text });
|
|
80
|
+
this.sendRaw({
|
|
81
|
+
type: "conversation.item.create",
|
|
82
|
+
item: {
|
|
83
|
+
type: "message",
|
|
84
|
+
role: "user",
|
|
85
|
+
content,
|
|
86
|
+
},
|
|
87
|
+
});
|
|
88
|
+
this.sendRaw({ type: "response.create" });
|
|
89
|
+
break;
|
|
90
|
+
}
|
|
91
|
+
case "file": {
|
|
92
|
+
const content = [];
|
|
93
|
+
if (msg.mimeType.startsWith("image/")) {
|
|
94
|
+
content.push({
|
|
95
|
+
type: "input_image",
|
|
96
|
+
image_url: `data:${msg.mimeType};base64,${msg.data}`,
|
|
97
|
+
});
|
|
98
|
+
content.push({ type: "input_text", text: msg.text || `[User attached image: ${msg.name}]` });
|
|
99
|
+
}
|
|
100
|
+
else {
|
|
101
|
+
const decoded = Buffer.from(msg.data, "base64").toString("utf-8");
|
|
102
|
+
const label = msg.text ? `${msg.text}\n\n` : "";
|
|
103
|
+
content.push({ type: "input_text", text: `${label}[File: ${msg.name}]\n\`\`\`\n${decoded}\n\`\`\`` });
|
|
104
|
+
}
|
|
105
|
+
this.sendRaw({
|
|
106
|
+
type: "conversation.item.create",
|
|
107
|
+
item: { type: "message", role: "user", content },
|
|
108
|
+
});
|
|
109
|
+
this.sendRaw({ type: "response.create" });
|
|
110
|
+
break;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
async disconnect() {
|
|
115
|
+
if (this.ws) {
|
|
116
|
+
this.ws.close();
|
|
117
|
+
this.ws = null;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
emit(msg) {
|
|
121
|
+
this.onMessage?.(msg);
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Inject the latest video frame as a conversation item so the model
|
|
125
|
+
* can see it when generating the next response (e.g. after a voice turn).
|
|
126
|
+
*/
|
|
127
|
+
injectVideoFrame() {
|
|
128
|
+
// Prefer screen frame over camera — it provides more useful context
|
|
129
|
+
const isScreen = !!this.latestScreenFrame;
|
|
130
|
+
const frame = this.latestScreenFrame || this.latestVideoFrame;
|
|
131
|
+
if (!frame)
|
|
132
|
+
return;
|
|
133
|
+
// Clear both so we don't inject stale frames
|
|
134
|
+
this.latestScreenFrame = null;
|
|
135
|
+
this.latestVideoFrame = null;
|
|
136
|
+
console.log(dim(`[voice] Injecting ${isScreen ? "screen" : "camera"} frame into conversation`));
|
|
137
|
+
this.sendRaw({
|
|
138
|
+
type: "conversation.item.create",
|
|
139
|
+
item: {
|
|
140
|
+
type: "message",
|
|
141
|
+
role: "user",
|
|
142
|
+
content: [{
|
|
143
|
+
type: "input_image",
|
|
144
|
+
image_url: `data:${frame.mimeType};base64,${frame.frame}`,
|
|
145
|
+
}],
|
|
146
|
+
},
|
|
147
|
+
});
|
|
148
|
+
}
|
|
149
|
+
sendSessionUpdate() {
|
|
150
|
+
const instructions = this.config.instructions || DEFAULT_VOICE_INSTRUCTIONS;
|
|
151
|
+
this.sendRaw({
|
|
152
|
+
type: "session.update",
|
|
153
|
+
session: {
|
|
154
|
+
instructions,
|
|
155
|
+
voice: this.config.voice || "ash",
|
|
156
|
+
modalities: ["text", "audio"],
|
|
157
|
+
turn_detection: {
|
|
158
|
+
type: "server_vad",
|
|
159
|
+
threshold: 0.6,
|
|
160
|
+
prefix_padding_ms: 400,
|
|
161
|
+
silence_duration_ms: 800,
|
|
162
|
+
create_response: true,
|
|
163
|
+
},
|
|
164
|
+
input_audio_transcription: { model: "whisper-1" },
|
|
165
|
+
tool_choice: "auto",
|
|
166
|
+
tools: [
|
|
167
|
+
{
|
|
168
|
+
type: "function",
|
|
169
|
+
name: "run_agent",
|
|
170
|
+
description: "Your ONLY way to take action. This agent runs on the user's Mac with full shell access. It can: run ANY shell command, open apps (open -a Spotify), play music (osascript, afplay, open URLs), browse the web, read/write files, git operations, send emails, manage calendars, install packages, control system settings, and save memories. You MUST call this tool whenever the user asks you to DO anything — play music, open something, check something, build something, send something. NEVER describe an action without calling this tool. If the user asks and you just talk without calling this — you failed.",
|
|
171
|
+
parameters: {
|
|
172
|
+
type: "object",
|
|
173
|
+
properties: {
|
|
174
|
+
query: {
|
|
175
|
+
type: "string",
|
|
176
|
+
description: "What to do. Be specific. Include file paths for uploaded files. Examples: 'Play relaxing music on YouTube using: open https://youtube.com/...', 'Open Spotify and play chill playlist using osascript', 'Save to memory: user likes rock music'",
|
|
177
|
+
},
|
|
178
|
+
},
|
|
179
|
+
required: ["query"],
|
|
180
|
+
},
|
|
181
|
+
},
|
|
182
|
+
],
|
|
183
|
+
},
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
handleEvent(event) {
|
|
187
|
+
switch (event.type) {
|
|
188
|
+
case "session.created":
|
|
189
|
+
console.log(dim("[voice] Session created"));
|
|
190
|
+
break;
|
|
191
|
+
case "session.updated":
|
|
192
|
+
console.log(dim("[voice] Session configured"));
|
|
193
|
+
break;
|
|
194
|
+
case "input_audio_buffer.speech_started":
|
|
195
|
+
// VAD detected start of speech — inject video frame (what user is looking at)
|
|
196
|
+
// and cancel any in-progress response so the user can interrupt
|
|
197
|
+
this.interrupted = true;
|
|
198
|
+
this.injectVideoFrame();
|
|
199
|
+
this.sendRaw({ type: "response.cancel" });
|
|
200
|
+
this.emit({ type: "interrupt" });
|
|
201
|
+
break;
|
|
202
|
+
case "input_audio_buffer.speech_stopped":
|
|
203
|
+
break;
|
|
204
|
+
case "conversation.item.input_audio_transcription.completed":
|
|
205
|
+
if (event.transcript) {
|
|
206
|
+
console.log(dim(`[voice] User: ${event.transcript}`));
|
|
207
|
+
this.emit({ type: "transcript", role: "user", text: event.transcript });
|
|
208
|
+
}
|
|
209
|
+
break;
|
|
210
|
+
case "response.created":
|
|
211
|
+
// New response starting — accept audio again
|
|
212
|
+
this.interrupted = false;
|
|
213
|
+
break;
|
|
214
|
+
case "response.audio.delta":
|
|
215
|
+
if (event.delta && !this.interrupted) {
|
|
216
|
+
this.emit({ type: "audio_delta", audio: event.delta });
|
|
217
|
+
}
|
|
218
|
+
break;
|
|
219
|
+
case "response.audio_transcript.delta":
|
|
220
|
+
this.emit({ type: "transcript", role: "assistant", text: event.delta || "", partial: true });
|
|
221
|
+
break;
|
|
222
|
+
case "response.audio_transcript.done":
|
|
223
|
+
if (event.transcript) {
|
|
224
|
+
this.emit({ type: "transcript", role: "assistant", text: event.transcript });
|
|
225
|
+
}
|
|
226
|
+
break;
|
|
227
|
+
case "response.function_call_arguments.done":
|
|
228
|
+
this.handleFunctionCall(event);
|
|
229
|
+
break;
|
|
230
|
+
case "error": {
|
|
231
|
+
const errMsg = event.error?.message || "Unknown OpenAI error";
|
|
232
|
+
console.error(dim(`[voice] Error: ${JSON.stringify(event.error)}`));
|
|
233
|
+
// Don't surface cancellation errors — they happen when user interrupts with no active response
|
|
234
|
+
if (errMsg.toLowerCase().includes("cancellation failed"))
|
|
235
|
+
break;
|
|
236
|
+
this.emit({ type: "error", message: errMsg });
|
|
237
|
+
break;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
async handleFunctionCall(event) {
|
|
242
|
+
const callId = event.call_id;
|
|
243
|
+
const name = event.name;
|
|
244
|
+
if (name !== "run_agent" || !this.toolHandler) {
|
|
245
|
+
console.error(dim(`[voice] Unknown function call: ${name}`));
|
|
246
|
+
return;
|
|
247
|
+
}
|
|
248
|
+
let args;
|
|
249
|
+
try {
|
|
250
|
+
args = JSON.parse(event.arguments);
|
|
251
|
+
}
|
|
252
|
+
catch {
|
|
253
|
+
console.error(dim("[voice] Failed to parse function arguments"));
|
|
254
|
+
return;
|
|
255
|
+
}
|
|
256
|
+
console.log(dim(`[voice] Agent query: ${args.query}`));
|
|
257
|
+
this.emit({ type: "agent_working", query: args.query });
|
|
258
|
+
try {
|
|
259
|
+
const result = await this.toolHandler(args.query);
|
|
260
|
+
console.log(dim(`[voice] Agent response: ${result.slice(0, 200)}${result.length > 200 ? "..." : ""}`));
|
|
261
|
+
this.sendRaw({
|
|
262
|
+
type: "conversation.item.create",
|
|
263
|
+
item: {
|
|
264
|
+
type: "function_call_output",
|
|
265
|
+
call_id: callId,
|
|
266
|
+
output: result,
|
|
267
|
+
},
|
|
268
|
+
});
|
|
269
|
+
this.sendRaw({ type: "response.create" });
|
|
270
|
+
this.emit({ type: "agent_done", result: result.slice(0, 500) });
|
|
271
|
+
}
|
|
272
|
+
catch (err) {
|
|
273
|
+
console.error(dim(`[voice] Agent error: ${err.message}`));
|
|
274
|
+
this.sendRaw({
|
|
275
|
+
type: "conversation.item.create",
|
|
276
|
+
item: {
|
|
277
|
+
type: "function_call_output",
|
|
278
|
+
call_id: callId,
|
|
279
|
+
output: `Error: ${err.message}`,
|
|
280
|
+
},
|
|
281
|
+
});
|
|
282
|
+
this.sendRaw({ type: "response.create" });
|
|
283
|
+
this.emit({ type: "error", message: err.message });
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
sendRaw(event) {
|
|
287
|
+
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
|
|
288
|
+
this.ws.send(JSON.stringify(event));
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
}
|