gitclaw 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +54 -28
  3. package/dist/composio/adapter.d.ts +26 -0
  4. package/dist/composio/adapter.js +92 -0
  5. package/dist/composio/client.d.ts +39 -0
  6. package/dist/composio/client.js +170 -0
  7. package/dist/composio/index.d.ts +2 -0
  8. package/dist/composio/index.js +2 -0
  9. package/dist/context.d.ts +20 -0
  10. package/dist/context.js +211 -0
  11. package/dist/exports.d.ts +2 -0
  12. package/dist/exports.js +1 -0
  13. package/dist/index.js +99 -7
  14. package/dist/learning/reinforcement.d.ts +11 -0
  15. package/dist/learning/reinforcement.js +91 -0
  16. package/dist/loader.js +34 -1
  17. package/dist/sdk.js +5 -1
  18. package/dist/skills.d.ts +5 -0
  19. package/dist/skills.js +58 -7
  20. package/dist/tools/capture-photo.d.ts +3 -0
  21. package/dist/tools/capture-photo.js +91 -0
  22. package/dist/tools/index.d.ts +2 -1
  23. package/dist/tools/index.js +12 -2
  24. package/dist/tools/read.js +4 -0
  25. package/dist/tools/shared.d.ts +20 -0
  26. package/dist/tools/shared.js +24 -0
  27. package/dist/tools/skill-learner.d.ts +3 -0
  28. package/dist/tools/skill-learner.js +358 -0
  29. package/dist/tools/task-tracker.d.ts +20 -0
  30. package/dist/tools/task-tracker.js +275 -0
  31. package/dist/tools/write.js +4 -0
  32. package/dist/voice/adapter.d.ts +97 -0
  33. package/dist/voice/adapter.js +30 -0
  34. package/dist/voice/chat-history.d.ts +8 -0
  35. package/dist/voice/chat-history.js +121 -0
  36. package/dist/voice/gemini-live.d.ts +20 -0
  37. package/dist/voice/gemini-live.js +279 -0
  38. package/dist/voice/index.d.ts +4 -0
  39. package/dist/voice/index.js +3 -0
  40. package/dist/voice/openai-realtime.d.ts +27 -0
  41. package/dist/voice/openai-realtime.js +291 -0
  42. package/dist/voice/server.d.ts +2 -0
  43. package/dist/voice/server.js +2319 -0
  44. package/dist/voice/ui.html +2556 -0
  45. package/package.json +21 -7
@@ -0,0 +1,291 @@
1
+ import WebSocket from "ws";
2
+ import { DEFAULT_VOICE_INSTRUCTIONS, } from "./adapter.js";
3
+ const dim = (s) => `\x1b[2m${s}\x1b[0m`;
4
+ export class OpenAIRealtimeAdapter {
5
+ ws = null;
6
+ config;
7
+ latestVideoFrame = null;
8
+ latestScreenFrame = null;
9
+ onMessage = null;
10
+ toolHandler = null;
11
+ interrupted = false;
12
+ constructor(config) {
13
+ this.config = config;
14
+ }
15
+ async connect(opts) {
16
+ this.onMessage = opts.onMessage;
17
+ this.toolHandler = opts.toolHandler;
18
+ const model = this.config.model || "gpt-realtime";
19
+ const url = `wss://api.openai.com/v1/realtime?model=${model}`;
20
+ return new Promise((resolve, reject) => {
21
+ this.ws = new WebSocket(url, {
22
+ headers: {
23
+ Authorization: `Bearer ${this.config.apiKey}`,
24
+ "OpenAI-Beta": "realtime=v1",
25
+ },
26
+ });
27
+ this.ws.on("open", () => {
28
+ this.sendSessionUpdate();
29
+ resolve();
30
+ });
31
+ this.ws.on("error", (err) => {
32
+ if (!this.ws) {
33
+ reject(err);
34
+ }
35
+ else {
36
+ console.error(dim(`[voice] WebSocket error: ${err.message}`));
37
+ this.emit({ type: "error", message: err.message });
38
+ }
39
+ });
40
+ this.ws.on("close", () => {
41
+ console.log(dim("[voice] WebSocket closed"));
42
+ });
43
+ this.ws.on("message", (data) => {
44
+ const event = JSON.parse(data.toString());
45
+ this.handleEvent(event);
46
+ });
47
+ });
48
+ }
49
+ send(msg) {
50
+ switch (msg.type) {
51
+ case "audio":
52
+ this.sendRaw({
53
+ type: "input_audio_buffer.append",
54
+ audio: msg.audio,
55
+ });
56
+ break;
57
+ case "video_frame": {
58
+ // OpenAI doesn't support continuous video. Store latest frame and
59
+ // inject it as an image on the next user turn via conversation item.
60
+ const source = msg.source || "camera";
61
+ if (source === "screen") {
62
+ this.latestScreenFrame = { frame: msg.frame, mimeType: msg.mimeType };
63
+ }
64
+ else {
65
+ this.latestVideoFrame = { frame: msg.frame, mimeType: msg.mimeType };
66
+ }
67
+ break;
68
+ }
69
+ case "text": {
70
+ // Send text as a user conversation item, optionally with latest video frame
71
+ const content = [];
72
+ if (this.latestVideoFrame) {
73
+ content.push({
74
+ type: "input_image",
75
+ image_url: `data:${this.latestVideoFrame.mimeType};base64,${this.latestVideoFrame.frame}`,
76
+ });
77
+ this.latestVideoFrame = null;
78
+ }
79
+ content.push({ type: "input_text", text: msg.text });
80
+ this.sendRaw({
81
+ type: "conversation.item.create",
82
+ item: {
83
+ type: "message",
84
+ role: "user",
85
+ content,
86
+ },
87
+ });
88
+ this.sendRaw({ type: "response.create" });
89
+ break;
90
+ }
91
+ case "file": {
92
+ const content = [];
93
+ if (msg.mimeType.startsWith("image/")) {
94
+ content.push({
95
+ type: "input_image",
96
+ image_url: `data:${msg.mimeType};base64,${msg.data}`,
97
+ });
98
+ content.push({ type: "input_text", text: msg.text || `[User attached image: ${msg.name}]` });
99
+ }
100
+ else {
101
+ const decoded = Buffer.from(msg.data, "base64").toString("utf-8");
102
+ const label = msg.text ? `${msg.text}\n\n` : "";
103
+ content.push({ type: "input_text", text: `${label}[File: ${msg.name}]\n\`\`\`\n${decoded}\n\`\`\`` });
104
+ }
105
+ this.sendRaw({
106
+ type: "conversation.item.create",
107
+ item: { type: "message", role: "user", content },
108
+ });
109
+ this.sendRaw({ type: "response.create" });
110
+ break;
111
+ }
112
+ }
113
+ }
114
+ async disconnect() {
115
+ if (this.ws) {
116
+ this.ws.close();
117
+ this.ws = null;
118
+ }
119
+ }
120
+ emit(msg) {
121
+ this.onMessage?.(msg);
122
+ }
123
+ /**
124
+ * Inject the latest video frame as a conversation item so the model
125
+ * can see it when generating the next response (e.g. after a voice turn).
126
+ */
127
+ injectVideoFrame() {
128
+ // Prefer screen frame over camera — it provides more useful context
129
+ const isScreen = !!this.latestScreenFrame;
130
+ const frame = this.latestScreenFrame || this.latestVideoFrame;
131
+ if (!frame)
132
+ return;
133
+ // Clear both so we don't inject stale frames
134
+ this.latestScreenFrame = null;
135
+ this.latestVideoFrame = null;
136
+ console.log(dim(`[voice] Injecting ${isScreen ? "screen" : "camera"} frame into conversation`));
137
+ this.sendRaw({
138
+ type: "conversation.item.create",
139
+ item: {
140
+ type: "message",
141
+ role: "user",
142
+ content: [{
143
+ type: "input_image",
144
+ image_url: `data:${frame.mimeType};base64,${frame.frame}`,
145
+ }],
146
+ },
147
+ });
148
+ }
149
+ sendSessionUpdate() {
150
+ const instructions = this.config.instructions || DEFAULT_VOICE_INSTRUCTIONS;
151
+ this.sendRaw({
152
+ type: "session.update",
153
+ session: {
154
+ instructions,
155
+ voice: this.config.voice || "ash",
156
+ modalities: ["text", "audio"],
157
+ turn_detection: {
158
+ type: "server_vad",
159
+ threshold: 0.6,
160
+ prefix_padding_ms: 400,
161
+ silence_duration_ms: 800,
162
+ create_response: true,
163
+ },
164
+ input_audio_transcription: { model: "whisper-1" },
165
+ tool_choice: "auto",
166
+ tools: [
167
+ {
168
+ type: "function",
169
+ name: "run_agent",
170
+ description: "Your ONLY way to take action. This agent runs on the user's Mac with full shell access. It can: run ANY shell command, open apps (open -a Spotify), play music (osascript, afplay, open URLs), browse the web, read/write files, git operations, send emails, manage calendars, install packages, control system settings, and save memories. You MUST call this tool whenever the user asks you to DO anything — play music, open something, check something, build something, send something. NEVER describe an action without calling this tool. If the user asks and you just talk without calling this — you failed.",
171
+ parameters: {
172
+ type: "object",
173
+ properties: {
174
+ query: {
175
+ type: "string",
176
+ description: "What to do. Be specific. Include file paths for uploaded files. Examples: 'Play relaxing music on YouTube using: open https://youtube.com/...', 'Open Spotify and play chill playlist using osascript', 'Save to memory: user likes rock music'",
177
+ },
178
+ },
179
+ required: ["query"],
180
+ },
181
+ },
182
+ ],
183
+ },
184
+ });
185
+ }
186
+ handleEvent(event) {
187
+ switch (event.type) {
188
+ case "session.created":
189
+ console.log(dim("[voice] Session created"));
190
+ break;
191
+ case "session.updated":
192
+ console.log(dim("[voice] Session configured"));
193
+ break;
194
+ case "input_audio_buffer.speech_started":
195
+ // VAD detected start of speech — inject video frame (what user is looking at)
196
+ // and cancel any in-progress response so the user can interrupt
197
+ this.interrupted = true;
198
+ this.injectVideoFrame();
199
+ this.sendRaw({ type: "response.cancel" });
200
+ this.emit({ type: "interrupt" });
201
+ break;
202
+ case "input_audio_buffer.speech_stopped":
203
+ break;
204
+ case "conversation.item.input_audio_transcription.completed":
205
+ if (event.transcript) {
206
+ console.log(dim(`[voice] User: ${event.transcript}`));
207
+ this.emit({ type: "transcript", role: "user", text: event.transcript });
208
+ }
209
+ break;
210
+ case "response.created":
211
+ // New response starting — accept audio again
212
+ this.interrupted = false;
213
+ break;
214
+ case "response.audio.delta":
215
+ if (event.delta && !this.interrupted) {
216
+ this.emit({ type: "audio_delta", audio: event.delta });
217
+ }
218
+ break;
219
+ case "response.audio_transcript.delta":
220
+ this.emit({ type: "transcript", role: "assistant", text: event.delta || "", partial: true });
221
+ break;
222
+ case "response.audio_transcript.done":
223
+ if (event.transcript) {
224
+ this.emit({ type: "transcript", role: "assistant", text: event.transcript });
225
+ }
226
+ break;
227
+ case "response.function_call_arguments.done":
228
+ this.handleFunctionCall(event);
229
+ break;
230
+ case "error": {
231
+ const errMsg = event.error?.message || "Unknown OpenAI error";
232
+ console.error(dim(`[voice] Error: ${JSON.stringify(event.error)}`));
233
+ // Don't surface cancellation errors — they happen when user interrupts with no active response
234
+ if (errMsg.toLowerCase().includes("cancellation failed"))
235
+ break;
236
+ this.emit({ type: "error", message: errMsg });
237
+ break;
238
+ }
239
+ }
240
+ }
241
+ async handleFunctionCall(event) {
242
+ const callId = event.call_id;
243
+ const name = event.name;
244
+ if (name !== "run_agent" || !this.toolHandler) {
245
+ console.error(dim(`[voice] Unknown function call: ${name}`));
246
+ return;
247
+ }
248
+ let args;
249
+ try {
250
+ args = JSON.parse(event.arguments);
251
+ }
252
+ catch {
253
+ console.error(dim("[voice] Failed to parse function arguments"));
254
+ return;
255
+ }
256
+ console.log(dim(`[voice] Agent query: ${args.query}`));
257
+ this.emit({ type: "agent_working", query: args.query });
258
+ try {
259
+ const result = await this.toolHandler(args.query);
260
+ console.log(dim(`[voice] Agent response: ${result.slice(0, 200)}${result.length > 200 ? "..." : ""}`));
261
+ this.sendRaw({
262
+ type: "conversation.item.create",
263
+ item: {
264
+ type: "function_call_output",
265
+ call_id: callId,
266
+ output: result,
267
+ },
268
+ });
269
+ this.sendRaw({ type: "response.create" });
270
+ this.emit({ type: "agent_done", result: result.slice(0, 500) });
271
+ }
272
+ catch (err) {
273
+ console.error(dim(`[voice] Agent error: ${err.message}`));
274
+ this.sendRaw({
275
+ type: "conversation.item.create",
276
+ item: {
277
+ type: "function_call_output",
278
+ call_id: callId,
279
+ output: `Error: ${err.message}`,
280
+ },
281
+ });
282
+ this.sendRaw({ type: "response.create" });
283
+ this.emit({ type: "error", message: err.message });
284
+ }
285
+ }
286
+ sendRaw(event) {
287
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
288
+ this.ws.send(JSON.stringify(event));
289
+ }
290
+ }
291
+ }
@@ -0,0 +1,2 @@
1
+ import type { VoiceServerOptions } from "./adapter.js";
2
+ export declare function startVoiceServer(opts: VoiceServerOptions): Promise<() => Promise<void>>;