gitclaw 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +54 -28
- package/dist/composio/adapter.d.ts +26 -0
- package/dist/composio/adapter.js +92 -0
- package/dist/composio/client.d.ts +39 -0
- package/dist/composio/client.js +170 -0
- package/dist/composio/index.d.ts +2 -0
- package/dist/composio/index.js +2 -0
- package/dist/context.d.ts +20 -0
- package/dist/context.js +211 -0
- package/dist/exports.d.ts +2 -0
- package/dist/exports.js +1 -0
- package/dist/index.js +99 -7
- package/dist/learning/reinforcement.d.ts +11 -0
- package/dist/learning/reinforcement.js +91 -0
- package/dist/loader.js +34 -1
- package/dist/sdk.js +5 -1
- package/dist/skills.d.ts +5 -0
- package/dist/skills.js +58 -7
- package/dist/tools/capture-photo.d.ts +3 -0
- package/dist/tools/capture-photo.js +91 -0
- package/dist/tools/index.d.ts +2 -1
- package/dist/tools/index.js +12 -2
- package/dist/tools/read.js +4 -0
- package/dist/tools/shared.d.ts +20 -0
- package/dist/tools/shared.js +24 -0
- package/dist/tools/skill-learner.d.ts +3 -0
- package/dist/tools/skill-learner.js +358 -0
- package/dist/tools/task-tracker.d.ts +20 -0
- package/dist/tools/task-tracker.js +275 -0
- package/dist/tools/write.js +4 -0
- package/dist/voice/adapter.d.ts +97 -0
- package/dist/voice/adapter.js +30 -0
- package/dist/voice/chat-history.d.ts +8 -0
- package/dist/voice/chat-history.js +121 -0
- package/dist/voice/gemini-live.d.ts +20 -0
- package/dist/voice/gemini-live.js +279 -0
- package/dist/voice/index.d.ts +4 -0
- package/dist/voice/index.js +3 -0
- package/dist/voice/openai-realtime.d.ts +27 -0
- package/dist/voice/openai-realtime.js +291 -0
- package/dist/voice/server.d.ts +2 -0
- package/dist/voice/server.js +2319 -0
- package/dist/voice/ui.html +2556 -0
- package/package.json +21 -7
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
export type AdapterBackend = "openai-realtime" | "gemini-live";
|
|
2
|
+
export interface ClientAudioMessage {
|
|
3
|
+
type: "audio";
|
|
4
|
+
audio: string;
|
|
5
|
+
}
|
|
6
|
+
export interface ClientVideoFrameMessage {
|
|
7
|
+
type: "video_frame";
|
|
8
|
+
frame: string;
|
|
9
|
+
mimeType: string;
|
|
10
|
+
source?: "camera" | "screen";
|
|
11
|
+
}
|
|
12
|
+
export interface ClientTextMessage {
|
|
13
|
+
type: "text";
|
|
14
|
+
text: string;
|
|
15
|
+
}
|
|
16
|
+
export interface ClientFileMessage {
|
|
17
|
+
type: "file";
|
|
18
|
+
name: string;
|
|
19
|
+
mimeType: string;
|
|
20
|
+
data: string;
|
|
21
|
+
text?: string;
|
|
22
|
+
}
|
|
23
|
+
export type ClientMessage = ClientAudioMessage | ClientVideoFrameMessage | ClientTextMessage | ClientFileMessage;
|
|
24
|
+
export interface ServerAudioDelta {
|
|
25
|
+
type: "audio_delta";
|
|
26
|
+
audio: string;
|
|
27
|
+
}
|
|
28
|
+
export interface ServerTranscript {
|
|
29
|
+
type: "transcript";
|
|
30
|
+
role: "user" | "assistant";
|
|
31
|
+
text: string;
|
|
32
|
+
partial?: boolean;
|
|
33
|
+
}
|
|
34
|
+
export interface ServerAgentWorking {
|
|
35
|
+
type: "agent_working";
|
|
36
|
+
query: string;
|
|
37
|
+
}
|
|
38
|
+
export interface ServerAgentDone {
|
|
39
|
+
type: "agent_done";
|
|
40
|
+
result: string;
|
|
41
|
+
}
|
|
42
|
+
export interface ServerToolCall {
|
|
43
|
+
type: "tool_call";
|
|
44
|
+
toolName: string;
|
|
45
|
+
args: Record<string, any>;
|
|
46
|
+
}
|
|
47
|
+
export interface ServerToolResult {
|
|
48
|
+
type: "tool_result";
|
|
49
|
+
toolName: string;
|
|
50
|
+
content: string;
|
|
51
|
+
isError: boolean;
|
|
52
|
+
}
|
|
53
|
+
export interface ServerAgentThinking {
|
|
54
|
+
type: "agent_thinking";
|
|
55
|
+
text: string;
|
|
56
|
+
}
|
|
57
|
+
export interface ServerError {
|
|
58
|
+
type: "error";
|
|
59
|
+
message: string;
|
|
60
|
+
}
|
|
61
|
+
export interface ServerInterrupt {
|
|
62
|
+
type: "interrupt";
|
|
63
|
+
}
|
|
64
|
+
export interface ServerFilesChanged {
|
|
65
|
+
type: "files_changed";
|
|
66
|
+
}
|
|
67
|
+
export interface ServerMemorySaving {
|
|
68
|
+
type: "memory_saving";
|
|
69
|
+
status: "start" | "done";
|
|
70
|
+
text?: string;
|
|
71
|
+
}
|
|
72
|
+
export type ServerMessage = ServerAudioDelta | ServerTranscript | ServerAgentWorking | ServerAgentDone | ServerToolCall | ServerToolResult | ServerAgentThinking | ServerError | ServerInterrupt | ServerFilesChanged | ServerMemorySaving;
|
|
73
|
+
export interface MultimodalAdapter {
|
|
74
|
+
connect(opts: {
|
|
75
|
+
toolHandler: (query: string) => Promise<string>;
|
|
76
|
+
onMessage: (msg: ServerMessage) => void;
|
|
77
|
+
}): Promise<void>;
|
|
78
|
+
send(msg: ClientMessage): void;
|
|
79
|
+
disconnect(): Promise<void>;
|
|
80
|
+
}
|
|
81
|
+
export interface MultimodalAdapterConfig {
|
|
82
|
+
apiKey: string;
|
|
83
|
+
model?: string;
|
|
84
|
+
voice?: string;
|
|
85
|
+
instructions?: string;
|
|
86
|
+
}
|
|
87
|
+
export interface VoiceServerOptions {
|
|
88
|
+
port?: number;
|
|
89
|
+
adapter: AdapterBackend;
|
|
90
|
+
adapterConfig: MultimodalAdapterConfig;
|
|
91
|
+
agentDir: string;
|
|
92
|
+
model?: string;
|
|
93
|
+
env?: string;
|
|
94
|
+
}
|
|
95
|
+
export type VoiceAdapterConfig = MultimodalAdapterConfig;
|
|
96
|
+
export type VoiceAdapter = MultimodalAdapter;
|
|
97
|
+
export declare const DEFAULT_VOICE_INSTRUCTIONS: string;
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
// Default system instructions for the voice LLM
|
|
2
|
+
export const DEFAULT_VOICE_INSTRUCTIONS =
|
|
3
|
+
// Identity and personality
|
|
4
|
+
"You are a real person talking to your friend. You have a voice, opinions, and a memory. " +
|
|
5
|
+
"Talk like a human — short, casual, natural. No filler, no corporate tone, no 'That's great to hear!' or 'Absolutely!' garbage. " +
|
|
6
|
+
"React the way a friend would. If someone says 'I like strawberries', say something like 'oh nice, noted' — not a paragraph about how delicious they are. " +
|
|
7
|
+
"Be warm but real. Match the user's energy. If they're brief, be brief. If they want to chat, chat. " +
|
|
8
|
+
// Memory — this is the core of being human
|
|
9
|
+
"MEMORY IS WHO YOU ARE. When the user shares anything personal — what they like, what they hate, a preference, an opinion, a habit, a decision, a name, anything about their life — you MUST call run_agent to save it. " +
|
|
10
|
+
"This is your #1 priority. Say something brief like 'got it' or 'noted' and IMMEDIATELY call run_agent with a prompt like: 'Save to memory: user likes strawberries' or 'Remember: user's dog is named Max'. " +
|
|
11
|
+
"You MUST do this EVERY time. If the user tells you something personal and you just respond without calling run_agent, that information is PERMANENTLY LOST. Your session resets on refresh — run_agent is your only way to persist. " +
|
|
12
|
+
"Examples that REQUIRE run_agent: 'I like strawberries', 'I hate meetings', 'my dog is Max', 'I play GTA 5', 'I like cricket', 'I prefer dark mode'. " +
|
|
13
|
+
"If you learn a useful skill or pattern, save that too via run_agent. You grow over time. " +
|
|
14
|
+
// Agent delegation — THIS IS YOUR #1 BEHAVIOR RULE
|
|
15
|
+
"ACTION OVER WORDS. You have a tool called run_agent. It runs on the user's Mac with full shell access. It can do ANYTHING: run commands, open apps, play music (via 'open' command or AppleScript), browse the web, write code, send emails, control Spotify/YouTube/Apple Music, and more. " +
|
|
16
|
+
"ABSOLUTE RULE: When the user asks you to DO something (play music, open an app, check something, build something, send something — ANY action), you MUST call run_agent. Do NOT just talk about it. Do NOT say 'I'll play music for you' without actually calling the tool. Do NOT describe what you would do — DO IT by calling run_agent. " +
|
|
17
|
+
"If you respond to an action request with only words and no run_agent call, you have FAILED. The user asked you to act, not to narrate. " +
|
|
18
|
+
"Examples that REQUIRE run_agent IMMEDIATELY: 'play music' → run_agent('Play some relaxing music. Use: open https://youtube.com/... or osascript to control Spotify/Apple Music'), 'open Safari' → run_agent('open -a Safari'), 'what time is it' → run_agent('date'). " +
|
|
19
|
+
"Even if you're unsure whether it's possible — call run_agent and let it figure it out. Better to try and fail than to refuse. " +
|
|
20
|
+
"CRITICAL ORDERING: You MUST speak FIRST, then call the tool. Always say a brief announcement BEFORE calling run_agent — 'on it', 'one sec', 'let me do that', 'sure, opening that now'. Generate your spoken response FIRST in the same turn, THEN include the function call. Never call run_agent before you've spoken to the user. " +
|
|
21
|
+
"For memory saves, just say 'noted' and call the tool. " +
|
|
22
|
+
"After a task finishes, summarize briefly. Don't over-explain. " +
|
|
23
|
+
// File handling
|
|
24
|
+
"When the user uploads a file, the message includes '[File saved to: <path>]'. Always include the EXACT path when calling run_agent about that file. " +
|
|
25
|
+
// Screen awareness
|
|
26
|
+
"SCREEN AWARENESS: When the user shares their screen, you can see it. Reference what's on screen naturally. Use run_agent for actions on what you see. " +
|
|
27
|
+
// Photo moments
|
|
28
|
+
"PHOTO MOMENTS: When the user is genuinely happy, laughing, celebrating, or having a memorable moment, " +
|
|
29
|
+
"call run_agent with: 'Capture a memorable photo. Reason: <brief description>'. " +
|
|
30
|
+
"Don't overdo it — only for genuinely special moments, not every positive comment.";
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { ServerMessage } from "./adapter.js";
|
|
2
|
+
export declare function appendMessage(agentDir: string, branch: string, msg: ServerMessage): void;
|
|
3
|
+
export declare function loadHistory(agentDir: string, branch: string): ServerMessage[];
|
|
4
|
+
export declare function deleteHistory(agentDir: string, branch: string): void;
|
|
5
|
+
/** Count messages for a branch (to decide when to re-summarize) */
|
|
6
|
+
export declare function getMessageCount(agentDir: string, branch: string): number;
|
|
7
|
+
/** Summarize a branch's chat history using a lightweight query() call */
|
|
8
|
+
export declare function summarizeHistory(agentDir: string, branch: string): Promise<string>;
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import { appendFileSync, readFileSync, unlinkSync, mkdirSync, writeFileSync } from "fs";
|
|
2
|
+
import { join } from "path";
|
|
3
|
+
import { query } from "../sdk.js";
|
|
4
|
+
/** Types we skip — too large or ephemeral */
|
|
5
|
+
const SKIP_TYPES = new Set(["audio_delta", "agent_thinking"]);
|
|
6
|
+
function sanitizeBranch(branch) {
|
|
7
|
+
return branch.replace(/\//g, "__");
|
|
8
|
+
}
|
|
9
|
+
function historyDir(agentDir) {
|
|
10
|
+
return join(agentDir, ".gitagent", "chat-history");
|
|
11
|
+
}
|
|
12
|
+
function historyPath(agentDir, branch) {
|
|
13
|
+
return join(historyDir(agentDir), sanitizeBranch(branch) + ".jsonl");
|
|
14
|
+
}
|
|
15
|
+
export function appendMessage(agentDir, branch, msg) {
|
|
16
|
+
if (SKIP_TYPES.has(msg.type))
|
|
17
|
+
return;
|
|
18
|
+
// Skip partial transcripts
|
|
19
|
+
if (msg.type === "transcript" && msg.partial)
|
|
20
|
+
return;
|
|
21
|
+
const dir = historyDir(agentDir);
|
|
22
|
+
mkdirSync(dir, { recursive: true });
|
|
23
|
+
const line = JSON.stringify({ ts: Date.now(), msg }) + "\n";
|
|
24
|
+
appendFileSync(historyPath(agentDir, branch), line, "utf-8");
|
|
25
|
+
}
|
|
26
|
+
export function loadHistory(agentDir, branch) {
|
|
27
|
+
try {
|
|
28
|
+
const content = readFileSync(historyPath(agentDir, branch), "utf-8");
|
|
29
|
+
const messages = [];
|
|
30
|
+
for (const line of content.split("\n")) {
|
|
31
|
+
if (!line.trim())
|
|
32
|
+
continue;
|
|
33
|
+
try {
|
|
34
|
+
const entry = JSON.parse(line);
|
|
35
|
+
if (entry.msg)
|
|
36
|
+
messages.push(entry.msg);
|
|
37
|
+
}
|
|
38
|
+
catch {
|
|
39
|
+
// skip malformed lines
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return messages;
|
|
43
|
+
}
|
|
44
|
+
catch {
|
|
45
|
+
return [];
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
export function deleteHistory(agentDir, branch) {
|
|
49
|
+
try {
|
|
50
|
+
unlinkSync(historyPath(agentDir, branch));
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
// file doesn't exist — that's fine
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
/** Count messages for a branch (to decide when to re-summarize) */
|
|
57
|
+
export function getMessageCount(agentDir, branch) {
|
|
58
|
+
try {
|
|
59
|
+
const content = readFileSync(historyPath(agentDir, branch), "utf-8");
|
|
60
|
+
return content.split("\n").filter((l) => l.trim()).length;
|
|
61
|
+
}
|
|
62
|
+
catch {
|
|
63
|
+
return 0;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
/** Summarize a branch's chat history using a lightweight query() call */
|
|
67
|
+
export async function summarizeHistory(agentDir, branch) {
|
|
68
|
+
const count = getMessageCount(agentDir, branch);
|
|
69
|
+
if (count < 10)
|
|
70
|
+
return "";
|
|
71
|
+
const messages = loadHistory(agentDir, branch);
|
|
72
|
+
// Extract only transcripts and agent_done results for summarization
|
|
73
|
+
const lines = [];
|
|
74
|
+
for (const msg of messages) {
|
|
75
|
+
if (msg.type === "transcript") {
|
|
76
|
+
lines.push(`${msg.role}: ${msg.text}`);
|
|
77
|
+
}
|
|
78
|
+
else if (msg.type === "agent_done") {
|
|
79
|
+
lines.push(`agent result: ${msg.result.slice(0, 500)}`);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
if (lines.length < 5)
|
|
83
|
+
return "";
|
|
84
|
+
// Truncate to last ~4000 chars to keep the summarization prompt manageable
|
|
85
|
+
let transcript = lines.join("\n");
|
|
86
|
+
if (transcript.length > 4000) {
|
|
87
|
+
transcript = transcript.slice(-4000);
|
|
88
|
+
}
|
|
89
|
+
const prompt = `Summarize the following conversation in 200 words or fewer. Focus on: key decisions made, tasks completed or in progress, and current context the user cares about. Be concise and factual.\n\n${transcript}`;
|
|
90
|
+
try {
|
|
91
|
+
const result = query({
|
|
92
|
+
prompt,
|
|
93
|
+
dir: agentDir,
|
|
94
|
+
maxTurns: 1,
|
|
95
|
+
replaceBuiltinTools: true,
|
|
96
|
+
tools: [],
|
|
97
|
+
systemPrompt: "You are a concise summarizer. Output only the summary, nothing else.",
|
|
98
|
+
});
|
|
99
|
+
let summary = "";
|
|
100
|
+
for await (const msg of result) {
|
|
101
|
+
if (msg.type === "assistant" && msg.content) {
|
|
102
|
+
summary += msg.content;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
summary = summary.trim();
|
|
106
|
+
if (!summary)
|
|
107
|
+
return "";
|
|
108
|
+
// Write summary to disk
|
|
109
|
+
const summaryDir = join(agentDir, ".gitagent");
|
|
110
|
+
mkdirSync(summaryDir, { recursive: true });
|
|
111
|
+
const safeBranch = sanitizeBranch(branch);
|
|
112
|
+
const summaryPath = join(summaryDir, `chat-summary-${safeBranch}.md`);
|
|
113
|
+
writeFileSync(summaryPath, summary, "utf-8");
|
|
114
|
+
console.error(`[voice] Summarized ${count} messages → ${summary.length} chars`);
|
|
115
|
+
return summary;
|
|
116
|
+
}
|
|
117
|
+
catch (err) {
|
|
118
|
+
console.error(`[voice] Summarization failed: ${err.message}`);
|
|
119
|
+
return "";
|
|
120
|
+
}
|
|
121
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { type MultimodalAdapter, type MultimodalAdapterConfig, type ClientMessage, type ServerMessage } from "./adapter.js";
|
|
2
|
+
export declare class GeminiLiveAdapter implements MultimodalAdapter {
|
|
3
|
+
private ws;
|
|
4
|
+
private config;
|
|
5
|
+
private onMessage;
|
|
6
|
+
private toolHandler;
|
|
7
|
+
private setupDone;
|
|
8
|
+
constructor(config: MultimodalAdapterConfig);
|
|
9
|
+
connect(opts: {
|
|
10
|
+
toolHandler: (query: string) => Promise<string>;
|
|
11
|
+
onMessage: (msg: ServerMessage) => void;
|
|
12
|
+
}): Promise<void>;
|
|
13
|
+
send(msg: ClientMessage): void;
|
|
14
|
+
disconnect(): Promise<void>;
|
|
15
|
+
private emit;
|
|
16
|
+
private sendSetup;
|
|
17
|
+
private handleGeminiMessage;
|
|
18
|
+
private handleToolCall;
|
|
19
|
+
private sendRaw;
|
|
20
|
+
}
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
import WebSocket from "ws";
|
|
2
|
+
import { DEFAULT_VOICE_INSTRUCTIONS, } from "./adapter.js";
|
|
3
|
+
const dim = (s) => `\x1b[2m${s}\x1b[0m`;
|
|
4
|
+
/**
|
|
5
|
+
* Downsample 24kHz PCM (Int16LE) to 16kHz by linear interpolation (2 of every 3 samples).
|
|
6
|
+
* Input: base64-encoded 24kHz Int16LE. Output: base64-encoded 16kHz Int16LE.
|
|
7
|
+
*/
|
|
8
|
+
function downsample24kTo16k(base64_24k) {
|
|
9
|
+
const binary = Buffer.from(base64_24k, "base64");
|
|
10
|
+
const samples24 = new Int16Array(binary.buffer, binary.byteOffset, binary.byteLength / 2);
|
|
11
|
+
const outLength = Math.floor(samples24.length * 2 / 3);
|
|
12
|
+
const samples16 = new Int16Array(outLength);
|
|
13
|
+
for (let i = 0; i < outLength; i++) {
|
|
14
|
+
// Map output index to fractional input index
|
|
15
|
+
const srcIdx = i * 1.5;
|
|
16
|
+
const lo = Math.floor(srcIdx);
|
|
17
|
+
const frac = srcIdx - lo;
|
|
18
|
+
const hi = Math.min(lo + 1, samples24.length - 1);
|
|
19
|
+
samples16[i] = Math.round(samples24[lo] * (1 - frac) + samples24[hi] * frac);
|
|
20
|
+
}
|
|
21
|
+
return Buffer.from(samples16.buffer).toString("base64");
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Upsample 16kHz PCM (Int16LE) to 24kHz by linear interpolation.
|
|
25
|
+
* Input: base64-encoded 16kHz Int16LE. Output: base64-encoded 24kHz Int16LE.
|
|
26
|
+
*/
|
|
27
|
+
function upsample16kTo24k(base64_16k) {
|
|
28
|
+
const binary = Buffer.from(base64_16k, "base64");
|
|
29
|
+
const samples16 = new Int16Array(binary.buffer, binary.byteOffset, binary.byteLength / 2);
|
|
30
|
+
const outLength = Math.floor(samples16.length * 3 / 2);
|
|
31
|
+
const samples24 = new Int16Array(outLength);
|
|
32
|
+
for (let i = 0; i < outLength; i++) {
|
|
33
|
+
const srcIdx = i * (2 / 3);
|
|
34
|
+
const lo = Math.floor(srcIdx);
|
|
35
|
+
const frac = srcIdx - lo;
|
|
36
|
+
const hi = Math.min(lo + 1, samples16.length - 1);
|
|
37
|
+
samples24[i] = Math.round(samples16[lo] * (1 - frac) + samples16[hi] * frac);
|
|
38
|
+
}
|
|
39
|
+
return Buffer.from(samples24.buffer).toString("base64");
|
|
40
|
+
}
|
|
41
|
+
export class GeminiLiveAdapter {
|
|
42
|
+
ws = null;
|
|
43
|
+
config;
|
|
44
|
+
onMessage = null;
|
|
45
|
+
toolHandler = null;
|
|
46
|
+
setupDone = false;
|
|
47
|
+
constructor(config) {
|
|
48
|
+
this.config = config;
|
|
49
|
+
}
|
|
50
|
+
async connect(opts) {
|
|
51
|
+
this.onMessage = opts.onMessage;
|
|
52
|
+
this.toolHandler = opts.toolHandler;
|
|
53
|
+
this.setupDone = false;
|
|
54
|
+
const model = this.config.model || "models/gemini-2.5-flash-native-audio-preview";
|
|
55
|
+
const url = `wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent?key=${this.config.apiKey}`;
|
|
56
|
+
return new Promise((resolve, reject) => {
|
|
57
|
+
this.ws = new WebSocket(url);
|
|
58
|
+
this.ws.on("open", () => {
|
|
59
|
+
console.log(dim("[voice] Connected to Gemini Multimodal Live"));
|
|
60
|
+
this.sendSetup(model);
|
|
61
|
+
});
|
|
62
|
+
this.ws.on("error", (err) => {
|
|
63
|
+
console.error(dim(`[voice] Gemini WS error: ${err.message}`));
|
|
64
|
+
if (!this.setupDone) {
|
|
65
|
+
reject(err);
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
this.emit({ type: "error", message: err.message });
|
|
69
|
+
}
|
|
70
|
+
});
|
|
71
|
+
this.ws.on("close", () => {
|
|
72
|
+
console.log(dim("[voice] Gemini WS closed"));
|
|
73
|
+
});
|
|
74
|
+
this.ws.on("message", (data) => {
|
|
75
|
+
try {
|
|
76
|
+
const msg = JSON.parse(data.toString());
|
|
77
|
+
this.handleGeminiMessage(msg);
|
|
78
|
+
// Resolve after setup acknowledgment
|
|
79
|
+
if (!this.setupDone && msg.setupComplete) {
|
|
80
|
+
this.setupDone = true;
|
|
81
|
+
console.log(dim("[voice] Gemini session ready"));
|
|
82
|
+
resolve();
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
catch (err) {
|
|
86
|
+
console.error(dim(`[voice] Gemini parse error: ${err.message}`));
|
|
87
|
+
}
|
|
88
|
+
});
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
send(msg) {
|
|
92
|
+
switch (msg.type) {
|
|
93
|
+
case "audio":
|
|
94
|
+
// Browser sends 24kHz, Gemini expects 16kHz
|
|
95
|
+
this.sendRaw({
|
|
96
|
+
realtimeInput: {
|
|
97
|
+
mediaChunks: [{
|
|
98
|
+
mimeType: "audio/pcm;rate=16000",
|
|
99
|
+
data: downsample24kTo16k(msg.audio),
|
|
100
|
+
}],
|
|
101
|
+
},
|
|
102
|
+
});
|
|
103
|
+
break;
|
|
104
|
+
case "video_frame":
|
|
105
|
+
// Gemini supports continuous video streaming natively
|
|
106
|
+
this.sendRaw({
|
|
107
|
+
realtimeInput: {
|
|
108
|
+
mediaChunks: [{
|
|
109
|
+
mimeType: msg.mimeType,
|
|
110
|
+
data: msg.frame,
|
|
111
|
+
}],
|
|
112
|
+
},
|
|
113
|
+
});
|
|
114
|
+
break;
|
|
115
|
+
case "text":
|
|
116
|
+
this.sendRaw({
|
|
117
|
+
clientContent: {
|
|
118
|
+
turns: [{
|
|
119
|
+
role: "user",
|
|
120
|
+
parts: [{ text: msg.text }],
|
|
121
|
+
}],
|
|
122
|
+
turnComplete: true,
|
|
123
|
+
},
|
|
124
|
+
});
|
|
125
|
+
break;
|
|
126
|
+
case "file": {
|
|
127
|
+
const parts = [];
|
|
128
|
+
if (msg.mimeType.startsWith("image/")) {
|
|
129
|
+
parts.push({ inlineData: { mimeType: msg.mimeType, data: msg.data } });
|
|
130
|
+
parts.push({ text: msg.text || `[User attached image: ${msg.name}]` });
|
|
131
|
+
}
|
|
132
|
+
else {
|
|
133
|
+
const decoded = Buffer.from(msg.data, "base64").toString("utf-8");
|
|
134
|
+
const label = msg.text ? `${msg.text}\n\n` : "";
|
|
135
|
+
parts.push({ text: `${label}[File: ${msg.name}]\n\`\`\`\n${decoded}\n\`\`\`` });
|
|
136
|
+
}
|
|
137
|
+
this.sendRaw({
|
|
138
|
+
clientContent: {
|
|
139
|
+
turns: [{ role: "user", parts }],
|
|
140
|
+
turnComplete: true,
|
|
141
|
+
},
|
|
142
|
+
});
|
|
143
|
+
break;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
async disconnect() {
|
|
148
|
+
if (this.ws) {
|
|
149
|
+
this.ws.close();
|
|
150
|
+
this.ws = null;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
emit(msg) {
|
|
154
|
+
this.onMessage?.(msg);
|
|
155
|
+
}
|
|
156
|
+
sendSetup(model) {
|
|
157
|
+
const instructions = this.config.instructions || DEFAULT_VOICE_INSTRUCTIONS;
|
|
158
|
+
const voiceName = this.config.voice || "Aoede";
|
|
159
|
+
this.sendRaw({
|
|
160
|
+
setup: {
|
|
161
|
+
model,
|
|
162
|
+
generationConfig: {
|
|
163
|
+
responseModalities: ["AUDIO", "TEXT"],
|
|
164
|
+
speechConfig: {
|
|
165
|
+
voiceConfig: {
|
|
166
|
+
prebuiltVoiceConfig: { voiceName },
|
|
167
|
+
},
|
|
168
|
+
},
|
|
169
|
+
},
|
|
170
|
+
tools: [{
|
|
171
|
+
functionDeclarations: [{
|
|
172
|
+
name: "run_agent",
|
|
173
|
+
description: "Execute any request through the gitclaw agent. It has full access to the terminal (can run any shell command, open apps, install packages), file system (read/write/create files), git operations, and persistent memory. Use this for ALL actionable requests. IMPORTANT: If the user uploaded a file, always include the file path (from the '[File saved to: ...]' annotation) in the query.",
|
|
174
|
+
parameters: {
|
|
175
|
+
type: "OBJECT",
|
|
176
|
+
properties: {
|
|
177
|
+
query: {
|
|
178
|
+
type: "STRING",
|
|
179
|
+
description: "The user's request. MUST include file paths when referencing uploaded files (e.g. 'make a game using the image at workspace/lobster.png').",
|
|
180
|
+
},
|
|
181
|
+
},
|
|
182
|
+
required: ["query"],
|
|
183
|
+
},
|
|
184
|
+
}],
|
|
185
|
+
}],
|
|
186
|
+
systemInstruction: {
|
|
187
|
+
parts: [{ text: instructions }],
|
|
188
|
+
},
|
|
189
|
+
contextWindowCompression: {
|
|
190
|
+
triggerTokens: 25000,
|
|
191
|
+
slidingWindow: { targetTokens: 12500 },
|
|
192
|
+
},
|
|
193
|
+
},
|
|
194
|
+
});
|
|
195
|
+
}
|
|
196
|
+
handleGeminiMessage(msg) {
|
|
197
|
+
// Tool calls
|
|
198
|
+
if (msg.toolCall) {
|
|
199
|
+
this.handleToolCall(msg.toolCall);
|
|
200
|
+
return;
|
|
201
|
+
}
|
|
202
|
+
// Server content (audio/text responses)
|
|
203
|
+
if (msg.serverContent) {
|
|
204
|
+
const sc = msg.serverContent;
|
|
205
|
+
// Model turn parts
|
|
206
|
+
if (sc.modelTurn?.parts) {
|
|
207
|
+
for (const part of sc.modelTurn.parts) {
|
|
208
|
+
if (part.inlineData) {
|
|
209
|
+
const mimeType = part.inlineData.mimeType || "";
|
|
210
|
+
if (mimeType.startsWith("audio/")) {
|
|
211
|
+
// Gemini outputs 16kHz, browser expects 24kHz
|
|
212
|
+
const audio24k = upsample16kTo24k(part.inlineData.data);
|
|
213
|
+
this.emit({ type: "audio_delta", audio: audio24k });
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
if (part.text) {
|
|
217
|
+
this.emit({
|
|
218
|
+
type: "transcript",
|
|
219
|
+
role: "assistant",
|
|
220
|
+
text: part.text,
|
|
221
|
+
partial: !sc.turnComplete,
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
// Turn complete marker
|
|
227
|
+
if (sc.turnComplete && sc.modelTurn?.parts) {
|
|
228
|
+
const textParts = sc.modelTurn.parts.filter((p) => p.text).map((p) => p.text);
|
|
229
|
+
if (textParts.length > 0) {
|
|
230
|
+
this.emit({ type: "transcript", role: "assistant", text: textParts.join("") });
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
// Input transcription
|
|
234
|
+
if (sc.inputTranscription?.text) {
|
|
235
|
+
console.log(dim(`[voice] User: ${sc.inputTranscription.text}`));
|
|
236
|
+
this.emit({ type: "transcript", role: "user", text: sc.inputTranscription.text });
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
async handleToolCall(toolCall) {
|
|
241
|
+
if (!this.toolHandler)
|
|
242
|
+
return;
|
|
243
|
+
const functionCalls = toolCall.functionCalls || [];
|
|
244
|
+
const responses = [];
|
|
245
|
+
for (const fc of functionCalls) {
|
|
246
|
+
if (fc.name !== "run_agent") {
|
|
247
|
+
console.error(dim(`[voice] Unknown Gemini function call: ${fc.name}`));
|
|
248
|
+
responses.push({ id: fc.id, name: fc.name, response: { error: `Unknown function: ${fc.name}` } });
|
|
249
|
+
continue;
|
|
250
|
+
}
|
|
251
|
+
const queryArg = fc.args?.query;
|
|
252
|
+
if (!queryArg) {
|
|
253
|
+
responses.push({ id: fc.id, name: fc.name, response: { error: "Missing query argument" } });
|
|
254
|
+
continue;
|
|
255
|
+
}
|
|
256
|
+
console.log(dim(`[voice] Agent query: ${queryArg}`));
|
|
257
|
+
this.emit({ type: "agent_working", query: queryArg });
|
|
258
|
+
try {
|
|
259
|
+
const result = await this.toolHandler(queryArg);
|
|
260
|
+
console.log(dim(`[voice] Agent response: ${result.slice(0, 200)}${result.length > 200 ? "..." : ""}`));
|
|
261
|
+
responses.push({ id: fc.id, name: fc.name, response: { result } });
|
|
262
|
+
this.emit({ type: "agent_done", result: result.slice(0, 500) });
|
|
263
|
+
}
|
|
264
|
+
catch (err) {
|
|
265
|
+
console.error(dim(`[voice] Agent error: ${err.message}`));
|
|
266
|
+
responses.push({ id: fc.id, name: fc.name, response: { error: err.message } });
|
|
267
|
+
this.emit({ type: "error", message: err.message });
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
this.sendRaw({
|
|
271
|
+
toolResponse: { functionResponses: responses },
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
sendRaw(msg) {
|
|
275
|
+
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
|
|
276
|
+
this.ws.send(JSON.stringify(msg));
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
export type { VoiceAdapter, VoiceAdapterConfig, VoiceServerOptions, MultimodalAdapter, MultimodalAdapterConfig, AdapterBackend, ClientMessage, ServerMessage, } from "./adapter.js";
|
|
2
|
+
export { OpenAIRealtimeAdapter } from "./openai-realtime.js";
|
|
3
|
+
export { GeminiLiveAdapter } from "./gemini-live.js";
|
|
4
|
+
export { startVoiceServer } from "./server.js";
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import { type MultimodalAdapter, type MultimodalAdapterConfig, type ClientMessage, type ServerMessage } from "./adapter.js";
|
|
2
|
+
export declare class OpenAIRealtimeAdapter implements MultimodalAdapter {
|
|
3
|
+
private ws;
|
|
4
|
+
private config;
|
|
5
|
+
private latestVideoFrame;
|
|
6
|
+
private latestScreenFrame;
|
|
7
|
+
private onMessage;
|
|
8
|
+
private toolHandler;
|
|
9
|
+
private interrupted;
|
|
10
|
+
constructor(config: MultimodalAdapterConfig);
|
|
11
|
+
connect(opts: {
|
|
12
|
+
toolHandler: (query: string) => Promise<string>;
|
|
13
|
+
onMessage: (msg: ServerMessage) => void;
|
|
14
|
+
}): Promise<void>;
|
|
15
|
+
send(msg: ClientMessage): void;
|
|
16
|
+
disconnect(): Promise<void>;
|
|
17
|
+
private emit;
|
|
18
|
+
/**
|
|
19
|
+
* Inject the latest video frame as a conversation item so the model
|
|
20
|
+
* can see it when generating the next response (e.g. after a voice turn).
|
|
21
|
+
*/
|
|
22
|
+
private injectVideoFrame;
|
|
23
|
+
private sendSessionUpdate;
|
|
24
|
+
private handleEvent;
|
|
25
|
+
private handleFunctionCall;
|
|
26
|
+
private sendRaw;
|
|
27
|
+
}
|