@open-gitagent/voice 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +52 -0
- package/dist/composio/adapter.d.ts +26 -0
- package/dist/composio/adapter.js +92 -0
- package/dist/composio/client.d.ts +39 -0
- package/dist/composio/client.js +170 -0
- package/dist/composio/index.d.ts +2 -0
- package/dist/composio/index.js +2 -0
- package/dist/gemini-live.d.ts +20 -0
- package/dist/gemini-live.js +279 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +3 -0
- package/dist/openai-realtime.d.ts +40 -0
- package/dist/openai-realtime.js +460 -0
- package/dist/server.d.ts +18 -0
- package/dist/server.js +3250 -0
- package/dist/ui.html +3859 -0
- package/package.json +57 -0
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
import WebSocket from "ws";
|
|
2
|
+
import { DEFAULT_VOICE_INSTRUCTIONS, } from "@open-gitagent/gitagent";
|
|
3
|
+
const dim = (s) => `\x1b[2m${s}\x1b[0m`;
|
|
4
|
+
/**
|
|
5
|
+
* Downsample 24kHz PCM (Int16LE) to 16kHz by linear interpolation (2 of every 3 samples).
|
|
6
|
+
* Input: base64-encoded 24kHz Int16LE. Output: base64-encoded 16kHz Int16LE.
|
|
7
|
+
*/
|
|
8
|
+
function downsample24kTo16k(base64_24k) {
|
|
9
|
+
const binary = Buffer.from(base64_24k, "base64");
|
|
10
|
+
const samples24 = new Int16Array(binary.buffer, binary.byteOffset, binary.byteLength / 2);
|
|
11
|
+
const outLength = Math.floor(samples24.length * 2 / 3);
|
|
12
|
+
const samples16 = new Int16Array(outLength);
|
|
13
|
+
for (let i = 0; i < outLength; i++) {
|
|
14
|
+
// Map output index to fractional input index
|
|
15
|
+
const srcIdx = i * 1.5;
|
|
16
|
+
const lo = Math.floor(srcIdx);
|
|
17
|
+
const frac = srcIdx - lo;
|
|
18
|
+
const hi = Math.min(lo + 1, samples24.length - 1);
|
|
19
|
+
samples16[i] = Math.round(samples24[lo] * (1 - frac) + samples24[hi] * frac);
|
|
20
|
+
}
|
|
21
|
+
return Buffer.from(samples16.buffer).toString("base64");
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Upsample 16kHz PCM (Int16LE) to 24kHz by linear interpolation.
|
|
25
|
+
* Input: base64-encoded 16kHz Int16LE. Output: base64-encoded 24kHz Int16LE.
|
|
26
|
+
*/
|
|
27
|
+
function upsample16kTo24k(base64_16k) {
|
|
28
|
+
const binary = Buffer.from(base64_16k, "base64");
|
|
29
|
+
const samples16 = new Int16Array(binary.buffer, binary.byteOffset, binary.byteLength / 2);
|
|
30
|
+
const outLength = Math.floor(samples16.length * 3 / 2);
|
|
31
|
+
const samples24 = new Int16Array(outLength);
|
|
32
|
+
for (let i = 0; i < outLength; i++) {
|
|
33
|
+
const srcIdx = i * (2 / 3);
|
|
34
|
+
const lo = Math.floor(srcIdx);
|
|
35
|
+
const frac = srcIdx - lo;
|
|
36
|
+
const hi = Math.min(lo + 1, samples16.length - 1);
|
|
37
|
+
samples24[i] = Math.round(samples16[lo] * (1 - frac) + samples16[hi] * frac);
|
|
38
|
+
}
|
|
39
|
+
return Buffer.from(samples24.buffer).toString("base64");
|
|
40
|
+
}
|
|
41
|
+
export class GeminiLiveAdapter {
|
|
42
|
+
ws = null;
|
|
43
|
+
config;
|
|
44
|
+
onMessage = null;
|
|
45
|
+
toolHandler = null;
|
|
46
|
+
setupDone = false;
|
|
47
|
+
constructor(config) {
|
|
48
|
+
this.config = config;
|
|
49
|
+
}
|
|
50
|
+
async connect(opts) {
|
|
51
|
+
this.onMessage = opts.onMessage;
|
|
52
|
+
this.toolHandler = opts.toolHandler;
|
|
53
|
+
this.setupDone = false;
|
|
54
|
+
const model = this.config.model || "models/gemini-2.5-flash-native-audio-preview";
|
|
55
|
+
const url = `wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent?key=${this.config.apiKey}`;
|
|
56
|
+
return new Promise((resolve, reject) => {
|
|
57
|
+
this.ws = new WebSocket(url);
|
|
58
|
+
this.ws.on("open", () => {
|
|
59
|
+
console.log(dim("[voice] Connected to Gemini Multimodal Live"));
|
|
60
|
+
this.sendSetup(model);
|
|
61
|
+
});
|
|
62
|
+
this.ws.on("error", (err) => {
|
|
63
|
+
console.error(dim(`[voice] Gemini WS error: ${err.message}`));
|
|
64
|
+
if (!this.setupDone) {
|
|
65
|
+
reject(err);
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
this.emit({ type: "error", message: err.message });
|
|
69
|
+
}
|
|
70
|
+
});
|
|
71
|
+
this.ws.on("close", () => {
|
|
72
|
+
console.log(dim("[voice] Gemini WS closed"));
|
|
73
|
+
});
|
|
74
|
+
this.ws.on("message", (data) => {
|
|
75
|
+
try {
|
|
76
|
+
const msg = JSON.parse(data.toString());
|
|
77
|
+
this.handleGeminiMessage(msg);
|
|
78
|
+
// Resolve after setup acknowledgment
|
|
79
|
+
if (!this.setupDone && msg.setupComplete) {
|
|
80
|
+
this.setupDone = true;
|
|
81
|
+
console.log(dim("[voice] Gemini session ready"));
|
|
82
|
+
resolve();
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
catch (err) {
|
|
86
|
+
console.error(dim(`[voice] Gemini parse error: ${err.message}`));
|
|
87
|
+
}
|
|
88
|
+
});
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
send(msg) {
|
|
92
|
+
switch (msg.type) {
|
|
93
|
+
case "audio":
|
|
94
|
+
// Browser sends 24kHz, Gemini expects 16kHz
|
|
95
|
+
this.sendRaw({
|
|
96
|
+
realtimeInput: {
|
|
97
|
+
mediaChunks: [{
|
|
98
|
+
mimeType: "audio/pcm;rate=16000",
|
|
99
|
+
data: downsample24kTo16k(msg.audio),
|
|
100
|
+
}],
|
|
101
|
+
},
|
|
102
|
+
});
|
|
103
|
+
break;
|
|
104
|
+
case "video_frame":
|
|
105
|
+
// Gemini supports continuous video streaming natively
|
|
106
|
+
this.sendRaw({
|
|
107
|
+
realtimeInput: {
|
|
108
|
+
mediaChunks: [{
|
|
109
|
+
mimeType: msg.mimeType,
|
|
110
|
+
data: msg.frame,
|
|
111
|
+
}],
|
|
112
|
+
},
|
|
113
|
+
});
|
|
114
|
+
break;
|
|
115
|
+
case "text":
|
|
116
|
+
this.sendRaw({
|
|
117
|
+
clientContent: {
|
|
118
|
+
turns: [{
|
|
119
|
+
role: "user",
|
|
120
|
+
parts: [{ text: msg.text }],
|
|
121
|
+
}],
|
|
122
|
+
turnComplete: true,
|
|
123
|
+
},
|
|
124
|
+
});
|
|
125
|
+
break;
|
|
126
|
+
case "file": {
|
|
127
|
+
const parts = [];
|
|
128
|
+
if (msg.mimeType.startsWith("image/")) {
|
|
129
|
+
parts.push({ inlineData: { mimeType: msg.mimeType, data: msg.data } });
|
|
130
|
+
parts.push({ text: msg.text || `[User attached image: ${msg.name}]` });
|
|
131
|
+
}
|
|
132
|
+
else {
|
|
133
|
+
const decoded = Buffer.from(msg.data, "base64").toString("utf-8");
|
|
134
|
+
const label = msg.text ? `${msg.text}\n\n` : "";
|
|
135
|
+
parts.push({ text: `${label}[File: ${msg.name}]\n\`\`\`\n${decoded}\n\`\`\`` });
|
|
136
|
+
}
|
|
137
|
+
this.sendRaw({
|
|
138
|
+
clientContent: {
|
|
139
|
+
turns: [{ role: "user", parts }],
|
|
140
|
+
turnComplete: true,
|
|
141
|
+
},
|
|
142
|
+
});
|
|
143
|
+
break;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
async disconnect() {
|
|
148
|
+
if (this.ws) {
|
|
149
|
+
this.ws.close();
|
|
150
|
+
this.ws = null;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
emit(msg) {
|
|
154
|
+
this.onMessage?.(msg);
|
|
155
|
+
}
|
|
156
|
+
sendSetup(model) {
|
|
157
|
+
const instructions = this.config.instructions || DEFAULT_VOICE_INSTRUCTIONS;
|
|
158
|
+
const voiceName = this.config.voice || "Aoede";
|
|
159
|
+
this.sendRaw({
|
|
160
|
+
setup: {
|
|
161
|
+
model,
|
|
162
|
+
generationConfig: {
|
|
163
|
+
responseModalities: ["AUDIO", "TEXT"],
|
|
164
|
+
speechConfig: {
|
|
165
|
+
voiceConfig: {
|
|
166
|
+
prebuiltVoiceConfig: { voiceName },
|
|
167
|
+
},
|
|
168
|
+
},
|
|
169
|
+
},
|
|
170
|
+
tools: [{
|
|
171
|
+
functionDeclarations: [{
|
|
172
|
+
name: "run_agent",
|
|
173
|
+
description: "Execute any request through the gitagent agent. It has full access to the terminal (can run any shell command, open apps, install packages), file system (read/write/create files), git operations, and persistent memory. Use this for ALL actionable requests. IMPORTANT: If the user uploaded a file, always include the file path (from the '[File saved to: ...]' annotation) in the query.",
|
|
174
|
+
parameters: {
|
|
175
|
+
type: "OBJECT",
|
|
176
|
+
properties: {
|
|
177
|
+
query: {
|
|
178
|
+
type: "STRING",
|
|
179
|
+
description: "The user's request. MUST include file paths when referencing uploaded files (e.g. 'make a game using the image at workspace/lobster.png').",
|
|
180
|
+
},
|
|
181
|
+
},
|
|
182
|
+
required: ["query"],
|
|
183
|
+
},
|
|
184
|
+
}],
|
|
185
|
+
}],
|
|
186
|
+
systemInstruction: {
|
|
187
|
+
parts: [{ text: instructions }],
|
|
188
|
+
},
|
|
189
|
+
contextWindowCompression: {
|
|
190
|
+
triggerTokens: 25000,
|
|
191
|
+
slidingWindow: { targetTokens: 12500 },
|
|
192
|
+
},
|
|
193
|
+
},
|
|
194
|
+
});
|
|
195
|
+
}
|
|
196
|
+
handleGeminiMessage(msg) {
|
|
197
|
+
// Tool calls
|
|
198
|
+
if (msg.toolCall) {
|
|
199
|
+
this.handleToolCall(msg.toolCall);
|
|
200
|
+
return;
|
|
201
|
+
}
|
|
202
|
+
// Server content (audio/text responses)
|
|
203
|
+
if (msg.serverContent) {
|
|
204
|
+
const sc = msg.serverContent;
|
|
205
|
+
// Model turn parts
|
|
206
|
+
if (sc.modelTurn?.parts) {
|
|
207
|
+
for (const part of sc.modelTurn.parts) {
|
|
208
|
+
if (part.inlineData) {
|
|
209
|
+
const mimeType = part.inlineData.mimeType || "";
|
|
210
|
+
if (mimeType.startsWith("audio/")) {
|
|
211
|
+
// Gemini outputs 16kHz, browser expects 24kHz
|
|
212
|
+
const audio24k = upsample16kTo24k(part.inlineData.data);
|
|
213
|
+
this.emit({ type: "audio_delta", audio: audio24k });
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
if (part.text) {
|
|
217
|
+
this.emit({
|
|
218
|
+
type: "transcript",
|
|
219
|
+
role: "assistant",
|
|
220
|
+
text: part.text,
|
|
221
|
+
partial: !sc.turnComplete,
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
// Turn complete marker
|
|
227
|
+
if (sc.turnComplete && sc.modelTurn?.parts) {
|
|
228
|
+
const textParts = sc.modelTurn.parts.filter((p) => p.text).map((p) => p.text);
|
|
229
|
+
if (textParts.length > 0) {
|
|
230
|
+
this.emit({ type: "transcript", role: "assistant", text: textParts.join("") });
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
// Input transcription
|
|
234
|
+
if (sc.inputTranscription?.text) {
|
|
235
|
+
console.log(dim(`[voice] User: ${sc.inputTranscription.text}`));
|
|
236
|
+
this.emit({ type: "transcript", role: "user", text: sc.inputTranscription.text });
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
async handleToolCall(toolCall) {
|
|
241
|
+
if (!this.toolHandler)
|
|
242
|
+
return;
|
|
243
|
+
const functionCalls = toolCall.functionCalls || [];
|
|
244
|
+
const responses = [];
|
|
245
|
+
for (const fc of functionCalls) {
|
|
246
|
+
if (fc.name !== "run_agent") {
|
|
247
|
+
console.error(dim(`[voice] Unknown Gemini function call: ${fc.name}`));
|
|
248
|
+
responses.push({ id: fc.id, name: fc.name, response: { error: `Unknown function: ${fc.name}` } });
|
|
249
|
+
continue;
|
|
250
|
+
}
|
|
251
|
+
const queryArg = fc.args?.query;
|
|
252
|
+
if (!queryArg) {
|
|
253
|
+
responses.push({ id: fc.id, name: fc.name, response: { error: "Missing query argument" } });
|
|
254
|
+
continue;
|
|
255
|
+
}
|
|
256
|
+
console.log(dim(`[voice] Agent query: ${queryArg}`));
|
|
257
|
+
this.emit({ type: "agent_working", query: queryArg });
|
|
258
|
+
try {
|
|
259
|
+
const result = await this.toolHandler(queryArg);
|
|
260
|
+
console.log(dim(`[voice] Agent response: ${result.slice(0, 200)}${result.length > 200 ? "..." : ""}`));
|
|
261
|
+
responses.push({ id: fc.id, name: fc.name, response: { result } });
|
|
262
|
+
this.emit({ type: "agent_done", result: result.slice(0, 500) });
|
|
263
|
+
}
|
|
264
|
+
catch (err) {
|
|
265
|
+
console.error(dim(`[voice] Agent error: ${err.message}`));
|
|
266
|
+
responses.push({ id: fc.id, name: fc.name, response: { error: err.message } });
|
|
267
|
+
this.emit({ type: "error", message: err.message });
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
this.sendRaw({
|
|
271
|
+
toolResponse: { functionResponses: responses },
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
sendRaw(msg) {
|
|
275
|
+
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
|
|
276
|
+
this.ws.send(JSON.stringify(msg));
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
export type { VoiceAdapter, VoiceAdapterConfig, VoiceServerOptions, MultimodalAdapter, MultimodalAdapterConfig, AdapterBackend, ClientMessage, ServerMessage, } from "@open-gitagent/gitagent";
|
|
2
|
+
export { OpenAIRealtimeAdapter } from "./openai-realtime.js";
|
|
3
|
+
export { GeminiLiveAdapter } from "./gemini-live.js";
|
|
4
|
+
export { startVoiceServer } from "./server.js";
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { type MultimodalAdapter, type MultimodalAdapterConfig, type ClientMessage, type ServerMessage } from "@open-gitagent/gitagent";
|
|
2
|
+
export declare class OpenAIRealtimeAdapter implements MultimodalAdapter {
|
|
3
|
+
private ws;
|
|
4
|
+
private config;
|
|
5
|
+
private latestVideoFrame;
|
|
6
|
+
private latestScreenFrame;
|
|
7
|
+
private onMessage;
|
|
8
|
+
private toolHandler;
|
|
9
|
+
private interrupted;
|
|
10
|
+
private refreshTimer;
|
|
11
|
+
private refreshing;
|
|
12
|
+
private disposed;
|
|
13
|
+
private static readonly REFRESH_AFTER_MS;
|
|
14
|
+
constructor(config: MultimodalAdapterConfig);
|
|
15
|
+
connect(opts: {
|
|
16
|
+
toolHandler: (query: string) => Promise<string>;
|
|
17
|
+
onMessage: (msg: ServerMessage) => void;
|
|
18
|
+
}): Promise<void>;
|
|
19
|
+
private connectWs;
|
|
20
|
+
/** Send session.update on a specific ws instance (before this.ws is set). */
|
|
21
|
+
private sendSessionUpdateOn;
|
|
22
|
+
send(msg: ClientMessage): void;
|
|
23
|
+
disconnect(): Promise<void>;
|
|
24
|
+
/**
|
|
25
|
+
* Tear down and reopen the Realtime WS before (or right after) OpenAI's
|
|
26
|
+
* 60-minute hard cap expires. Re-sends the stored session.update so the
|
|
27
|
+
* agent picks up where it left off without the user noticing.
|
|
28
|
+
*/
|
|
29
|
+
private refreshSession;
|
|
30
|
+
private emit;
|
|
31
|
+
/**
|
|
32
|
+
* Inject the latest video frame as a conversation item so the model
|
|
33
|
+
* can see it when generating the next response (e.g. after a voice turn).
|
|
34
|
+
*/
|
|
35
|
+
private injectVideoFrame;
|
|
36
|
+
private sendSessionUpdate;
|
|
37
|
+
private handleEvent;
|
|
38
|
+
private handleFunctionCall;
|
|
39
|
+
private sendRaw;
|
|
40
|
+
}
|