discoclaw 1.2.3 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.context/voice.md +30 -2
  2. package/.env.example +6 -0
  3. package/dist/cli/dashboard.js +7 -1
  4. package/dist/config.js +7 -0
  5. package/dist/cron/executor.js +72 -1
  6. package/dist/dashboard/api/metrics.js +7 -0
  7. package/dist/dashboard/api/metrics.test.js +16 -0
  8. package/dist/dashboard/api/traces.js +14 -0
  9. package/dist/dashboard/api/traces.test.js +40 -0
  10. package/dist/dashboard/page.js +187 -8
  11. package/dist/dashboard/server.js +81 -14
  12. package/dist/dashboard/server.test.js +120 -4
  13. package/dist/discord/deferred-runner.js +306 -219
  14. package/dist/discord/message-coordinator.js +1 -28
  15. package/dist/discord/reaction-handler.js +81 -3
  16. package/dist/index.js +15 -1
  17. package/dist/observability/trace-store.js +56 -0
  18. package/dist/observability/trace-utils.js +31 -0
  19. package/dist/runtime/codex-cli.js +3 -2
  20. package/dist/runtime/codex-cli.test.js +33 -0
  21. package/dist/runtime/model-tiers.js +1 -1
  22. package/dist/runtime/model-tiers.test.js +9 -0
  23. package/dist/runtime/openai-tool-schemas.js +17 -0
  24. package/dist/voice/audio-pipeline.js +246 -6
  25. package/dist/voice/audio-pipeline.test.js +481 -0
  26. package/dist/voice/audio-receiver.js +8 -0
  27. package/dist/voice/audio-receiver.test.js +16 -0
  28. package/dist/voice/conversation-buffer.js +16 -6
  29. package/dist/voice/providers/gemini-live-provider.js +481 -0
  30. package/dist/voice/providers/gemini-live-provider.test.js +834 -0
  31. package/dist/voice/providers/gemini-live-responder.js +267 -0
  32. package/dist/voice/providers/gemini-live-responder.test.js +615 -0
  33. package/dist/voice/providers/gemini-live-token-estimator.js +100 -0
  34. package/dist/voice/providers/gemini-live-token-estimator.test.js +160 -0
  35. package/dist/voice/providers/gemini-live-types.js +32 -0
  36. package/dist/voice/providers/gemini-tool-mapper.js +91 -0
  37. package/dist/voice/providers/gemini-tool-mapper.test.js +253 -0
  38. package/dist/voice/providers/index.js +3 -0
  39. package/dist/voice/types.test.js +6 -0
  40. package/dist/voice/voice-prompt-builder.js +26 -17
  41. package/dist/voice/voice-prompt-builder.test.js +16 -1
  42. package/package.json +1 -1
  43. package/templates/instructions/SYSTEM_DEFAULTS.md +8 -0
@@ -0,0 +1,481 @@
1
+ /**
2
+ * GeminiLiveProvider — bidirectional WebSocket session wrapper for the
3
+ * Gemini Multimodal Live API.
4
+ *
5
+ * Phase 1.1: standalone session management with a clean interface.
6
+ * Phase 1.2 (GeminiLiveResponder) and Phase 1.3 (pipeline integration)
7
+ * will consume this class.
8
+ *
9
+ * The provider manages:
10
+ * - WebSocket connection lifecycle (connect / disconnect)
11
+ * - Session setup (model, generation config, system instruction)
12
+ * - Sending audio input (PCM → base64-encoded chunks)
13
+ * - Receiving server events (audio output, text, turn completion, errors)
14
+ * - Reconnection with exponential backoff
15
+ *
16
+ * Audio format: Gemini Live expects 16 kHz mono PCM s16le input and
17
+ * returns 24 kHz mono PCM s16le output (configurable via responseModalities).
18
+ */
19
+ import WebSocket from 'ws';
20
+ import { DEFAULT_GEMINI_LIVE_MODEL, supportsGeminiLiveIncrementalClientContent, } from './gemini-live-types.js';
21
+ import { GeminiLiveTokenEstimator } from './gemini-live-token-estimator.js';
22
+ // ---------------------------------------------------------------------------
23
+ // Constants
24
+ // ---------------------------------------------------------------------------
25
+ const GEMINI_LIVE_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent';
26
+ const MAX_RETRIES = 3;
27
+ const BASE_BACKOFF_MS = 500;
28
+ /** Default session rotation threshold — 13 minutes (Gemini sessions cap at ~15 min). */
29
+ const DEFAULT_SESSION_ROTATION_MS = 780_000;
30
+ /** Session resumption handles remain valid for roughly 2 hours after termination. */
31
+ const RESUME_HANDLE_TTL_MS = 2 * 60 * 60 * 1000;
32
+ // ---------------------------------------------------------------------------
33
+ // Provider
34
+ // ---------------------------------------------------------------------------
35
+ export class GeminiLiveProvider {
36
+ apiKey;
37
+ model;
38
+ log;
39
+ systemInstruction;
40
+ responseModalities;
41
+ voiceName;
42
+ tools;
43
+ wsFactory;
44
+ sessionRotationMs;
45
+ initialHistoryInClientContent;
46
+ ws = null;
47
+ _state = 'idle';
48
+ retryCount = 0;
49
+ resumeHandle = null;
50
+ resumeHandleUpdatedAt = 0;
51
+ listener = null;
52
+ sessionStartedAt = 0;
53
+ rotationTimer = null;
54
+ tokenEstimator;
55
+ /** Tool call IDs dispatched in the current session but not yet responded to. */
56
+ inflightToolCalls = new Set();
57
+ constructor(opts) {
58
+ this.apiKey = opts.apiKey;
59
+ this.model = opts.model ?? DEFAULT_GEMINI_LIVE_MODEL;
60
+ this.log = opts.log;
61
+ this.systemInstruction = opts.systemInstruction;
62
+ this.responseModalities = opts.responseModalities ?? ['AUDIO'];
63
+ this.voiceName = opts.voiceName;
64
+ this.tools = opts.tools;
65
+ this.wsFactory = opts.wsFactory ?? ((url) => new WebSocket(url));
66
+ this.sessionRotationMs = opts.sessionRotationMs ?? DEFAULT_SESSION_ROTATION_MS;
67
+ this.tokenEstimator = new GeminiLiveTokenEstimator(opts.tokenBudget);
68
+ this.initialHistoryInClientContent = opts.initialHistoryInClientContent ?? false;
69
+ }
70
+ /** Current connection state. */
71
+ get state() {
72
+ return this._state;
73
+ }
74
+ /** Register a listener for server events. Only one listener at a time. */
75
+ onEvent(callback) {
76
+ this.listener = callback;
77
+ }
78
+ /** Connect to the Gemini Live API and perform session setup. */
79
+ async connect() {
80
+ if (this._state === 'open' || this._state === 'connecting' || this._state === 'setup')
81
+ return;
82
+ this._state = 'connecting';
83
+ this.retryCount = 0;
84
+ await this.doConnect();
85
+ }
86
+ /**
87
+ * Send a chunk of PCM audio to the session.
88
+ * The buffer is base64-encoded and sent as a realtimeInput message.
89
+ */
90
+ sendAudio(pcm) {
91
+ if (this._state !== 'open') {
92
+ throw new Error('Cannot sendAudio before connect() completes or after disconnect()');
93
+ }
94
+ this.tokenEstimator.addInputAudio(pcm.length);
95
+ this.checkTokenThreshold();
96
+ this.ws.send(JSON.stringify({
97
+ realtimeInput: {
98
+ audio: {
99
+ mimeType: 'audio/pcm;rate=16000',
100
+ data: pcm.toString('base64'),
101
+ },
102
+ },
103
+ }));
104
+ }
105
+ /** Signal that the current realtime audio stream has ended so Gemini can flush buffered input. */
106
+ sendAudioStreamEnd() {
107
+ if (this._state !== 'open') {
108
+ throw new Error('Cannot sendAudioStreamEnd before connect() completes or after disconnect()');
109
+ }
110
+ this.ws.send(JSON.stringify({
111
+ realtimeInput: {
112
+ audioStreamEnd: true,
113
+ },
114
+ }));
115
+ }
116
+ /** Send a text message to the session. */
117
+ sendText(text) {
118
+ if (this._state !== 'open') {
119
+ throw new Error('Cannot sendText before connect() completes or after disconnect()');
120
+ }
121
+ this.tokenEstimator.addText(text);
122
+ this.checkTokenThreshold();
123
+ if (supportsGeminiLiveIncrementalClientContent(this.model)) {
124
+ this.ws.send(JSON.stringify({
125
+ clientContent: {
126
+ turns: [{ role: 'user', parts: [{ text }] }],
127
+ turnComplete: true,
128
+ },
129
+ }));
130
+ return;
131
+ }
132
+ this.ws.send(JSON.stringify({
133
+ realtimeInput: {
134
+ text,
135
+ },
136
+ }));
137
+ }
138
+ /**
139
+ * Seed the session with prior conversation turns before realtime audio starts.
140
+ * For Gemini 3.1, this is the supported path for initial history backfill.
141
+ */
142
+ sendInitialHistory(turns) {
143
+ if (this._state !== 'open') {
144
+ throw new Error('Cannot sendInitialHistory before connect() completes or after disconnect()');
145
+ }
146
+ if (turns.length === 0)
147
+ return;
148
+ for (const turn of turns) {
149
+ for (const part of turn.parts) {
150
+ this.tokenEstimator.addText(part.text);
151
+ }
152
+ }
153
+ this.checkTokenThreshold();
154
+ this.ws.send(JSON.stringify({
155
+ clientContent: {
156
+ turns,
157
+ turnComplete: false,
158
+ },
159
+ }));
160
+ }
161
+ /**
162
+ * Send tool execution results back to the session.
163
+ * Each response is matched to its original function call by `id`.
164
+ * Silently drops responses for IDs that are no longer in-flight
165
+ * (e.g. from a previous session after rotation).
166
+ */
167
+ sendToolResponse(responses) {
168
+ if (this._state !== 'open') {
169
+ throw new Error('Cannot sendToolResponse before connect() completes or after disconnect()');
170
+ }
171
+ // Filter to only in-flight calls — stale responses from a previous session are dropped
172
+ const valid = responses.filter((r) => {
173
+ if (this.inflightToolCalls.has(r.id)) {
174
+ this.inflightToolCalls.delete(r.id);
175
+ this.tokenEstimator.addToolResponse(r.output);
176
+ return true;
177
+ }
178
+ this.log.warn({ id: r.id }, 'Gemini Live: dropping stale tool response (not in-flight)');
179
+ return false;
180
+ });
181
+ if (valid.length === 0)
182
+ return;
183
+ this.checkTokenThreshold();
184
+ this.ws.send(JSON.stringify({
185
+ toolResponse: {
186
+ functionResponses: valid.map((r) => ({
187
+ id: r.id,
188
+ name: r.name,
189
+ response: {
190
+ result: r.output,
191
+ ...(r.scheduling ? { scheduling: r.scheduling } : {}),
192
+ },
193
+ })),
194
+ },
195
+ }));
196
+ }
197
+ /** Disconnect the session and release resources. */
198
+ async disconnect() {
199
+ if (this._state === 'stopped' || this._state === 'idle')
200
+ return;
201
+ this._state = 'stopped';
202
+ this.cancelRotationTimer();
203
+ this.inflightToolCalls.clear();
204
+ this.tokenEstimator.reset();
205
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
206
+ this.ws.close(1000, 'client disconnect');
207
+ }
208
+ this.ws = null;
209
+ }
210
+ /** Number of tool calls currently in-flight (dispatched but not yet responded). */
211
+ get inflightToolCallCount() {
212
+ return this.inflightToolCalls.size;
213
+ }
214
+ // -----------------------------------------------------------------------
215
+ // Internal
216
+ // -----------------------------------------------------------------------
217
+ buildUrl() {
218
+ return `${GEMINI_LIVE_WS_BASE}?key=${encodeURIComponent(this.apiKey)}`;
219
+ }
220
+ buildSetupMessage() {
221
+ const generationConfig = {
222
+ responseModalities: this.responseModalities,
223
+ };
224
+ if (this.voiceName) {
225
+ generationConfig.speechConfig = {
226
+ voiceConfig: { prebuiltVoiceConfig: { voiceName: this.voiceName } },
227
+ };
228
+ }
229
+ const setup = {
230
+ model: `models/${this.model}`,
231
+ generationConfig,
232
+ contextWindowCompression: { slidingWindow: {} },
233
+ realtimeInputConfig: {
234
+ activityHandling: 'START_OF_ACTIVITY_INTERRUPTS',
235
+ },
236
+ inputAudioTranscription: {},
237
+ outputAudioTranscription: {},
238
+ };
239
+ if (this.systemInstruction) {
240
+ setup.systemInstruction = {
241
+ parts: [{ text: this.systemInstruction }],
242
+ };
243
+ }
244
+ if (this.tools) {
245
+ setup.tools = [this.tools];
246
+ }
247
+ if (this.initialHistoryInClientContent) {
248
+ setup.historyConfig = {
249
+ initialHistoryInClientContent: true,
250
+ };
251
+ }
252
+ if (this.resumeHandle) {
253
+ const age = Date.now() - this.resumeHandleUpdatedAt;
254
+ if (age < RESUME_HANDLE_TTL_MS) {
255
+ setup.sessionResumption = { handle: this.resumeHandle };
256
+ }
257
+ else {
258
+ this.log.warn({ ageMs: age, ttlMs: RESUME_HANDLE_TTL_MS }, 'Gemini Live resume handle expired — starting fresh session');
259
+ this.resumeHandle = null;
260
+ }
261
+ }
262
+ return { setup };
263
+ }
264
+ doConnect() {
265
+ return new Promise((resolve, reject) => {
266
+ const url = this.buildUrl();
267
+ const ws = this.wsFactory(url);
268
+ this.ws = ws;
269
+ ws.on('open', () => {
270
+ this._state = 'setup';
271
+ this.log.info({ model: this.model }, 'Gemini Live WebSocket connected, sending setup');
272
+ ws.send(JSON.stringify(this.buildSetupMessage()));
273
+ });
274
+ ws.on('message', (data) => {
275
+ this.handleMessage(data, resolve);
276
+ });
277
+ ws.on('error', (err) => {
278
+ this.log.error({ err: err.message }, 'Gemini Live WebSocket error');
279
+ });
280
+ ws.on('close', (code, reason) => {
281
+ if (this._state === 'stopped') {
282
+ reject(new Error('Gemini Live WebSocket closed: disconnect() called'));
283
+ return;
284
+ }
285
+ if (this._state === 'connecting' || this._state === 'setup') {
286
+ reject(new Error(`Gemini Live WebSocket closed during connect: code=${code} reason=${reason.toString()}`));
287
+ return;
288
+ }
289
+ this.handleUnexpectedClose();
290
+ });
291
+ });
292
+ }
293
+ handleMessage(data, onSetupComplete) {
294
+ try {
295
+ const parsed = JSON.parse(String(data));
296
+ // Setup complete acknowledgement
297
+ if (parsed.setupComplete != null) {
298
+ const wasReconnect = this.retryCount > 0;
299
+ const attempt = this.retryCount;
300
+ this._state = 'open';
301
+ this.retryCount = 0;
302
+ this.sessionStartedAt = Date.now();
303
+ this.tokenEstimator.reset();
304
+ this.inflightToolCalls.clear();
305
+ this.scheduleRotation();
306
+ this.log.info('Gemini Live session setup complete');
307
+ if (wasReconnect) {
308
+ this.emit({ type: 'reconnected', attempt });
309
+ }
310
+ this.emit({ type: 'setup_complete' });
311
+ onSetupComplete?.();
312
+ return;
313
+ }
314
+ // Server content (audio, text, turn signals)
315
+ if (parsed.serverContent != null) {
316
+ const sc = parsed.serverContent;
317
+ // Interrupted signal
318
+ if (sc.interrupted === true) {
319
+ this.emit({ type: 'interrupted' });
320
+ }
321
+ // Input audio transcription (server-side STT of user speech)
322
+ const inputTranscription = this.extractTranscriptionText(sc.inputTranscription);
323
+ if (inputTranscription) {
324
+ this.emit({ type: 'input_transcript', text: inputTranscription });
325
+ }
326
+ // Output transcription mirrors audio-only model replies without requiring TEXT modality.
327
+ // Do not count these as text tokens; the audio output is already accounted separately.
328
+ const outputTranscription = this.extractTranscriptionText(sc.outputTranscription);
329
+ if (outputTranscription) {
330
+ this.emit({ type: 'text', text: outputTranscription });
331
+ }
332
+ // Turn complete signal
333
+ if (sc.turnComplete === true) {
334
+ this.emit({ type: 'turn_complete' });
335
+ }
336
+ // Model turn with parts
337
+ const modelTurn = sc.modelTurn;
338
+ if (modelTurn?.parts) {
339
+ for (const part of modelTurn.parts) {
340
+ if (part.inlineData != null) {
341
+ const inline = part.inlineData;
342
+ if (inline.data) {
343
+ const buf = Buffer.from(inline.data, 'base64');
344
+ this.tokenEstimator.addOutputAudio(buf.length);
345
+ this.emit({ type: 'audio', data: buf });
346
+ }
347
+ }
348
+ if (typeof part.text === 'string') {
349
+ this.tokenEstimator.addText(part.text);
350
+ this.emit({ type: 'text', text: part.text });
351
+ }
352
+ }
353
+ this.checkTokenThreshold();
354
+ }
355
+ return;
356
+ }
357
+ // Tool call from server — model wants to invoke a function
358
+ if (parsed.toolCall != null) {
359
+ const tc = parsed.toolCall;
360
+ if (Array.isArray(tc.functionCalls) && tc.functionCalls.length > 0) {
361
+ const calls = tc.functionCalls.map((fc) => ({
362
+ id: String(fc.id ?? ''),
363
+ name: String(fc.name ?? ''),
364
+ args: fc.args ?? {},
365
+ }));
366
+ // Track in-flight IDs and estimate tokens
367
+ for (const call of calls) {
368
+ this.inflightToolCalls.add(call.id);
369
+ this.tokenEstimator.addToolCall(call.name, call.args);
370
+ }
371
+ this.log.info({ count: calls.length, names: calls.map((c) => c.name).join(','), inflight: this.inflightToolCalls.size }, 'Gemini Live tool call received');
372
+ this.emit({ type: 'tool_call', functionCalls: calls });
373
+ }
374
+ return;
375
+ }
376
+ // Error from server
377
+ if (parsed.error != null) {
378
+ const err = parsed.error;
379
+ const errMsg = err.message ?? `code ${err.code ?? 'unknown'}`;
380
+ this.log.error({ error: parsed.error }, 'Gemini Live server error');
381
+ this.emit({ type: 'error', error: errMsg });
382
+ return;
383
+ }
384
+ // Session resumption update — store handle for reconnection
385
+ if (parsed.sessionResumptionUpdate != null) {
386
+ const update = parsed.sessionResumptionUpdate;
387
+ if (update.newHandle) {
388
+ this.resumeHandle = update.newHandle;
389
+ this.resumeHandleUpdatedAt = Date.now();
390
+ this.log.info('Gemini Live session resume handle updated');
391
+ }
392
+ return;
393
+ }
394
+ // Unknown message shape — log for debugging
395
+ this.log.warn({ keys: Object.keys(parsed).join(',') }, 'Gemini Live: unrecognized message');
396
+ }
397
+ catch (err) {
398
+ this.log.error({ err }, 'Failed to parse Gemini Live message');
399
+ }
400
+ }
401
+ handleUnexpectedClose() {
402
+ this.cancelRotationTimer();
403
+ this.inflightToolCalls.clear();
404
+ if (this.retryCount >= MAX_RETRIES) {
405
+ this.log.error({ retries: this.retryCount }, 'Gemini Live exhausted reconnect retries');
406
+ this._state = 'stopped';
407
+ this.emit({ type: 'reconnect_failed', attempts: this.retryCount });
408
+ this.emit({ type: 'fallback_recommended', reason: 'exhausted reconnect retries' });
409
+ this.emit({ type: 'error', error: 'exhausted reconnect retries' });
410
+ return;
411
+ }
412
+ this.retryCount++;
413
+ const delay = BASE_BACKOFF_MS * 2 ** (this.retryCount - 1);
414
+ this.log.warn({ attempt: this.retryCount, maxRetries: MAX_RETRIES, delayMs: delay }, 'Gemini Live reconnecting after unexpected close');
415
+ this.emit({
416
+ type: 'reconnecting',
417
+ attempt: this.retryCount,
418
+ maxRetries: MAX_RETRIES,
419
+ hasResumeHandle: this.resumeHandle != null && (Date.now() - this.resumeHandleUpdatedAt) < RESUME_HANDLE_TTL_MS,
420
+ });
421
+ setTimeout(() => {
422
+ if (this._state === 'stopped')
423
+ return;
424
+ this._state = 'connecting';
425
+ this.doConnect().catch((err) => {
426
+ this.log.error({ err }, 'Gemini Live reconnect failed');
427
+ this.handleUnexpectedClose();
428
+ });
429
+ }, delay);
430
+ }
431
+ scheduleRotation() {
432
+ this.cancelRotationTimer();
433
+ if (!this.sessionRotationMs)
434
+ return;
435
+ this.rotationTimer = setTimeout(() => {
436
+ this.rotationTimer = null;
437
+ this.initiateGracefulReconnect();
438
+ }, this.sessionRotationMs);
439
+ }
440
+ cancelRotationTimer() {
441
+ if (this.rotationTimer != null) {
442
+ clearTimeout(this.rotationTimer);
443
+ this.rotationTimer = null;
444
+ }
445
+ }
446
+ initiateGracefulReconnect() {
447
+ if (this._state !== 'open')
448
+ return;
449
+ const sessionAgeMs = Date.now() - this.sessionStartedAt;
450
+ this.log.info({ sessionAgeMs }, 'Gemini Live session rotation — initiating graceful reconnect');
451
+ this.emit({ type: 'session_rotating', sessionAgeMs });
452
+ // Close the WebSocket; handleUnexpectedClose will reconnect with the resume handle.
453
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
454
+ this.ws.close(1000, 'session rotation');
455
+ }
456
+ }
457
+ /** Check token thresholds and emit warnings when newly crossed. */
458
+ checkTokenThreshold() {
459
+ const threshold = this.tokenEstimator.checkThreshold();
460
+ if (threshold) {
461
+ const est = this.tokenEstimator.estimate;
462
+ this.log.warn({ threshold, estimatedTokens: est.total, text: est.textTokens, audio: est.audioTokens, tool: est.toolTokens }, 'Gemini Live token threshold crossed');
463
+ this.emit({ type: 'token_warning', estimatedTokens: est.total, threshold });
464
+ // At the compress threshold, proactively trigger session rotation
465
+ // to prevent server-side sliding window from silently dropping context.
466
+ if (threshold === 'compress' && this._state === 'open') {
467
+ this.log.info({ estimatedTokens: est.total }, 'Gemini Live: compress threshold reached — initiating proactive rotation');
468
+ this.initiateGracefulReconnect();
469
+ }
470
+ }
471
+ }
472
+ emit(event) {
473
+ this.listener?.(event);
474
+ }
475
+ extractTranscriptionText(value) {
476
+ if (value == null || typeof value !== 'object')
477
+ return null;
478
+ const text = value.text;
479
+ return typeof text === 'string' && text !== '' ? text : null;
480
+ }
481
+ }