stelo 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +184 -0
- package/README.md +853 -0
- package/dist/accessibility.d.ts +227 -0
- package/dist/accessibility.d.ts.map +1 -0
- package/dist/accessibility.js +602 -0
- package/dist/accessibility.js.map +1 -0
- package/dist/agent.d.ts +870 -0
- package/dist/agent.d.ts.map +1 -0
- package/dist/agent.js +1107 -0
- package/dist/agent.js.map +1 -0
- package/dist/audio-stream.d.ts +114 -0
- package/dist/audio-stream.d.ts.map +1 -0
- package/dist/audio-stream.js +167 -0
- package/dist/audio-stream.js.map +1 -0
- package/dist/clipboard.d.ts +99 -0
- package/dist/clipboard.d.ts.map +1 -0
- package/dist/clipboard.js +352 -0
- package/dist/clipboard.js.map +1 -0
- package/dist/config.d.ts +183 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +477 -0
- package/dist/config.js.map +1 -0
- package/dist/context.d.ts +213 -0
- package/dist/context.d.ts.map +1 -0
- package/dist/context.js +387 -0
- package/dist/context.js.map +1 -0
- package/dist/cortex.d.ts +548 -0
- package/dist/cortex.d.ts.map +1 -0
- package/dist/cortex.js +1479 -0
- package/dist/cortex.js.map +1 -0
- package/dist/errors.d.ts +133 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +278 -0
- package/dist/errors.js.map +1 -0
- package/dist/events.d.ts +227 -0
- package/dist/events.d.ts.map +1 -0
- package/dist/events.js +429 -0
- package/dist/events.js.map +1 -0
- package/dist/executor.d.ts +212 -0
- package/dist/executor.d.ts.map +1 -0
- package/dist/executor.js +545 -0
- package/dist/executor.js.map +1 -0
- package/dist/index.d.ts +69 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +167 -0
- package/dist/index.js.map +1 -0
- package/dist/integration.d.ts +159 -0
- package/dist/integration.d.ts.map +1 -0
- package/dist/integration.js +533 -0
- package/dist/integration.js.map +1 -0
- package/dist/keyboard.d.ts +276 -0
- package/dist/keyboard.d.ts.map +1 -0
- package/dist/keyboard.js +404 -0
- package/dist/keyboard.js.map +1 -0
- package/dist/logger.d.ts +198 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +516 -0
- package/dist/logger.js.map +1 -0
- package/dist/middleware.d.ts +183 -0
- package/dist/middleware.d.ts.map +1 -0
- package/dist/middleware.js +493 -0
- package/dist/middleware.js.map +1 -0
- package/dist/monitor.d.ts +136 -0
- package/dist/monitor.d.ts.map +1 -0
- package/dist/monitor.js +341 -0
- package/dist/monitor.js.map +1 -0
- package/dist/mouse.d.ts +290 -0
- package/dist/mouse.d.ts.map +1 -0
- package/dist/mouse.js +466 -0
- package/dist/mouse.js.map +1 -0
- package/dist/plugin.d.ts +157 -0
- package/dist/plugin.d.ts.map +1 -0
- package/dist/plugin.js +409 -0
- package/dist/plugin.js.map +1 -0
- package/dist/process.d.ts +106 -0
- package/dist/process.d.ts.map +1 -0
- package/dist/process.js +326 -0
- package/dist/process.js.map +1 -0
- package/dist/recorder.d.ts +100 -0
- package/dist/recorder.d.ts.map +1 -0
- package/dist/recorder.js +258 -0
- package/dist/recorder.js.map +1 -0
- package/dist/safety.d.ts +59 -0
- package/dist/safety.d.ts.map +1 -0
- package/dist/safety.js +98 -0
- package/dist/safety.js.map +1 -0
- package/dist/scheduler.d.ts +152 -0
- package/dist/scheduler.d.ts.map +1 -0
- package/dist/scheduler.js +615 -0
- package/dist/scheduler.js.map +1 -0
- package/dist/screen.d.ts +96 -0
- package/dist/screen.d.ts.map +1 -0
- package/dist/screen.js +154 -0
- package/dist/screen.js.map +1 -0
- package/dist/session.d.ts +209 -0
- package/dist/session.d.ts.map +1 -0
- package/dist/session.js +479 -0
- package/dist/session.js.map +1 -0
- package/dist/stream.d.ts +168 -0
- package/dist/stream.d.ts.map +1 -0
- package/dist/stream.js +298 -0
- package/dist/stream.js.map +1 -0
- package/dist/telemetry.d.ts +223 -0
- package/dist/telemetry.d.ts.map +1 -0
- package/dist/telemetry.js +433 -0
- package/dist/telemetry.js.map +1 -0
- package/dist/types.d.ts +165 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +8 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/bezier.d.ts +51 -0
- package/dist/utils/bezier.d.ts.map +1 -0
- package/dist/utils/bezier.js +117 -0
- package/dist/utils/bezier.js.map +1 -0
- package/dist/utils/helpers.d.ts +90 -0
- package/dist/utils/helpers.d.ts.map +1 -0
- package/dist/utils/helpers.js +143 -0
- package/dist/utils/helpers.js.map +1 -0
- package/dist/utils/index.d.ts +4 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +18 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/validation.d.ts +254 -0
- package/dist/validation.d.ts.map +1 -0
- package/dist/validation.js +478 -0
- package/dist/validation.js.map +1 -0
- package/dist/vision.d.ts +719 -0
- package/dist/vision.d.ts.map +1 -0
- package/dist/vision.js +1197 -0
- package/dist/vision.js.map +1 -0
- package/dist/window.d.ts +80 -0
- package/dist/window.d.ts.map +1 -0
- package/dist/window.js +170 -0
- package/dist/window.js.map +1 -0
- package/dist/workflow.d.ts +224 -0
- package/dist/workflow.d.ts.map +1 -0
- package/dist/workflow.js +578 -0
- package/dist/workflow.js.map +1 -0
- package/index.d.ts +840 -0
- package/index.js +495 -0
- package/package.json +91 -0
package/dist/cortex.js
ADDED
|
@@ -0,0 +1,1479 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// ============================================================================
|
|
3
|
+
// Cortex — Real-Time Agent Nervous System
|
|
4
|
+
// ============================================================================
|
|
5
|
+
//
|
|
6
|
+
// ██████ ██████ ██████ ████████ ███████ ██ ██
|
|
7
|
+
// ██ ██ ██ ██ ██ ██ ██ ██ ██
|
|
8
|
+
// ██ ██ ██ ██████ ██ █████ ███
|
|
9
|
+
// ██ ██ ██ ██ ██ ██ ██ ██ ██
|
|
10
|
+
// ██████ ██████ ██ ██ ██ ███████ ██ ██
|
|
11
|
+
//
|
|
12
|
+
// The missing layer between real-time AI models and the physical computer.
|
|
13
|
+
// Cortex gives a model eyes (screen), ears (audio), hands (mouse/keyboard),
|
|
14
|
+
// and a voice — all flowing bidirectionally in real time.
|
|
15
|
+
//
|
|
16
|
+
// Provider-agnostic: works with Gemini Live, OpenAI Realtime, xAI, or any
|
|
17
|
+
// WebSocket-based model. The Cortex doesn't care what model is behind it.
|
|
18
|
+
// It just bridges senses ↔ intelligence ↔ actions.
|
|
19
|
+
//
|
|
20
|
+
// ============================================================================
|
|
21
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
22
|
+
exports.Cortex = void 0;
|
|
23
|
+
exports.geminiLiveProvider = geminiLiveProvider;
|
|
24
|
+
exports.openAIRealtimeProvider = openAIRealtimeProvider;
|
|
25
|
+
exports.customProvider = customProvider;
|
|
26
|
+
exports.createCortex = createCortex;
|
|
27
|
+
const native = require('../index.js');
|
|
28
|
+
class Cortex {
|
|
29
|
+
config = null;
|
|
30
|
+
transport = null;
|
|
31
|
+
_state = 'disconnected';
|
|
32
|
+
_turn = 'none';
|
|
33
|
+
_connected = false;
|
|
34
|
+
// Event listeners
|
|
35
|
+
listeners = new Map();
|
|
36
|
+
// Tool registry
|
|
37
|
+
tools = new Map();
|
|
38
|
+
pendingToolCalls = new Map();
|
|
39
|
+
// Sensory feeds
|
|
40
|
+
screenFeedInterval = null;
|
|
41
|
+
audioFeedInterval = null;
|
|
42
|
+
_screenFeedActive = false;
|
|
43
|
+
_audioFeedActive = false;
|
|
44
|
+
// Idle tracking
|
|
45
|
+
idleTimer = null;
|
|
46
|
+
lastActivityTime = 0;
|
|
47
|
+
// Session
|
|
48
|
+
_resumeToken = null;
|
|
49
|
+
_totalTokensUsed = 0;
|
|
50
|
+
// Accumulation buffers
|
|
51
|
+
modelTextBuffer = '';
|
|
52
|
+
userTranscriptBuffer = '';
|
|
53
|
+
// ═══ CONNECTION ═══════════════════════════════════════════════════════
|
|
54
|
+
/**
|
|
55
|
+
* Connect to a real-time AI model through a provider.
|
|
56
|
+
*
|
|
57
|
+
* @example
|
|
58
|
+
* ```typescript
|
|
59
|
+
* const cortex = new Cortex();
|
|
60
|
+
* await cortex.connect({
|
|
61
|
+
* provider: geminiLiveProvider(),
|
|
62
|
+
* connection: {
|
|
63
|
+
* apiKey: process.env.GEMINI_API_KEY!,
|
|
64
|
+
* model: 'gemini-3.1-flash-live-preview',
|
|
65
|
+
* systemInstruction: 'You are a desktop assistant. Use tools to interact with the computer.',
|
|
66
|
+
* responseModalities: ['audio'],
|
|
67
|
+
* voice: 'Kore',
|
|
68
|
+
* },
|
|
69
|
+
* screenFeed: { fps: 1, quality: 60, scale: 0.5 },
|
|
70
|
+
* audioFeed: { source: 'system' },
|
|
71
|
+
* autoRegisterSteloTools: true,
|
|
72
|
+
* });
|
|
73
|
+
* ```
|
|
74
|
+
*/
|
|
75
|
+
async connect(config) {
|
|
76
|
+
if (this._connected) {
|
|
77
|
+
await this.disconnect();
|
|
78
|
+
}
|
|
79
|
+
this.config = config;
|
|
80
|
+
// Merge custom tools into connection config
|
|
81
|
+
const allToolDefs = [...(config.connection.tools ?? [])];
|
|
82
|
+
for (const tool of config.tools ?? []) {
|
|
83
|
+
this.registerTool(tool.name, tool.handler, tool);
|
|
84
|
+
allToolDefs.push({ name: tool.name, description: tool.description, parameters: tool.parameters, nonBlocking: tool.nonBlocking });
|
|
85
|
+
}
|
|
86
|
+
// Auto-register Stelo tools
|
|
87
|
+
if (config.autoRegisterSteloTools) {
|
|
88
|
+
this.registerSteloTools();
|
|
89
|
+
for (const [, entry] of this.tools) {
|
|
90
|
+
allToolDefs.push(entry.definition);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
const connectionConfig = { ...config.connection, tools: allToolDefs };
|
|
94
|
+
// Build transport event handlers
|
|
95
|
+
const handlers = {
|
|
96
|
+
audio: (data, sampleRate) => this.handleModelAudio(data, sampleRate),
|
|
97
|
+
text: (text, isFinal) => this.handleModelText(text, isFinal),
|
|
98
|
+
transcript: (text, source, isFinal) => this.handleTranscript(text, source, isFinal),
|
|
99
|
+
toolCall: (id, name, args) => this.handleToolCall(id, name, args),
|
|
100
|
+
turnComplete: () => this.handleTurnComplete(),
|
|
101
|
+
generationComplete: () => this.handleGenerationComplete(),
|
|
102
|
+
interrupted: () => this.handleInterrupted(),
|
|
103
|
+
thought: (text) => this.emit('thought', text),
|
|
104
|
+
usage: (total, input, output) => {
|
|
105
|
+
this._totalTokensUsed = total;
|
|
106
|
+
this.emit('usage', total, input, output);
|
|
107
|
+
},
|
|
108
|
+
error: (err) => this.emit('error', err),
|
|
109
|
+
close: (reason) => this.handleDisconnect(reason),
|
|
110
|
+
resumeToken: (token) => {
|
|
111
|
+
this._resumeToken = token;
|
|
112
|
+
this.emit('resumeToken', token);
|
|
113
|
+
},
|
|
114
|
+
};
|
|
115
|
+
// Connect
|
|
116
|
+
this.transport = await config.provider.connect(connectionConfig, handlers);
|
|
117
|
+
this._connected = true;
|
|
118
|
+
this.setState('idle');
|
|
119
|
+
this.setTurn('none');
|
|
120
|
+
this.touchActivity();
|
|
121
|
+
this.emit('connected');
|
|
122
|
+
// Auto-start sensory feeds
|
|
123
|
+
if (config.screenFeed) {
|
|
124
|
+
const screenConfig = config.screenFeed === true ? {} : config.screenFeed;
|
|
125
|
+
this.startScreenFeed(screenConfig);
|
|
126
|
+
}
|
|
127
|
+
if (config.audioFeed) {
|
|
128
|
+
const audioConfig = config.audioFeed === true ? { source: 'system' } : config.audioFeed;
|
|
129
|
+
this.startAudioFeed(audioConfig);
|
|
130
|
+
}
|
|
131
|
+
// Idle timeout
|
|
132
|
+
if (config.idleTimeoutMs && config.idleTimeoutMs > 0) {
|
|
133
|
+
this.startIdleTimer(config.idleTimeoutMs);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Disconnect from the model and stop all feeds.
|
|
138
|
+
*/
|
|
139
|
+
async disconnect() {
|
|
140
|
+
this.stopScreenFeed();
|
|
141
|
+
this.stopAudioFeed();
|
|
142
|
+
this.stopIdleTimer();
|
|
143
|
+
if (this.transport) {
|
|
144
|
+
await this.transport.close();
|
|
145
|
+
this.transport = null;
|
|
146
|
+
}
|
|
147
|
+
this._connected = false;
|
|
148
|
+
this.setState('disconnected');
|
|
149
|
+
this.setTurn('none');
|
|
150
|
+
this.pendingToolCalls.clear();
|
|
151
|
+
this.emit('disconnected');
|
|
152
|
+
}
|
|
153
|
+
// ═══ SEND TO MODEL ════════════════════════════════════════════════════
|
|
154
|
+
/**
|
|
155
|
+
* Say something to the model (text).
|
|
156
|
+
*
|
|
157
|
+
* @example
|
|
158
|
+
* ```typescript
|
|
159
|
+
* cortex.say("Click the Save button");
|
|
160
|
+
* cortex.say("What do you see on the screen?");
|
|
161
|
+
* ```
|
|
162
|
+
*/
|
|
163
|
+
say(text) {
|
|
164
|
+
this.ensureConnected();
|
|
165
|
+
this.transport.sendText(text, 'user');
|
|
166
|
+
this.setState('listening');
|
|
167
|
+
this.setTurn('user');
|
|
168
|
+
this.touchActivity();
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Send raw audio to the model (PCM).
|
|
172
|
+
*/
|
|
173
|
+
sendAudio(pcm, sampleRate = 16000) {
|
|
174
|
+
this.ensureConnected();
|
|
175
|
+
this.transport.sendAudio(pcm, sampleRate);
|
|
176
|
+
this.touchActivity();
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Send a screenshot to the model right now.
|
|
180
|
+
*
|
|
181
|
+
* @example
|
|
182
|
+
* ```typescript
|
|
183
|
+
* cortex.sendScreenshot(); // sends current screen as JPEG
|
|
184
|
+
* ```
|
|
185
|
+
*/
|
|
186
|
+
sendScreenshot(region) {
|
|
187
|
+
this.ensureConnected();
|
|
188
|
+
const jpeg = region
|
|
189
|
+
? native.screenCapturePng(region.x, region.y, region.width, region.height)
|
|
190
|
+
: native.screenCapturePng();
|
|
191
|
+
this.transport.sendVideo(jpeg);
|
|
192
|
+
this.touchActivity();
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Send a system instruction mid-session.
|
|
196
|
+
*/
|
|
197
|
+
instruct(text) {
|
|
198
|
+
this.ensureConnected();
|
|
199
|
+
this.transport.sendText(text, 'system');
|
|
200
|
+
this.touchActivity();
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Interrupt the model's current output.
|
|
204
|
+
* Stops any audio/text generation in progress.
|
|
205
|
+
*
|
|
206
|
+
* @example
|
|
207
|
+
* ```typescript
|
|
208
|
+
* cortex.on('modelAudio', () => {
|
|
209
|
+
* if (userPressedStop) cortex.interrupt();
|
|
210
|
+
* });
|
|
211
|
+
* ```
|
|
212
|
+
*/
|
|
213
|
+
interrupt() {
|
|
214
|
+
this.ensureConnected();
|
|
215
|
+
this.transport.interrupt();
|
|
216
|
+
this.setState('interrupted');
|
|
217
|
+
this.touchActivity();
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* Signal that the user started speaking (manual VAD).
|
|
221
|
+
*/
|
|
222
|
+
activityStart() {
|
|
223
|
+
this.ensureConnected();
|
|
224
|
+
this.transport.sendActivityStart();
|
|
225
|
+
this.setState('listening');
|
|
226
|
+
this.setTurn('user');
|
|
227
|
+
this.touchActivity();
|
|
228
|
+
}
|
|
229
|
+
/**
|
|
230
|
+
* Signal that the user stopped speaking (manual VAD).
|
|
231
|
+
*/
|
|
232
|
+
activityEnd() {
|
|
233
|
+
this.ensureConnected();
|
|
234
|
+
this.transport.sendActivityEnd();
|
|
235
|
+
this.touchActivity();
|
|
236
|
+
}
|
|
237
|
+
// ═══ SENSORY FEEDS ════════════════════════════════════════════════════
|
|
238
|
+
// Auto-feed screen/audio to the model at a configurable rate.
|
|
239
|
+
/**
|
|
240
|
+
* Start streaming screen frames to the model.
|
|
241
|
+
* The model "sees" the screen in real-time.
|
|
242
|
+
*
|
|
243
|
+
* @example
|
|
244
|
+
* ```typescript
|
|
245
|
+
* cortex.startScreenFeed({ fps: 1, quality: 60 });
|
|
246
|
+
* // Model now receives a screenshot every second
|
|
247
|
+
* ```
|
|
248
|
+
*/
|
|
249
|
+
startScreenFeed(config) {
|
|
250
|
+
this.ensureConnected();
|
|
251
|
+
if (this._screenFeedActive)
|
|
252
|
+
this.stopScreenFeed();
|
|
253
|
+
const fps = Math.min(config?.fps ?? 1, 2); // Models usually cap at 1fps
|
|
254
|
+
const quality = config?.quality ?? 60;
|
|
255
|
+
const scale = config?.scale ?? 0.5;
|
|
256
|
+
const region = config?.region;
|
|
257
|
+
const intervalMs = Math.floor(1000 / fps);
|
|
258
|
+
this._screenFeedActive = true;
|
|
259
|
+
this.screenFeedInterval = setInterval(() => {
|
|
260
|
+
if (!this._connected || !this.transport)
|
|
261
|
+
return;
|
|
262
|
+
try {
|
|
263
|
+
let jpeg;
|
|
264
|
+
if (region) {
|
|
265
|
+
// Capture region, scale it
|
|
266
|
+
const cap = native.screenCapture(region.x, region.y, region.width, region.height);
|
|
267
|
+
const sw = Math.floor(cap.width * scale);
|
|
268
|
+
const sh = Math.floor(cap.height * scale);
|
|
269
|
+
jpeg = native.screenCapturePng(region.x, region.y, sw, sh);
|
|
270
|
+
}
|
|
271
|
+
else {
|
|
272
|
+
jpeg = native.screenCapturePng();
|
|
273
|
+
}
|
|
274
|
+
this.transport.sendVideo(jpeg);
|
|
275
|
+
this.emit('screenFrameSent', jpeg.length);
|
|
276
|
+
}
|
|
277
|
+
catch { /* non-fatal */ }
|
|
278
|
+
}, intervalMs);
|
|
279
|
+
}
|
|
280
|
+
/**
|
|
281
|
+
* Stop the automatic screen feed.
|
|
282
|
+
*/
|
|
283
|
+
stopScreenFeed() {
|
|
284
|
+
if (this.screenFeedInterval) {
|
|
285
|
+
clearInterval(this.screenFeedInterval);
|
|
286
|
+
this.screenFeedInterval = null;
|
|
287
|
+
}
|
|
288
|
+
this._screenFeedActive = false;
|
|
289
|
+
}
|
|
290
|
+
/**
|
|
291
|
+
* Start streaming audio to the model.
|
|
292
|
+
* Uses WASAPI loopback (system audio) or microphone.
|
|
293
|
+
*
|
|
294
|
+
* @example
|
|
295
|
+
* ```typescript
|
|
296
|
+
* cortex.startAudioFeed({ source: 'system', targetSampleRate: 16000 });
|
|
297
|
+
* // Model now hears everything playing on the desktop
|
|
298
|
+
* ```
|
|
299
|
+
*/
|
|
300
|
+
startAudioFeed(config) {
|
|
301
|
+
this.ensureConnected();
|
|
302
|
+
if (this._audioFeedActive)
|
|
303
|
+
this.stopAudioFeed();
|
|
304
|
+
const source = config?.source ?? 'system';
|
|
305
|
+
const targetRate = config?.targetSampleRate ?? 16000;
|
|
306
|
+
const pollMs = 100; // Feed audio every 100ms
|
|
307
|
+
this._audioFeedActive = true;
|
|
308
|
+
if (source === 'system') {
|
|
309
|
+
// Use Stelo audioStream for system audio capture
|
|
310
|
+
try {
|
|
311
|
+
native.audioStreamStart({ bufferSize: 30 });
|
|
312
|
+
}
|
|
313
|
+
catch { /* already started */ }
|
|
314
|
+
this.audioFeedInterval = setInterval(() => {
|
|
315
|
+
if (!this._connected || !this.transport)
|
|
316
|
+
return;
|
|
317
|
+
try {
|
|
318
|
+
const chunks = native.audioStreamDrain ? native.audioStreamDrain() : [];
|
|
319
|
+
for (const chunk of chunks) {
|
|
320
|
+
// Convert float32 to int16 PCM for model
|
|
321
|
+
const pcm = float32ToInt16(chunk.data, chunk.sampleRate ?? 48000, targetRate);
|
|
322
|
+
this.transport.sendAudio(pcm, targetRate);
|
|
323
|
+
this.emit('audioChunkSent', (chunk.frames / (chunk.sampleRate ?? 48000)) * 1000);
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
catch { /* non-fatal */ }
|
|
327
|
+
}, pollMs);
|
|
328
|
+
}
|
|
329
|
+
// 'mic' source would require a separate mic capture implementation
|
|
330
|
+
// which could be added as a native module extension
|
|
331
|
+
}
|
|
332
|
+
/**
|
|
333
|
+
* Stop the automatic audio feed.
|
|
334
|
+
*/
|
|
335
|
+
stopAudioFeed() {
|
|
336
|
+
if (this.audioFeedInterval) {
|
|
337
|
+
clearInterval(this.audioFeedInterval);
|
|
338
|
+
this.audioFeedInterval = null;
|
|
339
|
+
}
|
|
340
|
+
if (this._audioFeedActive) {
|
|
341
|
+
try {
|
|
342
|
+
native.audioStreamStop();
|
|
343
|
+
}
|
|
344
|
+
catch { /* ok */ }
|
|
345
|
+
}
|
|
346
|
+
this._audioFeedActive = false;
|
|
347
|
+
}
|
|
348
|
+
// ═══ TOOL REGISTRY ════════════════════════════════════════════════════
|
|
349
|
+
// Register tools that the model can call. When the model issues a tool_call,
|
|
350
|
+
// Cortex automatically routes it to the handler and sends the result back.
|
|
351
|
+
/**
|
|
352
|
+
* Register a tool the model can call.
|
|
353
|
+
*
|
|
354
|
+
* @example
|
|
355
|
+
* ```typescript
|
|
356
|
+
* cortex.registerTool('get_weather', async (city: string) => {
|
|
357
|
+
* return { temp: 72, condition: 'sunny' };
|
|
358
|
+
* }, {
|
|
359
|
+
* name: 'get_weather',
|
|
360
|
+
* description: 'Get current weather for a city',
|
|
361
|
+
* parameters: { city: { type: 'string', description: 'City name', required: true } },
|
|
362
|
+
* });
|
|
363
|
+
* ```
|
|
364
|
+
*/
|
|
365
|
+
registerTool(name, handler, definition) {
|
|
366
|
+
this.tools.set(name, {
|
|
367
|
+
definition: {
|
|
368
|
+
name,
|
|
369
|
+
description: definition?.description ?? name,
|
|
370
|
+
parameters: definition?.parameters,
|
|
371
|
+
nonBlocking: definition?.nonBlocking,
|
|
372
|
+
},
|
|
373
|
+
handler,
|
|
374
|
+
});
|
|
375
|
+
}
|
|
376
|
+
/**
|
|
377
|
+
* Auto-register Stelo SDK functions as model-callable tools.
|
|
378
|
+
* Gives the model hands to control the computer.
|
|
379
|
+
*
|
|
380
|
+
* @example
|
|
381
|
+
* ```typescript
|
|
382
|
+
* cortex.registerSteloTools();
|
|
383
|
+
* // Now the model can call: mouse_click, keyboard_type, screen_capture, etc.
|
|
384
|
+
* ```
|
|
385
|
+
*/
|
|
386
|
+
registerSteloTools() {
|
|
387
|
+
// Mouse tools
|
|
388
|
+
this.registerTool('mouse_click', (x, y, button) => {
|
|
389
|
+
native.mouseClickAt(x, y, button ?? 'left', false);
|
|
390
|
+
return { success: true, x, y };
|
|
391
|
+
}, {
|
|
392
|
+
description: 'Click the mouse at screen coordinates (x, y)',
|
|
393
|
+
parameters: {
|
|
394
|
+
x: { type: 'number', description: 'X coordinate', required: true },
|
|
395
|
+
y: { type: 'number', description: 'Y coordinate', required: true },
|
|
396
|
+
button: { type: 'string', description: 'Mouse button', enum: ['left', 'right', 'middle'] },
|
|
397
|
+
},
|
|
398
|
+
});
|
|
399
|
+
this.registerTool('mouse_double_click', (x, y) => {
|
|
400
|
+
native.mouseMove(x, y);
|
|
401
|
+
native.mouseDoubleClick('left');
|
|
402
|
+
return { success: true, x, y };
|
|
403
|
+
}, {
|
|
404
|
+
description: 'Double-click at screen coordinates',
|
|
405
|
+
parameters: {
|
|
406
|
+
x: { type: 'number', description: 'X coordinate', required: true },
|
|
407
|
+
y: { type: 'number', description: 'Y coordinate', required: true },
|
|
408
|
+
},
|
|
409
|
+
});
|
|
410
|
+
this.registerTool('mouse_move', (x, y, smooth) => {
|
|
411
|
+
if (smooth) {
|
|
412
|
+
native.mouseMoveSmooth(x, y, 300, 'easeInOut');
|
|
413
|
+
}
|
|
414
|
+
else {
|
|
415
|
+
native.mouseMove(x, y);
|
|
416
|
+
}
|
|
417
|
+
return { success: true, x, y };
|
|
418
|
+
}, {
|
|
419
|
+
description: 'Move the mouse cursor to screen coordinates',
|
|
420
|
+
parameters: {
|
|
421
|
+
x: { type: 'number', description: 'X coordinate', required: true },
|
|
422
|
+
y: { type: 'number', description: 'Y coordinate', required: true },
|
|
423
|
+
smooth: { type: 'boolean', description: 'Use smooth movement' },
|
|
424
|
+
},
|
|
425
|
+
});
|
|
426
|
+
this.registerTool('mouse_scroll', (amount, direction) => {
|
|
427
|
+
native.mouseScroll(amount, direction ?? 'down');
|
|
428
|
+
return { success: true };
|
|
429
|
+
}, {
|
|
430
|
+
description: 'Scroll the mouse wheel',
|
|
431
|
+
parameters: {
|
|
432
|
+
amount: { type: 'number', description: 'Scroll amount', required: true },
|
|
433
|
+
direction: { type: 'string', description: 'Scroll direction', enum: ['up', 'down', 'left', 'right'] },
|
|
434
|
+
},
|
|
435
|
+
});
|
|
436
|
+
this.registerTool('mouse_drag', (fromX, fromY, toX, toY) => {
|
|
437
|
+
native.mouseDrag(fromX, fromY, toX, toY, 300, 'left');
|
|
438
|
+
return { success: true };
|
|
439
|
+
}, {
|
|
440
|
+
description: 'Drag from one position to another',
|
|
441
|
+
parameters: {
|
|
442
|
+
fromX: { type: 'number', required: true },
|
|
443
|
+
fromY: { type: 'number', required: true },
|
|
444
|
+
toX: { type: 'number', required: true },
|
|
445
|
+
toY: { type: 'number', required: true },
|
|
446
|
+
},
|
|
447
|
+
});
|
|
448
|
+
// Keyboard tools
|
|
449
|
+
this.registerTool('keyboard_type', (text) => {
|
|
450
|
+
native.keyboardType(text);
|
|
451
|
+
return { success: true, typed: text };
|
|
452
|
+
}, {
|
|
453
|
+
description: 'Type text using the keyboard',
|
|
454
|
+
parameters: {
|
|
455
|
+
text: { type: 'string', description: 'Text to type', required: true },
|
|
456
|
+
},
|
|
457
|
+
});
|
|
458
|
+
this.registerTool('keyboard_press', (key) => {
|
|
459
|
+
native.keyboardPress(key);
|
|
460
|
+
return { success: true, key };
|
|
461
|
+
}, {
|
|
462
|
+
description: 'Press a single key (Enter, Tab, Escape, Backspace, etc.)',
|
|
463
|
+
parameters: {
|
|
464
|
+
key: { type: 'string', description: 'Key name', required: true },
|
|
465
|
+
},
|
|
466
|
+
});
|
|
467
|
+
this.registerTool('keyboard_hotkey', (...keys) => {
|
|
468
|
+
native.keyboardHotkey(keys);
|
|
469
|
+
return { success: true, keys };
|
|
470
|
+
}, {
|
|
471
|
+
description: 'Press a keyboard shortcut (e.g. ctrl+s, alt+f4, ctrl+shift+t)',
|
|
472
|
+
parameters: {
|
|
473
|
+
keys: { type: 'string', description: 'Comma-separated key names for the hotkey combo', required: true },
|
|
474
|
+
},
|
|
475
|
+
});
|
|
476
|
+
// Screen tools
|
|
477
|
+
this.registerTool('screen_capture', () => {
|
|
478
|
+
const cap = native.screenCapture();
|
|
479
|
+
return { width: cap.width, height: cap.height, captured: true };
|
|
480
|
+
}, {
|
|
481
|
+
description: 'Capture the current screen and return dimensions',
|
|
482
|
+
});
|
|
483
|
+
this.registerTool('screen_get_size', () => {
|
|
484
|
+
return native.screenGetSize();
|
|
485
|
+
}, {
|
|
486
|
+
description: 'Get the screen resolution (width, height)',
|
|
487
|
+
});
|
|
488
|
+
this.registerTool('screen_get_pixel_color', (x, y) => {
|
|
489
|
+
return native.screenGetPixelColor(x, y);
|
|
490
|
+
}, {
|
|
491
|
+
description: 'Get the color of a pixel at coordinates',
|
|
492
|
+
parameters: {
|
|
493
|
+
x: { type: 'number', required: true },
|
|
494
|
+
y: { type: 'number', required: true },
|
|
495
|
+
},
|
|
496
|
+
});
|
|
497
|
+
// OCR tools
|
|
498
|
+
this.registerTool('ocr_read_screen', (x, y, width, height) => {
|
|
499
|
+
const result = native.ocrRecognize(x, y, width, height);
|
|
500
|
+
return { text: result.text, words: result.words?.length ?? 0 };
|
|
501
|
+
}, {
|
|
502
|
+
description: 'Read text from the screen using OCR. Can target a specific region.',
|
|
503
|
+
parameters: {
|
|
504
|
+
x: { type: 'number', description: 'Region X (optional)' },
|
|
505
|
+
y: { type: 'number', description: 'Region Y (optional)' },
|
|
506
|
+
width: { type: 'number', description: 'Region width (optional)' },
|
|
507
|
+
height: { type: 'number', description: 'Region height (optional)' },
|
|
508
|
+
},
|
|
509
|
+
});
|
|
510
|
+
this.registerTool('ocr_find_text', (text) => {
|
|
511
|
+
const result = native.ocrFindText(text);
|
|
512
|
+
return result ? { found: true, x: result.x, y: result.y, width: result.width, height: result.height } : { found: false };
|
|
513
|
+
}, {
|
|
514
|
+
description: 'Find specific text on the screen using OCR and return its position',
|
|
515
|
+
parameters: {
|
|
516
|
+
text: { type: 'string', description: 'Text to find', required: true },
|
|
517
|
+
},
|
|
518
|
+
});
|
|
519
|
+
// Window tools
|
|
520
|
+
this.registerTool('window_list', () => {
|
|
521
|
+
return native.windowList().map((w) => ({
|
|
522
|
+
id: w.id, title: w.title, x: w.x, y: w.y, width: w.width, height: w.height,
|
|
523
|
+
processName: w.processName, isVisible: w.isVisible,
|
|
524
|
+
}));
|
|
525
|
+
}, {
|
|
526
|
+
description: 'List all open windows with their titles, positions, and sizes',
|
|
527
|
+
});
|
|
528
|
+
this.registerTool('window_focus', (titleSubstring) => {
|
|
529
|
+
const windows = native.windowList();
|
|
530
|
+
const match = windows.find((w) => w.title.toLowerCase().includes(titleSubstring.toLowerCase()) && w.isVisible);
|
|
531
|
+
if (match) {
|
|
532
|
+
native.windowFocus(match.id);
|
|
533
|
+
return { success: true, title: match.title };
|
|
534
|
+
}
|
|
535
|
+
return { success: false, error: 'Window not found' };
|
|
536
|
+
}, {
|
|
537
|
+
description: 'Focus a window by partial title match',
|
|
538
|
+
parameters: {
|
|
539
|
+
titleSubstring: { type: 'string', description: 'Part of the window title', required: true },
|
|
540
|
+
},
|
|
541
|
+
});
|
|
542
|
+
// Clipboard tools
|
|
543
|
+
this.registerTool('clipboard_read', () => {
|
|
544
|
+
return { text: native.clipboardRead() };
|
|
545
|
+
}, {
|
|
546
|
+
description: 'Read the current clipboard text content',
|
|
547
|
+
});
|
|
548
|
+
this.registerTool('clipboard_write', (text) => {
|
|
549
|
+
native.clipboardWrite(text);
|
|
550
|
+
return { success: true };
|
|
551
|
+
}, {
|
|
552
|
+
description: 'Write text to the clipboard',
|
|
553
|
+
parameters: {
|
|
554
|
+
text: { type: 'string', description: 'Text to copy', required: true },
|
|
555
|
+
},
|
|
556
|
+
});
|
|
557
|
+
// Wait/verify tools
|
|
558
|
+
this.registerTool('wait_ms', async (ms) => {
|
|
559
|
+
await new Promise(r => setTimeout(r, Math.min(ms, 30000)));
|
|
560
|
+
return { waited: ms };
|
|
561
|
+
}, {
|
|
562
|
+
description: 'Wait for a specified number of milliseconds',
|
|
563
|
+
parameters: {
|
|
564
|
+
ms: { type: 'number', description: 'Milliseconds to wait', required: true },
|
|
565
|
+
},
|
|
566
|
+
nonBlocking: true,
|
|
567
|
+
});
|
|
568
|
+
}
|
|
569
|
+
/**
|
|
570
|
+
* Get all registered tool names.
|
|
571
|
+
*/
|
|
572
|
+
get registeredTools() {
|
|
573
|
+
return Array.from(this.tools.keys());
|
|
574
|
+
}
|
|
575
|
+
// ═══ EVENTS ═══════════════════════════════════════════════════════════
|
|
576
|
+
/**
|
|
577
|
+
* Listen for Cortex events.
|
|
578
|
+
*
|
|
579
|
+
* @example
|
|
580
|
+
* ```typescript
|
|
581
|
+
* cortex.on('stateChange', (state, prev) => {
|
|
582
|
+
* console.log(`${prev} → ${state}`);
|
|
583
|
+
* });
|
|
584
|
+
*
|
|
585
|
+
* cortex.on('modelText', (text, isFinal) => {
|
|
586
|
+
* if (isFinal) console.log('Model said:', text);
|
|
587
|
+
* });
|
|
588
|
+
*
|
|
589
|
+
* cortex.on('toolCall', (name, args) => {
|
|
590
|
+
* console.log(`Model calling: ${name}(${JSON.stringify(args)})`);
|
|
591
|
+
* });
|
|
592
|
+
*
|
|
593
|
+
* cortex.on('transcript', (text, source, isFinal) => {
|
|
594
|
+
* if (source === 'user' && isFinal) console.log('User said:', text);
|
|
595
|
+
* });
|
|
596
|
+
* ```
|
|
597
|
+
*/
|
|
598
|
+
on(event, handler) {
|
|
599
|
+
const handlers = this.listeners.get(event) ?? [];
|
|
600
|
+
handlers.push(handler);
|
|
601
|
+
this.listeners.set(event, handlers);
|
|
602
|
+
return () => {
|
|
603
|
+
const list = this.listeners.get(event);
|
|
604
|
+
if (list) {
|
|
605
|
+
const idx = list.indexOf(handler);
|
|
606
|
+
if (idx >= 0)
|
|
607
|
+
list.splice(idx, 1);
|
|
608
|
+
}
|
|
609
|
+
};
|
|
610
|
+
}
|
|
611
|
+
/**
|
|
612
|
+
* Listen for an event once.
|
|
613
|
+
*/
|
|
614
|
+
once(event, handler) {
|
|
615
|
+
const unsub = this.on(event, ((...args) => {
|
|
616
|
+
unsub();
|
|
617
|
+
handler(...args);
|
|
618
|
+
}));
|
|
619
|
+
return unsub;
|
|
620
|
+
}
|
|
621
|
+
/**
|
|
622
|
+
* Wait for a specific event. Returns a promise that resolves with the event args.
|
|
623
|
+
*
|
|
624
|
+
* @example
|
|
625
|
+
* ```typescript
|
|
626
|
+
* cortex.say("What's on the screen?");
|
|
627
|
+
* const [text] = await cortex.waitFor('modelText', 10000);
|
|
628
|
+
* console.log('Model responded:', text);
|
|
629
|
+
* ```
|
|
630
|
+
*/
|
|
631
|
+
waitFor(event, timeoutMs = 30000) {
|
|
632
|
+
return new Promise((resolve, reject) => {
|
|
633
|
+
const timer = setTimeout(() => {
|
|
634
|
+
unsub();
|
|
635
|
+
reject(new Error(`Cortex.waitFor('${event}') timed out after ${timeoutMs}ms`));
|
|
636
|
+
}, timeoutMs);
|
|
637
|
+
const unsub = this.on(event, ((...args) => {
|
|
638
|
+
clearTimeout(timer);
|
|
639
|
+
unsub();
|
|
640
|
+
resolve(args);
|
|
641
|
+
}));
|
|
642
|
+
});
|
|
643
|
+
}
|
|
644
|
+
// ═══ STATE ════════════════════════════════════════════════════════════
|
|
645
|
+
/** Current conversation state. */
|
|
646
|
+
get state() { return this._state; }
|
|
647
|
+
/** Who currently holds the conversational floor. */
|
|
648
|
+
get turn() { return this._turn; }
|
|
649
|
+
/** Whether connected to a model. */
|
|
650
|
+
get isConnected() { return this._connected; }
|
|
651
|
+
/** Whether screen feed is active. */
|
|
652
|
+
get isScreenFeedActive() { return this._screenFeedActive; }
|
|
653
|
+
/** Whether audio feed is active. */
|
|
654
|
+
get isAudioFeedActive() { return this._audioFeedActive; }
|
|
655
|
+
/** Last session resume token for reconnection. */
|
|
656
|
+
get resumeToken() { return this._resumeToken; }
|
|
657
|
+
/** Total tokens consumed this session. */
|
|
658
|
+
get totalTokensUsed() { return this._totalTokensUsed; }
|
|
659
|
+
/** Number of pending (in-flight) tool calls. */
|
|
660
|
+
get pendingToolCallCount() { return this.pendingToolCalls.size; }
|
|
661
|
+
// ═══ CONVENIENCE: Ask & Wait ══════════════════════════════════════════
|
|
662
|
+
/**
|
|
663
|
+
* Send text and wait for the model's complete text response.
|
|
664
|
+
* This is the simplest way to have a conversation turn.
|
|
665
|
+
*
|
|
666
|
+
* @example
|
|
667
|
+
* ```typescript
|
|
668
|
+
* const response = await cortex.ask("What application is currently focused?");
|
|
669
|
+
* console.log(response); // "I can see that Notepad is currently focused..."
|
|
670
|
+
* ```
|
|
671
|
+
*/
|
|
672
|
+
async ask(text, timeoutMs = 30000) {
|
|
673
|
+
this.modelTextBuffer = '';
|
|
674
|
+
let transcriptBuffer = '';
|
|
675
|
+
this.say(text);
|
|
676
|
+
return new Promise((resolve, reject) => {
|
|
677
|
+
const timer = setTimeout(() => {
|
|
678
|
+
cleanup();
|
|
679
|
+
resolve(this.modelTextBuffer || transcriptBuffer || '(no response)');
|
|
680
|
+
}, timeoutMs);
|
|
681
|
+
const unsubText = this.on('modelText', (chunk, isFinal) => {
|
|
682
|
+
if (isFinal && this.modelTextBuffer) {
|
|
683
|
+
clearTimeout(timer);
|
|
684
|
+
cleanup();
|
|
685
|
+
resolve(this.modelTextBuffer);
|
|
686
|
+
}
|
|
687
|
+
});
|
|
688
|
+
// Also listen for model transcripts (when model responds with audio)
|
|
689
|
+
const unsubTranscript = this.on('transcript', (t, source, isFinal) => {
|
|
690
|
+
if (source === 'model') {
|
|
691
|
+
transcriptBuffer += t;
|
|
692
|
+
}
|
|
693
|
+
});
|
|
694
|
+
const unsubComplete = this.on('turnChange', (owner) => {
|
|
695
|
+
if (owner !== 'model' && (this.modelTextBuffer || transcriptBuffer)) {
|
|
696
|
+
clearTimeout(timer);
|
|
697
|
+
cleanup();
|
|
698
|
+
resolve(this.modelTextBuffer || transcriptBuffer);
|
|
699
|
+
}
|
|
700
|
+
});
|
|
701
|
+
const unsubErr = this.on('error', (err) => {
|
|
702
|
+
clearTimeout(timer);
|
|
703
|
+
cleanup();
|
|
704
|
+
reject(err);
|
|
705
|
+
});
|
|
706
|
+
const cleanup = () => { unsubText(); unsubComplete(); unsubErr(); unsubTranscript(); };
|
|
707
|
+
});
|
|
708
|
+
}
|
|
709
|
+
/**
|
|
710
|
+
* Tell the model to perform an action and wait for tool execution to complete.
|
|
711
|
+
*
|
|
712
|
+
* @example
|
|
713
|
+
* ```typescript
|
|
714
|
+
* await cortex.command("Click on the Start menu");
|
|
715
|
+
* // Model calls mouse_click tool, Cortex executes it
|
|
716
|
+
* await cortex.command("Type 'notepad' and press Enter");
|
|
717
|
+
* // Model calls keyboard_type then keyboard_press
|
|
718
|
+
* ```
|
|
719
|
+
*/
|
|
720
|
+
async command(instruction, timeoutMs = 30000) {
|
|
721
|
+
this.say(instruction);
|
|
722
|
+
return new Promise((resolve, reject) => {
|
|
723
|
+
const timer = setTimeout(() => {
|
|
724
|
+
cleanup();
|
|
725
|
+
resolve();
|
|
726
|
+
}, timeoutMs);
|
|
727
|
+
// Resolve when model turn completes and no pending tool calls
|
|
728
|
+
const unsubTurn = this.on('turnChange', (owner) => {
|
|
729
|
+
if (owner !== 'model' && this.pendingToolCalls.size === 0) {
|
|
730
|
+
clearTimeout(timer);
|
|
731
|
+
cleanup();
|
|
732
|
+
resolve();
|
|
733
|
+
}
|
|
734
|
+
});
|
|
735
|
+
const unsubErr = this.on('error', (err) => {
|
|
736
|
+
clearTimeout(timer);
|
|
737
|
+
cleanup();
|
|
738
|
+
reject(err);
|
|
739
|
+
});
|
|
740
|
+
const cleanup = () => { unsubTurn(); unsubErr(); };
|
|
741
|
+
});
|
|
742
|
+
}
|
|
743
|
+
// ═══ INTERNAL HANDLERS ════════════════════════════════════════════════
|
|
744
|
+
handleModelAudio(data, sampleRate) {
|
|
745
|
+
if (this._state !== 'speaking') {
|
|
746
|
+
this.setState('speaking');
|
|
747
|
+
this.setTurn('model');
|
|
748
|
+
}
|
|
749
|
+
this.emit('modelAudio', data, sampleRate);
|
|
750
|
+
this.touchActivity();
|
|
751
|
+
}
|
|
752
|
+
handleModelText(text, isFinal) {
|
|
753
|
+
this.modelTextBuffer += text;
|
|
754
|
+
if (this._state !== 'speaking') {
|
|
755
|
+
this.setState('speaking');
|
|
756
|
+
this.setTurn('model');
|
|
757
|
+
}
|
|
758
|
+
this.emit('modelText', text, isFinal);
|
|
759
|
+
if (isFinal) {
|
|
760
|
+
this.modelTextBuffer = '';
|
|
761
|
+
}
|
|
762
|
+
this.touchActivity();
|
|
763
|
+
}
|
|
764
|
+
handleTranscript(text, source, isFinal) {
|
|
765
|
+
if (source === 'user') {
|
|
766
|
+
this.userTranscriptBuffer += text;
|
|
767
|
+
if (this._state !== 'listening') {
|
|
768
|
+
this.setState('listening');
|
|
769
|
+
this.setTurn('user');
|
|
770
|
+
}
|
|
771
|
+
if (isFinal)
|
|
772
|
+
this.userTranscriptBuffer = '';
|
|
773
|
+
}
|
|
774
|
+
this.emit('transcript', text, source, isFinal);
|
|
775
|
+
}
|
|
776
|
+
async handleToolCall(id, name, args) {
|
|
777
|
+
this.setState('acting');
|
|
778
|
+
this.emit('toolCall', name, args, id);
|
|
779
|
+
const tool = this.tools.get(name);
|
|
780
|
+
if (!tool) {
|
|
781
|
+
// Unknown tool — send error response
|
|
782
|
+
this.transport?.sendToolResponse(id, { error: `Unknown tool: ${name}` });
|
|
783
|
+
return;
|
|
784
|
+
}
|
|
785
|
+
const pending = { id, name, args, startTime: Date.now(), resolved: false };
|
|
786
|
+
this.pendingToolCalls.set(id, pending);
|
|
787
|
+
try {
|
|
788
|
+
// Parse args: if tool handler expects positional args, spread them
|
|
789
|
+
let result;
|
|
790
|
+
if (typeof args === 'object' && args !== null && !Array.isArray(args)) {
|
|
791
|
+
const paramNames = Object.keys(tool.definition.parameters ?? {});
|
|
792
|
+
if (paramNames.length > 0) {
|
|
793
|
+
const positionalArgs = paramNames.map(p => args[p]);
|
|
794
|
+
result = await tool.handler(...positionalArgs);
|
|
795
|
+
}
|
|
796
|
+
else {
|
|
797
|
+
result = await tool.handler(args);
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
else {
|
|
801
|
+
result = await tool.handler(args);
|
|
802
|
+
}
|
|
803
|
+
const durationMs = Date.now() - pending.startTime;
|
|
804
|
+
pending.resolved = true;
|
|
805
|
+
this.pendingToolCalls.delete(id);
|
|
806
|
+
// Send result back to model
|
|
807
|
+
const scheduling = tool.definition.nonBlocking ? 'when_idle' : undefined;
|
|
808
|
+
this.transport?.sendToolResponse(id, result, scheduling);
|
|
809
|
+
this.emit('toolResult', name, result, durationMs);
|
|
810
|
+
// If no more pending calls, return to idle
|
|
811
|
+
if (this.pendingToolCalls.size === 0 && this._state === 'acting') {
|
|
812
|
+
this.setState('idle');
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
catch (err) {
|
|
816
|
+
this.pendingToolCalls.delete(id);
|
|
817
|
+
this.transport?.sendToolResponse(id, { error: err.message ?? String(err) });
|
|
818
|
+
this.emit('error', err instanceof Error ? err : new Error(String(err)));
|
|
819
|
+
}
|
|
820
|
+
this.touchActivity();
|
|
821
|
+
}
|
|
822
|
+
handleTurnComplete() {
|
|
823
|
+
this.setTurn('none');
|
|
824
|
+
if (this._state === 'speaking' || this._state === 'thinking') {
|
|
825
|
+
this.setState('idle');
|
|
826
|
+
}
|
|
827
|
+
this.modelTextBuffer = '';
|
|
828
|
+
}
|
|
829
|
+
handleGenerationComplete() {
|
|
830
|
+
if (this.pendingToolCalls.size === 0) {
|
|
831
|
+
this.setState('idle');
|
|
832
|
+
this.setTurn('none');
|
|
833
|
+
}
|
|
834
|
+
}
|
|
835
|
+
handleInterrupted() {
|
|
836
|
+
this.setState('interrupted');
|
|
837
|
+
this.emit('interrupted');
|
|
838
|
+
// After interruption, go back to listening (user is speaking)
|
|
839
|
+
setTimeout(() => {
|
|
840
|
+
if (this._state === 'interrupted') {
|
|
841
|
+
this.setState('listening');
|
|
842
|
+
this.setTurn('user');
|
|
843
|
+
}
|
|
844
|
+
}, 100);
|
|
845
|
+
}
|
|
846
|
+
async handleDisconnect(reason) {
|
|
847
|
+
const wasConnected = this._connected;
|
|
848
|
+
this._connected = false;
|
|
849
|
+
this.stopScreenFeed();
|
|
850
|
+
this.stopAudioFeed();
|
|
851
|
+
this.stopIdleTimer();
|
|
852
|
+
this.setState('disconnected');
|
|
853
|
+
if (wasConnected) {
|
|
854
|
+
this.emit('disconnected', reason);
|
|
855
|
+
// Auto-reconnect
|
|
856
|
+
if (this.config?.autoReconnect && this._resumeToken) {
|
|
857
|
+
try {
|
|
858
|
+
await new Promise(r => setTimeout(r, 1000)); // Brief delay
|
|
859
|
+
const reconnectConfig = { ...this.config };
|
|
860
|
+
reconnectConfig.resumeToken = this._resumeToken;
|
|
861
|
+
await this.connect(reconnectConfig);
|
|
862
|
+
}
|
|
863
|
+
catch (err) {
|
|
864
|
+
this.emit('error', err instanceof Error ? err : new Error('Reconnection failed'));
|
|
865
|
+
}
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
}
|
|
869
|
+
// ═══ STATE MACHINE ════════════════════════════════════════════════════
|
|
870
|
+
setState(newState) {
|
|
871
|
+
if (this._state === newState)
|
|
872
|
+
return;
|
|
873
|
+
const oldState = this._state;
|
|
874
|
+
this._state = newState;
|
|
875
|
+
this.emit('stateChange', newState, oldState);
|
|
876
|
+
}
|
|
877
|
+
setTurn(owner) {
|
|
878
|
+
if (this._turn === owner)
|
|
879
|
+
return;
|
|
880
|
+
this._turn = owner;
|
|
881
|
+
this.emit('turnChange', owner);
|
|
882
|
+
}
|
|
883
|
+
emit(event, ...args) {
|
|
884
|
+
const handlers = this.listeners.get(event);
|
|
885
|
+
if (!handlers)
|
|
886
|
+
return;
|
|
887
|
+
for (const handler of handlers) {
|
|
888
|
+
try {
|
|
889
|
+
handler(...args);
|
|
890
|
+
}
|
|
891
|
+
catch { /* listener error, don't crash cortex */ }
|
|
892
|
+
}
|
|
893
|
+
}
|
|
894
|
+
// ═══ UTILITIES ════════════════════════════════════════════════════════
|
|
895
|
+
ensureConnected() {
|
|
896
|
+
if (!this._connected || !this.transport) {
|
|
897
|
+
throw new Error('Cortex is not connected. Call connect() first.');
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
touchActivity() {
|
|
901
|
+
this.lastActivityTime = Date.now();
|
|
902
|
+
}
|
|
903
|
+
startIdleTimer(timeoutMs) {
|
|
904
|
+
this.stopIdleTimer();
|
|
905
|
+
this.idleTimer = setInterval(() => {
|
|
906
|
+
if (Date.now() - this.lastActivityTime > timeoutMs) {
|
|
907
|
+
this.disconnect();
|
|
908
|
+
}
|
|
909
|
+
}, Math.min(timeoutMs / 2, 30000));
|
|
910
|
+
}
|
|
911
|
+
stopIdleTimer() {
|
|
912
|
+
if (this.idleTimer) {
|
|
913
|
+
clearInterval(this.idleTimer);
|
|
914
|
+
this.idleTimer = null;
|
|
915
|
+
}
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
exports.Cortex = Cortex;
|
|
919
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
920
|
+
// AUDIO UTILITIES
|
|
921
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
922
|
+
/**
|
|
923
|
+
* Convert Float32 PCM audio to Int16 PCM with optional resampling.
|
|
924
|
+
* Most real-time APIs expect 16-bit PCM at 16kHz.
|
|
925
|
+
*/
|
|
926
|
+
function float32ToInt16(input, inputRate, outputRate) {
|
|
927
|
+
// Interpret as float32
|
|
928
|
+
const floatCount = Math.floor(input.length / 4);
|
|
929
|
+
const floats = new Float32Array(input.buffer, input.byteOffset, floatCount);
|
|
930
|
+
// Resample if needed (simple linear interpolation)
|
|
931
|
+
const ratio = inputRate / outputRate;
|
|
932
|
+
const outputLength = Math.floor(floatCount / ratio);
|
|
933
|
+
const output = Buffer.alloc(outputLength * 2); // 2 bytes per int16
|
|
934
|
+
for (let i = 0; i < outputLength; i++) {
|
|
935
|
+
const srcIdx = i * ratio;
|
|
936
|
+
const idx0 = Math.floor(srcIdx);
|
|
937
|
+
const idx1 = Math.min(idx0 + 1, floatCount - 1);
|
|
938
|
+
const frac = srcIdx - idx0;
|
|
939
|
+
// Interpolate
|
|
940
|
+
const sample = floats[idx0] * (1 - frac) + floats[idx1] * frac;
|
|
941
|
+
// Clamp and convert to int16
|
|
942
|
+
const clamped = Math.max(-1, Math.min(1, sample));
|
|
943
|
+
output.writeInt16LE(Math.round(clamped * 32767), i * 2);
|
|
944
|
+
}
|
|
945
|
+
return output;
|
|
946
|
+
}
|
|
947
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
948
|
+
// BUILT-IN PROVIDER FACTORIES
|
|
949
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
950
|
+
//
|
|
951
|
+
// These create CortexProvider instances for popular real-time AI APIs.
|
|
952
|
+
// They handle the WebSocket protocol translation so Cortex doesn't have to
|
|
953
|
+
// know about Gemini's BidiGenerateContent or OpenAI's realtime protocol.
|
|
954
|
+
//
|
|
955
|
+
/**
|
|
956
|
+
* Create a Gemini Live API provider.
|
|
957
|
+
*
|
|
958
|
+
* @example
|
|
959
|
+
* ```typescript
|
|
960
|
+
* const cortex = new Cortex();
|
|
961
|
+
* await cortex.connect({
|
|
962
|
+
* provider: geminiLiveProvider(),
|
|
963
|
+
* connection: {
|
|
964
|
+
* apiKey: 'YOUR_API_KEY',
|
|
965
|
+
* model: 'gemini-3.1-flash-live-preview',
|
|
966
|
+
* responseModalities: ['audio'],
|
|
967
|
+
* voice: 'Kore',
|
|
968
|
+
* systemInstruction: 'You are a desktop assistant.',
|
|
969
|
+
* transcribeInput: true,
|
|
970
|
+
* transcribeOutput: true,
|
|
971
|
+
* providerOptions: {
|
|
972
|
+
* // Gemini-specific
|
|
973
|
+
* enableAffectiveDialog: true,
|
|
974
|
+
* proactiveAudio: true,
|
|
975
|
+
* thinkingLevel: 'low',
|
|
976
|
+
* contextWindowCompression: true,
|
|
977
|
+
* },
|
|
978
|
+
* },
|
|
979
|
+
* });
|
|
980
|
+
* ```
|
|
981
|
+
*/
|
|
982
|
+
function geminiLiveProvider() {
|
|
983
|
+
return {
|
|
984
|
+
name: 'gemini-live',
|
|
985
|
+
async connect(config, handlers) {
|
|
986
|
+
const apiVersion = config.providerOptions?.apiVersion ?? 'v1beta';
|
|
987
|
+
const host = config.providerOptions?.host ?? 'generativelanguage.googleapis.com';
|
|
988
|
+
const wsUrl = `wss://${host}/ws/google.ai.generativelanguage.${apiVersion}.GenerativeService.BidiGenerateContent?key=${config.apiKey}`;
|
|
989
|
+
// Build setup message — uses snake_case per Gemini BidiGenerateContent protocol
|
|
990
|
+
const setupMsg = {
|
|
991
|
+
setup: {
|
|
992
|
+
model: `models/${config.model}`,
|
|
993
|
+
generation_config: {
|
|
994
|
+
response_modalities: (config.responseModalities ?? ['audio']).map(m => m.toUpperCase()),
|
|
995
|
+
},
|
|
996
|
+
},
|
|
997
|
+
};
|
|
998
|
+
if (config.systemInstruction) {
|
|
999
|
+
setupMsg.setup.system_instruction = { parts: [{ text: config.systemInstruction }] };
|
|
1000
|
+
}
|
|
1001
|
+
if (config.voice) {
|
|
1002
|
+
setupMsg.setup.generation_config.speech_config = {
|
|
1003
|
+
voice_config: { prebuilt_voice_config: { voice_name: config.voice } },
|
|
1004
|
+
};
|
|
1005
|
+
}
|
|
1006
|
+
if (config.transcribeInput) {
|
|
1007
|
+
setupMsg.setup.input_audio_transcription = {};
|
|
1008
|
+
}
|
|
1009
|
+
if (config.transcribeOutput) {
|
|
1010
|
+
setupMsg.setup.output_audio_transcription = {};
|
|
1011
|
+
}
|
|
1012
|
+
if (config.tools && config.tools.length > 0) {
|
|
1013
|
+
setupMsg.setup.tools = [{
|
|
1014
|
+
function_declarations: config.tools.map(t => ({
|
|
1015
|
+
name: t.name,
|
|
1016
|
+
description: t.description,
|
|
1017
|
+
parameters: t.parameters ? {
|
|
1018
|
+
type: 'OBJECT',
|
|
1019
|
+
properties: Object.fromEntries(Object.entries(t.parameters).map(([k, v]) => [k, { type: v.type.toUpperCase(), description: v.description }])),
|
|
1020
|
+
required: Object.entries(t.parameters).filter(([, v]) => v.required).map(([k]) => k),
|
|
1021
|
+
} : undefined,
|
|
1022
|
+
...(t.nonBlocking ? { behavior: 'NON_BLOCKING' } : {}),
|
|
1023
|
+
})),
|
|
1024
|
+
}];
|
|
1025
|
+
}
|
|
1026
|
+
// Provider-specific options
|
|
1027
|
+
const opts = config.providerOptions ?? {};
|
|
1028
|
+
if (opts.enableAffectiveDialog) {
|
|
1029
|
+
setupMsg.setup.enable_affective_dialog = true;
|
|
1030
|
+
}
|
|
1031
|
+
if (opts.proactiveAudio) {
|
|
1032
|
+
setupMsg.setup.proactivity = { proactive_audio: true };
|
|
1033
|
+
}
|
|
1034
|
+
if (opts.thinkingLevel) {
|
|
1035
|
+
setupMsg.setup.generation_config.thinking_config = { thinking_level: opts.thinkingLevel };
|
|
1036
|
+
}
|
|
1037
|
+
if (opts.contextWindowCompression) {
|
|
1038
|
+
setupMsg.setup.context_window_compression = { sliding_window: {} };
|
|
1039
|
+
}
|
|
1040
|
+
if (opts.sessionResumptionHandle) {
|
|
1041
|
+
setupMsg.setup.session_resumption = { handle: opts.sessionResumptionHandle };
|
|
1042
|
+
}
|
|
1043
|
+
// Connect WebSocket and wait for setupComplete acknowledgment
|
|
1044
|
+
const ws = new WebSocket(wsUrl);
|
|
1045
|
+
await new Promise((resolve, reject) => {
|
|
1046
|
+
const timeout = setTimeout(() => reject(new Error('Connection timeout')), 15000);
|
|
1047
|
+
let setupSent = false;
|
|
1048
|
+
ws.onopen = () => {
|
|
1049
|
+
ws.send(JSON.stringify(setupMsg));
|
|
1050
|
+
setupSent = true;
|
|
1051
|
+
};
|
|
1052
|
+
ws.onmessage = async (event) => {
|
|
1053
|
+
// First message back is { setupComplete: {} }
|
|
1054
|
+
if (setupSent) {
|
|
1055
|
+
clearTimeout(timeout);
|
|
1056
|
+
resolve();
|
|
1057
|
+
}
|
|
1058
|
+
};
|
|
1059
|
+
ws.onerror = (e) => { clearTimeout(timeout); reject(new Error(`WebSocket error: ${e.message ?? 'connection failed'}`)); };
|
|
1060
|
+
ws.onclose = (ev) => { clearTimeout(timeout); reject(new Error(`WebSocket closed during setup (code=${ev?.code} reason=${ev?.reason || 'none'})`)); };
|
|
1061
|
+
});
|
|
1062
|
+
// Helper to extract text from MessageEvent.data (handles string and Blob)
|
|
1063
|
+
async function extractText(data) {
|
|
1064
|
+
if (typeof data === 'string')
|
|
1065
|
+
return data;
|
|
1066
|
+
if (typeof Blob !== 'undefined' && data instanceof Blob)
|
|
1067
|
+
return data.text();
|
|
1068
|
+
if (Buffer.isBuffer(data))
|
|
1069
|
+
return data.toString('utf-8');
|
|
1070
|
+
return String(data);
|
|
1071
|
+
}
|
|
1072
|
+
// Message router
|
|
1073
|
+
ws.onmessage = (event) => {
|
|
1074
|
+
extractText(event.data).then(raw => {
|
|
1075
|
+
const msg = JSON.parse(raw);
|
|
1076
|
+
if (msg.serverContent) {
|
|
1077
|
+
const sc = msg.serverContent;
|
|
1078
|
+
if (sc.interrupted) {
|
|
1079
|
+
handlers.interrupted();
|
|
1080
|
+
}
|
|
1081
|
+
if (sc.modelTurn?.parts) {
|
|
1082
|
+
for (const part of sc.modelTurn.parts) {
|
|
1083
|
+
if (part.inlineData?.data) {
|
|
1084
|
+
const audioData = Buffer.from(part.inlineData.data, 'base64');
|
|
1085
|
+
handlers.audio(audioData, 24000);
|
|
1086
|
+
}
|
|
1087
|
+
if (part.text) {
|
|
1088
|
+
handlers.text(part.text, false);
|
|
1089
|
+
}
|
|
1090
|
+
}
|
|
1091
|
+
}
|
|
1092
|
+
if (sc.outputTranscription?.text) {
|
|
1093
|
+
handlers.transcript(sc.outputTranscription.text, 'model', true);
|
|
1094
|
+
}
|
|
1095
|
+
if (sc.inputTranscription?.text) {
|
|
1096
|
+
handlers.transcript(sc.inputTranscription.text, 'user', true);
|
|
1097
|
+
}
|
|
1098
|
+
if (sc.turnComplete) {
|
|
1099
|
+
handlers.text('', true); // Signal final text
|
|
1100
|
+
handlers.turnComplete();
|
|
1101
|
+
}
|
|
1102
|
+
if (sc.generationComplete) {
|
|
1103
|
+
handlers.generationComplete();
|
|
1104
|
+
}
|
|
1105
|
+
}
|
|
1106
|
+
if (msg.toolCall?.functionCalls) {
|
|
1107
|
+
for (const fc of msg.toolCall.functionCalls) {
|
|
1108
|
+
handlers.toolCall(fc.id, fc.name, fc.args ?? {});
|
|
1109
|
+
}
|
|
1110
|
+
}
|
|
1111
|
+
if (msg.usageMetadata) {
|
|
1112
|
+
handlers.usage(msg.usageMetadata.totalTokenCount ?? msg.usageMetadata.promptTokenCount ?? 0, msg.usageMetadata.promptTokenCount ?? 0, msg.usageMetadata.candidatesTokenCount ?? 0);
|
|
1113
|
+
}
|
|
1114
|
+
if (msg.sessionResumptionUpdate?.newHandle) {
|
|
1115
|
+
handlers.resumeToken(msg.sessionResumptionUpdate.newHandle);
|
|
1116
|
+
}
|
|
1117
|
+
if (msg.goAway) {
|
|
1118
|
+
handlers.close('server_go_away');
|
|
1119
|
+
}
|
|
1120
|
+
}).catch((err) => {
|
|
1121
|
+
handlers.error(new Error(`Message parse error: ${err.message}`));
|
|
1122
|
+
});
|
|
1123
|
+
};
|
|
1124
|
+
ws.onerror = (e) => handlers.error(new Error(`Gemini WS error: ${e.message ?? 'unknown'}`));
|
|
1125
|
+
ws.onclose = (ev) => handlers.close(`gemini_closed code=${ev?.code} reason=${ev?.reason || 'none'}`);
|
|
1126
|
+
// Build transport — uses snake_case per Gemini protocol
|
|
1127
|
+
const transport = {
|
|
1128
|
+
sendAudio(pcm, sampleRate) {
|
|
1129
|
+
if (ws.readyState !== WebSocket.OPEN)
|
|
1130
|
+
return;
|
|
1131
|
+
ws.send(JSON.stringify({
|
|
1132
|
+
realtime_input: {
|
|
1133
|
+
audio: {
|
|
1134
|
+
data: pcm.toString('base64'),
|
|
1135
|
+
mimeType: `audio/pcm;rate=${sampleRate}`,
|
|
1136
|
+
},
|
|
1137
|
+
},
|
|
1138
|
+
}));
|
|
1139
|
+
},
|
|
1140
|
+
sendVideo(jpeg) {
|
|
1141
|
+
if (ws.readyState !== WebSocket.OPEN)
|
|
1142
|
+
return;
|
|
1143
|
+
ws.send(JSON.stringify({
|
|
1144
|
+
realtime_input: {
|
|
1145
|
+
video: {
|
|
1146
|
+
data: jpeg.toString('base64'),
|
|
1147
|
+
mimeType: 'image/jpeg',
|
|
1148
|
+
},
|
|
1149
|
+
},
|
|
1150
|
+
}));
|
|
1151
|
+
},
|
|
1152
|
+
sendText(text, role) {
|
|
1153
|
+
if (ws.readyState !== WebSocket.OPEN)
|
|
1154
|
+
return;
|
|
1155
|
+
if (role === 'system') {
|
|
1156
|
+
ws.send(JSON.stringify({
|
|
1157
|
+
realtime_input: { text: `[System]: ${text}` },
|
|
1158
|
+
}));
|
|
1159
|
+
}
|
|
1160
|
+
else {
|
|
1161
|
+
ws.send(JSON.stringify({
|
|
1162
|
+
realtime_input: { text },
|
|
1163
|
+
}));
|
|
1164
|
+
}
|
|
1165
|
+
},
|
|
1166
|
+
sendToolResponse(callId, result, scheduling) {
|
|
1167
|
+
if (ws.readyState !== WebSocket.OPEN)
|
|
1168
|
+
return;
|
|
1169
|
+
const funcResponse = {
|
|
1170
|
+
name: callId,
|
|
1171
|
+
id: callId,
|
|
1172
|
+
response: typeof result === 'string' ? { result } : { result: JSON.stringify(result) },
|
|
1173
|
+
};
|
|
1174
|
+
if (scheduling) {
|
|
1175
|
+
funcResponse.response.scheduling = scheduling.toUpperCase();
|
|
1176
|
+
}
|
|
1177
|
+
ws.send(JSON.stringify({
|
|
1178
|
+
tool_response: {
|
|
1179
|
+
function_responses: [funcResponse],
|
|
1180
|
+
},
|
|
1181
|
+
}));
|
|
1182
|
+
},
|
|
1183
|
+
sendActivityStart() {
|
|
1184
|
+
if (ws.readyState !== WebSocket.OPEN)
|
|
1185
|
+
return;
|
|
1186
|
+
ws.send(JSON.stringify({ realtime_input: { activity_start: {} } }));
|
|
1187
|
+
},
|
|
1188
|
+
sendActivityEnd() {
|
|
1189
|
+
if (ws.readyState !== WebSocket.OPEN)
|
|
1190
|
+
return;
|
|
1191
|
+
ws.send(JSON.stringify({ realtime_input: { activity_end: {} } }));
|
|
1192
|
+
},
|
|
1193
|
+
interrupt() {
|
|
1194
|
+
if (ws.readyState !== WebSocket.OPEN)
|
|
1195
|
+
return;
|
|
1196
|
+
ws.send(JSON.stringify({ realtime_input: { text: '' } }));
|
|
1197
|
+
},
|
|
1198
|
+
async close() {
|
|
1199
|
+
ws.close();
|
|
1200
|
+
},
|
|
1201
|
+
};
|
|
1202
|
+
return transport;
|
|
1203
|
+
},
|
|
1204
|
+
};
|
|
1205
|
+
}
|
|
1206
|
+
/**
|
|
1207
|
+
* Create an OpenAI Realtime API provider.
|
|
1208
|
+
*
|
|
1209
|
+
* @example
|
|
1210
|
+
* ```typescript
|
|
1211
|
+
* await cortex.connect({
|
|
1212
|
+
* provider: openAIRealtimeProvider(),
|
|
1213
|
+
* connection: {
|
|
1214
|
+
* apiKey: 'YOUR_OPENAI_KEY',
|
|
1215
|
+
* model: 'gpt-4o-realtime-preview',
|
|
1216
|
+
* responseModalities: ['audio', 'text'],
|
|
1217
|
+
* voice: 'alloy',
|
|
1218
|
+
* },
|
|
1219
|
+
* });
|
|
1220
|
+
* ```
|
|
1221
|
+
*/
|
|
1222
|
+
function openAIRealtimeProvider() {
|
|
1223
|
+
return {
|
|
1224
|
+
name: 'openai-realtime',
|
|
1225
|
+
async connect(config, handlers) {
|
|
1226
|
+
const host = config.providerOptions?.host ?? 'api.openai.com';
|
|
1227
|
+
const wsUrl = `wss://${host}/v1/realtime?model=${config.model}`;
|
|
1228
|
+
const ws = new WebSocket(wsUrl, {
|
|
1229
|
+
headers: {
|
|
1230
|
+
'Authorization': `Bearer ${config.apiKey}`,
|
|
1231
|
+
'OpenAI-Beta': 'realtime=v1',
|
|
1232
|
+
},
|
|
1233
|
+
});
|
|
1234
|
+
await new Promise((resolve, reject) => {
|
|
1235
|
+
const timeout = setTimeout(() => reject(new Error('Connection timeout')), 15000);
|
|
1236
|
+
ws.onopen = () => {
|
|
1237
|
+
clearTimeout(timeout);
|
|
1238
|
+
// Send session.update with configuration
|
|
1239
|
+
const sessionUpdate = {
|
|
1240
|
+
type: 'session.update',
|
|
1241
|
+
session: {
|
|
1242
|
+
modalities: config.responseModalities ?? ['audio', 'text'],
|
|
1243
|
+
instructions: config.systemInstruction ?? '',
|
|
1244
|
+
voice: config.voice ?? 'alloy',
|
|
1245
|
+
input_audio_format: 'pcm16',
|
|
1246
|
+
output_audio_format: 'pcm16',
|
|
1247
|
+
input_audio_transcription: config.transcribeInput ? { model: 'whisper-1' } : undefined,
|
|
1248
|
+
turn_detection: config.providerOptions?.manualVAD ? null : { type: 'server_vad' },
|
|
1249
|
+
},
|
|
1250
|
+
};
|
|
1251
|
+
if (config.tools && config.tools.length > 0) {
|
|
1252
|
+
sessionUpdate.session.tools = config.tools.map(t => ({
|
|
1253
|
+
type: 'function',
|
|
1254
|
+
name: t.name,
|
|
1255
|
+
description: t.description,
|
|
1256
|
+
parameters: t.parameters ? {
|
|
1257
|
+
type: 'object',
|
|
1258
|
+
properties: Object.fromEntries(Object.entries(t.parameters).map(([k, v]) => [k, { type: v.type, description: v.description }])),
|
|
1259
|
+
required: Object.entries(t.parameters).filter(([, v]) => v.required).map(([k]) => k),
|
|
1260
|
+
} : { type: 'object', properties: {} },
|
|
1261
|
+
}));
|
|
1262
|
+
}
|
|
1263
|
+
ws.send(JSON.stringify(sessionUpdate));
|
|
1264
|
+
resolve();
|
|
1265
|
+
};
|
|
1266
|
+
ws.onerror = (e) => { clearTimeout(timeout); reject(new Error(`WebSocket error: ${e.message ?? 'connection failed'}`)); };
|
|
1267
|
+
});
|
|
1268
|
+
// Message router
|
|
1269
|
+
ws.onmessage = (event) => {
|
|
1270
|
+
try {
|
|
1271
|
+
const msg = JSON.parse(typeof event.data === 'string' ? event.data : event.data.toString());
|
|
1272
|
+
switch (msg.type) {
|
|
1273
|
+
case 'response.audio.delta':
|
|
1274
|
+
if (msg.delta) {
|
|
1275
|
+
handlers.audio(Buffer.from(msg.delta, 'base64'), 24000);
|
|
1276
|
+
}
|
|
1277
|
+
break;
|
|
1278
|
+
case 'response.text.delta':
|
|
1279
|
+
if (msg.delta) {
|
|
1280
|
+
handlers.text(msg.delta, false);
|
|
1281
|
+
}
|
|
1282
|
+
break;
|
|
1283
|
+
case 'response.text.done':
|
|
1284
|
+
if (msg.text)
|
|
1285
|
+
handlers.text(msg.text, true);
|
|
1286
|
+
break;
|
|
1287
|
+
case 'response.audio_transcript.delta':
|
|
1288
|
+
handlers.transcript(msg.delta ?? '', 'model', false);
|
|
1289
|
+
break;
|
|
1290
|
+
case 'response.audio_transcript.done':
|
|
1291
|
+
handlers.transcript(msg.transcript ?? '', 'model', true);
|
|
1292
|
+
break;
|
|
1293
|
+
case 'conversation.item.input_audio_transcription.completed':
|
|
1294
|
+
handlers.transcript(msg.transcript ?? '', 'user', true);
|
|
1295
|
+
break;
|
|
1296
|
+
case 'response.function_call_arguments.done':
|
|
1297
|
+
try {
|
|
1298
|
+
const args = JSON.parse(msg.arguments ?? '{}');
|
|
1299
|
+
handlers.toolCall(msg.call_id, msg.name, args);
|
|
1300
|
+
}
|
|
1301
|
+
catch { /* parse error */ }
|
|
1302
|
+
break;
|
|
1303
|
+
case 'response.done':
|
|
1304
|
+
handlers.turnComplete();
|
|
1305
|
+
if (msg.response?.usage) {
|
|
1306
|
+
handlers.usage(msg.response.usage.total_tokens ?? 0, msg.response.usage.input_tokens ?? 0, msg.response.usage.output_tokens ?? 0);
|
|
1307
|
+
}
|
|
1308
|
+
break;
|
|
1309
|
+
case 'input_audio_buffer.speech_started':
|
|
1310
|
+
// User started speaking — model will be interrupted
|
|
1311
|
+
break;
|
|
1312
|
+
case 'input_audio_buffer.speech_stopped':
|
|
1313
|
+
break;
|
|
1314
|
+
case 'error':
|
|
1315
|
+
handlers.error(new Error(`OpenAI: ${msg.error?.message ?? JSON.stringify(msg.error)}`));
|
|
1316
|
+
break;
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1319
|
+
catch (err) {
|
|
1320
|
+
handlers.error(new Error(`Message parse error: ${err.message}`));
|
|
1321
|
+
}
|
|
1322
|
+
};
|
|
1323
|
+
ws.onerror = (e) => handlers.error(new Error(`WebSocket error: ${e.message ?? 'unknown'}`));
|
|
1324
|
+
ws.onclose = () => handlers.close('connection_closed');
|
|
1325
|
+
const transport = {
|
|
1326
|
+
sendAudio(pcm, _sampleRate) {
|
|
1327
|
+
if (ws.readyState !== WebSocket.OPEN)
|
|
1328
|
+
return;
|
|
1329
|
+
ws.send(JSON.stringify({
|
|
1330
|
+
type: 'input_audio_buffer.append',
|
|
1331
|
+
audio: pcm.toString('base64'),
|
|
1332
|
+
}));
|
|
1333
|
+
},
|
|
1334
|
+
sendVideo(_jpeg) {
|
|
1335
|
+
// OpenAI Realtime doesn't support video frames directly
|
|
1336
|
+
// Could be sent as a conversation item with image
|
|
1337
|
+
},
|
|
1338
|
+
sendText(text, role) {
|
|
1339
|
+
if (ws.readyState !== WebSocket.OPEN)
|
|
1340
|
+
return;
|
|
1341
|
+
ws.send(JSON.stringify({
|
|
1342
|
+
type: 'conversation.item.create',
|
|
1343
|
+
item: {
|
|
1344
|
+
type: 'message',
|
|
1345
|
+
role: role ?? 'user',
|
|
1346
|
+
content: [{ type: 'input_text', text }],
|
|
1347
|
+
},
|
|
1348
|
+
}));
|
|
1349
|
+
ws.send(JSON.stringify({ type: 'response.create' }));
|
|
1350
|
+
},
|
|
1351
|
+
sendToolResponse(callId, result) {
|
|
1352
|
+
if (ws.readyState !== WebSocket.OPEN)
|
|
1353
|
+
return;
|
|
1354
|
+
ws.send(JSON.stringify({
|
|
1355
|
+
type: 'conversation.item.create',
|
|
1356
|
+
item: {
|
|
1357
|
+
type: 'function_call_output',
|
|
1358
|
+
call_id: callId,
|
|
1359
|
+
output: JSON.stringify(result),
|
|
1360
|
+
},
|
|
1361
|
+
}));
|
|
1362
|
+
ws.send(JSON.stringify({ type: 'response.create' }));
|
|
1363
|
+
},
|
|
1364
|
+
sendActivityStart() {
|
|
1365
|
+
// OpenAI uses server VAD by default
|
|
1366
|
+
},
|
|
1367
|
+
sendActivityEnd() {
|
|
1368
|
+
if (ws.readyState !== WebSocket.OPEN)
|
|
1369
|
+
return;
|
|
1370
|
+
ws.send(JSON.stringify({ type: 'input_audio_buffer.commit' }));
|
|
1371
|
+
ws.send(JSON.stringify({ type: 'response.create' }));
|
|
1372
|
+
},
|
|
1373
|
+
interrupt() {
|
|
1374
|
+
if (ws.readyState !== WebSocket.OPEN)
|
|
1375
|
+
return;
|
|
1376
|
+
ws.send(JSON.stringify({ type: 'response.cancel' }));
|
|
1377
|
+
},
|
|
1378
|
+
async close() {
|
|
1379
|
+
ws.close();
|
|
1380
|
+
},
|
|
1381
|
+
};
|
|
1382
|
+
return transport;
|
|
1383
|
+
},
|
|
1384
|
+
};
|
|
1385
|
+
}
|
|
1386
|
+
function customProvider(custom) {
|
|
1387
|
+
return {
|
|
1388
|
+
name: custom.name,
|
|
1389
|
+
async connect(config, handlers) {
|
|
1390
|
+
const url = custom.buildUrl(config);
|
|
1391
|
+
const headers = custom.buildHeaders?.(config);
|
|
1392
|
+
const ws = new WebSocket(url, headers ? { headers } : undefined);
|
|
1393
|
+
await new Promise((resolve, reject) => {
|
|
1394
|
+
const timeout = setTimeout(() => reject(new Error('Connection timeout')), 15000);
|
|
1395
|
+
ws.onopen = () => {
|
|
1396
|
+
clearTimeout(timeout);
|
|
1397
|
+
const setupMsg = custom.buildSetupMessage?.(config);
|
|
1398
|
+
if (setupMsg)
|
|
1399
|
+
ws.send(JSON.stringify(setupMsg));
|
|
1400
|
+
resolve();
|
|
1401
|
+
};
|
|
1402
|
+
ws.onerror = (e) => { clearTimeout(timeout); reject(new Error(e.message ?? 'connection failed')); };
|
|
1403
|
+
});
|
|
1404
|
+
ws.onmessage = (event) => {
|
|
1405
|
+
try {
|
|
1406
|
+
const raw = typeof event.data === 'string' ? event.data : event.data.toString();
|
|
1407
|
+
const parsed = custom.parseMessage(raw);
|
|
1408
|
+
if (!parsed)
|
|
1409
|
+
return;
|
|
1410
|
+
switch (parsed.kind) {
|
|
1411
|
+
case 'audio':
|
|
1412
|
+
handlers.audio(parsed.data, parsed.sampleRate);
|
|
1413
|
+
break;
|
|
1414
|
+
case 'text':
|
|
1415
|
+
handlers.text(parsed.text, parsed.isFinal);
|
|
1416
|
+
break;
|
|
1417
|
+
case 'transcript':
|
|
1418
|
+
handlers.transcript(parsed.text, parsed.source, parsed.isFinal);
|
|
1419
|
+
break;
|
|
1420
|
+
case 'toolCall':
|
|
1421
|
+
handlers.toolCall(parsed.id, parsed.name, parsed.args);
|
|
1422
|
+
break;
|
|
1423
|
+
case 'turnComplete':
|
|
1424
|
+
handlers.turnComplete();
|
|
1425
|
+
break;
|
|
1426
|
+
case 'interrupted':
|
|
1427
|
+
handlers.interrupted();
|
|
1428
|
+
break;
|
|
1429
|
+
case 'thought':
|
|
1430
|
+
handlers.thought(parsed.text);
|
|
1431
|
+
break;
|
|
1432
|
+
case 'usage':
|
|
1433
|
+
handlers.usage(parsed.total, parsed.input, parsed.output);
|
|
1434
|
+
break;
|
|
1435
|
+
case 'error':
|
|
1436
|
+
handlers.error(new Error(parsed.message));
|
|
1437
|
+
break;
|
|
1438
|
+
case 'close':
|
|
1439
|
+
handlers.close(parsed.reason);
|
|
1440
|
+
break;
|
|
1441
|
+
case 'resumeToken':
|
|
1442
|
+
handlers.resumeToken(parsed.token);
|
|
1443
|
+
break;
|
|
1444
|
+
}
|
|
1445
|
+
}
|
|
1446
|
+
catch (err) {
|
|
1447
|
+
handlers.error(new Error(`Parse error: ${err.message}`));
|
|
1448
|
+
}
|
|
1449
|
+
};
|
|
1450
|
+
ws.onerror = (e) => handlers.error(new Error(e.message ?? 'unknown'));
|
|
1451
|
+
ws.onclose = () => handlers.close('connection_closed');
|
|
1452
|
+
return {
|
|
1453
|
+
sendAudio(pcm, rate) { if (ws.readyState === WebSocket.OPEN)
|
|
1454
|
+
ws.send(custom.formatAudio(pcm, rate)); },
|
|
1455
|
+
sendVideo(jpeg) { if (ws.readyState === WebSocket.OPEN && custom.formatVideo)
|
|
1456
|
+
ws.send(custom.formatVideo(jpeg)); },
|
|
1457
|
+
sendText(text, role) { if (ws.readyState === WebSocket.OPEN)
|
|
1458
|
+
ws.send(custom.formatText(text, role)); },
|
|
1459
|
+
sendToolResponse(id, result, sched) { if (ws.readyState === WebSocket.OPEN)
|
|
1460
|
+
ws.send(custom.formatToolResponse(id, result, sched)); },
|
|
1461
|
+
sendActivityStart() { if (ws.readyState === WebSocket.OPEN && custom.formatActivityStart)
|
|
1462
|
+
ws.send(custom.formatActivityStart()); },
|
|
1463
|
+
sendActivityEnd() { if (ws.readyState === WebSocket.OPEN && custom.formatActivityEnd)
|
|
1464
|
+
ws.send(custom.formatActivityEnd()); },
|
|
1465
|
+
interrupt() { if (ws.readyState === WebSocket.OPEN && custom.formatInterrupt)
|
|
1466
|
+
ws.send(custom.formatInterrupt()); },
|
|
1467
|
+
async close() { ws.close(); },
|
|
1468
|
+
};
|
|
1469
|
+
},
|
|
1470
|
+
};
|
|
1471
|
+
}
|
|
1472
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
1473
|
+
// FACTORY
|
|
1474
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
1475
|
+
/** Create a new Cortex instance. */
|
|
1476
|
+
function createCortex() {
|
|
1477
|
+
return new Cortex();
|
|
1478
|
+
}
|
|
1479
|
+
//# sourceMappingURL=cortex.js.map
|