@empir3/empir3-bridge 0.3.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +1531 -0
- package/CODE_OF_CONDUCT.md +9 -0
- package/CONTRIBUTING.md +75 -0
- package/LICENSE +21 -0
- package/README.md +464 -0
- package/SECURITY.md +130 -0
- package/assets/accuracy-lab.html +2639 -0
- package/assets/api-clis-real.jpg +0 -0
- package/assets/bridge-console-hero.jpg +0 -0
- package/assets/browser-privacy.svg +151 -0
- package/assets/demo-orchestration.svg +74 -0
- package/assets/desktop-select-region.jpg +0 -0
- package/assets/in-page-chat.gif +0 -0
- package/assets/orchestration-hero.svg +126 -0
- package/assets/social-preview.png +0 -0
- package/assets/zara-accent.png +0 -0
- package/build/bootstrap.js +548 -0
- package/build/build.js +680 -0
- package/build/payload-entry.js +649 -0
- package/build/payload-signing-pub.json +7 -0
- package/docs/AGENT_GUIDE.md +259 -0
- package/docs/RELEASE.md +106 -0
- package/docs/SAFETY.md +112 -0
- package/docs/TESTING.md +181 -0
- package/installer/server.js +231 -0
- package/installer/ui/app.js +278 -0
- package/installer/ui/index.html +24 -0
- package/installer/ui/styles.css +146 -0
- package/package.json +95 -0
- package/scripts/bootstrap-e2e.mjs +650 -0
- package/scripts/certify-bridge.mjs +636 -0
- package/scripts/check-companion-surface.mjs +118 -0
- package/scripts/extract-welcome.mjs +64 -0
- package/scripts/gh-route-handler-check.mjs +57 -0
- package/scripts/gh-wire-test.mjs +107 -0
- package/scripts/publish-downloads.mjs +180 -0
- package/scripts/smoke-all-tools.mjs +509 -0
- package/scripts/smoke-live-bridge.mjs +696 -0
- package/scripts/splice-welcome.mjs +63 -0
- package/scripts/welcome-body.txt +2733 -0
- package/src/anthropic-client.ts +192 -0
- package/src/bootstrap-exe.ts +69 -0
- package/src/bridge.ts +2444 -0
- package/src/chat.ts +345 -0
- package/src/cli-runner.ts +239 -0
- package/src/cli.ts +649 -0
- package/src/config.ts +199 -0
- package/src/desktop-overlay.ps1 +121 -0
- package/src/executable-resolver.ts +330 -0
- package/src/handlers/agy-imagegen.ts +179 -0
- package/src/handlers/github-cli.ts +399 -0
- package/src/handlers/higgsfield-cli.ts +783 -0
- package/src/launch.js +337 -0
- package/src/mcp-server.ts +1265 -0
- package/src/pair-claim.ts +218 -0
- package/src/payload-daemon.ts +168 -0
- package/src/server.ts +21036 -0
- package/src/tool-defaults.ts +230 -0
- package/src/update-check.js +136 -0
- package/tray/build.py +76 -0
- package/tray/requirements.txt +2 -0
- package/tray/tray.py +1843 -0
|
@@ -0,0 +1,1265 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Empir3 Browser Bridge — MCP Server
|
|
3
|
+
*
|
|
4
|
+
* Exposes the browser bridge as Claude Code MCP tools.
|
|
5
|
+
* Requires the bridge daemon running on :3006 (run `npm start` in this repo).
|
|
6
|
+
*
|
|
7
|
+
* Register globally:
|
|
8
|
+
* claude mcp add empir3-browser -- npx tsx <path-to-bridge>/src/mcp-server.ts
|
|
9
|
+
*
|
|
10
|
+
* Tools:
|
|
11
|
+
* browser_status — Check bridge + Empir3 connection
|
|
12
|
+
* browser_navigate — Navigate to URL
|
|
13
|
+
* browser_click — Click element by CSS selector
|
|
14
|
+
* browser_click_ref — Click element by Empir3 ref (e.g., e5)
|
|
15
|
+
* browser_click_xy — Click viewport coordinates without DOM
|
|
16
|
+
* browser_type — Type text into element
|
|
17
|
+
* browser_type_ref — Type text into element ref
|
|
18
|
+
* browser_press — Press keyboard key
|
|
19
|
+
* browser_scroll — Scroll page
|
|
20
|
+
* browser_screenshot — Take screenshot (returns image)
|
|
21
|
+
* browser_snapshot — Get interactive element refs from accessibility tree
|
|
22
|
+
* browser_text — Extract page text
|
|
23
|
+
* browser_evaluate — Run JavaScript on page
|
|
24
|
+
* browser_highlight — Highlight element
|
|
25
|
+
* browser_chat — Send message to browser overlay
|
|
26
|
+
* browser_read_chat — Read chat history
|
|
27
|
+
* browser_record_start — Start recording user actions
|
|
28
|
+
* browser_record_stop — Stop recording and save
|
|
29
|
+
* browser_play — Play a saved recording
|
|
30
|
+
* browser_recordings — List saved recordings
|
|
31
|
+
* browser_refresh — Refresh the page
|
|
32
|
+
*
|
|
33
|
+
* desktop_monitors — List DPI-aware monitor bounds
|
|
34
|
+
* desktop_screenshot — Capture monitor(s) or a region; optional grid overlay
|
|
35
|
+
* desktop_cursor_position — Read current cursor position
|
|
36
|
+
* desktop_click — Click physical desktop coordinates
|
|
37
|
+
* desktop_hover — Move cursor to coordinates
|
|
38
|
+
* desktop_drag — Drag between coordinates
|
|
39
|
+
* desktop_snapshot — Enumerate UI Automation refs (d0..dN) for native apps
|
|
40
|
+
* desktop_click_ref — Click by snapshot ref
|
|
41
|
+
* desktop_hover_ref — Hover by snapshot ref
|
|
42
|
+
* desktop_overlay — Toggle click-through labeled-box overlay
|
|
43
|
+
* desktop_select_region — User drags a rectangle → sets agent focus (auto-scopes
|
|
44
|
+
* screenshot/snapshot to it). 30-min TTL.
|
|
45
|
+
* desktop_release_focus — Clear the agent-focus region
|
|
46
|
+
* desktop_focus_status — Report current focus state
|
|
47
|
+
*/
|
|
48
|
+
|
|
49
|
+
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
50
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
51
|
+
import { z } from 'zod';
|
|
52
|
+
import { spawn } from 'child_process';
|
|
53
|
+
import { join, resolve } from 'path';
|
|
54
|
+
import { existsSync, readFileSync } from 'fs';
|
|
55
|
+
import { homedir } from 'os';
|
|
56
|
+
import { resolveBootstrapExe } from './bootstrap-exe';
|
|
57
|
+
|
|
58
|
+
const BRIDGE_URL = process.env.BRIDGE_URL || 'http://localhost:3006';
|
|
59
|
+
const SRC = __dirname;
|
|
60
|
+
const ROOT = resolve(SRC, '..');
|
|
61
|
+
const LAUNCHER = join(SRC, 'launch.js');
|
|
62
|
+
const SERVER_VERSION = process.env.EMPIR3_BRIDGE_PAYLOAD_VERSION || readPackageVersion();
|
|
63
|
+
|
|
64
|
+
function readPackageVersion(): string {
|
|
65
|
+
try {
|
|
66
|
+
const pkg = JSON.parse(readFileSync(join(ROOT, 'package.json'), 'utf-8'));
|
|
67
|
+
return pkg.version || 'dev';
|
|
68
|
+
} catch {
|
|
69
|
+
return 'dev';
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
function readBridgeNonce(): string {
|
|
74
|
+
const explicit = process.env.EMPIR3_BRIDGE_NONCE || process.env.BRIDGE_NONCE;
|
|
75
|
+
if (explicit?.trim()) return explicit.trim();
|
|
76
|
+
try {
|
|
77
|
+
return readFileSync(join(homedir(), '.empir3-bridge', 'nonce'), 'utf-8').trim();
|
|
78
|
+
} catch {
|
|
79
|
+
return '';
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function bridgeHeaders(json = true): Record<string, string> {
|
|
84
|
+
const headers: Record<string, string> = {};
|
|
85
|
+
if (json) headers['Content-Type'] = 'application/json';
|
|
86
|
+
const nonce = readBridgeNonce();
|
|
87
|
+
if (nonce) headers['X-Empir3-Nonce'] = nonce;
|
|
88
|
+
return headers;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Mirror of SETTINGS_DIR in src/server.ts. The MCP shim is a separate
|
|
92
|
+
// process and can't import the server module — duplicate the path so we
|
|
93
|
+
// can read bridge-settings.json at startup for handler-family gating.
|
|
94
|
+
function readBridgeSettingsFile(): any {
|
|
95
|
+
try {
|
|
96
|
+
const settingsDir = join(process.env.APPDATA || join(homedir(), '.empir3'), 'Empir3');
|
|
97
|
+
const settingsFile = join(settingsDir, 'bridge-settings.json');
|
|
98
|
+
if (!existsSync(settingsFile)) return {};
|
|
99
|
+
return JSON.parse(readFileSync(settingsFile, 'utf-8'));
|
|
100
|
+
} catch {
|
|
101
|
+
return {};
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function isHandlerFamilyEnabled(family: string): boolean {
|
|
106
|
+
const settings = readBridgeSettingsFile();
|
|
107
|
+
return !!settings?.handlers?.[family]?.enabled;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function hasAnyCustomProvider(): boolean {
|
|
111
|
+
const settings = readBridgeSettingsFile();
|
|
112
|
+
return Array.isArray(settings?.customProviders) && settings.customProviders.length > 0;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// ─── Helpers ─────────────────────────────────────────────────
|
|
116
|
+
|
|
117
|
+
async function bridgeApi(path: string, method = 'GET', body?: any): Promise<any> {
|
|
118
|
+
const opts: RequestInit = { method, headers: bridgeHeaders() };
|
|
119
|
+
if (body) opts.body = JSON.stringify(body);
|
|
120
|
+
const res = await fetch(`${BRIDGE_URL}${path}`, opts);
|
|
121
|
+
if (!res.ok) throw new Error(`Bridge ${path}: ${res.status} ${await res.text()}`);
|
|
122
|
+
return res.json();
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
async function bridgeCommand(cmd: any): Promise<any> {
|
|
126
|
+
const result = await bridgeApi('/api/command', 'POST', { action: cmd.type, channel: 'mcp', ...cmd });
|
|
127
|
+
if (!result.ok || result.result?.success === false || result.result?.ok === false) throw new Error(result.error || result.result?.error || 'Command failed');
|
|
128
|
+
return result.result;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
async function bridgeScreenshot(): Promise<Buffer> {
|
|
132
|
+
// Cap at 1800px wide to stay under Claude's 2000px multi-image limit
|
|
133
|
+
const res = await fetch(`${BRIDGE_URL}/api/screenshot?maxWidth=1800`, { headers: bridgeHeaders(false) });
|
|
134
|
+
if (!res.ok) throw new Error(`Screenshot failed: ${res.status}`);
|
|
135
|
+
const ab = await res.arrayBuffer();
|
|
136
|
+
return Buffer.from(ab);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
function textResult(text: unknown) {
|
|
140
|
+
const normalized = typeof text === 'string'
|
|
141
|
+
? text
|
|
142
|
+
: JSON.stringify(text ?? '', null, 2);
|
|
143
|
+
return { content: [{ type: 'text' as const, text: normalized }] };
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
function jsonResult(data: any) {
|
|
147
|
+
return textResult(JSON.stringify(data, null, 2));
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// ─── MCP Server ──────────────────────────────────────────────
|
|
151
|
+
|
|
152
|
+
const server = new McpServer({
|
|
153
|
+
name: 'empir3-browser',
|
|
154
|
+
version: SERVER_VERSION,
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
// ── Status ───────────────────────────────────────────────────
|
|
158
|
+
|
|
159
|
+
server.tool(
|
|
160
|
+
'browser_status',
|
|
161
|
+
'Check browser bridge and Empir3 connection status',
|
|
162
|
+
{},
|
|
163
|
+
async () => {
|
|
164
|
+
try {
|
|
165
|
+
const status = await bridgeApi('/api/status');
|
|
166
|
+
return jsonResult(status);
|
|
167
|
+
} catch (e: any) {
|
|
168
|
+
return textResult(`Bridge not running: ${e.message}. Start it with: npm start (in the bridge repo)`);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
);
|
|
172
|
+
|
|
173
|
+
server.tool(
|
|
174
|
+
'bridge_tool_advisor',
|
|
175
|
+
'Discoverability helper: given a one-line description of what you\'re trying to do (e.g. "click a small icon in Photoshop", "type into a form on a website", "guide the user through a tutorial without taking their mouse"), returns the relevant tools and the matching slice of docs/AGENT_GUIDE.md. Call this FIRST when you are unsure which of the bridge\'s 50+ tools to use.',
|
|
176
|
+
{
|
|
177
|
+
intent: z.string().describe('One-line description of what you are trying to do (intent, not tool name).'),
|
|
178
|
+
},
|
|
179
|
+
async ({ intent }) => {
|
|
180
|
+
const result = await bridgeCommand({ type: 'bridge_tool_advisor', intent });
|
|
181
|
+
return jsonResult(result);
|
|
182
|
+
}
|
|
183
|
+
);
|
|
184
|
+
|
|
185
|
+
// ── Navigate ─────────────────────────────────────────────────
|
|
186
|
+
|
|
187
|
+
server.tool(
|
|
188
|
+
'bridge_reliability_status',
|
|
189
|
+
'Show bridge health, enabled tools, and recent action receipts.',
|
|
190
|
+
{},
|
|
191
|
+
async () => {
|
|
192
|
+
const result = await bridgeCommand({ type: 'reliability_status' });
|
|
193
|
+
return jsonResult(result);
|
|
194
|
+
}
|
|
195
|
+
);
|
|
196
|
+
|
|
197
|
+
server.tool(
|
|
198
|
+
'bridge_overlay_reinject',
|
|
199
|
+
'Re-inject and verify the browser overlay used for chat and recording capture.',
|
|
200
|
+
{},
|
|
201
|
+
async () => {
|
|
202
|
+
const result = await bridgeCommand({ type: 'overlay_reinject', reason: 'mcp' });
|
|
203
|
+
return jsonResult(result);
|
|
204
|
+
}
|
|
205
|
+
);
|
|
206
|
+
|
|
207
|
+
server.tool(
|
|
208
|
+
'bridge_setup_status',
|
|
209
|
+
'Report the first-use desktop setup checklist: overlay injection, monitor detection, saved click calibration, recording/playback readiness, and saved completion state.',
|
|
210
|
+
{},
|
|
211
|
+
async () => {
|
|
212
|
+
const result = await bridgeCommand({ type: 'bridge_setup_status' });
|
|
213
|
+
return jsonResult(result);
|
|
214
|
+
}
|
|
215
|
+
);
|
|
216
|
+
|
|
217
|
+
server.tool(
|
|
218
|
+
'bridge_setup_save',
|
|
219
|
+
'Save the current first-use desktop setup checklist result to bridge-settings.json so MCP and empir3 agents can confirm the device was calibrated.',
|
|
220
|
+
{
|
|
221
|
+
completed: z.boolean().optional().describe('Whether to mark setup complete. Default true.'),
|
|
222
|
+
},
|
|
223
|
+
async ({ completed }) => {
|
|
224
|
+
const result = await bridgeCommand({ type: 'bridge_setup_save', completed });
|
|
225
|
+
return jsonResult(result);
|
|
226
|
+
}
|
|
227
|
+
);
|
|
228
|
+
|
|
229
|
+
server.tool(
|
|
230
|
+
'bridge_reliability_smoke',
|
|
231
|
+
'Run monitor, desktop screenshot, and trusted browser coordinate-click checks.',
|
|
232
|
+
{},
|
|
233
|
+
async () => {
|
|
234
|
+
const result = await bridgeCommand({ type: 'reliability_smoke' });
|
|
235
|
+
return jsonResult(result);
|
|
236
|
+
}
|
|
237
|
+
);
|
|
238
|
+
|
|
239
|
+
server.tool(
|
|
240
|
+
'bridge_action_log',
|
|
241
|
+
'Read recent bridge action receipts for debugging failed or uncertain tool calls.',
|
|
242
|
+
{},
|
|
243
|
+
async () => {
|
|
244
|
+
const result = await bridgeCommand({ type: 'action_log' });
|
|
245
|
+
return jsonResult(result);
|
|
246
|
+
}
|
|
247
|
+
);
|
|
248
|
+
|
|
249
|
+
server.tool(
|
|
250
|
+
'bridge_safety_status',
|
|
251
|
+
'Show whether browser write controls, desktop controls, eval, or recordings are currently enabled.',
|
|
252
|
+
{},
|
|
253
|
+
async () => {
|
|
254
|
+
const result = await bridgeCommand({ type: 'safety_status' });
|
|
255
|
+
return jsonResult(result);
|
|
256
|
+
}
|
|
257
|
+
);
|
|
258
|
+
|
|
259
|
+
server.tool(
|
|
260
|
+
'bridge_revoke_control',
|
|
261
|
+
'Immediately disable browser interact, desktop, eval, and recording tools in bridge settings.',
|
|
262
|
+
{},
|
|
263
|
+
async () => {
|
|
264
|
+
const result = await bridgeCommand({ type: 'safety_lockdown' });
|
|
265
|
+
return jsonResult(result);
|
|
266
|
+
}
|
|
267
|
+
);
|
|
268
|
+
|
|
269
|
+
server.tool(
|
|
270
|
+
'browser_navigate',
|
|
271
|
+
'Navigate the browser to a URL',
|
|
272
|
+
{ url: z.string().describe('The URL to navigate to') },
|
|
273
|
+
async ({ url }) => {
|
|
274
|
+
const result = await bridgeCommand({ type: 'navigate', url });
|
|
275
|
+
return textResult(`Navigated to: ${result.url}`);
|
|
276
|
+
}
|
|
277
|
+
);
|
|
278
|
+
|
|
279
|
+
// ── Click ────────────────────────────────────────────────────
|
|
280
|
+
|
|
281
|
+
server.tool(
|
|
282
|
+
'browser_tab_state',
|
|
283
|
+
'List bridge browser tabs and report which tab is agent-controlled versus user-focused. Use this before switching tab control so browsing by the user does not interrupt an agent tab.',
|
|
284
|
+
{},
|
|
285
|
+
async () => {
|
|
286
|
+
const result = await bridgeCommand({ type: 'browser_tab_state' });
|
|
287
|
+
return jsonResult(result);
|
|
288
|
+
}
|
|
289
|
+
);
|
|
290
|
+
|
|
291
|
+
server.tool(
|
|
292
|
+
'browser_tab_focus',
|
|
293
|
+
'Explicitly mark a browser tab as the user focus or hand control of that tab to the agent. This never happens automatically just because the user opens a new tab.',
|
|
294
|
+
{
|
|
295
|
+
targetId: z.string().optional().describe('Browser target id from browser_tab_state. Preferred over URL.'),
|
|
296
|
+
url: z.string().optional().describe('Fallback URL if targetId is not available.'),
|
|
297
|
+
action: z.enum(['user_focus', 'control', 'show_agent']).optional().describe('user_focus marks where the user is looking; control hands the tab to the agent; show_agent brings the current agent tab forward. Default: user_focus.'),
|
|
298
|
+
},
|
|
299
|
+
async ({ targetId, url, action }) => {
|
|
300
|
+
const result = await bridgeCommand({ type: 'browser_tab_focus', targetId, url, tabAction: action || 'user_focus' });
|
|
301
|
+
return jsonResult(result);
|
|
302
|
+
}
|
|
303
|
+
);
|
|
304
|
+
|
|
305
|
+
server.tool(
|
|
306
|
+
'browser_click',
|
|
307
|
+
'Click an element by CSS selector',
|
|
308
|
+
{ selector: z.string().describe('CSS selector of element to click') },
|
|
309
|
+
async ({ selector }) => {
|
|
310
|
+
await bridgeCommand({ type: 'click', selector });
|
|
311
|
+
return textResult(`Clicked: ${selector}`);
|
|
312
|
+
}
|
|
313
|
+
);
|
|
314
|
+
|
|
315
|
+
server.tool(
|
|
316
|
+
'browser_click_ref',
|
|
317
|
+
'Click an element by Empir3 ref (e.g., e5). Use browser_snapshot first to see available refs.',
|
|
318
|
+
{ ref: z.string().describe('Element ref from snapshot (e.g., "e5")') },
|
|
319
|
+
async ({ ref }) => {
|
|
320
|
+
await bridgeCommand({ type: 'click_ref', ref });
|
|
321
|
+
return textResult(`Clicked ref: ${ref}`);
|
|
322
|
+
}
|
|
323
|
+
);
|
|
324
|
+
|
|
325
|
+
server.tool(
|
|
326
|
+
'browser_click_xy',
|
|
327
|
+
'Click viewport coordinates using native browser mouse events, without DOM selectors or refs.',
|
|
328
|
+
{
|
|
329
|
+
x: z.number().describe('Viewport x coordinate in CSS pixels'),
|
|
330
|
+
y: z.number().describe('Viewport y coordinate in CSS pixels'),
|
|
331
|
+
},
|
|
332
|
+
async ({ x, y }) => {
|
|
333
|
+
await bridgeCommand({ type: 'click_xy', x, y });
|
|
334
|
+
return textResult(`Clicked coordinates: ${x},${y}`);
|
|
335
|
+
}
|
|
336
|
+
);
|
|
337
|
+
|
|
338
|
+
// ── Type ─────────────────────────────────────────────────────
|
|
339
|
+
|
|
340
|
+
server.tool(
|
|
341
|
+
'browser_type',
|
|
342
|
+
'Type text into an element by CSS selector',
|
|
343
|
+
{
|
|
344
|
+
selector: z.string().describe('CSS selector of input element'),
|
|
345
|
+
text: z.string().describe('Text to type'),
|
|
346
|
+
},
|
|
347
|
+
async ({ selector, text }) => {
|
|
348
|
+
await bridgeCommand({ type: 'type', selector, text });
|
|
349
|
+
return textResult(`Typed "${text}" into ${selector}`);
|
|
350
|
+
}
|
|
351
|
+
);
|
|
352
|
+
|
|
353
|
+
server.tool(
|
|
354
|
+
'browser_type_ref',
|
|
355
|
+
'Type text into an element by Empir3 ref. Use browser_snapshot first to see available refs.',
|
|
356
|
+
{
|
|
357
|
+
ref: z.string().describe('Element ref from snapshot (e.g., "e3")'),
|
|
358
|
+
text: z.string().describe('Text to type'),
|
|
359
|
+
},
|
|
360
|
+
async ({ ref, text }) => {
|
|
361
|
+
await bridgeCommand({ type: 'type_ref', ref, text });
|
|
362
|
+
return textResult(`Typed "${text}" into ref:${ref}`);
|
|
363
|
+
}
|
|
364
|
+
);
|
|
365
|
+
|
|
366
|
+
// ── Press ────────────────────────────────────────────────────
|
|
367
|
+
|
|
368
|
+
server.tool(
|
|
369
|
+
'browser_press',
|
|
370
|
+
'Press a keyboard key (e.g., Enter, Tab, Escape, Control+a)',
|
|
371
|
+
{ key: z.string().describe('Key to press (e.g., "Enter", "Tab", "Control+a")') },
|
|
372
|
+
async ({ key }) => {
|
|
373
|
+
await bridgeCommand({ type: 'press', text: key });
|
|
374
|
+
return textResult(`Pressed: ${key}`);
|
|
375
|
+
}
|
|
376
|
+
);
|
|
377
|
+
|
|
378
|
+
// ── Scroll ───────────────────────────────────────────────────
|
|
379
|
+
|
|
380
|
+
server.tool(
|
|
381
|
+
'browser_scroll',
|
|
382
|
+
'Scroll the page. Positive y = down, negative y = up.',
|
|
383
|
+
{
|
|
384
|
+
y: z.number().describe('Vertical scroll amount in pixels (positive=down, negative=up)'),
|
|
385
|
+
x: z.number().optional().describe('Horizontal scroll amount in pixels'),
|
|
386
|
+
},
|
|
387
|
+
async ({ y, x }) => {
|
|
388
|
+
const result = await bridgeCommand({ type: 'scroll', x: x || 0, y });
|
|
389
|
+
return textResult(JSON.stringify({
|
|
390
|
+
requested: result.scrolled,
|
|
391
|
+
moved: result.moved,
|
|
392
|
+
position: result.position,
|
|
393
|
+
scroll: result.scroll,
|
|
394
|
+
}, null, 2));
|
|
395
|
+
}
|
|
396
|
+
);
|
|
397
|
+
|
|
398
|
+
// ── Screenshot ───────────────────────────────────────────────
|
|
399
|
+
|
|
400
|
+
server.tool(
|
|
401
|
+
'browser_screenshot',
|
|
402
|
+
'Take a screenshot of the current browser page. Returns the image.',
|
|
403
|
+
{},
|
|
404
|
+
async () => {
|
|
405
|
+
const buf = await bridgeScreenshot();
|
|
406
|
+
return {
|
|
407
|
+
content: [{
|
|
408
|
+
type: 'image' as const,
|
|
409
|
+
data: buf.toString('base64'),
|
|
410
|
+
mimeType: 'image/jpeg',
|
|
411
|
+
}],
|
|
412
|
+
};
|
|
413
|
+
}
|
|
414
|
+
);
|
|
415
|
+
|
|
416
|
+
// ── Snapshot ─────────────────────────────────────────────────
|
|
417
|
+
|
|
418
|
+
server.tool(
|
|
419
|
+
'desktop_monitors',
|
|
420
|
+
'List desktop monitors with DPI-aware physical bounds, including negative coordinates and working areas.',
|
|
421
|
+
{},
|
|
422
|
+
async () => {
|
|
423
|
+
const result = await bridgeCommand({ type: 'desktop_monitors' });
|
|
424
|
+
return jsonResult(result);
|
|
425
|
+
}
|
|
426
|
+
);
|
|
427
|
+
|
|
428
|
+
server.tool(
|
|
429
|
+
'desktop_cursor_position',
|
|
430
|
+
'Get the current desktop cursor position in DPI-aware physical virtual-screen coordinates.',
|
|
431
|
+
{},
|
|
432
|
+
async () => {
|
|
433
|
+
const result = await bridgeCommand({ type: 'desktop_cursor_position' });
|
|
434
|
+
return jsonResult(result);
|
|
435
|
+
}
|
|
436
|
+
);
|
|
437
|
+
|
|
438
|
+
server.tool(
|
|
439
|
+
'desktop_screenshot',
|
|
440
|
+
'Capture desktop screenshots in DPI-aware physical coordinates. Pass `monitor` for a whole display (all/primary/DISPLAY1/...), `region` for a native-res crop, `grid:true` to overlay coordinate gridlines + labels, and/or `marker:{x,y}` to draw a high-visibility crosshair + circle at proposed click coordinates BEFORE clicking. The marker is the "verify before clicking" loop: pick coords from the grid, re-screenshot with marker={x,y} to confirm it lands on the target, then desktop_click. Saves one wrong click per attempt vs. eyeballing.',
|
|
441
|
+
{
|
|
442
|
+
monitor: z.string().optional().describe('Monitor to capture: all, primary, DISPLAY1, DISPLAY2, or full device name. Default: all. Ignored when region is supplied.'),
|
|
443
|
+
region: z.object({
|
|
444
|
+
x: z.number().describe('Virtual-screen X (same coordinate space as desktop_click).'),
|
|
445
|
+
y: z.number().describe('Virtual-screen Y.'),
|
|
446
|
+
width: z.number().describe('Region width in pixels.'),
|
|
447
|
+
height: z.number().describe('Region height in pixels.'),
|
|
448
|
+
}).optional().describe('Optional rectangle to capture. When set, monitor is ignored.'),
|
|
449
|
+
grid: z.union([
|
|
450
|
+
z.boolean(),
|
|
451
|
+
z.object({
|
|
452
|
+
step: z.number().optional().describe('Grid step in pixels. Default 50.'),
|
|
453
|
+
color: z.string().optional().describe('Hex color for grid lines + labels, e.g. "#7AC8FF".'),
|
|
454
|
+
labels: z.enum(['virtual', 'local', 'none']).optional().describe('Coord labels: "virtual" (default, virtual-screen coords usable directly with desktop_click), "local" (region-relative), or "none".'),
|
|
455
|
+
labelEvery: z.number().optional().describe('Label every Nth grid line. Default 2.'),
|
|
456
|
+
})
|
|
457
|
+
]).optional().describe('Set to true for a default grid overlay, or pass an object to customize.'),
|
|
458
|
+
marker: z.union([
|
|
459
|
+
z.object({
|
|
460
|
+
x: z.number(), y: z.number(),
|
|
461
|
+
color: z.string().optional().describe('Hex color, e.g. "#FF7A33" (default).'),
|
|
462
|
+
size: z.number().optional().describe('Diameter in pixels of the inner circle. Default 28.'),
|
|
463
|
+
label: z.string().optional().describe('Text label drawn next to the marker. Default: "x,y".'),
|
|
464
|
+
}),
|
|
465
|
+
z.array(z.object({
|
|
466
|
+
x: z.number(), y: z.number(),
|
|
467
|
+
color: z.string().optional(),
|
|
468
|
+
size: z.number().optional(),
|
|
469
|
+
label: z.string().optional(),
|
|
470
|
+
})),
|
|
471
|
+
]).optional().describe('Crosshair + circle marker(s) at the supplied virtual-screen coord(s). Use to visually verify proposed click coords land on the right element before firing desktop_click.'),
|
|
472
|
+
},
|
|
473
|
+
async ({ monitor, region, grid, marker }) => {
|
|
474
|
+
// Pass `monitor` through verbatim (undefined when the caller omitted it) so
|
|
475
|
+
// the daemon can distinguish an explicit monitor from the default and let
|
|
476
|
+
// an explicit monitor win over an active focus region. The daemon defaults
|
|
477
|
+
// to 'all' when neither monitor, region, nor a focus scope applies.
|
|
478
|
+
const result = await bridgeCommand({ type: 'desktop_screenshot', monitor, region, grid, marker });
|
|
479
|
+
const path = result.stitchedPath || result.captures?.[0]?.path;
|
|
480
|
+
if (!path) return jsonResult(result);
|
|
481
|
+
const buf = readFileSync(path);
|
|
482
|
+
return {
|
|
483
|
+
content: [
|
|
484
|
+
{ type: 'text' as const, text: JSON.stringify(result, null, 2) },
|
|
485
|
+
{ type: 'image' as const, data: buf.toString('base64'), mimeType: 'image/png' },
|
|
486
|
+
],
|
|
487
|
+
};
|
|
488
|
+
}
|
|
489
|
+
);
|
|
490
|
+
|
|
491
|
+
server.tool(
|
|
492
|
+
'desktop_click',
|
|
493
|
+
'Click desktop coordinates using Windows DPI-aware physical virtual-screen coordinates. By default x/y are absolute virtual-screen. Pass monitor to use monitor-relative coords, or space:"focus" to use coords relative to the active agent-focus region (top-left = 0,0). The persisted desktop calibration is auto-applied.',
|
|
494
|
+
{
|
|
495
|
+
x: z.number().describe('X coordinate. Absolute virtual-screen by default; monitor-relative when monitor is supplied; focus-relative when space:"focus".'),
|
|
496
|
+
y: z.number().describe('Y coordinate. See x.'),
|
|
497
|
+
monitor: z.string().optional().describe('Optional monitor id such as DISPLAY1 or DISPLAY2. When supplied, x/y are monitor-relative.'),
|
|
498
|
+
space: z.enum(['desktop', 'monitor', 'focus']).optional().describe('Coordinate space. "focus" adds the active focus region\'s origin to x/y — use this when you read coords off a focus-cropped screenshot. Requires an active desktop_select_region.'),
|
|
499
|
+
double: z.boolean().optional().describe('Double-click instead of single-click.'),
|
|
500
|
+
button: z.enum(['left', 'right', 'middle']).optional().describe('Mouse button. Default: left.'),
|
|
501
|
+
},
|
|
502
|
+
async ({ x, y, monitor, space, double, button }) => {
|
|
503
|
+
const resolvedSpace = space || (monitor ? 'monitor' : 'desktop');
|
|
504
|
+
const result = await bridgeCommand({
|
|
505
|
+
type: 'desktop_click',
|
|
506
|
+
x,
|
|
507
|
+
y,
|
|
508
|
+
monitor,
|
|
509
|
+
space: resolvedSpace,
|
|
510
|
+
double: !!double,
|
|
511
|
+
button: button || 'left',
|
|
512
|
+
});
|
|
513
|
+
return jsonResult(result);
|
|
514
|
+
}
|
|
515
|
+
);
|
|
516
|
+
|
|
517
|
+
server.tool(
|
|
518
|
+
'desktop_hover',
|
|
519
|
+
'Move the desktop cursor using Windows DPI-aware physical virtual-screen coordinates. By default x/y are absolute. Pass monitor for monitor-relative or space:"focus" for focus-relative coords.',
|
|
520
|
+
{
|
|
521
|
+
x: z.number(),
|
|
522
|
+
y: z.number(),
|
|
523
|
+
monitor: z.string().optional(),
|
|
524
|
+
space: z.enum(['desktop', 'monitor', 'focus']).optional(),
|
|
525
|
+
},
|
|
526
|
+
async ({ x, y, monitor, space }) => {
|
|
527
|
+
const resolvedSpace = space || (monitor ? 'monitor' : 'desktop');
|
|
528
|
+
const result = await bridgeCommand({
|
|
529
|
+
type: 'desktop_hover',
|
|
530
|
+
x,
|
|
531
|
+
y,
|
|
532
|
+
monitor,
|
|
533
|
+
space: resolvedSpace,
|
|
534
|
+
});
|
|
535
|
+
return jsonResult(result);
|
|
536
|
+
}
|
|
537
|
+
);
|
|
538
|
+
|
|
539
|
+
server.tool(
|
|
540
|
+
'desktop_drag',
|
|
541
|
+
'Drag between desktop coordinates using Windows DPI-aware physical virtual-screen coordinates. By default endpoints are absolute. Pass monitor for monitor-relative or space:"focus" for focus-relative endpoints.',
|
|
542
|
+
{
|
|
543
|
+
x: z.number(),
|
|
544
|
+
y: z.number(),
|
|
545
|
+
toX: z.number(),
|
|
546
|
+
toY: z.number(),
|
|
547
|
+
monitor: z.string().optional(),
|
|
548
|
+
space: z.enum(['desktop', 'monitor', 'focus']).optional(),
|
|
549
|
+
durationMs: z.number().optional().describe('Drag duration in milliseconds. Default: 500.'),
|
|
550
|
+
steps: z.number().optional().describe('Interpolation steps. Default: 24.'),
|
|
551
|
+
button: z.enum(['left', 'right', 'middle']).optional().describe('Mouse button. Default: left.'),
|
|
552
|
+
},
|
|
553
|
+
async ({ x, y, toX, toY, monitor, space, durationMs, steps, button }) => {
|
|
554
|
+
const resolvedSpace = space || (monitor ? 'monitor' : 'desktop');
|
|
555
|
+
const result = await bridgeCommand({
|
|
556
|
+
type: 'desktop_drag',
|
|
557
|
+
x,
|
|
558
|
+
y,
|
|
559
|
+
toX,
|
|
560
|
+
toY,
|
|
561
|
+
monitor,
|
|
562
|
+
space: resolvedSpace,
|
|
563
|
+
durationMs,
|
|
564
|
+
steps,
|
|
565
|
+
button: button || 'left',
|
|
566
|
+
});
|
|
567
|
+
return jsonResult(result);
|
|
568
|
+
}
|
|
569
|
+
);
|
|
570
|
+
|
|
571
|
+
server.tool(
|
|
572
|
+
'desktop_snapshot',
|
|
573
|
+
'Enumerate visible interactive elements (buttons, menus, inputs, list items, tabs) on the desktop via Windows UI Automation. Returns refs like "d0", "d1" with role, name, bounds, and owning window. Use these refs with desktop_click_ref and desktop_hover_ref — far more reliable than guessing pixel coordinates from a screenshot. Best on native Win32/UWP apps; Electron/Chromium apps need accessibility enabled. Default scope is the foreground window.',
|
|
574
|
+
{
|
|
575
|
+
scope: z.enum(['foreground', 'all-windows']).optional().describe('"foreground" (default) walks only the active window; "all-windows" enumerates every visible top-level window.'),
|
|
576
|
+
maxElements: z.number().optional().describe('Cap on returned elements. Default 200, max 500.'),
|
|
577
|
+
},
|
|
578
|
+
async ({ scope, maxElements }) => {
|
|
579
|
+
const result = await bridgeCommand({ type: 'desktop_snapshot', scope: scope || 'foreground', maxElements: maxElements ?? 200 });
|
|
580
|
+
return jsonResult(result);
|
|
581
|
+
}
|
|
582
|
+
);
|
|
583
|
+
|
|
584
|
+
server.tool(
|
|
585
|
+
'desktop_snapshot_som',
|
|
586
|
+
'Set-of-Mark snapshot for the agent-focus region (or an explicit region). Runs a UIA enumeration, filters to elements inside the region, takes a focus-scoped screenshot, and DRAWS numbered colored boxes (1..N) directly on the image. The agent reads the numbers off the image and acts with desktop_click_ref using the returned `ref`. Removes pixel-coordinate guessing for native Win32 apps. For CEF/Electron/games where UIA returns nothing, this returns `empty: true` (vision-based fallback is a separate tool).',
|
|
587
|
+
{
|
|
588
|
+
region: z.object({
|
|
589
|
+
x: z.number(), y: z.number(), width: z.number(), height: z.number(),
|
|
590
|
+
}).optional().describe('Optional region override. Defaults to the active agent-focus region, then to the foreground window bounds.'),
|
|
591
|
+
maxElements: z.number().optional().describe('Cap on enumerated elements. Default 200, max 500.'),
|
|
592
|
+
},
|
|
593
|
+
async ({ region, maxElements }) => {
|
|
594
|
+
const result = await bridgeCommand({ type: 'desktop_snapshot_som', region, maxElements: maxElements ?? 200 });
|
|
595
|
+
return jsonResult(result);
|
|
596
|
+
}
|
|
597
|
+
);
|
|
598
|
+
|
|
599
|
+
server.tool(
|
|
600
|
+
'desktop_click_ref',
|
|
601
|
+
'Click a desktop element by ref returned from desktop_snapshot (e.g. "d3"). Resolves to the element bounds center and performs a real Win32 click. Refs are invalidated by the next desktop_snapshot.',
|
|
602
|
+
{
|
|
603
|
+
ref: z.string().describe('Desktop ref from the most recent desktop_snapshot, e.g. "d0".'),
|
|
604
|
+
button: z.enum(['left', 'right', 'middle']).optional().describe('Mouse button. Default: left.'),
|
|
605
|
+
double: z.boolean().optional().describe('Double-click instead of single-click.'),
|
|
606
|
+
},
|
|
607
|
+
async ({ ref, button, double }) => {
|
|
608
|
+
const result = await bridgeCommand({ type: 'desktop_click_ref', ref, button: button || 'left', double: !!double });
|
|
609
|
+
return jsonResult(result);
|
|
610
|
+
}
|
|
611
|
+
);
|
|
612
|
+
|
|
613
|
+
server.tool(
|
|
614
|
+
'desktop_hover_ref',
|
|
615
|
+
'Move the cursor to the center of a desktop element by ref returned from desktop_snapshot. Useful for hover-revealed tooltips and menus.',
|
|
616
|
+
{
|
|
617
|
+
ref: z.string().describe('Desktop ref from the most recent desktop_snapshot, e.g. "d0".'),
|
|
618
|
+
},
|
|
619
|
+
async ({ ref }) => {
|
|
620
|
+
const result = await bridgeCommand({ type: 'desktop_hover_ref', ref });
|
|
621
|
+
return jsonResult(result);
|
|
622
|
+
}
|
|
623
|
+
);
|
|
624
|
+
|
|
625
|
+
server.tool(
|
|
626
|
+
'desktop_select_region',
|
|
627
|
+
'Open a fullscreen overlay that lets the USER drag a rectangle around the area they want help with. Blocks until the user finishes selecting (or hits Esc to cancel). On success, sets the bridge\'s "agent focus" to that region — subsequent desktop_screenshot / desktop_snapshot calls automatically scope to it unless the caller explicitly passes its own region. A small "Agent focus active" chip appears anchored to the region so the user can see what the agent is looking at. By default the region uses IDLE-REVOKE: every real scoped use (screenshot, click, snapshot, etc.) resets a 30-minute timer, so active work never drops it, but a region left untouched for 30 min auto-clears. Pass keepOpen:true for PERSISTENT mode (no expiry — lives until desktop_release_focus); useful for long-running watches. Returns the selected region in virtual-screen coords plus the resolved persist flag.',
|
|
628
|
+
{
|
|
629
|
+
timeoutMs: z.number().optional().describe('How long to wait for the user before giving up. Default 60000 (60s), max 120000.'),
|
|
630
|
+
keepOpen: z.boolean().optional().describe('Persistent mode: keep the focus region until explicitly released (desktop_release_focus), with NO idle expiry. Default false → idle-revoke (auto-clears after 30 min of no scoped use). If the user has set a global keep-open default, that applies when this is omitted; pass false to force idle-revoke regardless.'),
|
|
631
|
+
},
|
|
632
|
+
async ({ timeoutMs, keepOpen }) => {
|
|
633
|
+
const result = await bridgeCommand({ type: 'desktop_select_region', timeoutMs, keepOpen });
|
|
634
|
+
return jsonResult(result);
|
|
635
|
+
}
|
|
636
|
+
);
|
|
637
|
+
|
|
638
|
+
server.tool(
|
|
639
|
+
'desktop_release_focus',
|
|
640
|
+
'Clear the bridge\'s current agent-focus region (if any). After this, desktop_screenshot/desktop_snapshot revert to whole-monitor or foreground-window behavior. The on-screen chip disappears.',
|
|
641
|
+
{},
|
|
642
|
+
async () => {
|
|
643
|
+
const result = await bridgeCommand({ type: 'desktop_release_focus' });
|
|
644
|
+
return jsonResult(result);
|
|
645
|
+
}
|
|
646
|
+
);
|
|
647
|
+
|
|
648
|
+
server.tool(
|
|
649
|
+
'desktop_focus_status',
|
|
650
|
+
'Report whether a desktop focus region is currently active, with bounds, mode ("idle-revoke" or "persistent"), persist flag, and remainingMs (null for persistent regions — they have no expiry; otherwise ms until idle auto-clear, which resets on every scoped use). Pure status reads do NOT extend the region.',
|
|
651
|
+
{},
|
|
652
|
+
async () => {
|
|
653
|
+
const result = await bridgeCommand({ type: 'desktop_focus_status' });
|
|
654
|
+
return jsonResult(result);
|
|
655
|
+
}
|
|
656
|
+
);
|
|
657
|
+
|
|
658
|
+
server.tool(
|
|
659
|
+
'desktop_pointer_show',
|
|
660
|
+
'Show a click-through "ghost cursor" overlay at the given absolute screen coords. The cursor is purely visual — clicks pass straight through, the user\'s real mouse is unaffected. Use this to draw the user\'s attention to a specific spot ("I\'m looking here / I think you should click here") without taking control. Optional label paints a small pill next to the arrow. Stays visible until desktop_pointer_hide. By default the persisted desktop calibration is applied so the pointer lands at the same physical pixel a desktop_click would hit; pass noCalibration:true to render at raw requested coords.',
|
|
661
|
+
{
|
|
662
|
+
x: z.number().describe('X coordinate. Absolute virtual-screen by default, or relative to the agent-focus region top-left when space:"focus".'),
|
|
663
|
+
y: z.number().describe('Y coordinate. See x.'),
|
|
664
|
+
label: z.string().optional().describe('Short text shown beside the cursor (max 80 chars).'),
|
|
665
|
+
space: z.enum(['desktop', 'focus']).optional().describe('"desktop" (default): x/y are absolute virtual-screen coords. "focus": x/y are relative to the top-left of the user\'s desktop_select_region selection (requires an active focus). Use focus when you\'re reading coords off a focus-cropped screenshot so you don\'t have to add focus.x/focus.y manually.'),
|
|
666
|
+
noCalibration: z.boolean().optional().describe('Skip the click-calibration transform (render at raw coords). Default false.'),
|
|
667
|
+
},
|
|
668
|
+
async ({ x, y, label, space, noCalibration }) => {
|
|
669
|
+
const result = await bridgeCommand({ type: 'desktop_pointer_show', x, y, label, space, noCalibration });
|
|
670
|
+
return jsonResult(result);
|
|
671
|
+
}
|
|
672
|
+
);
|
|
673
|
+
|
|
674
|
+
server.tool(
|
|
675
|
+
'desktop_click_page',
|
|
676
|
+
'Perform a REAL OS-level mouse click on an element inside the bridge\'s OWN Chrome page. Give it a CSS selector, a browser_snapshot ref, or raw cssX/cssY viewport coords; the bridge maps the page coordinate to a physical virtual-screen pixel (content-window origin + devicePixelRatio + persisted click calibration) and dispatches a hardware click there. Use this instead of browser_click when a target needs a TRUSTED, OS-level click (drag handles, native-feel widgets, trusted-event-gated UIs) or when you want the real cursor to move onto the element. Brings the bridge Chrome window to the front first. Windows-only; bridge\'s Chrome only.',
|
|
677
|
+
{
|
|
678
|
+
selector: z.string().optional().describe('CSS selector of the target element. The element\'s bounding-box center is used.'),
|
|
679
|
+
ref: z.string().optional().describe('A browser_snapshot element ref (e.g. "e5") — resolved via [data-empir3-ref].'),
|
|
680
|
+
cssX: z.number().optional().describe('Raw CSS-viewport X (px). Use with cssY when you have explicit page coords instead of an element.'),
|
|
681
|
+
cssY: z.number().optional().describe('Raw CSS-viewport Y (px). See cssX.'),
|
|
682
|
+
button: z.enum(['left', 'right', 'middle']).optional().describe('Mouse button. Default left.'),
|
|
683
|
+
double: z.boolean().optional().describe('Double-click instead of single. Default false.'),
|
|
684
|
+
},
|
|
685
|
+
async ({ selector, ref, cssX, cssY, button, double }) => {
|
|
686
|
+
const result = await bridgeCommand({ type: 'desktop_click_page', selector, ref, cssX, cssY, button, double });
|
|
687
|
+
return jsonResult(result);
|
|
688
|
+
}
|
|
689
|
+
);
|
|
690
|
+
|
|
691
|
+
server.tool(
|
|
692
|
+
'desktop_pointer_page',
|
|
693
|
+
'Show the click-through "ghost cursor" overlay on top of an element in the bridge\'s own Chrome page (CSS selector, snapshot ref, or cssX/cssY). Same page→physical-screen mapping as desktop_click_page, but visual-only — the user\'s real mouse is untouched and no click happens. Use to point at a page element ("I\'m looking at this button") without taking control. Windows-only; bridge\'s Chrome only.',
|
|
694
|
+
{
|
|
695
|
+
selector: z.string().optional().describe('CSS selector of the element to point at (bounding-box center).'),
|
|
696
|
+
ref: z.string().optional().describe('A browser_snapshot element ref.'),
|
|
697
|
+
cssX: z.number().optional().describe('Raw CSS-viewport X (px), with cssY.'),
|
|
698
|
+
cssY: z.number().optional().describe('Raw CSS-viewport Y (px).'),
|
|
699
|
+
label: z.string().optional().describe('Short text shown beside the ghost cursor (max 80 chars).'),
|
|
700
|
+
},
|
|
701
|
+
async ({ selector, ref, cssX, cssY, label }) => {
|
|
702
|
+
const result = await bridgeCommand({ type: 'desktop_pointer_page', selector, ref, cssX, cssY, label });
|
|
703
|
+
return jsonResult(result);
|
|
704
|
+
}
|
|
705
|
+
);
|
|
706
|
+
|
|
707
|
+
server.tool(
|
|
708
|
+
'page_to_screen',
|
|
709
|
+
'Inspect-only: resolve an element in the bridge\'s Chrome page (CSS selector, snapshot ref, or cssX/cssY) to its physical virtual-screen coordinates. Returns the intended screen pixel (where the element actually is), the calibrated coordinate a real click would dispatch, the content-window origin, devicePixelRatio, and the element\'s CSS rect. Use to verify where desktop_click_page would land before committing, or to compute a screen coord for another desktop tool. No click, no cursor movement (but does bring the window to front to read its geometry). Windows-only.',
|
|
710
|
+
{
|
|
711
|
+
selector: z.string().optional().describe('CSS selector of the element (bounding-box center is mapped).'),
|
|
712
|
+
ref: z.string().optional().describe('A browser_snapshot element ref.'),
|
|
713
|
+
cssX: z.number().optional().describe('Raw CSS-viewport X (px), with cssY.'),
|
|
714
|
+
cssY: z.number().optional().describe('Raw CSS-viewport Y (px).'),
|
|
715
|
+
},
|
|
716
|
+
async ({ selector, ref, cssX, cssY }) => {
|
|
717
|
+
const result = await bridgeCommand({ type: 'page_to_screen', selector, ref, cssX, cssY });
|
|
718
|
+
return jsonResult(result);
|
|
719
|
+
}
|
|
720
|
+
);
|
|
721
|
+
|
|
722
|
+
server.tool(
|
|
723
|
+
'desktop_pointer_move',
|
|
724
|
+
'Reposition the ghost cursor to new coords. If no pointer is currently shown, this is equivalent to desktop_pointer_show. The overlay polls every ~40ms so updates feel near real-time. Applies the persisted calibration unless noCalibration:true. Supports space:"focus" for focus-relative coords (see desktop_pointer_show).',
|
|
725
|
+
{
|
|
726
|
+
x: z.number(),
|
|
727
|
+
y: z.number(),
|
|
728
|
+
label: z.string().optional().describe('Optional new label — omit to leave the current label unchanged.'),
|
|
729
|
+
space: z.enum(['desktop', 'focus']).optional(),
|
|
730
|
+
noCalibration: z.boolean().optional(),
|
|
731
|
+
},
|
|
732
|
+
async ({ x, y, label, space, noCalibration }) => {
|
|
733
|
+
const result = await bridgeCommand({ type: 'desktop_pointer_move', x, y, label, space, noCalibration });
|
|
734
|
+
return jsonResult(result);
|
|
735
|
+
}
|
|
736
|
+
);
|
|
737
|
+
|
|
738
|
+
server.tool(
|
|
739
|
+
'desktop_pointer_pulse',
|
|
740
|
+
'Trigger a one-shot expanding ring animation at the ghost cursor\'s current (or specified) position. Useful for "look here NOW" emphasis. Requires the pointer to already be shown.',
|
|
741
|
+
{
|
|
742
|
+
x: z.number().optional().describe('Optional: move + pulse in one call.'),
|
|
743
|
+
y: z.number().optional(),
|
|
744
|
+
},
|
|
745
|
+
async ({ x, y }) => {
|
|
746
|
+
const result = await bridgeCommand({ type: 'desktop_pointer_pulse', x, y });
|
|
747
|
+
return jsonResult(result);
|
|
748
|
+
}
|
|
749
|
+
);
|
|
750
|
+
|
|
751
|
+
server.tool(
|
|
752
|
+
'desktop_pointer_hide',
|
|
753
|
+
'Hide the ghost cursor overlay if it is currently shown.',
|
|
754
|
+
{},
|
|
755
|
+
async () => {
|
|
756
|
+
const result = await bridgeCommand({ type: 'desktop_pointer_hide' });
|
|
757
|
+
return jsonResult(result);
|
|
758
|
+
}
|
|
759
|
+
);
|
|
760
|
+
|
|
761
|
+
server.tool(
|
|
762
|
+
'desktop_pointer_status',
|
|
763
|
+
'Report whether the ghost cursor is currently shown, its position and label, and whether the overlay PS process is alive.',
|
|
764
|
+
{},
|
|
765
|
+
async () => {
|
|
766
|
+
const result = await bridgeCommand({ type: 'desktop_pointer_status' });
|
|
767
|
+
return jsonResult(result);
|
|
768
|
+
}
|
|
769
|
+
);
|
|
770
|
+
|
|
771
|
+
server.tool(
|
|
772
|
+
'desktop_calibrate_pointer',
|
|
773
|
+
'Run interactive multi-point click calibration. Shows 5 target crosshairs (corners + center) — the user clicks each one. Bridge fits a per-axis affine transform (scale + offset) from (target → actual_click) and persists it per monitor in bridge-settings.json. Every subsequent desktop_click (and desktop_pointer_show) auto-applies the transform. Run when clicks land off-target. WHEN AN AGENT-FOCUS REGION IS ACTIVE, this defaults to calibrating WITHIN the focus region — smaller overlay, tighter fit for the area the user actually cares about, fewer interruptions. Override with area:"monitor" to calibrate the whole monitor.',
|
|
774
|
+
{
|
|
775
|
+
monitor: z.string().optional().describe('Which monitor to calibrate: "primary" (default), "all", or a specific id like "DISPLAY4". Ignored when area="focus".'),
|
|
776
|
+
area: z.enum(['focus', 'monitor', 'all']).optional().describe('"focus" (default when desktop_select_region is active) calibrates inside the focus region. "monitor" calibrates the whole monitor selected by `monitor`. "all" calibrates every monitor.'),
|
|
777
|
+
persist: z.boolean().optional().describe('Save to bridge-settings.json. Default true.'),
|
|
778
|
+
},
|
|
779
|
+
async ({ monitor, area, persist }) => {
|
|
780
|
+
const result = await bridgeCommand({ type: 'desktop_calibrate_pointer', monitor, area, persist });
|
|
781
|
+
return jsonResult(result);
|
|
782
|
+
}
|
|
783
|
+
);
|
|
784
|
+
|
|
785
|
+
server.tool(
|
|
786
|
+
'desktop_calibration_status',
|
|
787
|
+
'Return the persisted desktop click calibration (per-monitor affine transforms in v2, uniform offset in v1) or null if uncalibrated. Use this to check which monitors are calibrated and inspect the residual pixel error.',
|
|
788
|
+
{},
|
|
789
|
+
async () => {
|
|
790
|
+
const result = await bridgeCommand({ type: 'desktop_calibration_status' });
|
|
791
|
+
return jsonResult(result);
|
|
792
|
+
}
|
|
793
|
+
);
|
|
794
|
+
|
|
795
|
+
server.tool(
|
|
796
|
+
'desktop_pick_point',
|
|
797
|
+
'Ask the user to designate a point inside the agent-focus region. A semi-transparent capture overlay appears over the focus area with a banner prompt; user clicks anywhere inside; bridge returns the click position as focus-relative pixel, absolute pixel, AND chess-board cell coords (col/row/subX/subY matching the desktop_focus_grid overlay). Eliminates "click HERE → I have to guess where HERE is" round-trips when the user can show you. Blocks until click or Esc (max 60s default).',
|
|
798
|
+
{
|
|
799
|
+
prompt: z.string().optional().describe('Custom banner text shown to the user. Default: "Click the spot you want the agent to target".'),
|
|
800
|
+
timeoutMs: z.number().optional().describe('Max wait. Default 60000, clamped to [5000, 120000].'),
|
|
801
|
+
},
|
|
802
|
+
async ({ prompt, timeoutMs }) => {
|
|
803
|
+
const result = await bridgeCommand({ type: 'desktop_pick_point', prompt, timeoutMs });
|
|
804
|
+
return jsonResult(result);
|
|
805
|
+
}
|
|
806
|
+
);
|
|
807
|
+
|
|
808
|
+
server.tool(
|
|
809
|
+
'desktop_toolbar',
|
|
810
|
+
'Open, close, or check the movable desktop toolbar widget. The toolbar exposes focus region, release focus, overlay chat injection, recording, playback, saved recording selection, and quick calibration on the monitor where the toolbar sits.',
|
|
811
|
+
{
|
|
812
|
+
action: z.enum(['show', 'hide', 'status']).optional().describe('show opens the toolbar, hide closes it, status reports whether it is running. Default: show.'),
|
|
813
|
+
},
|
|
814
|
+
async ({ action }) => {
|
|
815
|
+
const result = await bridgeCommand({ type: 'desktop_toolbar', action: action || 'show' });
|
|
816
|
+
return jsonResult(result);
|
|
817
|
+
}
|
|
818
|
+
);
|
|
819
|
+
|
|
820
|
+
server.tool(
|
|
821
|
+
'desktop_focus_grid',
|
|
822
|
+
'Toggle an on-screen click-through grid overlay covering the active agent-focus region. Same chess-board grid (~16 cells, integer pill labels on top + left edges) that goes into the focus screenshot — but drawn live ON the user\'s screen, so human and agent share the exact same coord system. User can say "click cell 8,7" reading off the on-screen labels and you call desktop_click_cell with those numbers, no screenshot round-trip. Overlay survives focus repositioning and auto-disappears when desktop_release_focus is called.',
|
|
823
|
+
{
|
|
824
|
+
action: z.enum(['show', 'hide', 'toggle', 'status']).optional().describe('Default: toggle.'),
|
|
825
|
+
},
|
|
826
|
+
async ({ action }) => {
|
|
827
|
+
const result = await bridgeCommand({ type: 'desktop_focus_grid', action: action || 'toggle' });
|
|
828
|
+
return jsonResult(result);
|
|
829
|
+
}
|
|
830
|
+
);
|
|
831
|
+
|
|
832
|
+
server.tool(
|
|
833
|
+
'desktop_click_cell',
|
|
834
|
+
'Click a cell of the agent-focus grid. The focus screenshot is overlaid with a chess-board grid (~16 cells across the larger dimension); pass the col/row you read off the pill labels and bridge translates to the cell center, then clicks. Requires an active desktop_select_region. Optional subX/subY (each in [-0.5, 0.5]) shifts within the cell — useful for sub-cell precision without taking a zoom screenshot.',
|
|
835
|
+
{
|
|
836
|
+
col: z.number().int().describe('Column index (1-indexed, matches the top-edge pill labels).'),
|
|
837
|
+
row: z.number().int().describe('Row index (1-indexed, matches the left-edge pill labels).'),
|
|
838
|
+
subX: z.number().optional().describe('Fractional X offset within the cell, -0.5 (left edge) to +0.5 (right edge). Default 0 (center).'),
|
|
839
|
+
subY: z.number().optional().describe('Fractional Y offset within the cell, -0.5 (top) to +0.5 (bottom). Default 0 (center).'),
|
|
840
|
+
button: z.enum(['left', 'right', 'middle']).optional(),
|
|
841
|
+
double: z.boolean().optional(),
|
|
842
|
+
},
|
|
843
|
+
async ({ col, row, subX, subY, button, double }) => {
|
|
844
|
+
const result = await bridgeCommand({ type: 'desktop_click_cell', col, row, subX, subY, button, double });
|
|
845
|
+
return jsonResult(result);
|
|
846
|
+
}
|
|
847
|
+
);
|
|
848
|
+
|
|
849
|
+
server.tool(
|
|
850
|
+
'desktop_pointer_cell',
|
|
851
|
+
'Show the ghost cursor at the center of a focus-grid cell. Same addressing as desktop_click_cell — col/row (1-indexed) match the on-screen pill labels.',
|
|
852
|
+
{
|
|
853
|
+
col: z.number().int(),
|
|
854
|
+
row: z.number().int(),
|
|
855
|
+
subX: z.number().optional(),
|
|
856
|
+
subY: z.number().optional(),
|
|
857
|
+
label: z.string().optional(),
|
|
858
|
+
},
|
|
859
|
+
async ({ col, row, subX, subY, label }) => {
|
|
860
|
+
const result = await bridgeCommand({ type: 'desktop_pointer_cell', col, row, subX, subY, label });
|
|
861
|
+
return jsonResult(result);
|
|
862
|
+
}
|
|
863
|
+
);
|
|
864
|
+
|
|
865
|
+
server.tool(
|
|
866
|
+
'desktop_screenshot_zoom',
|
|
867
|
+
'Take a tight crop of the desktop centered on (x, y) at native resolution — no downscaling. Use this when you need pixel-accurate visual inspection of a small area before clicking or pointing at a specific element. Pass radius for the half-width of the crop (default 100 → 200×200 px square). A small green marker is drawn at the exact center of the returned image so you can verify your coord estimate. Set noMarker:true to omit it. Pass space:"focus" to interpret x/y as focus-relative coords.',
|
|
868
|
+
{
|
|
869
|
+
x: z.number().describe('X of the crop center (absolute by default, focus-relative when space:"focus").'),
|
|
870
|
+
y: z.number().describe('Y of the crop center.'),
|
|
871
|
+
radius: z.number().optional().describe('Half-width of the square crop in pixels. Default 100 (200×200 px). Clamped to [20, 800].'),
|
|
872
|
+
space: z.enum(['desktop', 'focus']).optional(),
|
|
873
|
+
noMarker: z.boolean().optional().describe('Skip the center marker. Default false.'),
|
|
874
|
+
},
|
|
875
|
+
async ({ x, y, radius, space, noMarker }) => {
|
|
876
|
+
const result = await bridgeCommand({ type: 'desktop_screenshot_zoom', x, y, radius, space, noMarker });
|
|
877
|
+
const path = result.stitchedPath || result.captures?.[0]?.path;
|
|
878
|
+
if (!path) return jsonResult(result);
|
|
879
|
+
const buf = readFileSync(path);
|
|
880
|
+
return {
|
|
881
|
+
content: [
|
|
882
|
+
{ type: 'text' as const, text: JSON.stringify(result, null, 2) },
|
|
883
|
+
{ type: 'image' as const, data: buf.toString('base64'), mimeType: 'image/png' },
|
|
884
|
+
],
|
|
885
|
+
};
|
|
886
|
+
}
|
|
887
|
+
);
|
|
888
|
+
|
|
889
|
+
server.tool(
|
|
890
|
+
'desktop_overlay',
|
|
891
|
+
'Toggle a click-through overlay that draws labeled rectangles over the elements from the most recent desktop_snapshot. The overlay is fully transparent to clicks/keys so it never blocks the user. Useful when an agent is driving the desktop and the human wants to see what is being targeted. The overlay auto-refreshes whenever a new snapshot is taken.',
|
|
892
|
+
{
|
|
893
|
+
action: z.enum(['show', 'hide', 'toggle', 'status']).optional().describe('"show" opens the overlay, "hide" closes it, "toggle" flips state, "status" returns the running state without changing it. Default: toggle.'),
|
|
894
|
+
},
|
|
895
|
+
async ({ action }) => {
|
|
896
|
+
const result = await bridgeCommand({ type: 'desktop_overlay', action: action || 'toggle' });
|
|
897
|
+
return jsonResult(result);
|
|
898
|
+
}
|
|
899
|
+
);
|
|
900
|
+
|
|
901
|
+
server.tool(
|
|
902
|
+
'browser_snapshot',
|
|
903
|
+
'Get interactive element refs from the accessibility tree. Use these refs with browser_click_ref and browser_type_ref. Much cheaper than screenshots.',
|
|
904
|
+
{
|
|
905
|
+
filter: z.enum(['interactive', 'all']).optional().describe('Filter: "interactive" (buttons, inputs) or "all" (everything). Default: interactive'),
|
|
906
|
+
},
|
|
907
|
+
async ({ filter }) => {
|
|
908
|
+
const result = await bridgeCommand({ type: 'snapshot', filter: filter || 'interactive', format: 'compact' });
|
|
909
|
+
const snapshot = result.snapshot;
|
|
910
|
+
if (typeof snapshot === 'string') {
|
|
911
|
+
return textResult(snapshot);
|
|
912
|
+
}
|
|
913
|
+
return jsonResult(snapshot);
|
|
914
|
+
}
|
|
915
|
+
);
|
|
916
|
+
|
|
917
|
+
// ── Text ─────────────────────────────────────────────────────
|
|
918
|
+
|
|
919
|
+
server.tool(
|
|
920
|
+
'browser_text',
|
|
921
|
+
'Extract readable text content from the current page',
|
|
922
|
+
{},
|
|
923
|
+
async () => {
|
|
924
|
+
const result = await bridgeCommand({ type: 'text' });
|
|
925
|
+
return textResult(result.text || '(no text)');
|
|
926
|
+
}
|
|
927
|
+
);
|
|
928
|
+
|
|
929
|
+
// ── Evaluate ─────────────────────────────────────────────────
|
|
930
|
+
|
|
931
|
+
server.tool(
|
|
932
|
+
'browser_evaluate',
|
|
933
|
+
'Run JavaScript on the current page and return the result',
|
|
934
|
+
{ script: z.string().describe('JavaScript expression to evaluate') },
|
|
935
|
+
async ({ script }) => {
|
|
936
|
+
const result = await bridgeCommand({ type: 'evaluate', script });
|
|
937
|
+
return jsonResult(result);
|
|
938
|
+
}
|
|
939
|
+
);
|
|
940
|
+
|
|
941
|
+
// ── Highlight ────────────────────────────────────────────────
|
|
942
|
+
|
|
943
|
+
server.tool(
|
|
944
|
+
'browser_highlight',
|
|
945
|
+
'Highlight an element on the page with a blue glow (for showing the user)',
|
|
946
|
+
{ selector: z.string().describe('CSS selector to highlight') },
|
|
947
|
+
async ({ selector }) => {
|
|
948
|
+
await bridgeCommand({ type: 'highlight', selector });
|
|
949
|
+
return textResult(`Highlighted: ${selector}`);
|
|
950
|
+
}
|
|
951
|
+
);
|
|
952
|
+
|
|
953
|
+
// ── Chat ─────────────────────────────────────────────────────
|
|
954
|
+
|
|
955
|
+
server.tool(
|
|
956
|
+
'browser_chat',
|
|
957
|
+
'Send a message to the user via the browser overlay chat panel',
|
|
958
|
+
{ message: z.string().describe('Message to display in the browser chat panel') },
|
|
959
|
+
async ({ message }) => {
|
|
960
|
+
await bridgeCommand({ type: 'chat', message });
|
|
961
|
+
return textResult(`Sent to browser: ${message}`);
|
|
962
|
+
}
|
|
963
|
+
);
|
|
964
|
+
|
|
965
|
+
server.tool(
|
|
966
|
+
'browser_read_chat',
|
|
967
|
+
'Read recent messages from the browser overlay chat',
|
|
968
|
+
{ limit: z.number().optional().describe('Number of messages to read (default: 20)') },
|
|
969
|
+
async ({ limit }) => {
|
|
970
|
+
const messages = await bridgeApi('/api/chat');
|
|
971
|
+
const recent = messages.slice(-(limit || 20));
|
|
972
|
+
if (recent.length === 0) return textResult('No messages yet.');
|
|
973
|
+
const formatted = recent.map((m: any) => {
|
|
974
|
+
const time = new Date(m.timestamp).toLocaleTimeString();
|
|
975
|
+
const from = m.from === 'user' ? 'User' : 'Claude';
|
|
976
|
+
let line = `[${time}] ${from}: ${m.text}`;
|
|
977
|
+
if (m.screenshot) line += ` [screenshot: ${m.screenshot}]`;
|
|
978
|
+
if (m.selector) line += ` [element: ${m.selector}]`;
|
|
979
|
+
return line;
|
|
980
|
+
}).join('\n');
|
|
981
|
+
return textResult(formatted);
|
|
982
|
+
}
|
|
983
|
+
);
|
|
984
|
+
|
|
985
|
+
// ── Recording ────────────────────────────────────────────────
|
|
986
|
+
|
|
987
|
+
server.tool(
|
|
988
|
+
'browser_record_start',
|
|
989
|
+
'Start recording user interactions in the browser. The user clicks/types/scrolls and actions are captured with element refs.',
|
|
990
|
+
{},
|
|
991
|
+
async () => {
|
|
992
|
+
const result = await bridgeCommand({ type: 'record_start' });
|
|
993
|
+
return textResult(`Recording started at ${result.startUrl}`);
|
|
994
|
+
}
|
|
995
|
+
);
|
|
996
|
+
|
|
997
|
+
server.tool(
|
|
998
|
+
'browser_record_stop',
|
|
999
|
+
'Stop recording and save. Returns the recording file name and action count.',
|
|
1000
|
+
{ name: z.string().optional().describe('Name for the recording (default: auto-generated)') },
|
|
1001
|
+
async ({ name }) => {
|
|
1002
|
+
const result = await bridgeCommand({ type: 'record_stop', text: name });
|
|
1003
|
+
const refCount = result.refCount || 0;
|
|
1004
|
+
// Agent-driven actions (browser_click/type from this MCP) replay reliably
|
|
1005
|
+
// via their selector/evaluate steps, so a 0 here is not a problem — only
|
|
1006
|
+
// overlay-captured user clicks carry accessibility refs. Word it so it
|
|
1007
|
+
// doesn't read as a failure.
|
|
1008
|
+
const refNote = refCount > 0
|
|
1009
|
+
? `${refCount} with accessibility refs`
|
|
1010
|
+
: 'replays via selector/coordinate steps (no accessibility refs captured)';
|
|
1011
|
+
return textResult(`Saved: ${result.saved} (${result.actionCount} actions, ${(result.duration / 1000).toFixed(1)}s, ${refNote})`);
|
|
1012
|
+
}
|
|
1013
|
+
);
|
|
1014
|
+
|
|
1015
|
+
server.tool(
|
|
1016
|
+
'browser_play',
|
|
1017
|
+
'Play a saved recording. Uses element refs for reliable replay with coordinate fallback.',
|
|
1018
|
+
{
|
|
1019
|
+
recording: z.string().describe('Recording name to play'),
|
|
1020
|
+
speed: z.number().optional().describe('Playback speed multiplier (default: 1, range: 0.1-10)'),
|
|
1021
|
+
variables: z.record(z.string()).optional().describe('Variable substitutions (e.g., {"EMAIL": "test@test.com"})'),
|
|
1022
|
+
},
|
|
1023
|
+
async ({ recording, speed, variables }) => {
|
|
1024
|
+
const result = await bridgeCommand({ type: 'play', recording, speed: speed || 1, variables: variables || {} });
|
|
1025
|
+
const lines = result.results.map((r: any) => {
|
|
1026
|
+
const icon = r.ok ? '\u2713' : '\u2717';
|
|
1027
|
+
const method = r.method ? ` [${r.method}]` : '';
|
|
1028
|
+
return ` ${icon} Step ${r.step}: ${r.action}${method}${r.error ? ' — ' + r.error : ''}`;
|
|
1029
|
+
});
|
|
1030
|
+
return textResult(`Playback "${result.name}": ${result.passed}/${result.total} passed, ${result.failed} failed.\n${lines.join('\n')}`);
|
|
1031
|
+
}
|
|
1032
|
+
);
|
|
1033
|
+
|
|
1034
|
+
server.tool(
|
|
1035
|
+
'browser_recordings',
|
|
1036
|
+
'List all saved recordings',
|
|
1037
|
+
{},
|
|
1038
|
+
async () => {
|
|
1039
|
+
const recordings = await bridgeApi('/api/recordings');
|
|
1040
|
+
if (recordings.length === 0) return textResult('No recordings yet.');
|
|
1041
|
+
const lines = recordings.map((r: any) => {
|
|
1042
|
+
const engine = r.engine === 'empir3' ? '[empir3]' : '[legacy]';
|
|
1043
|
+
// Truncate data: URIs and other long URLs so the listing stays readable.
|
|
1044
|
+
let url = String(r.startUrl || '');
|
|
1045
|
+
if (url.startsWith('data:')) {
|
|
1046
|
+
url = `data:… (${url.length}b)`;
|
|
1047
|
+
} else if (url.length > 100) {
|
|
1048
|
+
url = url.slice(0, 97) + '…';
|
|
1049
|
+
}
|
|
1050
|
+
return ` ${r.name} (${r.actionCount} actions, ${(r.duration / 1000).toFixed(1)}s) ${engine} — ${url}`;
|
|
1051
|
+
});
|
|
1052
|
+
return textResult(lines.join('\n'));
|
|
1053
|
+
}
|
|
1054
|
+
);
|
|
1055
|
+
|
|
1056
|
+
// ── Refresh ──────────────────────────────────────────────────
|
|
1057
|
+
|
|
1058
|
+
server.tool(
|
|
1059
|
+
'browser_refresh',
|
|
1060
|
+
'Refresh the current browser page',
|
|
1061
|
+
{},
|
|
1062
|
+
async () => {
|
|
1063
|
+
await bridgeCommand({ type: 'refresh' });
|
|
1064
|
+
return textResult('Page refreshed');
|
|
1065
|
+
}
|
|
1066
|
+
);
|
|
1067
|
+
|
|
1068
|
+
// ─── Higgsfield CLI (handler-gated) ─────────────────────────
|
|
1069
|
+
//
|
|
1070
|
+
// Registered only when settings.handlers.higgsfield.enabled is true so the
|
|
1071
|
+
// tools never appear in a client's tool inventory unless the user has
|
|
1072
|
+
// flipped the tray toggle. The bridge dispatcher enforces the same gate
|
|
1073
|
+
// at command time (defense in depth) — see enforceHandlerFamilyGate() in
|
|
1074
|
+
// src/server.ts.
|
|
1075
|
+
if (isHandlerFamilyEnabled('higgsfield')) {
|
|
1076
|
+
server.tool(
|
|
1077
|
+
'higgsfield_status',
|
|
1078
|
+
'Check whether the higgsfield CLI is installed, authenticated, and ready.',
|
|
1079
|
+
{},
|
|
1080
|
+
async () => {
|
|
1081
|
+
const result = await bridgeCommand({ type: 'higgsfield_status' });
|
|
1082
|
+
return jsonResult(result);
|
|
1083
|
+
}
|
|
1084
|
+
);
|
|
1085
|
+
|
|
1086
|
+
server.tool(
|
|
1087
|
+
'higgsfield_list',
|
|
1088
|
+
'List the user\'s recent Higgsfield generations.',
|
|
1089
|
+
{ limit: z.number().int().positive().max(200).optional().describe('Optional cap on results returned by the CLI.') },
|
|
1090
|
+
async ({ limit }) => {
|
|
1091
|
+
const result = await bridgeCommand({ type: 'higgsfield_list', params: { limit } });
|
|
1092
|
+
return jsonResult(result);
|
|
1093
|
+
}
|
|
1094
|
+
);
|
|
1095
|
+
|
|
1096
|
+
server.tool(
|
|
1097
|
+
'higgsfield_models',
|
|
1098
|
+
'List the available Higgsfield models so you can pick a valid `model` (job_set_type) for higgsfield_generate. Returns [{job_set_type, name, type}] where type is "image", "video", or "text". The catalog changes over time — ALWAYS call this to discover valid ids rather than guessing. Examples of current ids: z_image / flux_2 / seedream_v4_5 (text→image), nano_banana_2 / flux_kontext (image edit, need an --image), veo3_1 / kling3_0 / seedance_2_0 (video).',
|
|
1099
|
+
{ type: z.enum(['image', 'video', 'text']).optional().describe('Optional filter to only image, video, or text models.') },
|
|
1100
|
+
async ({ type }) => {
|
|
1101
|
+
const result = await bridgeCommand({ type: 'higgsfield_models', params: { type } });
|
|
1102
|
+
return jsonResult(result);
|
|
1103
|
+
}
|
|
1104
|
+
);
|
|
1105
|
+
|
|
1106
|
+
server.tool(
|
|
1107
|
+
'higgsfield_generate',
|
|
1108
|
+
'Generate a Higgsfield image or video from a text prompt (and optional reference image). Returns the result URL plus a local artifact path under ~/.empir3-bridge/artifacts/higgsfield/. Costs money/quota on the user\'s Higgsfield account. HOW TO USE: (1) call higgsfield_models to get a valid `model` (job_set_type) and its type; (2) for text→image use an image model with just a prompt (e.g. z_image); (3) for image editing use an edit model AND pass `image` (e.g. nano_banana_2); (4) video models (e.g. veo3_1) take a prompt and run longer. Per-model knobs (aspect_ratio, resolution, duration, etc.) go in `extra`.',
|
|
1109
|
+
{
|
|
1110
|
+
model: z.string().describe('A Higgsfield job_set_type from higgsfield_models (e.g. "z_image", "nano_banana_2", "veo3_1"). NOT a free-form name — call higgsfield_models first if unsure.'),
|
|
1111
|
+
prompt: z.string().describe('Text prompt for the generation.'),
|
|
1112
|
+
image: z.string().optional().describe('Reference/input image for edit or image-conditioned models — an absolute path on disk or base64 bytes (optionally a data: URI). Required by edit models like nano_banana_2; ignored by pure text→image models.'),
|
|
1113
|
+
extra: z.record(z.union([z.string(), z.number(), z.boolean()])).optional().describe('Per-model params forwarded verbatim as --key value (e.g. {aspect_ratio:"16:9", resolution:"2048", duration:5}). See higgsfield model params for what each model accepts.'),
|
|
1114
|
+
waitTimeoutMs: z.number().int().positive().optional().describe('Max wait in milliseconds before the bridge gives up. Hard-capped at 20 minutes (videos can be slow).'),
|
|
1115
|
+
},
|
|
1116
|
+
async (params) => {
|
|
1117
|
+
const result = await bridgeCommand({ type: 'higgsfield_generate', params });
|
|
1118
|
+
return jsonResult(result);
|
|
1119
|
+
}
|
|
1120
|
+
);
|
|
1121
|
+
}
|
|
1122
|
+
|
|
1123
|
+
// ─── Lent CLIs — run another model's CLI one-shot ────────────
|
|
1124
|
+
// codex / grok / gemini / claude / agy. Each run is gated by that CLI's lend
|
|
1125
|
+
// toggle (the bridge refuses a model that isn't lent). The driving agent
|
|
1126
|
+
// orchestrates — this is just the primitive that lends the seat.
|
|
1127
|
+
server.tool(
|
|
1128
|
+
'cli_run',
|
|
1129
|
+
'Run another model\'s lent CLI with a prompt and get its text response back — so you (the driving agent) can pull a second LLM into a task: have Codex build something, Grok draft a spec, Gemini review, etc. The bridge handles each CLI\'s invocation quirks. Each model must be lent (toggle on the bridge\'s API & CLIs pane) or the run is refused. Spends the user\'s CLI subscription/quota. For a long run pass background:true and poll cli_run_status.',
|
|
1130
|
+
{
|
|
1131
|
+
model: z.enum(['codex', 'grok', 'gemini', 'claude', 'agy']).describe('Which lent CLI to run.'),
|
|
1132
|
+
prompt: z.string().optional().describe('The prompt to send. Provide this OR promptFile.'),
|
|
1133
|
+
promptFile: z.string().optional().describe('Path to a file whose contents are the prompt (use for very large prompts).'),
|
|
1134
|
+
cwd: z.string().optional().describe('Working directory to run the CLI in (relevant for agentic/file-writing work).'),
|
|
1135
|
+
mode: z.enum(['text', 'agentic']).optional().describe('"text" (default): read-only, just return the answer. "agentic": allow the CLI to write files in cwd (best supported on codex via its workspace-write sandbox).'),
|
|
1136
|
+
modelId: z.string().optional().describe('Optional underlying model id passed to the CLI (e.g. a specific Codex model). Omit for the CLI\'s default.'),
|
|
1137
|
+
background: z.boolean().optional().describe('Run without blocking; returns a run id immediately. Poll cli_run_status(id) / cli_runs for completion.'),
|
|
1138
|
+
timeoutMs: z.number().int().positive().optional().describe('Max wait in ms. Default 4 min, hard cap 20 min.'),
|
|
1139
|
+
},
|
|
1140
|
+
async (params) => {
|
|
1141
|
+
const result = await bridgeCommand({ type: 'cli_run', params });
|
|
1142
|
+
return jsonResult(result);
|
|
1143
|
+
}
|
|
1144
|
+
);
|
|
1145
|
+
|
|
1146
|
+
server.tool(
|
|
1147
|
+
'cli_runs',
|
|
1148
|
+
'List recent cli_run invocations — id, model, status, duration, and transcript path. Use to see what lent-CLI runs are in flight or finished.',
|
|
1149
|
+
{},
|
|
1150
|
+
async () => {
|
|
1151
|
+
const result = await bridgeCommand({ type: 'cli_runs' });
|
|
1152
|
+
return jsonResult(result);
|
|
1153
|
+
}
|
|
1154
|
+
);
|
|
1155
|
+
|
|
1156
|
+
server.tool(
|
|
1157
|
+
'cli_run_status',
|
|
1158
|
+
'Get the status and output of a cli_run by id. Use to poll a background run to completion (status goes running → done/error/timeout) and read its text + transcript.',
|
|
1159
|
+
{ id: z.string().describe('The run id returned by cli_run.') },
|
|
1160
|
+
async ({ id }) => {
|
|
1161
|
+
const result = await bridgeCommand({ type: 'cli_run_status', params: { id } });
|
|
1162
|
+
return jsonResult(result);
|
|
1163
|
+
}
|
|
1164
|
+
);
|
|
1165
|
+
|
|
1166
|
+
server.tool(
|
|
1167
|
+
'cli_status',
|
|
1168
|
+
'Discover which lent CLIs you can drive via cli_run RIGHT NOW. Returns one row per model (codex / grok / gemini / claude / agy) with available (installed), lent (owner toggled it on), authenticated (signed in), ready (all three), and blocker (cli_not_installed / not_lent / not_signed_in / null). Call this FIRST to route a task to a model that will actually run, instead of calling cli_run and getting a "not lent" refusal. For image/video use higgsfield_* instead.',
|
|
1169
|
+
{},
|
|
1170
|
+
async () => {
|
|
1171
|
+
const result = await bridgeCommand({ type: 'cli_status' });
|
|
1172
|
+
return jsonResult(result);
|
|
1173
|
+
}
|
|
1174
|
+
);
|
|
1175
|
+
|
|
1176
|
+
// ─── Custom LLMs (provider-count-gated) ──────────────────────
|
|
1177
|
+
// One generic tool that fans out to any custom LLM the user configured
|
|
1178
|
+
// on the API & CLIs pane (Ollama, LM Studio, OpenRouter, vLLM, etc).
|
|
1179
|
+
// Registered only when at least one custom provider exists, so the
|
|
1180
|
+
// permission toggle never appears as a phantom "blocked" tool with
|
|
1181
|
+
// nothing to dispatch to. The bridge dispatcher enforces the same gate
|
|
1182
|
+
// at command time.
|
|
1183
|
+
if (hasAnyCustomProvider()) {
|
|
1184
|
+
server.tool(
|
|
1185
|
+
'custom_llm',
|
|
1186
|
+
'Send a chat-completion request to any custom LLM the user configured on the bridge\'s API & CLIs pane (Ollama, LM Studio, OpenRouter, Groq Cloud, vLLM, etc — any OpenAI-compatible endpoint). Use this to route a prompt through a local LLM or a cloud aggregator the user has set up.',
|
|
1187
|
+
{
|
|
1188
|
+
provider: z.string().describe('Provider slug from the bridge\'s configured custom providers (e.g. "ollama-local", "openrouter").'),
|
|
1189
|
+
model: z.string().describe('Model id to use (must match a model the provider exposes — see provider.models).'),
|
|
1190
|
+
prompt: z.string().describe('User prompt for the chat completion.'),
|
|
1191
|
+
system: z.string().optional().describe('Optional system message.'),
|
|
1192
|
+
},
|
|
1193
|
+
async ({ provider, model, prompt, system }) => {
|
|
1194
|
+
const result = await bridgeCommand({ type: 'custom_llm', params: { provider, model, prompt, system } });
|
|
1195
|
+
return textResult(result?.text || result?.result?.text || JSON.stringify(result, null, 2));
|
|
1196
|
+
}
|
|
1197
|
+
);
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
// ─── Auto-launch ────────────────────────────────────────────
|
|
1201
|
+
|
|
1202
|
+
async function checkBridgeHealth(timeout = 2000): Promise<boolean> {
|
|
1203
|
+
try {
|
|
1204
|
+
const controller = new AbortController();
|
|
1205
|
+
const timer = setTimeout(() => controller.abort(), timeout);
|
|
1206
|
+
const res = await fetch(`${BRIDGE_URL}/api/status`, { signal: controller.signal, headers: bridgeHeaders(false) });
|
|
1207
|
+
clearTimeout(timer);
|
|
1208
|
+
if (!res.ok) return false;
|
|
1209
|
+
const body = await res.json().catch(() => null);
|
|
1210
|
+
return !!body?.running || !!body?.ok;
|
|
1211
|
+
} catch {
|
|
1212
|
+
return false;
|
|
1213
|
+
}
|
|
1214
|
+
}
|
|
1215
|
+
|
|
1216
|
+
async function waitForBridgeHealth(maxWait = 30000): Promise<void> {
|
|
1217
|
+
const start = Date.now();
|
|
1218
|
+
while (Date.now() - start < maxWait) {
|
|
1219
|
+
if (await checkBridgeHealth()) return;
|
|
1220
|
+
await new Promise(r => setTimeout(r, 500));
|
|
1221
|
+
}
|
|
1222
|
+
throw new Error(`Bridge did not become healthy at ${BRIDGE_URL} within ${maxWait / 1000}s`);
|
|
1223
|
+
}
|
|
1224
|
+
|
|
1225
|
+
async function ensureBridgeRunning(): Promise<void> {
|
|
1226
|
+
if (await checkBridgeHealth()) {
|
|
1227
|
+
console.error('[MCP] Bridge already running');
|
|
1228
|
+
return;
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
console.error('[MCP] Bridge not running — auto-launching...');
|
|
1232
|
+
|
|
1233
|
+
// Prod: launch the resolved bootstrap exe with --daemon (it brings up the
|
|
1234
|
+
// tray → daemon). The old `node <SRC>/launch.js` form is dev-only — launch.js
|
|
1235
|
+
// is not shipped in the payload — so use it only when no bootstrap exe
|
|
1236
|
+
// resolves. All logging here is stderr (MCP stdout must stay clean).
|
|
1237
|
+
const bootExe = resolveBootstrapExe();
|
|
1238
|
+
const [launchCmd, launchArgs] = bootExe
|
|
1239
|
+
? [bootExe, ['--daemon']]
|
|
1240
|
+
: [process.execPath, [LAUNCHER]];
|
|
1241
|
+
console.error(`[MCP] launching: ${launchCmd} ${launchArgs.join(' ')}`);
|
|
1242
|
+
spawn(launchCmd, launchArgs, {
|
|
1243
|
+
cwd: ROOT,
|
|
1244
|
+
stdio: 'ignore',
|
|
1245
|
+
windowsHide: true,
|
|
1246
|
+
env: process.env,
|
|
1247
|
+
}).unref();
|
|
1248
|
+
console.error(`[MCP] Waiting for bridge at ${BRIDGE_URL}...`);
|
|
1249
|
+
await waitForBridgeHealth(60000);
|
|
1250
|
+
console.error('[MCP] Bridge launched successfully');
|
|
1251
|
+
}
|
|
1252
|
+
|
|
1253
|
+
// ─── Start ───────────────────────────────────────────────────
|
|
1254
|
+
|
|
1255
|
+
async function main() {
|
|
1256
|
+
await ensureBridgeRunning();
|
|
1257
|
+
const transport = new StdioServerTransport();
|
|
1258
|
+
await server.connect(transport);
|
|
1259
|
+
console.error('[MCP] Empir3 Browser Bridge MCP server running');
|
|
1260
|
+
}
|
|
1261
|
+
|
|
1262
|
+
main().catch((e) => {
|
|
1263
|
+
console.error('[MCP] Fatal:', e);
|
|
1264
|
+
process.exit(1);
|
|
1265
|
+
});
|