codeloop-mcp-server 0.1.47 → 0.1.48

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- {"version":3,"file":"critical_floors.d.ts","sourceRoot":"","sources":["../../src/auth/critical_floors.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AAEH,MAAM,WAAW,aAAa;IAC5B,4DAA4D;IAC5D,WAAW,EAAE,MAAM,CAAC;IACpB,wDAAwD;IACxD,MAAM,EAAE,MAAM,CAAC;CAChB;AAED;;;;;;;;GAQG;AACH,eAAO,MAAM,eAAe,EAAE,aAAa,EAqB1C,CAAC"}
1
+ {"version":3,"file":"critical_floors.d.ts","sourceRoot":"","sources":["../../src/auth/critical_floors.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AAEH,MAAM,WAAW,aAAa;IAC5B,4DAA4D;IAC5D,WAAW,EAAE,MAAM,CAAC;IACpB,wDAAwD;IACxD,MAAM,EAAE,MAAM,CAAC;CAChB;AAED;;;;;;;;GAQG;AACH,eAAO,MAAM,eAAe,EAAE,aAAa,EAyB1C,CAAC"}
@@ -56,5 +56,9 @@ export const CRITICAL_FLOORS = [
56
56
  min_version: "0.1.47",
57
57
  reason: "Desktop-app capture-screenshot honesty + auto-detected target_app — pre-0.1.47 builds applied the desktop-app honesty refusal ONLY inside codeloop_verify's auto-capture path, so the standalone codeloop_capture_screenshot tool still silently grabbed the IDE when called without app_name during recording. Also pre-0.1.47, evidence.target_app required manual user setup — codeloop_init_project now auto-extracts it from .csproj AssemblyName / Xcode PRODUCT_NAME / AndroidManifest android:label so desktop projects work without manual config",
58
58
  },
59
+ {
60
+ min_version: "0.1.48",
61
+ reason: "Desktop interaction reliability — pre-0.1.48 builds rejected target_type: 'windows_desktop' / 'mac_desktop' / 'linux_desktop' with a hard Zod error (agents had to know to type 'desktop' instead), didn't translate window-relative click coordinates to screen-absolute (so coords computed from a captured window-screenshot missed targets by hundreds of pixels), had no codeloop_launch_app tool / auto-launch in start_recording (agents had to hand-roll Get-Process + Start-Process PowerShell every recording), refused agent-natural flat sequence step shapes like { action: 'click', x: 100, y: 200 }, and treated cross-run weak design_compare matches as critical 0% failures of the gate-run screen. All four broke the autonomous loop on Photometry-DB E2E session 8",
62
+ },
59
63
  ];
60
64
  //# sourceMappingURL=critical_floors.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"critical_floors.js","sourceRoot":"","sources":["../../src/auth/critical_floors.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AASH;;;;;;;;GAQG;AACH,MAAM,CAAC,MAAM,eAAe,GAAoB;IAC9C;QACE,WAAW,EAAE,QAAQ;QACrB,MAAM,EAAE,8JAA8J;KACvK;IACD;QACE,WAAW,EAAE,QAAQ;QACrB,MAAM,EAAE,yNAAyN;KAClO;IACD;QACE,WAAW,EAAE,QAAQ;QACrB,MAAM,EAAE,oYAAoY;KAC7Y;IACD;QACE,WAAW,EAAE,QAAQ;QACrB,MAAM,EAAE,sbAAsb;KAC/b;IACD;QACE,WAAW,EAAE,QAAQ;QACrB,MAAM,EAAE,4hBAA4hB;KACriB;CACF,CAAC"}
1
+ {"version":3,"file":"critical_floors.js","sourceRoot":"","sources":["../../src/auth/critical_floors.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AASH;;;;;;;;GAQG;AACH,MAAM,CAAC,MAAM,eAAe,GAAoB;IAC9C;QACE,WAAW,EAAE,QAAQ;QACrB,MAAM,EAAE,8JAA8J;KACvK;IACD;QACE,WAAW,EAAE,QAAQ;QACrB,MAAM,EAAE,yNAAyN;KAClO;IACD;QACE,WAAW,EAAE,QAAQ;QACrB,MAAM,EAAE,oYAAoY;KAC7Y;IACD;QACE,WAAW,EAAE,QAAQ;QACrB,MAAM,EAAE,sbAAsb;KAC/b;IACD;QACE,WAAW,EAAE,QAAQ;QACrB,MAAM,EAAE,4hBAA4hB;KACriB;IACD;QACE,WAAW,EAAE,QAAQ;QACrB,MAAM,EAAE,yvBAAyvB;KAClwB;CACF,CAAC"}
package/dist/index.js CHANGED
@@ -135,6 +135,12 @@ const server = new McpServer({
135
135
  async function withAuth(fn, tracker) {
136
136
  const started = Date.now();
137
137
  let outcome = { success: false };
138
+ // Photometry-DB E2E 8 regression: when the agent passes
139
+ // `project_dir`/`workspace_root` to a tool, remember that dir for
140
+ // the lifetime of this MCP process so the init-hint check no
141
+ // longer false-positives later calls where the call site didn't
142
+ // forward `dir` to `withInitHint`.
143
+ rememberInitializedDir(tracker?.cwd);
138
144
  try {
139
145
  // Local / self-hosted mode (CODELOOP_MODE=local): skip API-key validation
140
146
  // entirely. All cloud-side checks are bypassed; usage events are queued
@@ -286,6 +292,26 @@ function buildVersionBanner() {
286
292
  text: `[CodeLoop server v${v}]`,
287
293
  };
288
294
  }
295
+ /**
296
+ * Last project directory observed to be initialized via an actual
297
+ * tool call (set by `withAuth`'s tracker — see the .cwd field). The
298
+ * Photometry-DB E2E session 8 regression was that the agent passed
299
+ * `project_dir: "D:\\Work\\Photometry DB"` (which IS initialized) on
300
+ * every call, but the init hint was checking the server's startup
301
+ * `projectDir` (C:\Users\jiq on Windows) and incorrectly prepending
302
+ * "This project has not been initialized" to every response. With
303
+ * this cache, the first authenticated call that hits an initialized
304
+ * dir silences the hint for the rest of the session — independent of
305
+ * whether the specific call site forwarded `dir` to `withInitHint`.
306
+ */
307
+ let lastInitializedDir = null;
308
+ function rememberInitializedDir(dir) {
309
+ if (!dir)
310
+ return;
311
+ if (isProjectInitialized(dir)) {
312
+ lastInitializedDir = dir;
313
+ }
314
+ }
289
315
  function withInitHint(content, dir) {
290
316
  // Order matters:
291
317
  // 1. Update notice (most actionable signal — CRITICAL stays at top).
@@ -296,7 +322,22 @@ function withInitHint(content, dir) {
296
322
  const banner = buildVersionBanner();
297
323
  const withUpdate = withUpdateNotice(content);
298
324
  const head = [];
299
- if (!isProjectInitialized(dir || projectDir)) {
325
+ // Candidate dirs in priority order:
326
+ // 1. Explicit `dir` arg from the call site (when the handler
327
+ // bothered to thread it through — the verify / capture /
328
+ // design_compare etc. handlers do).
329
+ // 2. `lastInitializedDir` — any dir the agent has already passed
330
+ // to *any* CodeLoop tool this session that resolved to an
331
+ // initialized config. This catches the long tail of handlers
332
+ // that don't forward `dir` (e.g. `recommend_tool`,
333
+ // `release_readiness`, `section_status`) so they no longer
334
+ // false-positive the hint after init has succeeded once.
335
+ // 3. The MCP server's startup `projectDir` (often the user's
336
+ // home folder on Windows / Cursor — see CODELOOP_PROJECT_DIR
337
+ // auto-injection notes in setup-project.ts).
338
+ const candidates = [dir, lastInitializedDir, projectDir].filter((d) => typeof d === "string" && d.length > 0);
339
+ const anyInitialized = candidates.some((d) => isProjectInitialized(d));
340
+ if (!anyInitialized) {
300
341
  head.push({ type: "text", text: INIT_HINT });
301
342
  }
302
343
  const result = [...head, ...withUpdate];
@@ -305,6 +346,42 @@ function withInitHint(content, dir) {
305
346
  return result;
306
347
  }
307
348
  const TOOL_BOOTSTRAP = "[CodeLoop] You MUST call codeloop_verify after every code change. If .codeloop/config.json is missing, call codeloop_init_project FIRST.\n\n";
349
+ /**
350
+ * Zod preprocessor for `target_type` that accepts common synonyms.
351
+ *
352
+ * Photometry-DB E2E 8 regression: the agent (reasonably) tried
353
+ * `target_type: "windows_desktop"` because the project is a WPF
354
+ * Windows app, and got a hard schema-validation rejection. Same
355
+ * pattern for `mac_desktop`, `linux_desktop`, `web`, `android`,
356
+ * `ios`. Coerce them to the canonical 4-value enum so the agent
357
+ * isn't blocked by a thin naming convention.
358
+ */
359
+ const TARGET_TYPE_SYNONYMS = {
360
+ desktop: "desktop",
361
+ windows_desktop: "desktop",
362
+ win_desktop: "desktop",
363
+ win32_desktop: "desktop",
364
+ mac_desktop: "desktop",
365
+ macos_desktop: "desktop",
366
+ osx_desktop: "desktop",
367
+ linux_desktop: "desktop",
368
+ native_desktop: "desktop",
369
+ browser: "browser",
370
+ web: "browser",
371
+ webapp: "browser",
372
+ chrome: "browser",
373
+ android_emulator: "android_emulator",
374
+ android: "android_emulator",
375
+ ios_simulator: "ios_simulator",
376
+ ios: "ios_simulator",
377
+ };
378
+ function normalizeTargetType(v) {
379
+ if (typeof v !== "string")
380
+ return v;
381
+ const key = v.toLowerCase().trim();
382
+ return TARGET_TYPE_SYNONYMS[key] ?? v;
383
+ }
384
+ const targetTypeSchema = z.preprocess(normalizeTargetType, z.enum(["desktop", "browser", "android_emulator", "ios_simulator"]));
308
385
  // ── Implemented Tools ────────────────────────────────────────────
309
386
  server.tool("codeloop_verify", TOOL_BOOTSTRAP + `Run the CodeLoop verification suite on the current project. Use this tool when:
310
387
  - You have implemented or modified code and need to check if it works correctly
@@ -1136,8 +1213,28 @@ Returns: confirmation + the captured image as an MCP ImageContent block so you c
1136
1213
  const cfg = loadConfig(cwd);
1137
1214
  const targetApp = params.app_name ?? cfg.evidence?.target_app;
1138
1215
  const result = await captureScreenshot(screenshotsDir, params.screen_name, targetApp, undefined, { desktopAppMode: isDesktopAppProject });
1216
+ // Photometry-DB E2E 8 follow-on: when we capture a desktop app
1217
+ // window, also resolve its on-screen bounds so the agent can
1218
+ // (a) compute window-relative coords from the returned image
1219
+ // dimensions, and
1220
+ // (b) pass coords:"window" to codeloop_interact to get them
1221
+ // translated to screen-absolute automatically.
1222
+ // Without this, agents reasoned from a downscaled vision view
1223
+ // of the image and clicked tens or hundreds of pixels off the
1224
+ // intended target.
1225
+ let windowBounds = null;
1226
+ if (isDesktopAppProject && targetApp && result.captured) {
1227
+ try {
1228
+ const wm = await import("./runners/window_manager.js");
1229
+ const b = await wm.getWindowBounds(targetApp);
1230
+ if (b && b.width > 0 && b.height > 0) {
1231
+ windowBounds = { x: b.x, y: b.y, width: b.width, height: b.height };
1232
+ }
1233
+ }
1234
+ catch { /* best-effort */ }
1235
+ }
1139
1236
  await trackUsage(apiKey, "visual_review");
1140
- return result;
1237
+ return { ...result, windowBounds };
1141
1238
  }, { tool: "codeloop_capture_screenshot", cwd: (params.project_dir || params.workspace_root || projectDir), input: params });
1142
1239
  if (typeof authResult === "object" && authResult !== null && "error" in authResult) {
1143
1240
  return { content: [{ type: "text", text: JSON.stringify(authResult, null, 2) }] };
@@ -1145,12 +1242,18 @@ Returns: confirmation + the captured image as an MCP ImageContent block so you c
1145
1242
  const result = authResult;
1146
1243
  const content = [];
1147
1244
  if (result.captured && result.paths.length > 0) {
1148
- content.push({ type: "text", text: JSON.stringify({
1149
- captured: true,
1150
- screen_name: params.screen_name,
1151
- path: result.paths[0],
1152
- method: result.method,
1153
- }, null, 2) });
1245
+ const payload = {
1246
+ captured: true,
1247
+ screen_name: params.screen_name,
1248
+ path: result.paths[0],
1249
+ method: result.method,
1250
+ };
1251
+ if (result.windowBounds) {
1252
+ payload.window_bounds = result.windowBounds;
1253
+ payload.coordinate_hint =
1254
+ "This screenshot captures the named window. When you compute click coordinates from the image, pass them to codeloop_interact with `coords: \"window\"` so they're translated to screen-absolute automatically. (Default `coords: \"auto\"` also works when the coord fits inside the window — but `\"window\"` is unambiguous.)";
1255
+ }
1256
+ content.push({ type: "text", text: JSON.stringify(payload, null, 2) });
1154
1257
  const data = readImageAsBase64(result.paths[0]);
1155
1258
  if (data) {
1156
1259
  content.push({ type: "image", data, mimeType: mimeForPath(result.paths[0]) });
@@ -1321,6 +1424,45 @@ After recording, call codeloop_interaction_replay to extract frames and analyze
1321
1424
  content: withInitHint([{ type: "text", text: JSON.stringify(result, null, 2) }]),
1322
1425
  };
1323
1426
  });
1427
+ server.tool("codeloop_launch_app", TOOL_BOOTSTRAP + `Launch a desktop application by name from a project's build output. Use when:
1428
+ - You need the app under test to be running before codeloop_start_recording / codeloop_interact.
1429
+ - The agent doesn't know where the executable lives and shouldn't have to hand-roll Start-Process / open -a.
1430
+
1431
+ This tool is the canonical replacement for hand-coding PowerShell Start-Process / osascript / xdg-open
1432
+ from the agent. For Android / iOS, use codeloop_interact action="launch_app" with package_id instead.
1433
+
1434
+ Search order on Windows: publish/**/*.exe → bin/Release/**/*.exe → bin/Debug/**/*.exe (newest first).
1435
+ On macOS: publish/**/*.app → build/**/*.app → /Applications/<name>.app → open -a.
1436
+ On Linux: build/**/<name> → bin/**/<name> → dist/**/<name>.
1437
+
1438
+ If app_name is omitted, falls back to evidence.target_app from .codeloop/config.json (auto-detected at
1439
+ init for .NET/Xcode/Android projects via detect-target-app).`, {
1440
+ app_name: z.string().optional().describe("Window title / executable name of the app to launch. Defaults to evidence.target_app from .codeloop/config.json. Required if target_app is unset."),
1441
+ project_dir: z.string().optional().describe("Absolute path to the project root. Defaults to CODELOOP_PROJECT_DIR / discovered project dir."),
1442
+ workspace_root: z.string().optional().describe("[Alias for project_dir]"),
1443
+ }, async (params) => {
1444
+ const authResult = await withAuth(async () => {
1445
+ const wm = await import("./runners/window_manager.js");
1446
+ const { loadConfig } = await import("./config.js");
1447
+ const cwd = (params.project_dir || params.workspace_root || projectDir);
1448
+ const cfg = loadConfig(cwd);
1449
+ const appName = params.app_name || cfg.evidence?.target_app;
1450
+ if (!appName) {
1451
+ return {
1452
+ launched: false,
1453
+ reason: "No app_name provided and evidence.target_app is unset in .codeloop/config.json. Set it (e.g. \"evidence\": { \"target_app\": \"My App\" }) or pass app_name explicitly. For .NET/Xcode/Android projects, codeloop_init_project auto-detects this — re-run init or edit the config by hand.",
1454
+ };
1455
+ }
1456
+ const r = await wm.launchDesktopApp(appName, cwd);
1457
+ return { app_name: appName, ...r };
1458
+ }, { tool: "codeloop_launch_app", cwd: (params.project_dir || params.workspace_root || projectDir), input: params });
1459
+ if (typeof authResult === "object" && authResult !== null && "error" in authResult) {
1460
+ return { content: [{ type: "text", text: JSON.stringify(authResult, null, 2) }] };
1461
+ }
1462
+ return {
1463
+ content: withInitHint([{ type: "text", text: JSON.stringify(authResult, null, 2) }], params.project_dir || params.workspace_root),
1464
+ };
1465
+ });
1324
1466
  server.tool("codeloop_start_recording", TOOL_BOOTSTRAP + `Start recording the app window in the background. The app is brought to the front automatically
1325
1467
  (un-minimized if needed). Recording continues while you interact with the app. Call codeloop_stop_recording when done.
1326
1468
  This is the PREFERRED recording method because it lets you actively operate the app during capture.
@@ -1347,11 +1489,12 @@ Flow: start_recording → codeloop_interact with ALL app elements → stop_recor
1347
1489
  Supports desktop apps, Android emulator, iOS Simulator, and browser targets.
1348
1490
  Multi-monitor: on macOS, automatically detects which screen the app window is on.
1349
1491
  App logs (stdout, logcat, simctl log) are automatically captured alongside the video.`, {
1350
- app_name: z.string().describe("The name of the app to record (used to find and focus its window)"),
1492
+ app_name: z.string().optional().describe("The name of the app to record (used to find and focus its window). For desktop projects, defaults to evidence.target_app from .codeloop/config.json — set during init via detect-target-app for .NET/Xcode/Android projects, or settable manually."),
1351
1493
  run_id: z.string().optional().describe("Existing run ID to store the video in"),
1352
1494
  max_duration_seconds: z.number().default(120).describe("Safety timeout — recording stops automatically after this many seconds"),
1353
- target_type: z.enum(["desktop", "android_emulator", "ios_simulator", "browser"]).optional()
1495
+ target_type: targetTypeSchema.optional()
1354
1496
  .describe("Capture method. Auto-detected from project if omitted. desktop=ffmpeg screen, android_emulator=adb screenrecord, ios_simulator=simctl recordVideo, browser=ffmpeg/Playwright"),
1497
+ auto_launch: z.boolean().default(true).describe("When target_type=desktop and the app isn't already running, auto-launch it from the project's build output via evidence.target_app. Set false to skip (e.g. when the app is started by another process)."),
1355
1498
  project_dir: z.string().optional().describe("Absolute path to the project root. Defaults to CODELOOP_PROJECT_DIR env var or auto-discovered project directory. MUST be an actual project folder — passing the user's home directory is rejected. If your IDE launches the MCP server from the wrong cwd (common on Windows where Cursor uses C:\\Users\\<name> as cwd), set CODELOOP_PROJECT_DIR or pass this param explicitly."),
1356
1499
  workspace_root: z.string().optional().describe("[Alias for project_dir] Same semantics; accepted because many agents reach for this conventional name. Pass either `project_dir` OR `workspace_root` — they're equivalent."),
1357
1500
  }, async (params) => {
@@ -1359,6 +1502,7 @@ App logs (stdout, logcat, simctl log) are automatically captured alongside the v
1359
1502
  const { startBackgroundRecording } = await import("./runners/video_recorder.js");
1360
1503
  const { createRunDir, getRunDir, getArtifactsBaseDir } = await import("./evidence/artifacts.js");
1361
1504
  const { detectTargetType } = await import("./runners/platform_detect.js");
1505
+ const { loadConfig } = await import("./config.js");
1362
1506
  const cwd = (params.project_dir || params.workspace_root || projectDir);
1363
1507
  let videosDir;
1364
1508
  if (params.run_id) {
@@ -1370,7 +1514,11 @@ App logs (stdout, logcat, simctl log) are automatically captured alongside the v
1370
1514
  videosDir = join(runDir, "videos");
1371
1515
  }
1372
1516
  const targetType = params.target_type || (await detectTargetType(cwd));
1517
+ const cfg = loadConfig(cwd);
1373
1518
  let appName = params.app_name;
1519
+ if (!appName && (targetType === "desktop")) {
1520
+ appName = cfg.evidence?.target_app;
1521
+ }
1374
1522
  if (targetType === "browser") {
1375
1523
  const bi = await import("./runners/browser_interaction.js");
1376
1524
  await bi.ensureBrowserPage();
@@ -1379,7 +1527,42 @@ App logs (stdout, logcat, simctl log) are automatically captured alongside the v
1379
1527
  appName = pwAppName;
1380
1528
  }
1381
1529
  }
1382
- const result = await startBackgroundRecording(videosDir, appName, params.max_duration_seconds, targetType);
1530
+ // Photometry-DB E2E 8: agents spent many turns manually
1531
+ // probing `Get-Process` / `Start-Process` to launch the app
1532
+ // because nothing in CodeLoop did it for them. Now, when
1533
+ // recording a desktop app, we auto-launch from the build
1534
+ // output if the app isn't already running.
1535
+ const wm = await import("./runners/window_manager.js");
1536
+ let autoLaunchSummary;
1537
+ if (targetType === "desktop" && params.auto_launch !== false && appName) {
1538
+ try {
1539
+ const bounds = await wm.getWindowBounds(appName);
1540
+ if (!bounds) {
1541
+ const r = await wm.launchDesktopApp(appName, cwd);
1542
+ autoLaunchSummary = {
1543
+ attempted: true,
1544
+ launched: r.launched,
1545
+ command: r.command,
1546
+ reason: r.reason,
1547
+ };
1548
+ if (r.launched) {
1549
+ // Give the window time to appear / paint before the
1550
+ // recorder starts capturing frames.
1551
+ await new Promise((res) => setTimeout(res, 2000));
1552
+ }
1553
+ }
1554
+ else {
1555
+ autoLaunchSummary = { attempted: false, launched: true, reason: "already running" };
1556
+ }
1557
+ }
1558
+ catch (e) {
1559
+ autoLaunchSummary = { attempted: true, launched: false, reason: e.message };
1560
+ }
1561
+ }
1562
+ const result = await startBackgroundRecording(videosDir, appName ?? "", params.max_duration_seconds, targetType);
1563
+ if (autoLaunchSummary) {
1564
+ result.auto_launch = autoLaunchSummary;
1565
+ }
1383
1566
  await trackUsage(apiKey, "visual_review");
1384
1567
  return result;
1385
1568
  }, { tool: "codeloop_start_recording", cwd: (params.project_dir || params.workspace_root || projectDir), input: params });
@@ -1955,8 +2138,8 @@ MANDATORY for web apps: You MUST type into form fields, fill login/signup forms,
1955
2138
  validation errors, and click submit buttons. Just navigating pages is NOT enough.
1956
2139
  Wait 1-2 seconds between interactions so video frames capture state changes.`, {
1957
2140
  action: z.string().describe("Action to perform: click, double_click, right_click, hover, type, keystroke, hotkey, scroll, drag_drop, long_press, type_and_submit, type_and_tab, fill_form, select_option, toggle, upload_file, navigate_url, navigate_back, navigate_forward, wait, sequence, swipe, back_button, home_button, deep_link, grant_permission, rotate_device, biometric_auth, launch_app, clear_app_data, mock_location, simulate_network, maestro_flow, win_ui_inspect, win_ui_automate"),
1958
- target_type: z.enum(["desktop", "browser", "android_emulator", "ios_simulator"]).optional()
1959
- .describe("Interaction target. Auto-detected if omitted."),
2141
+ target_type: targetTypeSchema.optional()
2142
+ .describe("Interaction target. Auto-detected if omitted. Accepts synonyms: `windows_desktop`/`mac_desktop`/`linux_desktop` → `desktop`; `web` → `browser`; `android` → `android_emulator`; `ios` → `ios_simulator`."),
1960
2143
  x: z.number().optional().describe("X coordinate for click/scroll/drag/swipe"),
1961
2144
  y: z.number().optional().describe("Y coordinate for click/scroll/drag/swipe"),
1962
2145
  x2: z.number().optional().describe("End X for drag_drop/swipe"),
@@ -1987,7 +2170,7 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
1987
2170
  action: z.string(),
1988
2171
  params: z.record(z.unknown()).optional(),
1989
2172
  delay_ms: z.number().optional(),
1990
- })).optional().describe("Steps for sequence action"),
2173
+ }).passthrough()).optional().describe("Steps for sequence action. Accepts BOTH nested form `{ action, params: { x, y, … }, delay_ms? }` and flat form `{ action, x, y, … }` — the flat form is what agents naturally write (mirrors the top-level codeloop_interact shape). Supports inside desktop sequences: click, double_click, right_click, hover, scroll, type, hotkey, keystroke, navigate_url, wait, win_ui_automate."),
1991
2174
  maestro_steps: z.array(z.string()).optional().describe("High-level steps for maestro_flow"),
1992
2175
  automation_action: z.enum(["invoke", "setValue", "toggle", "select", "scroll"]).optional()
1993
2176
  .describe("For win_ui_automate"),
@@ -1997,6 +2180,7 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
1997
2180
  description: z.string().optional().describe("[Alias for intent] Same semantics."),
1998
2181
  purpose: z.string().optional().describe("[Alias for intent] Same semantics."),
1999
2182
  step: z.string().optional().describe("Plan-step name when this interaction is driving a codeloop_plan_user_journey arc (e.g. 'edit', 'delete', 'create', 'save', 'verify'). Logged alongside `intent` and read by the CRUD classifier."),
2183
+ coords: z.enum(["auto", "window", "screen"]).optional().describe("How to interpret x/y for desktop click/double_click/right_click/hover/scroll/drag/long_press. `auto` (default): if `app_name` resolves to a visible window AND (x, y) fits inside the window's client area, treat as window-relative and auto-offset by the window origin; otherwise leave as raw screen-absolute coords. `window`: ALWAYS add the window origin offset (errors if the window isn't found). `screen`: ALWAYS pass through (legacy behaviour, matches CGEvent / user32.dll / xdotool semantics). Fixes the Photometry-DB E2E 8 failure mode where the agent captured a 1600×900 window screenshot, computed click coords against the image, and missed the sidebar because the window's actual top-left was (286, 286) on a 5120×1440 screen."),
2000
2184
  project_dir: z.string().optional().describe("Absolute path to project root."),
2001
2185
  workspace_root: z.string().optional().describe("[Alias for project_dir] Pass either; they're equivalent."),
2002
2186
  }, async (params) => {
@@ -2026,13 +2210,58 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2026
2210
  await bi.ensureBrowserPage();
2027
2211
  }
2028
2212
  // Bring the app to front before desktop interactions (non-browser, non-mobile).
2213
+ let windowOriginOffset = null;
2029
2214
  if (tt === "desktop") {
2030
2215
  const appName = params.app_name || vr.getActiveRecordingAppName();
2031
2216
  if (appName && action !== "wait") {
2032
2217
  await wm.bringAppToFront(appName);
2033
2218
  await new Promise(r => setTimeout(r, 300));
2219
+ // Photometry-DB E2E 8: agents commonly compute click coords
2220
+ // from a window-cropped screenshot (which is what
2221
+ // codeloop_capture_screenshot returns when `app_name` is
2222
+ // set), then pass those coords to codeloop_interact —
2223
+ // which expects raw SCREEN coordinates. On a multi-monitor
2224
+ // / DPI-scaled setup that mismatch silently dropped clicks
2225
+ // 100s of pixels off-target. When `coords` is `auto` (the
2226
+ // default) we look up the window's actual screen origin
2227
+ // and add it to x/y, but ONLY if (x, y) fits inside the
2228
+ // window — that keeps legacy callers passing raw screen
2229
+ // coords working unchanged. `coords: "window"` forces the
2230
+ // offset; `coords: "screen"` opts out.
2231
+ const coordsMode = params.coords ?? "auto";
2232
+ if (coordsMode !== "screen") {
2233
+ try {
2234
+ const b = await wm.getWindowBounds(appName);
2235
+ if (b && b.width > 0 && b.height > 0) {
2236
+ windowOriginOffset = { dx: b.x, dy: b.y, width: b.width, height: b.height };
2237
+ }
2238
+ }
2239
+ catch { /* best-effort */ }
2240
+ }
2034
2241
  }
2035
2242
  }
2243
+ // Helper used by every coordinate-driven desktop action below.
2244
+ const translateXY = (x, y) => {
2245
+ if (tt !== "desktop" || x == null || y == null || !windowOriginOffset) {
2246
+ return { x, y };
2247
+ }
2248
+ const mode = params.coords ?? "auto";
2249
+ if (mode === "screen")
2250
+ return { x, y };
2251
+ if (mode === "window") {
2252
+ return { x: x + windowOriginOffset.dx, y: y + windowOriginOffset.dy };
2253
+ }
2254
+ // auto: if (x, y) fits inside the window's client area,
2255
+ // assume the agent computed against a window-cropped
2256
+ // screenshot and add the origin. Otherwise pass through
2257
+ // (likely a raw screen coord from a manual workflow).
2258
+ const inside = x >= 0 && x <= windowOriginOffset.width &&
2259
+ y >= 0 && y <= windowOriginOffset.height;
2260
+ if (inside) {
2261
+ return { x: x + windowOriginOffset.dx, y: y + windowOriginOffset.dy };
2262
+ }
2263
+ return { x, y };
2264
+ };
2036
2265
  switch (action) {
2037
2266
  case "click":
2038
2267
  if (tt === "browser" && params.selector) {
@@ -2054,7 +2283,8 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2054
2283
  }
2055
2284
  }
2056
2285
  else if (params.x != null && params.y != null) {
2057
- success = await wm.clickAtPosition(params.x, params.y);
2286
+ const t = translateXY(params.x, params.y);
2287
+ success = await wm.clickAtPosition(t.x, t.y);
2058
2288
  }
2059
2289
  detail = `click at ${params.selector || `(${params.x},${params.y})`}`;
2060
2290
  break;
@@ -2063,7 +2293,8 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2063
2293
  success = await bi.browserDoubleClick(params.selector);
2064
2294
  }
2065
2295
  else if (params.x != null && params.y != null) {
2066
- success = await wm.doubleClickAtPosition(params.x, params.y);
2296
+ const t = translateXY(params.x, params.y);
2297
+ success = await wm.doubleClickAtPosition(t.x, t.y);
2067
2298
  }
2068
2299
  detail = `double_click at ${params.selector || `(${params.x},${params.y})`}`;
2069
2300
  break;
@@ -2072,7 +2303,8 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2072
2303
  success = await bi.browserRightClick(params.selector);
2073
2304
  }
2074
2305
  else if (params.x != null && params.y != null) {
2075
- success = await wm.rightClickAtPosition(params.x, params.y);
2306
+ const t = translateXY(params.x, params.y);
2307
+ success = await wm.rightClickAtPosition(t.x, t.y);
2076
2308
  }
2077
2309
  detail = `right_click at ${params.selector || `(${params.x},${params.y})`}`;
2078
2310
  break;
@@ -2081,7 +2313,8 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2081
2313
  success = await bi.browserHover(params.selector);
2082
2314
  }
2083
2315
  else if (params.x != null && params.y != null) {
2084
- success = await wm.hoverAtPosition(params.x, params.y);
2316
+ const t = translateXY(params.x, params.y);
2317
+ success = await wm.hoverAtPosition(t.x, t.y);
2085
2318
  }
2086
2319
  detail = `hover at ${params.selector || `(${params.x},${params.y})`}`;
2087
2320
  break;
@@ -2166,7 +2399,8 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2166
2399
  success = await wm.simctlSwipe(sx, sy, ex, ey);
2167
2400
  }
2168
2401
  else {
2169
- success = await wm.scrollAtPosition(params.x || 500, params.y || 400, params.direction || "down", params.amount || 3);
2402
+ const t = translateXY(params.x || 500, params.y || 400);
2403
+ success = await wm.scrollAtPosition(t.x, t.y, params.direction || "down", params.amount || 3);
2170
2404
  }
2171
2405
  detail = `scroll ${params.direction || "down"}`;
2172
2406
  break;
@@ -2179,7 +2413,9 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2179
2413
  success = await wm.adbSwipe(params.x, params.y, params.x2, params.y2, params.duration_ms || 500);
2180
2414
  }
2181
2415
  else {
2182
- success = await wm.dragDrop(params.x, params.y, params.x2, params.y2, params.duration_ms || 500);
2416
+ const a = translateXY(params.x, params.y);
2417
+ const b = translateXY(params.x2, params.y2);
2418
+ success = await wm.dragDrop(a.x, a.y, b.x, b.y, params.duration_ms || 500);
2183
2419
  }
2184
2420
  }
2185
2421
  detail = `drag_drop`;
@@ -2189,7 +2425,8 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2189
2425
  success = await wm.adbLongPress(params.x, params.y, params.duration_ms || 1000);
2190
2426
  }
2191
2427
  else if (params.x != null && params.y != null) {
2192
- success = await wm.longPressAtPosition(params.x, params.y, params.duration_ms || 1000);
2428
+ const t = translateXY(params.x, params.y);
2429
+ success = await wm.longPressAtPosition(t.x, t.y, params.duration_ms || 1000);
2193
2430
  }
2194
2431
  detail = `long_press at (${params.x},${params.y})`;
2195
2432
  break;
@@ -2371,11 +2608,37 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2371
2608
  if (tt === "android_emulator" && params.package_id) {
2372
2609
  const r = await import("./runners/base.js").then(m => m.runCommand("adb", ["shell", "am", "start", "-n", params.package_id], process.cwd()));
2373
2610
  success = r.exit_code === 0;
2611
+ detail = `launch_app "${params.package_id}"`;
2374
2612
  }
2375
2613
  else if (tt === "ios_simulator" && params.package_id) {
2376
2614
  success = await wm.simctlLaunch(params.package_id);
2615
+ detail = `launch_app "${params.package_id}"`;
2616
+ }
2617
+ else if (tt === "desktop") {
2618
+ // Photometry-DB E2E 8: desktop launch was completely
2619
+ // missing — agents had to hand-roll PowerShell
2620
+ // Start-Process / `open -a` calls. Now resolves via
2621
+ // evidence.target_app from the project config when
2622
+ // app_name is omitted.
2623
+ const { loadConfig } = await import("./config.js");
2624
+ const cfg = loadConfig(cwd);
2625
+ const appName = params.app_name || cfg.evidence?.target_app;
2626
+ if (!appName) {
2627
+ success = false;
2628
+ detail = "launch_app desktop: no app_name provided and evidence.target_app is unset in .codeloop/config.json. Set it (e.g. \"target_app\": \"Photometry DB\") or pass app_name explicitly.";
2629
+ }
2630
+ else {
2631
+ const r = await wm.launchDesktopApp(appName, cwd);
2632
+ success = r.launched;
2633
+ detail = r.launched
2634
+ ? `launch_app "${appName}" via ${r.command}${r.pid ? ` (pid ${r.pid})` : ""}`
2635
+ : `launch_app "${appName}" failed: ${r.reason || "unknown error"}`;
2636
+ }
2637
+ }
2638
+ else {
2639
+ success = false;
2640
+ detail = `launch_app "${params.package_id || params.app_name || ""}": target ${tt} not supported in this action`;
2377
2641
  }
2378
- detail = `launch_app "${params.package_id}"`;
2379
2642
  break;
2380
2643
  case "clear_app_data":
2381
2644
  if (tt === "android_emulator" && params.package_id) {
@@ -2439,7 +2702,22 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2439
2702
  if (step.delay_ms)
2440
2703
  await new Promise(r => setTimeout(r, step.delay_ms));
2441
2704
  const stepAction = step.action;
2442
- const sp = (step.params || {});
2705
+ // Photometry-DB E2E 8: agents naturally write steps in
2706
+ // FLAT form `{ action, x, y, ms, … }` because that
2707
+ // mirrors the top-level codeloop_interact shape. The
2708
+ // schema documented the NESTED form `{ action,
2709
+ // params: { … } }`. Now we accept both: prefer
2710
+ // `step.params` if present, otherwise fall back to the
2711
+ // step object itself minus the wrapper keys.
2712
+ const stepObj = step;
2713
+ const nested = (step.params || {});
2714
+ const sp = Object.keys(nested).length > 0
2715
+ ? nested
2716
+ : Object.fromEntries(Object.entries(stepObj).filter(([k]) => k !== "action" && k !== "params" && k !== "delay_ms"));
2717
+ // Convenient aliases: agents wrote `ms` for wait
2718
+ // duration in the log; accept that as `duration_ms`.
2719
+ if (sp.ms != null && sp.duration_ms == null)
2720
+ sp.duration_ms = sp.ms;
2443
2721
  let stepOk = false;
2444
2722
  let stepReason;
2445
2723
  try {
@@ -2447,7 +2725,20 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2447
2725
  stepOk = await bi.browserClick(sp.selector);
2448
2726
  }
2449
2727
  else if (stepAction === "click" && sp.x != null && sp.y != null) {
2450
- stepOk = await wm.clickAtPosition(sp.x, sp.y);
2728
+ const t = translateXY(sp.x, sp.y);
2729
+ stepOk = await wm.clickAtPosition(t.x, t.y);
2730
+ }
2731
+ else if (stepAction === "double_click" && tt !== "browser" && sp.x != null && sp.y != null) {
2732
+ const t = translateXY(sp.x, sp.y);
2733
+ stepOk = await wm.doubleClickAtPosition(t.x, t.y);
2734
+ }
2735
+ else if (stepAction === "right_click" && tt !== "browser" && sp.x != null && sp.y != null) {
2736
+ const t = translateXY(sp.x, sp.y);
2737
+ stepOk = await wm.rightClickAtPosition(t.x, t.y);
2738
+ }
2739
+ else if (stepAction === "hover" && tt !== "browser" && sp.x != null && sp.y != null) {
2740
+ const t = translateXY(sp.x, sp.y);
2741
+ stepOk = await wm.hoverAtPosition(t.x, t.y);
2451
2742
  }
2452
2743
  else if (stepAction === "type" && tt === "browser" && sp.selector && sp.text) {
2453
2744
  stepOk = await bi.browserType(sp.selector, sp.text);
@@ -2483,9 +2774,13 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2483
2774
  stepOk = tt === "browser" ? await bi.browserKeystroke(sp.key) : await wm.sendKeyByName(sp.key);
2484
2775
  }
2485
2776
  else if (stepAction === "scroll") {
2486
- stepOk = tt === "browser"
2487
- ? await bi.browserScroll(sp.direction || "down", sp.amount || 300)
2488
- : await wm.scrollAtPosition(sp.x || 500, sp.y || 400, sp.direction || "down", sp.amount || 3);
2777
+ if (tt === "browser") {
2778
+ stepOk = await bi.browserScroll(sp.direction || "down", sp.amount || 300);
2779
+ }
2780
+ else {
2781
+ const t = translateXY(sp.x || 500, sp.y || 400);
2782
+ stepOk = await wm.scrollAtPosition(t.x, t.y, sp.direction || "down", sp.amount || 3);
2783
+ }
2489
2784
  }
2490
2785
  else if (stepAction === "wait") {
2491
2786
  await new Promise(r => setTimeout(r, sp.duration_ms || 1000));