codeloop-mcp-server 0.1.46 → 0.1.48

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -135,6 +135,12 @@ const server = new McpServer({
135
135
  async function withAuth(fn, tracker) {
136
136
  const started = Date.now();
137
137
  let outcome = { success: false };
138
+ // Photometry-DB E2E 8 regression: when the agent passes
139
+ // `project_dir`/`workspace_root` to a tool, remember that dir for
140
+ // the lifetime of this MCP process so the init-hint check no
141
+ // longer false-positives later calls where the call site didn't
142
+ // forward `dir` to `withInitHint`.
143
+ rememberInitializedDir(tracker?.cwd);
138
144
  try {
139
145
  // Local / self-hosted mode (CODELOOP_MODE=local): skip API-key validation
140
146
  // entirely. All cloud-side checks are bypassed; usage events are queued
@@ -286,6 +292,26 @@ function buildVersionBanner() {
286
292
  text: `[CodeLoop server v${v}]`,
287
293
  };
288
294
  }
295
+ /**
296
+ * Last project directory observed to be initialized via an actual
297
+ * tool call (set by `withAuth`'s tracker — see the .cwd field). The
298
+ * Photometry-DB E2E session 8 regression was that the agent passed
299
+ * `project_dir: "D:\\Work\\Photometry DB"` (which IS initialized) on
300
+ * every call, but the init hint was checking the server's startup
301
+ * `projectDir` (C:\Users\jiq on Windows) and incorrectly prepending
302
+ * "This project has not been initialized" to every response. With
303
+ * this cache, the first authenticated call that hits an initialized
304
+ * dir silences the hint for the rest of the session — independent of
305
+ * whether the specific call site forwarded `dir` to `withInitHint`.
306
+ */
307
+ let lastInitializedDir = null;
308
+ function rememberInitializedDir(dir) {
309
+ if (!dir)
310
+ return;
311
+ if (isProjectInitialized(dir)) {
312
+ lastInitializedDir = dir;
313
+ }
314
+ }
289
315
  function withInitHint(content, dir) {
290
316
  // Order matters:
291
317
  // 1. Update notice (most actionable signal — CRITICAL stays at top).
@@ -296,7 +322,22 @@ function withInitHint(content, dir) {
296
322
  const banner = buildVersionBanner();
297
323
  const withUpdate = withUpdateNotice(content);
298
324
  const head = [];
299
- if (!isProjectInitialized(dir || projectDir)) {
325
+ // Candidate dirs in priority order:
326
+ // 1. Explicit `dir` arg from the call site (when the handler
327
+ // bothered to thread it through — the verify / capture /
328
+ // design_compare etc. handlers do).
329
+ // 2. `lastInitializedDir` — any dir the agent has already passed
330
+ // to *any* CodeLoop tool this session that resolved to an
331
+ // initialized config. This catches the long tail of handlers
332
+ // that don't forward `dir` (e.g. `recommend_tool`,
333
+ // `release_readiness`, `section_status`) so they no longer
334
+ // false-positive the hint after init has succeeded once.
335
+ // 3. The MCP server's startup `projectDir` (often the user's
336
+ // home folder on Windows / Cursor — see CODELOOP_PROJECT_DIR
337
+ // auto-injection notes in setup-project.ts).
338
+ const candidates = [dir, lastInitializedDir, projectDir].filter((d) => typeof d === "string" && d.length > 0);
339
+ const anyInitialized = candidates.some((d) => isProjectInitialized(d));
340
+ if (!anyInitialized) {
300
341
  head.push({ type: "text", text: INIT_HINT });
301
342
  }
302
343
  const result = [...head, ...withUpdate];
@@ -305,6 +346,42 @@ function withInitHint(content, dir) {
305
346
  return result;
306
347
  }
307
348
  const TOOL_BOOTSTRAP = "[CodeLoop] You MUST call codeloop_verify after every code change. If .codeloop/config.json is missing, call codeloop_init_project FIRST.\n\n";
349
+ /**
350
+ * Zod preprocessor for `target_type` that accepts common synonyms.
351
+ *
352
+ * Photometry-DB E2E 8 regression: the agent (reasonably) tried
353
+ * `target_type: "windows_desktop"` because the project is a WPF
354
+ * Windows app, and got a hard schema-validation rejection. Same
355
+ * pattern for `mac_desktop`, `linux_desktop`, `web`, `android`,
356
+ * `ios`. Coerce them to the canonical 4-value enum so the agent
357
+ * isn't blocked by a thin naming convention.
358
+ */
359
+ const TARGET_TYPE_SYNONYMS = {
360
+ desktop: "desktop",
361
+ windows_desktop: "desktop",
362
+ win_desktop: "desktop",
363
+ win32_desktop: "desktop",
364
+ mac_desktop: "desktop",
365
+ macos_desktop: "desktop",
366
+ osx_desktop: "desktop",
367
+ linux_desktop: "desktop",
368
+ native_desktop: "desktop",
369
+ browser: "browser",
370
+ web: "browser",
371
+ webapp: "browser",
372
+ chrome: "browser",
373
+ android_emulator: "android_emulator",
374
+ android: "android_emulator",
375
+ ios_simulator: "ios_simulator",
376
+ ios: "ios_simulator",
377
+ };
378
+ function normalizeTargetType(v) {
379
+ if (typeof v !== "string")
380
+ return v;
381
+ const key = v.toLowerCase().trim();
382
+ return TARGET_TYPE_SYNONYMS[key] ?? v;
383
+ }
384
+ const targetTypeSchema = z.preprocess(normalizeTargetType, z.enum(["desktop", "browser", "android_emulator", "ios_simulator"]));
308
385
  // ── Implemented Tools ────────────────────────────────────────────
309
386
  server.tool("codeloop_verify", TOOL_BOOTSTRAP + `Run the CodeLoop verification suite on the current project. Use this tool when:
310
387
  - You have implemented or modified code and need to check if it works correctly
@@ -1093,9 +1170,12 @@ server.tool("codeloop_capture_screenshot", TOOL_BOOTSTRAP + `Capture a screensho
1093
1170
  - You want to capture a specific page/screen of the app for visual analysis
1094
1171
  - You are navigating through the app to capture all pages for complete visual coverage
1095
1172
  - You want to add a screenshot to an existing verification run
1096
- Provide app_name to capture ONLY that app's window (REQUIRED for correct capture). The app is
1097
- automatically brought to the front before capture, and the IDE is restored to the front after.
1098
- Without app_name, captures the full screen which may show the IDE instead of the app.
1173
+ Provide app_name to capture ONLY that app's window. The app is automatically brought to the
1174
+ front before capture, and the IDE is restored to the front after. When app_name is omitted on
1175
+ a desktop-app project (WPF/.NET, native Xcode, Android Gradle host), this tool first falls
1176
+ back to evidence.target_app from .codeloop/config.json; if that's also missing it REFUSES to
1177
+ silently grab the full screen (which would otherwise capture the IDE) and returns an
1178
+ actionable error pointing at the exact config key to set.
1099
1179
  Returns: confirmation + the captured image as an MCP ImageContent block so you can see what was captured.`, {
1100
1180
  screen_name: z.string(),
1101
1181
  app_name: z.string().optional(),
@@ -1116,9 +1196,45 @@ Returns: confirmation + the captured image as an MCP ImageContent block so you c
1116
1196
  const { runDir } = createRunDir(undefined, join(cwd, "artifacts", "runs"));
1117
1197
  screenshotsDir = join(runDir, "screenshots");
1118
1198
  }
1119
- const result = await captureScreenshot(screenshotsDir, params.screen_name, params.app_name);
1199
+ // Mirror codeloop_verify's auto-capture honesty: when the
1200
+ // project is a desktop app (WPF/.NET, native Xcode, Android
1201
+ // Gradle host) and the agent didn't pass an explicit app_name,
1202
+ // fall back to `evidence.target_app` from the project config
1203
+ // and turn on `desktopAppMode` so captureScreenshot refuses
1204
+ // a silent full-screen grab of the IDE. Previously this
1205
+ // tool would happily save a 4K PNG of Cursor whenever the
1206
+ // agent forgot app_name — and the auto-fix loop would then
1207
+ // burn cycles trying to "fix design diffs" against a
1208
+ // screenshot of the editor.
1209
+ const { detectPlatform } = await import("./tools/verify.js");
1210
+ const { loadConfig } = await import("./config.js");
1211
+ const platform = detectPlatform(cwd);
1212
+ const isDesktopAppProject = platform === "dotnet" || platform === "xcode" || platform === "android";
1213
+ const cfg = loadConfig(cwd);
1214
+ const targetApp = params.app_name ?? cfg.evidence?.target_app;
1215
+ const result = await captureScreenshot(screenshotsDir, params.screen_name, targetApp, undefined, { desktopAppMode: isDesktopAppProject });
1216
+ // Photometry-DB E2E 8 follow-on: when we capture a desktop app
1217
+ // window, also resolve its on-screen bounds so the agent can
1218
+ // (a) compute window-relative coords from the returned image
1219
+ // dimensions, and
1220
+ // (b) pass coords:"window" to codeloop_interact to get them
1221
+ // translated to screen-absolute automatically.
1222
+ // Without this, agents reasoned from a downscaled vision view
1223
+ // of the image and clicked tens or hundreds of pixels off the
1224
+ // intended target.
1225
+ let windowBounds = null;
1226
+ if (isDesktopAppProject && targetApp && result.captured) {
1227
+ try {
1228
+ const wm = await import("./runners/window_manager.js");
1229
+ const b = await wm.getWindowBounds(targetApp);
1230
+ if (b && b.width > 0 && b.height > 0) {
1231
+ windowBounds = { x: b.x, y: b.y, width: b.width, height: b.height };
1232
+ }
1233
+ }
1234
+ catch { /* best-effort */ }
1235
+ }
1120
1236
  await trackUsage(apiKey, "visual_review");
1121
- return result;
1237
+ return { ...result, windowBounds };
1122
1238
  }, { tool: "codeloop_capture_screenshot", cwd: (params.project_dir || params.workspace_root || projectDir), input: params });
1123
1239
  if (typeof authResult === "object" && authResult !== null && "error" in authResult) {
1124
1240
  return { content: [{ type: "text", text: JSON.stringify(authResult, null, 2) }] };
@@ -1126,12 +1242,18 @@ Returns: confirmation + the captured image as an MCP ImageContent block so you c
1126
1242
  const result = authResult;
1127
1243
  const content = [];
1128
1244
  if (result.captured && result.paths.length > 0) {
1129
- content.push({ type: "text", text: JSON.stringify({
1130
- captured: true,
1131
- screen_name: params.screen_name,
1132
- path: result.paths[0],
1133
- method: result.method,
1134
- }, null, 2) });
1245
+ const payload = {
1246
+ captured: true,
1247
+ screen_name: params.screen_name,
1248
+ path: result.paths[0],
1249
+ method: result.method,
1250
+ };
1251
+ if (result.windowBounds) {
1252
+ payload.window_bounds = result.windowBounds;
1253
+ payload.coordinate_hint =
1254
+ "This screenshot captures the named window. When you compute click coordinates from the image, pass them to codeloop_interact with `coords: \"window\"` so they're translated to screen-absolute automatically. (Default `coords: \"auto\"` also works when the coord fits inside the window — but `\"window\"` is unambiguous.)";
1255
+ }
1256
+ content.push({ type: "text", text: JSON.stringify(payload, null, 2) });
1135
1257
  const data = readImageAsBase64(result.paths[0]);
1136
1258
  if (data) {
1137
1259
  content.push({ type: "image", data, mimeType: mimeForPath(result.paths[0]) });
@@ -1302,6 +1424,45 @@ After recording, call codeloop_interaction_replay to extract frames and analyze
1302
1424
  content: withInitHint([{ type: "text", text: JSON.stringify(result, null, 2) }]),
1303
1425
  };
1304
1426
  });
1427
+ server.tool("codeloop_launch_app", TOOL_BOOTSTRAP + `Launch a desktop application by name from a project's build output. Use when:
1428
+ - You need the app under test to be running before codeloop_start_recording / codeloop_interact.
1429
+ - The agent doesn't know where the executable lives and shouldn't have to hand-roll Start-Process / open -a.
1430
+
1431
+ This tool is the canonical replacement for hand-coding PowerShell Start-Process / osascript / xdg-open
1432
+ from the agent. For Android / iOS, use codeloop_interact action="launch_app" with package_id instead.
1433
+
1434
+ Search order on Windows: publish/**/*.exe → bin/Release/**/*.exe → bin/Debug/**/*.exe (newest first).
1435
+ On macOS: publish/**/*.app → build/**/*.app → /Applications/<name>.app → open -a.
1436
+ On Linux: build/**/<name> → bin/**/<name> → dist/**/<name>.
1437
+
1438
+ If app_name is omitted, falls back to evidence.target_app from .codeloop/config.json (auto-detected at
1439
+ init for .NET/Xcode/Android projects via detect-target-app).`, {
1440
+ app_name: z.string().optional().describe("Window title / executable name of the app to launch. Defaults to evidence.target_app from .codeloop/config.json. Required if target_app is unset."),
1441
+ project_dir: z.string().optional().describe("Absolute path to the project root. Defaults to CODELOOP_PROJECT_DIR / discovered project dir."),
1442
+ workspace_root: z.string().optional().describe("[Alias for project_dir]"),
1443
+ }, async (params) => {
1444
+ const authResult = await withAuth(async () => {
1445
+ const wm = await import("./runners/window_manager.js");
1446
+ const { loadConfig } = await import("./config.js");
1447
+ const cwd = (params.project_dir || params.workspace_root || projectDir);
1448
+ const cfg = loadConfig(cwd);
1449
+ const appName = params.app_name || cfg.evidence?.target_app;
1450
+ if (!appName) {
1451
+ return {
1452
+ launched: false,
1453
+ reason: "No app_name provided and evidence.target_app is unset in .codeloop/config.json. Set it (e.g. \"evidence\": { \"target_app\": \"My App\" }) or pass app_name explicitly. For .NET/Xcode/Android projects, codeloop_init_project auto-detects this — re-run init or edit the config by hand.",
1454
+ };
1455
+ }
1456
+ const r = await wm.launchDesktopApp(appName, cwd);
1457
+ return { app_name: appName, ...r };
1458
+ }, { tool: "codeloop_launch_app", cwd: (params.project_dir || params.workspace_root || projectDir), input: params });
1459
+ if (typeof authResult === "object" && authResult !== null && "error" in authResult) {
1460
+ return { content: [{ type: "text", text: JSON.stringify(authResult, null, 2) }] };
1461
+ }
1462
+ return {
1463
+ content: withInitHint([{ type: "text", text: JSON.stringify(authResult, null, 2) }], params.project_dir || params.workspace_root),
1464
+ };
1465
+ });
1305
1466
  server.tool("codeloop_start_recording", TOOL_BOOTSTRAP + `Start recording the app window in the background. The app is brought to the front automatically
1306
1467
  (un-minimized if needed). Recording continues while you interact with the app. Call codeloop_stop_recording when done.
1307
1468
  This is the PREFERRED recording method because it lets you actively operate the app during capture.
@@ -1328,11 +1489,12 @@ Flow: start_recording → codeloop_interact with ALL app elements → stop_recor
1328
1489
  Supports desktop apps, Android emulator, iOS Simulator, and browser targets.
1329
1490
  Multi-monitor: on macOS, automatically detects which screen the app window is on.
1330
1491
  App logs (stdout, logcat, simctl log) are automatically captured alongside the video.`, {
1331
- app_name: z.string().describe("The name of the app to record (used to find and focus its window)"),
1492
+ app_name: z.string().optional().describe("The name of the app to record (used to find and focus its window). For desktop projects, defaults to evidence.target_app from .codeloop/config.json — set during init via detect-target-app for .NET/Xcode/Android projects, or settable manually."),
1332
1493
  run_id: z.string().optional().describe("Existing run ID to store the video in"),
1333
1494
  max_duration_seconds: z.number().default(120).describe("Safety timeout — recording stops automatically after this many seconds"),
1334
- target_type: z.enum(["desktop", "android_emulator", "ios_simulator", "browser"]).optional()
1495
+ target_type: targetTypeSchema.optional()
1335
1496
  .describe("Capture method. Auto-detected from project if omitted. desktop=ffmpeg screen, android_emulator=adb screenrecord, ios_simulator=simctl recordVideo, browser=ffmpeg/Playwright"),
1497
+ auto_launch: z.boolean().default(true).describe("When target_type=desktop and the app isn't already running, auto-launch it from the project's build output via evidence.target_app. Set false to skip (e.g. when the app is started by another process)."),
1336
1498
  project_dir: z.string().optional().describe("Absolute path to the project root. Defaults to CODELOOP_PROJECT_DIR env var or auto-discovered project directory. MUST be an actual project folder — passing the user's home directory is rejected. If your IDE launches the MCP server from the wrong cwd (common on Windows where Cursor uses C:\\Users\\<name> as cwd), set CODELOOP_PROJECT_DIR or pass this param explicitly."),
1337
1499
  workspace_root: z.string().optional().describe("[Alias for project_dir] Same semantics; accepted because many agents reach for this conventional name. Pass either `project_dir` OR `workspace_root` — they're equivalent."),
1338
1500
  }, async (params) => {
@@ -1340,6 +1502,7 @@ App logs (stdout, logcat, simctl log) are automatically captured alongside the v
1340
1502
  const { startBackgroundRecording } = await import("./runners/video_recorder.js");
1341
1503
  const { createRunDir, getRunDir, getArtifactsBaseDir } = await import("./evidence/artifacts.js");
1342
1504
  const { detectTargetType } = await import("./runners/platform_detect.js");
1505
+ const { loadConfig } = await import("./config.js");
1343
1506
  const cwd = (params.project_dir || params.workspace_root || projectDir);
1344
1507
  let videosDir;
1345
1508
  if (params.run_id) {
@@ -1351,7 +1514,11 @@ App logs (stdout, logcat, simctl log) are automatically captured alongside the v
1351
1514
  videosDir = join(runDir, "videos");
1352
1515
  }
1353
1516
  const targetType = params.target_type || (await detectTargetType(cwd));
1517
+ const cfg = loadConfig(cwd);
1354
1518
  let appName = params.app_name;
1519
+ if (!appName && (targetType === "desktop")) {
1520
+ appName = cfg.evidence?.target_app;
1521
+ }
1355
1522
  if (targetType === "browser") {
1356
1523
  const bi = await import("./runners/browser_interaction.js");
1357
1524
  await bi.ensureBrowserPage();
@@ -1360,7 +1527,42 @@ App logs (stdout, logcat, simctl log) are automatically captured alongside the v
1360
1527
  appName = pwAppName;
1361
1528
  }
1362
1529
  }
1363
- const result = await startBackgroundRecording(videosDir, appName, params.max_duration_seconds, targetType);
1530
+ // Photometry-DB E2E 8: agents spent many turns manually
1531
+ // probing `Get-Process` / `Start-Process` to launch the app
1532
+ // because nothing in CodeLoop did it for them. Now, when
1533
+ // recording a desktop app, we auto-launch from the build
1534
+ // output if the app isn't already running.
1535
+ const wm = await import("./runners/window_manager.js");
1536
+ let autoLaunchSummary;
1537
+ if (targetType === "desktop" && params.auto_launch !== false && appName) {
1538
+ try {
1539
+ const bounds = await wm.getWindowBounds(appName);
1540
+ if (!bounds) {
1541
+ const r = await wm.launchDesktopApp(appName, cwd);
1542
+ autoLaunchSummary = {
1543
+ attempted: true,
1544
+ launched: r.launched,
1545
+ command: r.command,
1546
+ reason: r.reason,
1547
+ };
1548
+ if (r.launched) {
1549
+ // Give the window time to appear / paint before the
1550
+ // recorder starts capturing frames.
1551
+ await new Promise((res) => setTimeout(res, 2000));
1552
+ }
1553
+ }
1554
+ else {
1555
+ autoLaunchSummary = { attempted: false, launched: true, reason: "already running" };
1556
+ }
1557
+ }
1558
+ catch (e) {
1559
+ autoLaunchSummary = { attempted: true, launched: false, reason: e.message };
1560
+ }
1561
+ }
1562
+ const result = await startBackgroundRecording(videosDir, appName ?? "", params.max_duration_seconds, targetType);
1563
+ if (autoLaunchSummary) {
1564
+ result.auto_launch = autoLaunchSummary;
1565
+ }
1364
1566
  await trackUsage(apiKey, "visual_review");
1365
1567
  return result;
1366
1568
  }, { tool: "codeloop_start_recording", cwd: (params.project_dir || params.workspace_root || projectDir), input: params });
@@ -1936,8 +2138,8 @@ MANDATORY for web apps: You MUST type into form fields, fill login/signup forms,
1936
2138
  validation errors, and click submit buttons. Just navigating pages is NOT enough.
1937
2139
  Wait 1-2 seconds between interactions so video frames capture state changes.`, {
1938
2140
  action: z.string().describe("Action to perform: click, double_click, right_click, hover, type, keystroke, hotkey, scroll, drag_drop, long_press, type_and_submit, type_and_tab, fill_form, select_option, toggle, upload_file, navigate_url, navigate_back, navigate_forward, wait, sequence, swipe, back_button, home_button, deep_link, grant_permission, rotate_device, biometric_auth, launch_app, clear_app_data, mock_location, simulate_network, maestro_flow, win_ui_inspect, win_ui_automate"),
1939
- target_type: z.enum(["desktop", "browser", "android_emulator", "ios_simulator"]).optional()
1940
- .describe("Interaction target. Auto-detected if omitted."),
2141
+ target_type: targetTypeSchema.optional()
2142
+ .describe("Interaction target. Auto-detected if omitted. Accepts synonyms: `windows_desktop`/`mac_desktop`/`linux_desktop` → `desktop`; `web` → `browser`; `android` → `android_emulator`; `ios` → `ios_simulator`."),
1941
2143
  x: z.number().optional().describe("X coordinate for click/scroll/drag/swipe"),
1942
2144
  y: z.number().optional().describe("Y coordinate for click/scroll/drag/swipe"),
1943
2145
  x2: z.number().optional().describe("End X for drag_drop/swipe"),
@@ -1968,7 +2170,7 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
1968
2170
  action: z.string(),
1969
2171
  params: z.record(z.unknown()).optional(),
1970
2172
  delay_ms: z.number().optional(),
1971
- })).optional().describe("Steps for sequence action"),
2173
+ }).passthrough()).optional().describe("Steps for sequence action. Accepts BOTH nested form `{ action, params: { x, y, … }, delay_ms? }` and flat form `{ action, x, y, … }` — the flat form is what agents naturally write (mirrors the top-level codeloop_interact shape). Supports inside desktop sequences: click, double_click, right_click, hover, scroll, type, hotkey, keystroke, navigate_url, wait, win_ui_automate."),
1972
2174
  maestro_steps: z.array(z.string()).optional().describe("High-level steps for maestro_flow"),
1973
2175
  automation_action: z.enum(["invoke", "setValue", "toggle", "select", "scroll"]).optional()
1974
2176
  .describe("For win_ui_automate"),
@@ -1978,6 +2180,7 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
1978
2180
  description: z.string().optional().describe("[Alias for intent] Same semantics."),
1979
2181
  purpose: z.string().optional().describe("[Alias for intent] Same semantics."),
1980
2182
  step: z.string().optional().describe("Plan-step name when this interaction is driving a codeloop_plan_user_journey arc (e.g. 'edit', 'delete', 'create', 'save', 'verify'). Logged alongside `intent` and read by the CRUD classifier."),
2183
+ coords: z.enum(["auto", "window", "screen"]).optional().describe("How to interpret x/y for desktop click/double_click/right_click/hover/scroll/drag/long_press. `auto` (default): if `app_name` resolves to a visible window AND (x, y) fits inside the window's client area, treat as window-relative and auto-offset by the window origin; otherwise leave as raw screen-absolute coords. `window`: ALWAYS add the window origin offset (errors if the window isn't found). `screen`: ALWAYS pass through (legacy behaviour, matches CGEvent / user32.dll / xdotool semantics). Fixes the Photometry-DB E2E 8 failure mode where the agent captured a 1600×900 window screenshot, computed click coords against the image, and missed the sidebar because the window's actual top-left was (286, 286) on a 5120×1440 screen."),
1981
2184
  project_dir: z.string().optional().describe("Absolute path to project root."),
1982
2185
  workspace_root: z.string().optional().describe("[Alias for project_dir] Pass either; they're equivalent."),
1983
2186
  }, async (params) => {
@@ -2007,13 +2210,58 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2007
2210
  await bi.ensureBrowserPage();
2008
2211
  }
2009
2212
  // Bring the app to front before desktop interactions (non-browser, non-mobile).
2213
+ let windowOriginOffset = null;
2010
2214
  if (tt === "desktop") {
2011
2215
  const appName = params.app_name || vr.getActiveRecordingAppName();
2012
2216
  if (appName && action !== "wait") {
2013
2217
  await wm.bringAppToFront(appName);
2014
2218
  await new Promise(r => setTimeout(r, 300));
2219
+ // Photometry-DB E2E 8: agents commonly compute click coords
2220
+ // from a window-cropped screenshot (which is what
2221
+ // codeloop_capture_screenshot returns when `app_name` is
2222
+ // set), then pass those coords to codeloop_interact —
2223
+ // which expects raw SCREEN coordinates. On a multi-monitor
2224
+ // / DPI-scaled setup that mismatch silently dropped clicks
2225
+ // 100s of pixels off-target. When `coords` is `auto` (the
2226
+ // default) we look up the window's actual screen origin
2227
+ // and add it to x/y, but ONLY if (x, y) fits inside the
2228
+ // window — that keeps legacy callers passing raw screen
2229
+ // coords working unchanged. `coords: "window"` forces the
2230
+ // offset; `coords: "screen"` opts out.
2231
+ const coordsMode = params.coords ?? "auto";
2232
+ if (coordsMode !== "screen") {
2233
+ try {
2234
+ const b = await wm.getWindowBounds(appName);
2235
+ if (b && b.width > 0 && b.height > 0) {
2236
+ windowOriginOffset = { dx: b.x, dy: b.y, width: b.width, height: b.height };
2237
+ }
2238
+ }
2239
+ catch { /* best-effort */ }
2240
+ }
2015
2241
  }
2016
2242
  }
2243
+ // Helper used by every coordinate-driven desktop action below.
2244
+ const translateXY = (x, y) => {
2245
+ if (tt !== "desktop" || x == null || y == null || !windowOriginOffset) {
2246
+ return { x, y };
2247
+ }
2248
+ const mode = params.coords ?? "auto";
2249
+ if (mode === "screen")
2250
+ return { x, y };
2251
+ if (mode === "window") {
2252
+ return { x: x + windowOriginOffset.dx, y: y + windowOriginOffset.dy };
2253
+ }
2254
+ // auto: if (x, y) fits inside the window's client area,
2255
+ // assume the agent computed against a window-cropped
2256
+ // screenshot and add the origin. Otherwise pass through
2257
+ // (likely a raw screen coord from a manual workflow).
2258
+ const inside = x >= 0 && x <= windowOriginOffset.width &&
2259
+ y >= 0 && y <= windowOriginOffset.height;
2260
+ if (inside) {
2261
+ return { x: x + windowOriginOffset.dx, y: y + windowOriginOffset.dy };
2262
+ }
2263
+ return { x, y };
2264
+ };
2017
2265
  switch (action) {
2018
2266
  case "click":
2019
2267
  if (tt === "browser" && params.selector) {
@@ -2035,7 +2283,8 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2035
2283
  }
2036
2284
  }
2037
2285
  else if (params.x != null && params.y != null) {
2038
- success = await wm.clickAtPosition(params.x, params.y);
2286
+ const t = translateXY(params.x, params.y);
2287
+ success = await wm.clickAtPosition(t.x, t.y);
2039
2288
  }
2040
2289
  detail = `click at ${params.selector || `(${params.x},${params.y})`}`;
2041
2290
  break;
@@ -2044,7 +2293,8 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2044
2293
  success = await bi.browserDoubleClick(params.selector);
2045
2294
  }
2046
2295
  else if (params.x != null && params.y != null) {
2047
- success = await wm.doubleClickAtPosition(params.x, params.y);
2296
+ const t = translateXY(params.x, params.y);
2297
+ success = await wm.doubleClickAtPosition(t.x, t.y);
2048
2298
  }
2049
2299
  detail = `double_click at ${params.selector || `(${params.x},${params.y})`}`;
2050
2300
  break;
@@ -2053,7 +2303,8 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2053
2303
  success = await bi.browserRightClick(params.selector);
2054
2304
  }
2055
2305
  else if (params.x != null && params.y != null) {
2056
- success = await wm.rightClickAtPosition(params.x, params.y);
2306
+ const t = translateXY(params.x, params.y);
2307
+ success = await wm.rightClickAtPosition(t.x, t.y);
2057
2308
  }
2058
2309
  detail = `right_click at ${params.selector || `(${params.x},${params.y})`}`;
2059
2310
  break;
@@ -2062,7 +2313,8 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2062
2313
  success = await bi.browserHover(params.selector);
2063
2314
  }
2064
2315
  else if (params.x != null && params.y != null) {
2065
- success = await wm.hoverAtPosition(params.x, params.y);
2316
+ const t = translateXY(params.x, params.y);
2317
+ success = await wm.hoverAtPosition(t.x, t.y);
2066
2318
  }
2067
2319
  detail = `hover at ${params.selector || `(${params.x},${params.y})`}`;
2068
2320
  break;
@@ -2147,7 +2399,8 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2147
2399
  success = await wm.simctlSwipe(sx, sy, ex, ey);
2148
2400
  }
2149
2401
  else {
2150
- success = await wm.scrollAtPosition(params.x || 500, params.y || 400, params.direction || "down", params.amount || 3);
2402
+ const t = translateXY(params.x || 500, params.y || 400);
2403
+ success = await wm.scrollAtPosition(t.x, t.y, params.direction || "down", params.amount || 3);
2151
2404
  }
2152
2405
  detail = `scroll ${params.direction || "down"}`;
2153
2406
  break;
@@ -2160,7 +2413,9 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2160
2413
  success = await wm.adbSwipe(params.x, params.y, params.x2, params.y2, params.duration_ms || 500);
2161
2414
  }
2162
2415
  else {
2163
- success = await wm.dragDrop(params.x, params.y, params.x2, params.y2, params.duration_ms || 500);
2416
+ const a = translateXY(params.x, params.y);
2417
+ const b = translateXY(params.x2, params.y2);
2418
+ success = await wm.dragDrop(a.x, a.y, b.x, b.y, params.duration_ms || 500);
2164
2419
  }
2165
2420
  }
2166
2421
  detail = `drag_drop`;
@@ -2170,7 +2425,8 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2170
2425
  success = await wm.adbLongPress(params.x, params.y, params.duration_ms || 1000);
2171
2426
  }
2172
2427
  else if (params.x != null && params.y != null) {
2173
- success = await wm.longPressAtPosition(params.x, params.y, params.duration_ms || 1000);
2428
+ const t = translateXY(params.x, params.y);
2429
+ success = await wm.longPressAtPosition(t.x, t.y, params.duration_ms || 1000);
2174
2430
  }
2175
2431
  detail = `long_press at (${params.x},${params.y})`;
2176
2432
  break;
@@ -2352,11 +2608,37 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2352
2608
  if (tt === "android_emulator" && params.package_id) {
2353
2609
  const r = await import("./runners/base.js").then(m => m.runCommand("adb", ["shell", "am", "start", "-n", params.package_id], process.cwd()));
2354
2610
  success = r.exit_code === 0;
2611
+ detail = `launch_app "${params.package_id}"`;
2355
2612
  }
2356
2613
  else if (tt === "ios_simulator" && params.package_id) {
2357
2614
  success = await wm.simctlLaunch(params.package_id);
2615
+ detail = `launch_app "${params.package_id}"`;
2616
+ }
2617
+ else if (tt === "desktop") {
2618
+ // Photometry-DB E2E 8: desktop launch was completely
2619
+ // missing — agents had to hand-roll PowerShell
2620
+ // Start-Process / `open -a` calls. Now resolves via
2621
+ // evidence.target_app from the project config when
2622
+ // app_name is omitted.
2623
+ const { loadConfig } = await import("./config.js");
2624
+ const cfg = loadConfig(cwd);
2625
+ const appName = params.app_name || cfg.evidence?.target_app;
2626
+ if (!appName) {
2627
+ success = false;
2628
+ detail = "launch_app desktop: no app_name provided and evidence.target_app is unset in .codeloop/config.json. Set it (e.g. \"target_app\": \"Photometry DB\") or pass app_name explicitly.";
2629
+ }
2630
+ else {
2631
+ const r = await wm.launchDesktopApp(appName, cwd);
2632
+ success = r.launched;
2633
+ detail = r.launched
2634
+ ? `launch_app "${appName}" via ${r.command}${r.pid ? ` (pid ${r.pid})` : ""}`
2635
+ : `launch_app "${appName}" failed: ${r.reason || "unknown error"}`;
2636
+ }
2637
+ }
2638
+ else {
2639
+ success = false;
2640
+ detail = `launch_app "${params.package_id || params.app_name || ""}": target ${tt} not supported in this action`;
2358
2641
  }
2359
- detail = `launch_app "${params.package_id}"`;
2360
2642
  break;
2361
2643
  case "clear_app_data":
2362
2644
  if (tt === "android_emulator" && params.package_id) {
@@ -2420,7 +2702,22 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2420
2702
  if (step.delay_ms)
2421
2703
  await new Promise(r => setTimeout(r, step.delay_ms));
2422
2704
  const stepAction = step.action;
2423
- const sp = (step.params || {});
2705
+ // Photometry-DB E2E 8: agents naturally write steps in
2706
+ // FLAT form `{ action, x, y, ms, … }` because that
2707
+ // mirrors the top-level codeloop_interact shape. The
2708
+ // schema documented the NESTED form `{ action,
2709
+ // params: { … } }`. Now we accept both: prefer
2710
+ // `step.params` if present, otherwise fall back to the
2711
+ // step object itself minus the wrapper keys.
2712
+ const stepObj = step;
2713
+ const nested = (step.params || {});
2714
+ const sp = Object.keys(nested).length > 0
2715
+ ? nested
2716
+ : Object.fromEntries(Object.entries(stepObj).filter(([k]) => k !== "action" && k !== "params" && k !== "delay_ms"));
2717
+ // Convenient aliases: agents wrote `ms` for wait
2718
+ // duration in the log; accept that as `duration_ms`.
2719
+ if (sp.ms != null && sp.duration_ms == null)
2720
+ sp.duration_ms = sp.ms;
2424
2721
  let stepOk = false;
2425
2722
  let stepReason;
2426
2723
  try {
@@ -2428,7 +2725,20 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2428
2725
  stepOk = await bi.browserClick(sp.selector);
2429
2726
  }
2430
2727
  else if (stepAction === "click" && sp.x != null && sp.y != null) {
2431
- stepOk = await wm.clickAtPosition(sp.x, sp.y);
2728
+ const t = translateXY(sp.x, sp.y);
2729
+ stepOk = await wm.clickAtPosition(t.x, t.y);
2730
+ }
2731
+ else if (stepAction === "double_click" && tt !== "browser" && sp.x != null && sp.y != null) {
2732
+ const t = translateXY(sp.x, sp.y);
2733
+ stepOk = await wm.doubleClickAtPosition(t.x, t.y);
2734
+ }
2735
+ else if (stepAction === "right_click" && tt !== "browser" && sp.x != null && sp.y != null) {
2736
+ const t = translateXY(sp.x, sp.y);
2737
+ stepOk = await wm.rightClickAtPosition(t.x, t.y);
2738
+ }
2739
+ else if (stepAction === "hover" && tt !== "browser" && sp.x != null && sp.y != null) {
2740
+ const t = translateXY(sp.x, sp.y);
2741
+ stepOk = await wm.hoverAtPosition(t.x, t.y);
2432
2742
  }
2433
2743
  else if (stepAction === "type" && tt === "browser" && sp.selector && sp.text) {
2434
2744
  stepOk = await bi.browserType(sp.selector, sp.text);
@@ -2464,9 +2774,13 @@ Wait 1-2 seconds between interactions so video frames capture state changes.`, {
2464
2774
  stepOk = tt === "browser" ? await bi.browserKeystroke(sp.key) : await wm.sendKeyByName(sp.key);
2465
2775
  }
2466
2776
  else if (stepAction === "scroll") {
2467
- stepOk = tt === "browser"
2468
- ? await bi.browserScroll(sp.direction || "down", sp.amount || 300)
2469
- : await wm.scrollAtPosition(sp.x || 500, sp.y || 400, sp.direction || "down", sp.amount || 3);
2777
+ if (tt === "browser") {
2778
+ stepOk = await bi.browserScroll(sp.direction || "down", sp.amount || 300);
2779
+ }
2780
+ else {
2781
+ const t = translateXY(sp.x || 500, sp.y || 400);
2782
+ stepOk = await wm.scrollAtPosition(t.x, t.y, sp.direction || "down", sp.amount || 3);
2783
+ }
2470
2784
  }
2471
2785
  else if (stepAction === "wait") {
2472
2786
  await new Promise(r => setTimeout(r, sp.duration_ms || 1000));