npm - onbuzz - Versions diffs - 4.9.13 → 4.10.0 - Mend

onbuzz 4.9.13 → 4.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (451) hide show

package/src/tools/desktop/DesktopTool.js CHANGED Viewed

@@ -1,638 +1,629 @@
-/**
- * DesktopTool — agent-facing desktop-control surface.
- *
- * Pairs the OS controller (mouse / keyboard / screenshot / windows) with
- * a pluggable visual-grounding model (default: Kimi K2.6) so an agent
- * can say "click the orange Save button" and the tool figures out
- * `(x, y)` from a fresh screenshot before the click.
- *
- * Five guardrails are layered into every execute() call:
- *
- *   1. CAPABILITY GATE     - osController refuses ops the OS can't do
- *                            (e.g. mouseInput on Wayland) and surfaces
- *                            an operator-friendly remediation message.
- *
- *   2. PER-AGENT ALLOWLIST - toolConfig.allowedActions lets an operator
- *                            disable any subset of actions per agent
- *                            (e.g. read-only screenshot bot vs. full
- *                            control). Default: ALL DISABLED — the
- *                            agent has to be explicitly granted control.
- *
- *   3. KILL SWITCH         - LOXIA_DESKTOP_TOOL_DISABLED=1 env flag
- *                            short-circuits every call with a clear
- *                            error. Zero-source kill in production.
- *
- *   4. AUDIT TRAIL         - every action emits a structured log line
- *                            with operation id + grounded coords so
- *                            screen-recording-free postmortems work.
- *
- *   5. INTENT GROUNDING    - "click" / "scroll" / "drag" accept either
- *                            raw (x, y) OR a natural-language `intent`.
- *                            With intent, the tool grabs a screenshot,
- *                            asks the grounding model, validates the
- *                            returned coords fit the screen, and
- *                            ONLY THEN moves. Failed grounding never
- *                            silently clicks somewhere random.
- *
- * Model is swappable via toolConfig.groundingModelId (any id registered
- * in src/services/grounding/registry.js). Backend proxy vs. direct
- * Foundry is also config — see _buildAdapter below.
- */
-import { writeFile, mkdir } from 'node:fs/promises';
-import { join } from 'node:path';
-import { tmpdir } from 'node:os';
-import { BaseTool } from '../baseTool.js';
-import { createOSController, OSError } from './osController.js';
-import {
-  createGroundingModel,
-  ModelId,
-  DEFAULT_REASONING_EFFORT,
-} from '../../services/grounding/index.js';
-/** Every action the tool can perform. */
-export const DesktopAction = Object.freeze({
-  SCREENSHOT:     'screenshot',
-  CLICK:          'click',
-  TYPE:           'type',
-  KEY_PRESS:      'key_press',
-  SCROLL:         'scroll',
-  DRAG:           'drag',
-  LIST_WINDOWS:   'list_windows',
-  FOCUS_WINDOW:   'focus_window',
-  DESCRIBE_CAPS:  'describe_capabilities',
-});
-const ACTIONS_REQUIRING_INPUT       = new Set([
-  DesktopAction.CLICK, DesktopAction.TYPE, DesktopAction.KEY_PRESS,
-  DesktopAction.SCROLL, DesktopAction.DRAG, DesktopAction.FOCUS_WINDOW,
-]);
-const ACTIONS_THAT_CAN_USE_INTENT   = new Set([
-  DesktopAction.CLICK, DesktopAction.SCROLL, DesktopAction.DRAG,
-]);
-/** Env-level kill switch — checked on every call. */
-const ENV_KILL_FLAG = 'LOXIA_DESKTOP_TOOL_DISABLED';
-/** Defaults — overridable via per-agent toolConfig.desktop or this.config. */
-const Defaults = Object.freeze({
-  /** Default actions an agent gets WHEN allowedActions is unset.
-   *  Empty list = no control. Operator must opt in per agent. */
-  ALLOWED_ACTIONS:        [],
-  /** Visual-grounding model (registry id). */
-  GROUNDING_MODEL:        ModelId.KIMI_K2_6,
-  /** Grounding reasoning effort. */
-  GROUNDING_EFFORT:       DEFAULT_REASONING_EFFORT,
-  /** Hard cap on a single grounded coord lookup. Catches hangs. */
-  GROUNDING_TIMEOUT_MS:   180_000,
-  /** Refuse clicks > this many px from the screen edge (sanity check
-   *  against bogus grounding outputs). */
-  COORD_MARGIN_PX:        0,
-});
-export class DesktopTool extends BaseTool {
-  constructor(config = {}, logger = null) {
-    super(config, logger);
-    this.id = 'desktop';
-    this.name = 'Desktop Control';
-    this.version = '1.0.0';
-    this.requiresProject = false;
-    this.isAsync = true;
-    // Modest builtin delay so the OS has time to repaint between
-    // an action and the agent's next screenshot.
-    this.builtinDelay = 150;
-    // Injected at registry time. The tool needs:
-    //   - aiService: source of baseUrl + apiKey for proxy-mode grounding
-    //   - osController: defaults to a fresh createOSController() instance
-    //                   but tests inject a stub.
-    this.aiService    = null;
-    this.osController = config.osController || null;
-    this._adapterCache = null;       // lazy — built on first ground()
-  }
-  /** Called by index.js after construction. */
-  setAIService(aiService) {
-    this.aiService = aiService;
-  }
-  /** Static factory for tests / explicit DI. */
-  static withDependencies({ aiService, osController, config = {}, logger = null }) {
-    const tool = new DesktopTool({ ...config, osController }, logger);
-    if (aiService) tool.setAIService(aiService);
-    return tool;
-  }
-  // ─── BaseTool surface ─────────────────────────────────────────────
-  getDescription() {
-    // NOTE: this string lands in the agent's system prompt every turn.
-    // Keep it tight, action-oriented, and free of meta-commentary like
-    // "this is beta" / "we're missing safeguards" — agents read those
-    // disclaimers as instructions to act timidly and the user's UX
-    // suffers. Operator-facing beta indicators live in the web-UI
-    // configurator + tool-selector pills, not here.
-    return `
-Desktop Control Tool: drive the user's keyboard, mouse, screen, and
-windows like a human. Combines OS-level input with visual grounding
-("click the orange Save button") via a vision LLM.
-USAGE:
-  {
-    "toolId": "desktop",
-    "parameters": {
-      "action": "click",
-      "intent": "the orange Save button"
-    }
-  }
-PREFERRED WORKFLOW — use intent-driven actions, NOT raw screenshot:
-For "click X" tasks, do NOT call screenshot first and then try to read
-the image yourself. You can't see screenshot results in plain text. The
-intent-driven actions screenshot + ground + act in one step:
-  { "toolId": "desktop", "parameters": {
-      "action": "click", "intent": "the Netflix icon in the taskbar" }}
-That triggers: screenshot → grounding model finds the coords → click.
-You only need to look at the screenshot yourself if grounding fails
-(rare on legible UIs).
-ACTIONS:
-  - click  (intent OR x,y)      single/double click — use intent first
-  - type   (text)               keyboard input into focused window
-  - key_press (keys[])          chord like ["Control","S"]
-  - scroll (intent OR x,y, dy)  positive dy scrolls down
-  - drag   (fromIntent/toIntent OR from/to coords)
-  - list_windows                titles + bounds of open windows
-  - focus_window (titleMatch)   bring matching window to front
-  - screenshot                  capture screen to disk (returns file
-                                path; you cannot read raw PNG bytes —
-                                use the vision tool on the path if you
-                                must inspect, or prefer intent actions)
-  - describe_capabilities       OS + display server + permission state
-INTENT FAILURE: if grounding can't find the target, you'll get a
-GROUNDING_FAILED or COORDS_OUT_OF_BOUNDS error with the model's raw
-answer. Sharpen the intent ("the red Save button in the toolbar")
-and retry.
-PERMISSION MODEL:
-  - Every action is OFF by default per-agent until the operator adds
-    it to toolConfig.desktop.allowedActions in the agent's config.
-  - LOXIA_DESKTOP_TOOL_DISABLED=1 is a global kill switch (env).
-  - On Linux Wayland, input actions are blocked by the OS; only
-    screenshot + list_windows work. The tool surfaces a clear
-    "Wayland blocks input injection" error for the rest.
-OS NOTES:
-  - macOS: needs Accessibility + Screen Recording permissions granted
-    to the Loxia process. Without them you'll see PERMISSION_DENIED.
-  - First call lazily loads @nut-tree-fork/nut-js (optional dep, ~30MB
-    native binary). If not installed, every action fails with
-    NATIVE_UNAVAILABLE — install it or run on a different machine.
-    `.trim();
-  }
-  parseParameters(content) {
-    try {
-      const trimmed = (content || '').trim();
-      if (trimmed.startsWith('{')) {
-        const parsed = JSON.parse(trimmed);
-        return parsed.parameters || parsed;
-      }
-      // No XML alternate form — desktop actions are too varied to
-      // hand-author. JSON-only keeps the surface honest.
-      throw new Error('desktop tool requires JSON parameters');
-    } catch (err) {
-      throw new Error(`Failed to parse desktop parameters: ${err.message}`);
-    }
-  }
-  getSupportedActions() {
-    return Object.values(DesktopAction);
-  }
-  getRequiredParameters() {
-    return ['action'];
-  }
-  // ─── execute ──────────────────────────────────────────────────────
-  async execute(params, context) {
-    // 1. Kill switch
-    if (process.env[ENV_KILL_FLAG] === '1') {
-      return this._fail('DESKTOP_DISABLED',
-        'Desktop tool disabled via LOXIA_DESKTOP_TOOL_DISABLED.');
-    }
-    const action = params?.action;
-    if (!action || !Object.values(DesktopAction).includes(action)) {
-      return this._fail('INVALID_ACTION',
-        `unknown action "${action}". Valid: ${Object.values(DesktopAction).join(', ')}`);
-    }
-    // 2. Per-agent allowlist
-    const effective = this.getEffectiveConfig(context, Defaults);
-    const allowed = effective.allowedActions || Defaults.ALLOWED_ACTIONS;
-    if (!Array.isArray(allowed) || !allowed.includes(action)) {
-      return this._fail('NOT_PERMITTED',
-        `action "${action}" not in this agent's allowedActions ` +
-        `(set toolConfig.desktop.allowedActions to enable).`);
-    }
-    // 3. Dispatch
-    const osc = this._osc();
-    try {
-      switch (action) {
-        case DesktopAction.SCREENSHOT:    return await this._actScreenshot(osc, params);
-        case DesktopAction.CLICK:         return await this._actClick(osc, effective, params, context);
-        case DesktopAction.TYPE:          return await this._actType(osc, params);
-        case DesktopAction.KEY_PRESS:     return await this._actKeyPress(osc, params);
-        case DesktopAction.SCROLL:        return await this._actScroll(osc, effective, params, context);
-        case DesktopAction.DRAG:          return await this._actDrag(osc, effective, params, context);
-        case DesktopAction.LIST_WINDOWS:  return await this._actListWindows(osc);
-        case DesktopAction.FOCUS_WINDOW:  return await this._actFocusWindow(osc, params);
-        case DesktopAction.DESCRIBE_CAPS: return await this._actDescribeCaps(osc);
-      }
-    } catch (err) {
-      return this._fail(err.code || 'OP_FAILED', err.message, action);
-    }
-  }
-  // ─── actions ──────────────────────────────────────────────────────
-  async _actScreenshot(osc, params) {
-    const png = await osc.screenshot({ region: params.region });
-    const size = await osc.screenSize();
-    // Tag the action in the output. Without this, a screenshot success
-    // result is indistinguishable from a click/scroll/drag SUCCESS when
-    // the agent reads the tool result text — and out-of-order batched
-    // returns can land a stale screenshot success right where a click
-    // success would normally appear. Putting "[action: screenshot]" up
-    // front makes the action explicit so the model can't mistake it for
-    // a click confirmation.
-    // Save to a temp file rather than inlining base64 into the tool
-    // result. A 200 KB PNG becomes ~290 KB of base64 — stuffing that
-    // into the conversation as a text tool-result derails the next
-    // model turn (it sees a wall of characters, not an image, and
-    // loses the original task; streaming often aborts mid-scan).
-    // Returning a path lets vision-capable downstream tools open it
-    // properly, and keeps the conversation token count sane.
-    const filePath = await this._saveScreenshot(png);
-    this._audit('screenshot', { size, bytes: png.length, filePath });
-    return {
-      success: true,
-      action: 'screenshot',
-      output:
-        `[action: screenshot] Captured ${size.width}x${size.height} screenshot ` +
-        `(${png.length} bytes) → ${filePath}\n` +
-        `NOTE: a successful screenshot does NOT mean an earlier click/scroll/drag ` +
-        `succeeded — those have separate results tagged "[action: click]" etc. ` +
-        `Look at the most recent action-tagged result for the action you actually called.\n` +
-        `To inspect the image use the vision tool with this path, or just call ` +
-        `click/scroll/drag with an "intent" (one-step: screenshot + ground + act).`,
-      screenshotPath: filePath,
-      bytes: png.length,
-      size,
-    };
-  }
-  /**
-   * Persist a captured PNG to disk. We use the OS temp dir under a
-   * stable subfolder so old shots are easy to clean up by hand, and
-   * a millisecond-precise filename so concurrent captures don't
-   * collide. Returning the file path (not the bytes) is what keeps
-   * the conversation text-size sane — see _actScreenshot.
-   */
-  async _saveScreenshot(png) {
-    const dir = join(tmpdir(), 'loxia-desktop-screenshots');
-    await mkdir(dir, { recursive: true });
-    const filePath = join(dir, `screenshot-${Date.now()}.png`);
-    await writeFile(filePath, png);
-    return filePath;
-  }
-  async _actClick(osc, effective, params, context) {
-    const { x, y, groundedFrom } = await this._resolveCoords(osc, effective, params, context);
-    await osc.mouseClick(x, y, { button: params.button, count: params.count });
-    this._audit('click', { x, y, button: params.button || 'left', count: params.count || 1, groundedFrom });
-    return {
-      success: true,
-      action: 'click',
-      output: `[action: click] Clicked at (${x}, ${y})${groundedFrom ? ` — grounded from "${groundedFrom}"` : ''}.`,
-      coords: { x, y },
-      groundedFrom,
-    };
-  }
-  async _actType(osc, params) {
-    if (typeof params.text !== 'string' || params.text.length === 0) {
-      return this._fail('INVALID_INPUT', 'type action requires non-empty "text"', 'type');
-    }
-    await osc.typeText(params.text, { delayMs: params.delayMs });
-    this._audit('type', { chars: params.text.length });
-    return {
-      success: true,
-      action: 'type',
-      output: `[action: type] Typed ${params.text.length} characters.`,
-    };
-  }
-  async _actKeyPress(osc, params) {
-    if (!Array.isArray(params.keys) || params.keys.length === 0) {
-      return this._fail('INVALID_INPUT', 'key_press requires non-empty "keys" array', 'key_press');
-    }
-    await osc.keyPress(params.keys);
-    this._audit('key_press', { keys: params.keys });
-    return {
-      success: true,
-      action: 'key_press',
-      output: `[action: key_press] Pressed ${params.keys.join('+')}.`,
-    };
-  }
-  async _actScroll(osc, effective, params, context) {
-    if (!Number.isFinite(params.dy)) {
-      return this._fail('INVALID_INPUT', 'scroll requires numeric "dy"', 'scroll');
-    }
-    const { x, y, groundedFrom } = await this._resolveCoords(osc, effective, params, context);
-    await osc.mouseScroll(x, y, params.dy);
-    this._audit('scroll', { x, y, dy: params.dy, groundedFrom });
-    return {
-      success: true,
-      action: 'scroll',
-      output: `[action: scroll] Scrolled ${params.dy > 0 ? 'down' : 'up'} ${Math.abs(params.dy)} at (${x}, ${y}).`,
-      coords: { x, y },
-      groundedFrom,
-    };
-  }
-  async _actDrag(osc, effective, params, context) {
-    // Drag accepts either two coord pairs OR two intents. The two ends
-    // are grounded independently with the SAME screenshot — avoids two
-    // model calls when both intents reference the same view.
-    const from = await this._resolveCoords(
-      osc, effective,
-      { x: params.from?.x, y: params.from?.y, intent: params.fromIntent },
-      context,
-    );
-    const to = await this._resolveCoords(
-      osc, effective,
-      { x: params.to?.x, y: params.to?.y, intent: params.toIntent },
-      context,
-      from._sharedScreenshot,
-    );
-    await osc.mouseDrag({ x: from.x, y: from.y }, { x: to.x, y: to.y }, { button: params.button });
-    this._audit('drag', { from, to });
-    return {
-      success: true,
-      action: 'drag',
-      output: `[action: drag] Dragged from (${from.x}, ${from.y}) to (${to.x}, ${to.y}).`,
-      from: { x: from.x, y: from.y },
-      to:   { x: to.x,   y: to.y },
-    };
-  }
-  async _actListWindows(osc) {
-    const raw = await osc.listWindows();
-    // Most platforms (Windows especially) report hundreds of OS-internal
-    // handles with empty titles. Surfacing the full list bloats the
-    // conversation context (>200 KB observed on Win11 with ~1300
-    // handles) and the model loses the original task in the noise.
-    // Filter to titled windows and cap to a sensible top-N. Total count
-    // stays in the output so the agent knows truncation happened.
-    const titled = raw.filter(w => w.title && w.title.trim().length > 0);
-    const MAX_LIST = 50;
-    const top = titled.slice(0, MAX_LIST);
-    const truncated = titled.length > MAX_LIST;
-    this._audit('list_windows', { total: raw.length, titled: titled.length, returned: top.length });
-    return {
-      success: true,
-      action: 'list_windows',
-      output:
-        `[action: list_windows] ${titled.length} titled window(s)` +
-        (raw.length !== titled.length ? ` (filtered ${raw.length - titled.length} untitled handles)` : '') +
-        (truncated ? ` — showing first ${MAX_LIST}` : '') + ': ' +
-        top.map(w => `"${w.title}"`).join(', '),
-      windows: top,
-      totalCount: raw.length,
-      titledCount: titled.length,
-      truncated,
-    };
-  }
-  async _actFocusWindow(osc, params) {
-    if (!params.titleMatch) {
-      return this._fail('INVALID_INPUT', 'focus_window requires "titleMatch"', 'focus_window');
-    }
-    const r = await osc.focusWindow({ titleMatch: params.titleMatch });
-    this._audit('focus_window', r);
-    return {
-      success: r.focused,
-      action: 'focus_window',
-      output: r.focused
-        ? `[action: focus_window] Focused window: "${r.title}".`
-        : `[action: focus_window] No window matched "${params.titleMatch}".`,
-      ...r,
-    };
-  }
-  async _actDescribeCaps(osc) {
-    const caps = await osc.describeCapabilities();
-    return {
-      success: true,
-      action: 'describe_capabilities',
-      output: `[action: describe_capabilities] OS: ${caps.os}${caps.display ? ` / ${caps.display}` : ''}; ` +
-              `screenshot=${caps.screenshot} mouseInput=${caps.mouseInput} ` +
-              `keyboardInput=${caps.keyboardInput} windowFocus=${caps.windowFocus}` +
-              (caps.degradedReason ? `\nDegraded: ${caps.degradedReason}` : ''),
-      capabilities: caps,
-    };
-  }
-  // ─── helpers ──────────────────────────────────────────────────────
-  /**
-   * Resolve (x, y) for an action. Three input shapes:
-   *
-   *   1. {x, y}              — used as-is after validation
-   *   2. {intent: '...'}     — ask the grounding model
-   *   3. neither             — INVALID_INPUT
-   *
-   * When grounding fires, the screenshot is returned on the result so
-   * the caller (drag) can reuse it for the second coord without an
-   * extra capture.
-   */
-  async _resolveCoords(osc, effective, params, context, reuseScreenshot = null) {
-    if (Number.isFinite(params.x) && Number.isFinite(params.y)) {
-      return { x: params.x, y: params.y, groundedFrom: null };
-    }
-    if (typeof params.intent === 'string' && params.intent.trim()) {
-      const size = await osc.screenSize();
-      const screenshot = reuseScreenshot || await osc.screenshot();
-      const adapter = await this._adapter(effective, context);
-      const result = await this._groundWithFallback(adapter, {
-        screenshot,
-        intent: params.intent,
-        imageSize: { width: size.width, height: size.height },
-        knobs: { reasoning_effort: effective.groundingEffort || Defaults.GROUNDING_EFFORT },
-        timeoutMs: effective.groundingTimeoutMs || Defaults.GROUNDING_TIMEOUT_MS,
-      }, effective, context);
-      if (!result.coords) {
-        const err = new Error(`grounding produced no coords; model said: "${(result.answer || '').slice(0, 200)}"`);
-        err.code = 'GROUNDING_FAILED';
-        throw err;
-      }
-      const { x, y } = result.coords;
-      if (!this._coordsInScreen(x, y, size, effective.coordMarginPx ?? Defaults.COORD_MARGIN_PX)) {
-        const err = new Error(`grounded coords (${x}, ${y}) lie outside the ${size.width}x${size.height} screen`);
-        err.code = 'COORDS_OUT_OF_BOUNDS';
-        throw err;
-      }
-      return { x, y, groundedFrom: params.intent, _sharedScreenshot: screenshot };
-    }
-    const err = new Error('action requires either (x, y) or "intent"');
-    err.code = 'INVALID_INPUT';
-    throw err;
-  }
-  _coordsInScreen(x, y, size, margin) {
-    return x >= margin && y >= margin
-        && x <= size.width  - margin
-        && y <= size.height - margin;
-  }
-  /**
-   * Lazy-build the grounding adapter. The model id + transport mode
-   * come from toolConfig so an operator can swap Kimi → some future
-   * model without code changes.
-   */
-  async _adapter(effective, context) {
-    if (this._adapterCache) return this._adapterCache;
-    this._adapterCache = this._buildAdapter(effective, context);
-    return this._adapterCache;
-  }
-  _buildAdapter(effective, context) {
-    const modelId = effective.groundingModelId || Defaults.GROUNDING_MODEL;
-    const foundryEndpoint = process.env.FOUNDRY_ENDPOINT;
-    const foundryKey      = process.env.FOUNDRY_KEY;
-    const forceDirect     = process.env.LOXIA_GROUNDING_DIRECT === '1';
-    const haveDirectCreds = !!(foundryEndpoint && foundryKey);
-    // Mode selection. In order of preference:
-    //   1. forceDirect env flag → direct (test / CI)
-    //   2. direct creds set AND no aiService → direct (headless / bench)
-    //   3. otherwise → proxy via the Loxia backend (production path)
-    // A failed proxy request (e.g. 404 because /llm/grounding isn't
-    // deployed yet) automatically retries once in direct mode if creds
-    // exist — `ground()` does the fallback below.
-    if (forceDirect || (!this.aiService && haveDirectCreds)) {
-      if (!haveDirectCreds) {
-        throw Object.assign(new Error(
-          'Direct mode requested but FOUNDRY_ENDPOINT / FOUNDRY_KEY env vars are not set.'
-        ), { code: 'NOT_INITIALISED' });
-      }
-      return createGroundingModel(modelId, {
-        mode:     'direct',
-        endpoint: foundryEndpoint,
-        apiKey:   foundryKey,
-      });
-    }
-    if (!this.aiService) {
-      throw Object.assign(new Error(
-        'DesktopTool has no grounding transport. Either set FOUNDRY_ENDPOINT + ' +
-        'FOUNDRY_KEY env vars (direct mode) OR run a Loxia backend that has ' +
-        'POST /llm/grounding deployed (proxy mode).'
-      ), { code: 'NOT_INITIALISED' });
-    }
-    return createGroundingModel(modelId, {
-      mode:        'proxy',
-      backendUrl:  this.aiService.baseUrl,
-      userApiKey:  this._resolveUserKey(context),
-    });
-  }
-  /**
-   * Wraps the adapter's ground() with one automatic fallback to direct
-   * mode when proxy returns 404 (route not deployed yet) AND direct
-   * credentials are available in env. Saves the user from "deploy the
-   * backend before you can fun" friction during the rollout window.
-   */
-  async _groundWithFallback(adapter, request, effective, context) {
-    try {
-      return await adapter.ground(request);
-    } catch (err) {
-      const is404 = /\b404\b/.test(err.message || '') || /\bNot Found\b/i.test(err.message || '');
-      const isProxy = adapter.mode === 'proxy';
-      const haveDirectCreds = !!(process.env.FOUNDRY_ENDPOINT && process.env.FOUNDRY_KEY);
-      if (!is404 || !isProxy || !haveDirectCreds) throw err;
-      this.logger?.warn?.(
-        '[desktop] proxy /llm/grounding returned 404 — falling back to direct Foundry. ' +
-        'Deploy the backend route to get billing + audit back.',
-      );
-      const directAdapter = createGroundingModel(
-        effective.groundingModelId || Defaults.GROUNDING_MODEL,
-        { mode: 'direct', endpoint: process.env.FOUNDRY_ENDPOINT, apiKey: process.env.FOUNDRY_KEY },
-      );
-      // Replace cache so subsequent calls go straight to direct.
-      this._adapterCache = directAdapter;
-      return await directAdapter.ground(request);
-    }
-  }
-  _resolveUserKey(context) {
-    // Pull the per-session Loxia token the same way other tools do.
-    const km = this.aiService?.apiKeyManager;
-    if (km && typeof km.getKeysForRequest === 'function') {
-      const keys = km.getKeysForRequest(context?.sessionId, {
-        platformProvided: context?.platformProvided !== false,
-      });
-      if (keys?.loxiaApiKey) return keys.loxiaApiKey;
-    }
-    return this.aiService?.config?.apiKey || process.env.LOXIA_API_KEY || null;
-  }
-  /**
-   * Lazy-create an OS controller if the constructor didn't get one.
-   * Production path: created on demand. Test path: injected at ctor.
-   */
-  _osc() {
-    if (!this.osController) this.osController = createOSController();
-    return this.osController;
-  }
-  _audit(action, meta) {
-    this.logger?.info?.(`[desktop] ${action}`, { tool: this.id, action, ...meta });
-  }
-  /**
-   * Build a structured failure result. `action` is optional but
-   * strongly preferred — it lets the agent distinguish "click failed"
-   * from "screenshot failed" at a glance when results arrive
-   * out-of-order in a batched message, which is the only way it can
-   * recover the correct mental model of what happened.
-   */
-  _fail(code, message, action = null) {
-    this.logger?.warn?.(`[desktop] ${code}: ${message}`);
-    const prefix = action ? `[action: ${action}] ` : '';
-    return {
-      success: false,
-      ...(action ? { action } : {}),
-      error: message,
-      code,
-      output: `${prefix}Desktop action failed (${code}): ${message}`,
-    };
-  }
-}
-// Re-export OSError so callers can match on it without two imports.
-export { OSError };
-export default DesktopTool;
+/**
+ * DesktopTool — agent-facing desktop-control surface.
+ *
+ * Pairs the OS controller (mouse / keyboard / screenshot / windows) with
+ * a pluggable visual-grounding model (default: Kimi K2.6) so an agent
+ * can say "click the orange Save button" and the tool figures out
+ * `(x, y)` from a fresh screenshot before the click.
+ *
+ * Five guardrails are layered into every execute() call:
+ *
+ *   1. CAPABILITY GATE     - osController refuses ops the OS can't do
+ *                            (e.g. mouseInput on Wayland) and surfaces
+ *                            an operator-friendly remediation message.
+ *
+ *   2. PER-AGENT ALLOWLIST - toolConfig.allowedActions lets an operator
+ *                            disable any subset of actions per agent
+ *                            (e.g. read-only screenshot bot vs. full
+ *                            control). Default: ALL DISABLED — the
+ *                            agent has to be explicitly granted control.
+ *
+ *   3. KILL SWITCH         - LOXIA_DESKTOP_TOOL_DISABLED=1 env flag
+ *                            short-circuits every call with a clear
+ *                            error. Zero-source kill in production.
+ *
+ *   4. AUDIT TRAIL         - every action emits a structured log line
+ *                            with operation id + grounded coords so
+ *                            screen-recording-free postmortems work.
+ *
+ *   5. INTENT GROUNDING    - "click" / "scroll" / "drag" accept either
+ *                            raw (x, y) OR a natural-language `intent`.
+ *                            With intent, the tool grabs a screenshot,
+ *                            asks the grounding model, validates the
+ *                            returned coords fit the screen, and
+ *                            ONLY THEN moves. Failed grounding never
+ *                            silently clicks somewhere random.
+ *
+ * Model is swappable via toolConfig.groundingModelId (any id registered
+ * in src/services/grounding/registry.js). Backend proxy vs. direct
+ * Foundry is also config — see _buildAdapter below.
+ */
+import { writeFile, mkdir } from 'node:fs/promises';
+import { join } from 'node:path';
+import { tmpdir } from 'node:os';
+import { BaseTool } from '../baseTool.js';
+import { createOSController, OSError } from './osController.js';
+import {
+  createGroundingModel,
+  ModelId,
+  DEFAULT_REASONING_EFFORT,
+} from '../../services/grounding/index.js';
+/** Every action the tool can perform. */
+export const DesktopAction = Object.freeze({
+  SCREENSHOT:     'screenshot',
+  CLICK:          'click',
+  TYPE:           'type',
+  KEY_PRESS:      'key_press',
+  SCROLL:         'scroll',
+  DRAG:           'drag',
+  LIST_WINDOWS:   'list_windows',
+  FOCUS_WINDOW:   'focus_window',
+  DESCRIBE_CAPS:  'describe_capabilities',
+});
+/** Env-level kill switch — checked on every call. */
+const ENV_KILL_FLAG = 'LOXIA_DESKTOP_TOOL_DISABLED';
+/** Defaults — overridable via per-agent toolConfig.desktop or this.config. */
+const Defaults = Object.freeze({
+  /** Default actions an agent gets WHEN allowedActions is unset.
+   *  Empty list = no control. Operator must opt in per agent. */
+  ALLOWED_ACTIONS:        [],
+  /** Visual-grounding model (registry id). */
+  GROUNDING_MODEL:        ModelId.KIMI_K2_6,
+  /** Grounding reasoning effort. */
+  GROUNDING_EFFORT:       DEFAULT_REASONING_EFFORT,
+  /** Hard cap on a single grounded coord lookup. Catches hangs. */
+  GROUNDING_TIMEOUT_MS:   180_000,
+  /** Refuse clicks > this many px from the screen edge (sanity check
+   *  against bogus grounding outputs). */
+  COORD_MARGIN_PX:        0,
+});
+export class DesktopTool extends BaseTool {
+  constructor(config = {}, logger = null) {
+    super(config, logger);
+    this.id = 'desktop';
+    this.name = 'Desktop Control';
+    this.version = '1.0.0';
+    this.requiresProject = false;
+    this.isAsync = true;
+    // Modest builtin delay so the OS has time to repaint between
+    // an action and the agent's next screenshot.
+    this.builtinDelay = 150;
+    // Injected at registry time. The tool needs:
+    //   - aiService: source of baseUrl + apiKey for proxy-mode grounding
+    //   - osController: defaults to a fresh createOSController() instance
+    //                   but tests inject a stub.
+    this.aiService    = null;
+    this.osController = config.osController || null;
+    this._adapterCache = null;       // lazy — built on first ground()
+  }
+  /** Called by index.js after construction. */
+  setAIService(aiService) {
+    this.aiService = aiService;
+  }
+  /** Static factory for tests / explicit DI. */
+  static withDependencies({ aiService, osController, config = {}, logger = null }) {
+    const tool = new DesktopTool({ ...config, osController }, logger);
+    if (aiService) tool.setAIService(aiService);
+    return tool;
+  }
+  // ─── BaseTool surface ─────────────────────────────────────────────
+  getDescription() {
+    // NOTE: this string lands in the agent's system prompt every turn.
+    // Keep it tight, action-oriented, and free of meta-commentary like
+    // "this is beta" / "we're missing safeguards" — agents read those
+    // disclaimers as instructions to act timidly and the user's UX
+    // suffers. Operator-facing beta indicators live in the web-UI
+    // configurator + tool-selector pills, not here.
+    return `
+Desktop Control Tool: drive the user's keyboard, mouse, screen, and
+windows like a human. Combines OS-level input with visual grounding
+("click the orange Save button") via a vision LLM.
+USAGE:
+  {
+    "toolId": "desktop",
+    "parameters": {
+      "action": "click",
+      "intent": "the orange Save button"
+    }
+  }
+PREFERRED WORKFLOW — use intent-driven actions, NOT raw screenshot:
+For "click X" tasks, do NOT call screenshot first and then try to read
+the image yourself. You can't see screenshot results in plain text. The
+intent-driven actions screenshot + ground + act in one step:
+  { "toolId": "desktop", "parameters": {
+      "action": "click", "intent": "the Netflix icon in the taskbar" }}
+That triggers: screenshot → grounding model finds the coords → click.
+You only need to look at the screenshot yourself if grounding fails
+(rare on legible UIs).
+ACTIONS:
+  - click  (intent OR x,y)      single/double click — use intent first
+  - type   (text)               keyboard input into focused window
+  - key_press (keys[])          chord like ["Control","S"]
+  - scroll (intent OR x,y, dy)  positive dy scrolls down
+  - drag   (fromIntent/toIntent OR from/to coords)
+  - list_windows                titles + bounds of open windows
+  - focus_window (titleMatch)   bring matching window to front
+  - screenshot                  capture screen to disk (returns file
+                                path; you cannot read raw PNG bytes —
+                                use the vision tool on the path if you
+                                must inspect, or prefer intent actions)
+  - describe_capabilities       OS + display server + permission state
+INTENT FAILURE: if grounding can't find the target, you'll get a
+GROUNDING_FAILED or COORDS_OUT_OF_BOUNDS error with the model's raw
+answer. Sharpen the intent ("the red Save button in the toolbar")
+and retry.
+PERMISSION MODEL:
+  - Every action is OFF by default per-agent until the operator adds
+    it to toolConfig.desktop.allowedActions in the agent's config.
+  - LOXIA_DESKTOP_TOOL_DISABLED=1 is a global kill switch (env).
+  - On Linux Wayland, input actions are blocked by the OS; only
+    screenshot + list_windows work. The tool surfaces a clear
+    "Wayland blocks input injection" error for the rest.
+OS NOTES:
+  - macOS: needs Accessibility + Screen Recording permissions granted
+    to the Loxia process. Without them you'll see PERMISSION_DENIED.
+  - First call lazily loads @nut-tree-fork/nut-js (optional dep, ~30MB
+    native binary). If not installed, every action fails with
+    NATIVE_UNAVAILABLE — install it or run on a different machine.
+    `.trim();
+  }
+  parseParameters(content) {
+    try {
+      const trimmed = (content || '').trim();
+      if (trimmed.startsWith('{')) {
+        const parsed = JSON.parse(trimmed);
+        return parsed.parameters || parsed;
+      }
+      // No XML alternate form — desktop actions are too varied to
+      // hand-author. JSON-only keeps the surface honest.
+      throw new Error('desktop tool requires JSON parameters');
+    } catch (err) {
+      throw new Error(`Failed to parse desktop parameters: ${err.message}`, { cause: err });
+    }
+  }
+  getSupportedActions() {
+    return Object.values(DesktopAction);
+  }
+  getRequiredParameters() {
+    return ['action'];
+  }
+  // ─── execute ──────────────────────────────────────────────────────
+  async execute(params, context) {
+    // 1. Kill switch
+    if (process.env[ENV_KILL_FLAG] === '1') {
+      return this._fail('DESKTOP_DISABLED',
+        'Desktop tool disabled via LOXIA_DESKTOP_TOOL_DISABLED.');
+    }
+    const action = params?.action;
+    if (!action || !Object.values(DesktopAction).includes(action)) {
+      return this._fail('INVALID_ACTION',
+        `unknown action "${action}". Valid: ${Object.values(DesktopAction).join(', ')}`);
+    }
+    // 2. Per-agent allowlist
+    const effective = this.getEffectiveConfig(context, Defaults);
+    const allowed = effective.allowedActions || Defaults.ALLOWED_ACTIONS;
+    if (!Array.isArray(allowed) || !allowed.includes(action)) {
+      return this._fail('NOT_PERMITTED',
+        `action "${action}" not in this agent's allowedActions ` +
+        `(set toolConfig.desktop.allowedActions to enable).`);
+    }
+    // 3. Dispatch
+    const osc = this._osc();
+    try {
+      switch (action) {
+        case DesktopAction.SCREENSHOT:    return await this._actScreenshot(osc, params);
+        case DesktopAction.CLICK:         return await this._actClick(osc, effective, params, context);
+        case DesktopAction.TYPE:          return await this._actType(osc, params);
+        case DesktopAction.KEY_PRESS:     return await this._actKeyPress(osc, params);
+        case DesktopAction.SCROLL:        return await this._actScroll(osc, effective, params, context);
+        case DesktopAction.DRAG:          return await this._actDrag(osc, effective, params, context);
+        case DesktopAction.LIST_WINDOWS:  return await this._actListWindows(osc);
+        case DesktopAction.FOCUS_WINDOW:  return await this._actFocusWindow(osc, params);
+        case DesktopAction.DESCRIBE_CAPS: return await this._actDescribeCaps(osc);
+      }
+    } catch (err) {
+      return this._fail(err.code || 'OP_FAILED', err.message, action);
+    }
+  }
+  // ─── actions ──────────────────────────────────────────────────────
+  async _actScreenshot(osc, params) {
+    const png = await osc.screenshot({ region: params.region });
+    const size = await osc.screenSize();
+    // Tag the action in the output. Without this, a screenshot success
+    // result is indistinguishable from a click/scroll/drag SUCCESS when
+    // the agent reads the tool result text — and out-of-order batched
+    // returns can land a stale screenshot success right where a click
+    // success would normally appear. Putting "[action: screenshot]" up
+    // front makes the action explicit so the model can't mistake it for
+    // a click confirmation.
+    // Save to a temp file rather than inlining base64 into the tool
+    // result. A 200 KB PNG becomes ~290 KB of base64 — stuffing that
+    // into the conversation as a text tool-result derails the next
+    // model turn (it sees a wall of characters, not an image, and
+    // loses the original task; streaming often aborts mid-scan).
+    // Returning a path lets vision-capable downstream tools open it
+    // properly, and keeps the conversation token count sane.
+    const filePath = await this._saveScreenshot(png);
+    this._audit('screenshot', { size, bytes: png.length, filePath });
+    return {
+      success: true,
+      action: 'screenshot',
+      output:
+        `[action: screenshot] Captured ${size.width}x${size.height} screenshot ` +
+        `(${png.length} bytes) → ${filePath}\n` +
+        `NOTE: a successful screenshot does NOT mean an earlier click/scroll/drag ` +
+        `succeeded — those have separate results tagged "[action: click]" etc. ` +
+        `Look at the most recent action-tagged result for the action you actually called.\n` +
+        `To inspect the image use the vision tool with this path, or just call ` +
+        `click/scroll/drag with an "intent" (one-step: screenshot + ground + act).`,
+      screenshotPath: filePath,
+      bytes: png.length,
+      size,
+    };
+  }
+  /**
+   * Persist a captured PNG to disk. We use the OS temp dir under a
+   * stable subfolder so old shots are easy to clean up by hand, and
+   * a millisecond-precise filename so concurrent captures don't
+   * collide. Returning the file path (not the bytes) is what keeps
+   * the conversation text-size sane — see _actScreenshot.
+   */
+  async _saveScreenshot(png) {
+    const dir = join(tmpdir(), 'loxia-desktop-screenshots');
+    await mkdir(dir, { recursive: true });
+    const filePath = join(dir, `screenshot-${Date.now()}.png`);
+    await writeFile(filePath, png);
+    return filePath;
+  }
+  async _actClick(osc, effective, params, context) {
+    const { x, y, groundedFrom } = await this._resolveCoords(osc, effective, params, context);
+    await osc.mouseClick(x, y, { button: params.button, count: params.count });
+    this._audit('click', { x, y, button: params.button || 'left', count: params.count || 1, groundedFrom });
+    return {
+      success: true,
+      action: 'click',
+      output: `[action: click] Clicked at (${x}, ${y})${groundedFrom ? ` — grounded from "${groundedFrom}"` : ''}.`,
+      coords: { x, y },
+      groundedFrom,
+    };
+  }
+  async _actType(osc, params) {
+    if (typeof params.text !== 'string' || params.text.length === 0) {
+      return this._fail('INVALID_INPUT', 'type action requires non-empty "text"', 'type');
+    }
+    await osc.typeText(params.text, { delayMs: params.delayMs });
+    this._audit('type', { chars: params.text.length });
+    return {
+      success: true,
+      action: 'type',
+      output: `[action: type] Typed ${params.text.length} characters.`,
+    };
+  }
+  async _actKeyPress(osc, params) {
+    if (!Array.isArray(params.keys) || params.keys.length === 0) {
+      return this._fail('INVALID_INPUT', 'key_press requires non-empty "keys" array', 'key_press');
+    }
+    await osc.keyPress(params.keys);
+    this._audit('key_press', { keys: params.keys });
+    return {
+      success: true,
+      action: 'key_press',
+      output: `[action: key_press] Pressed ${params.keys.join('+')}.`,
+    };
+  }
+  async _actScroll(osc, effective, params, context) {
+    if (!Number.isFinite(params.dy)) {
+      return this._fail('INVALID_INPUT', 'scroll requires numeric "dy"', 'scroll');
+    }
+    const { x, y, groundedFrom } = await this._resolveCoords(osc, effective, params, context);
+    await osc.mouseScroll(x, y, params.dy);
+    this._audit('scroll', { x, y, dy: params.dy, groundedFrom });
+    return {
+      success: true,
+      action: 'scroll',
+      output: `[action: scroll] Scrolled ${params.dy > 0 ? 'down' : 'up'} ${Math.abs(params.dy)} at (${x}, ${y}).`,
+      coords: { x, y },
+      groundedFrom,
+    };
+  }
+  async _actDrag(osc, effective, params, context) {
+    // Drag accepts either two coord pairs OR two intents. The two ends
+    // are grounded independently with the SAME screenshot — avoids two
+    // model calls when both intents reference the same view.
+    const from = await this._resolveCoords(
+      osc, effective,
+      { x: params.from?.x, y: params.from?.y, intent: params.fromIntent },
+      context,
+    );
+    const to = await this._resolveCoords(
+      osc, effective,
+      { x: params.to?.x, y: params.to?.y, intent: params.toIntent },
+      context,
+      from._sharedScreenshot,
+    );
+    await osc.mouseDrag({ x: from.x, y: from.y }, { x: to.x, y: to.y }, { button: params.button });
+    this._audit('drag', { from, to });
+    return {
+      success: true,
+      action: 'drag',
+      output: `[action: drag] Dragged from (${from.x}, ${from.y}) to (${to.x}, ${to.y}).`,
+      from: { x: from.x, y: from.y },
+      to:   { x: to.x,   y: to.y },
+    };
+  }
+  async _actListWindows(osc) {
+    const raw = await osc.listWindows();
+    // Most platforms (Windows especially) report hundreds of OS-internal
+    // handles with empty titles. Surfacing the full list bloats the
+    // conversation context (>200 KB observed on Win11 with ~1300
+    // handles) and the model loses the original task in the noise.
+    // Filter to titled windows and cap to a sensible top-N. Total count
+    // stays in the output so the agent knows truncation happened.
+    const titled = raw.filter(w => w.title && w.title.trim().length > 0);
+    const MAX_LIST = 50;
+    const top = titled.slice(0, MAX_LIST);
+    const truncated = titled.length > MAX_LIST;
+    this._audit('list_windows', { total: raw.length, titled: titled.length, returned: top.length });
+    return {
+      success: true,
+      action: 'list_windows',
+      output:
+        `[action: list_windows] ${titled.length} titled window(s)` +
+        (raw.length !== titled.length ? ` (filtered ${raw.length - titled.length} untitled handles)` : '') +
+        (truncated ? ` — showing first ${MAX_LIST}` : '') + ': ' +
+        top.map(w => `"${w.title}"`).join(', '),
+      windows: top,
+      totalCount: raw.length,
+      titledCount: titled.length,
+      truncated,
+    };
+  }
+  async _actFocusWindow(osc, params) {
+    if (!params.titleMatch) {
+      return this._fail('INVALID_INPUT', 'focus_window requires "titleMatch"', 'focus_window');
+    }
+    const r = await osc.focusWindow({ titleMatch: params.titleMatch });
+    this._audit('focus_window', r);
+    return {
+      success: r.focused,
+      action: 'focus_window',
+      output: r.focused
+        ? `[action: focus_window] Focused window: "${r.title}".`
+        : `[action: focus_window] No window matched "${params.titleMatch}".`,
+      ...r,
+    };
+  }
+  async _actDescribeCaps(osc) {
+    const caps = await osc.describeCapabilities();
+    return {
+      success: true,
+      action: 'describe_capabilities',
+      output: `[action: describe_capabilities] OS: ${caps.os}${caps.display ? ` / ${caps.display}` : ''}; ` +
+              `screenshot=${caps.screenshot} mouseInput=${caps.mouseInput} ` +
+              `keyboardInput=${caps.keyboardInput} windowFocus=${caps.windowFocus}` +
+              (caps.degradedReason ? `\nDegraded: ${caps.degradedReason}` : ''),
+      capabilities: caps,
+    };
+  }
+  // ─── helpers ──────────────────────────────────────────────────────
+  /**
+   * Resolve (x, y) for an action. Three input shapes:
+   *
+   *   1. {x, y}              — used as-is after validation
+   *   2. {intent: '...'}     — ask the grounding model
+   *   3. neither             — INVALID_INPUT
+   *
+   * When grounding fires, the screenshot is returned on the result so
+   * the caller (drag) can reuse it for the second coord without an
+   * extra capture.
+   */
+  async _resolveCoords(osc, effective, params, context, reuseScreenshot = null) {
+    if (Number.isFinite(params.x) && Number.isFinite(params.y)) {
+      return { x: params.x, y: params.y, groundedFrom: null };
+    }
+    if (typeof params.intent === 'string' && params.intent.trim()) {
+      const size = await osc.screenSize();
+      const screenshot = reuseScreenshot || await osc.screenshot();
+      const adapter = await this._adapter(effective, context);
+      const result = await this._groundWithFallback(adapter, {
+        screenshot,
+        intent: params.intent,
+        imageSize: { width: size.width, height: size.height },
+        knobs: { reasoning_effort: effective.groundingEffort || Defaults.GROUNDING_EFFORT },
+        timeoutMs: effective.groundingTimeoutMs || Defaults.GROUNDING_TIMEOUT_MS,
+      }, effective, context);
+      if (!result.coords) {
+        const err = new Error(`grounding produced no coords; model said: "${(result.answer || '').slice(0, 200)}"`);
+        err.code = 'GROUNDING_FAILED';
+        throw err;
+      }
+      const { x, y } = result.coords;
+      if (!this._coordsInScreen(x, y, size, effective.coordMarginPx ?? Defaults.COORD_MARGIN_PX)) {
+        const err = new Error(`grounded coords (${x}, ${y}) lie outside the ${size.width}x${size.height} screen`);
+        err.code = 'COORDS_OUT_OF_BOUNDS';
+        throw err;
+      }
+      return { x, y, groundedFrom: params.intent, _sharedScreenshot: screenshot };
+    }
+    const err = new Error('action requires either (x, y) or "intent"');
+    err.code = 'INVALID_INPUT';
+    throw err;
+  }
+  _coordsInScreen(x, y, size, margin) {
+    return x >= margin && y >= margin
+        && x <= size.width  - margin
+        && y <= size.height - margin;
+  }
+  /**
+   * Lazy-build the grounding adapter. The model id + transport mode
+   * come from toolConfig so an operator can swap Kimi → some future
+   * model without code changes.
+   */
+  async _adapter(effective, context) {
+    if (this._adapterCache) return this._adapterCache;
+    this._adapterCache = this._buildAdapter(effective, context);
+    return this._adapterCache;
+  }
+  _buildAdapter(effective, context) {
+    const modelId = effective.groundingModelId || Defaults.GROUNDING_MODEL;
+    const foundryEndpoint = process.env.FOUNDRY_ENDPOINT;
+    const foundryKey      = process.env.FOUNDRY_KEY;
+    const forceDirect     = process.env.LOXIA_GROUNDING_DIRECT === '1';
+    const haveDirectCreds = !!(foundryEndpoint && foundryKey);
+    // Mode selection. In order of preference:
+    //   1. forceDirect env flag → direct (test / CI)
+    //   2. direct creds set AND no aiService → direct (headless / bench)
+    //   3. otherwise → proxy via the Loxia backend (production path)
+    // A failed proxy request (e.g. 404 because /llm/grounding isn't
+    // deployed yet) automatically retries once in direct mode if creds
+    // exist — `ground()` does the fallback below.
+    if (forceDirect || (!this.aiService && haveDirectCreds)) {
+      if (!haveDirectCreds) {
+        throw Object.assign(new Error(
+          'Direct mode requested but FOUNDRY_ENDPOINT / FOUNDRY_KEY env vars are not set.'
+        ), { code: 'NOT_INITIALISED' });
+      }
+      return createGroundingModel(modelId, {
+        mode:     'direct',
+        endpoint: foundryEndpoint,
+        apiKey:   foundryKey,
+      });
+    }
+    if (!this.aiService) {
+      throw Object.assign(new Error(
+        'DesktopTool has no grounding transport. Either set FOUNDRY_ENDPOINT + ' +
+        'FOUNDRY_KEY env vars (direct mode) OR run a Loxia backend that has ' +
+        'POST /llm/grounding deployed (proxy mode).'
+      ), { code: 'NOT_INITIALISED' });
+    }
+    return createGroundingModel(modelId, {
+      mode:        'proxy',
+      backendUrl:  this.aiService.baseUrl,
+      userApiKey:  this._resolveUserKey(context),
+    });
+  }
+  /**
+   * Wraps the adapter's ground() with one automatic fallback to direct
+   * mode when proxy returns 404 (route not deployed yet) AND direct
+   * credentials are available in env. Saves the user from "deploy the
+   * backend before you can fun" friction during the rollout window.
+   */
+  async _groundWithFallback(adapter, request, effective) {
+    try {
+      return await adapter.ground(request);
+    } catch (err) {
+      const is404 = /\b404\b/.test(err.message || '') || /\bNot Found\b/i.test(err.message || '');
+      const isProxy = adapter.mode === 'proxy';
+      const haveDirectCreds = !!(process.env.FOUNDRY_ENDPOINT && process.env.FOUNDRY_KEY);
+      if (!is404 || !isProxy || !haveDirectCreds) throw err;
+      this.logger?.warn?.(
+        '[desktop] proxy /llm/grounding returned 404 — falling back to direct Foundry. ' +
+        'Deploy the backend route to get billing + audit back.',
+      );
+      const directAdapter = createGroundingModel(
+        effective.groundingModelId || Defaults.GROUNDING_MODEL,
+        { mode: 'direct', endpoint: process.env.FOUNDRY_ENDPOINT, apiKey: process.env.FOUNDRY_KEY },
+      );
+      // Replace cache so subsequent calls go straight to direct.
+      this._adapterCache = directAdapter;
+      return await directAdapter.ground(request);
+    }
+  }
+  _resolveUserKey(context) {
+    // Pull the per-session Loxia token the same way other tools do.
+    const km = this.aiService?.apiKeyManager;
+    if (km && typeof km.getKeysForRequest === 'function') {
+      const keys = km.getKeysForRequest(context?.sessionId, {
+        platformProvided: context?.platformProvided !== false,
+      });
+      if (keys?.loxiaApiKey) return keys.loxiaApiKey;
+    }
+    return this.aiService?.config?.apiKey || process.env.LOXIA_API_KEY || null;
+  }
+  /**
+   * Lazy-create an OS controller if the constructor didn't get one.
+   * Production path: created on demand. Test path: injected at ctor.
+   */
+  _osc() {
+    if (!this.osController) this.osController = createOSController();
+    return this.osController;
+  }
+  _audit(action, meta) {
+    this.logger?.info?.(`[desktop] ${action}`, { tool: this.id, action, ...meta });
+  }
+  /**
+   * Build a structured failure result. `action` is optional but
+   * strongly preferred — it lets the agent distinguish "click failed"
+   * from "screenshot failed" at a glance when results arrive
+   * out-of-order in a batched message, which is the only way it can
+   * recover the correct mental model of what happened.
+   */
+  _fail(code, message, action = null) {
+    this.logger?.warn?.(`[desktop] ${code}: ${message}`);
+    const prefix = action ? `[action: ${action}] ` : '';
+    return {
+      success: false,
+      ...(action ? { action } : {}),
+      error: message,
+      code,
+      output: `${prefix}Desktop action failed (${code}): ${message}`,
+    };
+  }
+}
+// Re-export OSError so callers can match on it without two imports.
+export { OSError };
+export default DesktopTool;