npm - @opengeni/runtime - Versions diffs - 0.2.1 → 0.2.2 - Mend

@opengeni/runtime 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@opengeni/runtime",
-  "version": "0.2.1",
+  "version": "0.2.2",
   "type": "module",
   "main": "./dist/index.js",
   "module": "./dist/index.js",
@@ -28,14 +28,14 @@
     "typecheck": "tsc --noEmit"
   },
   "dependencies": {
-    "@opengeni/agent-proto": "^0.2.0",
-    "@opengeni/config": "^0.2.1",
+    "@opengeni/agent-proto": "^0.2.1",
+    "@opengeni/config": "^0.2.2",
     "@opengeni/contracts": "^0.4.0",
     "@openai/agents": "^0.11.6",
     "@openai/agents-extensions": "^0.11.6",
     "modal": "^0.7.4",
     "openai": "6.36.0",
-    "@opengeni/codex": "^0.2.0"
+    "@opengeni/codex": "^0.2.1"
   },
   "devDependencies": {
     "tsup": "^8.5.0",

package/src/index.ts CHANGED Viewed

@@ -91,7 +91,7 @@ import {
   restoredSandboxSessionStateFromEntry,
   setSelfhostedApplyDiff,
 } from "./sandbox";
-import { computerUse } from "./sandbox-computer";
+import { computerUse, type ComputerToolMode } from "./sandbox-computer";
 // P4.3 computer-use surface (the agent's :0 driver). Re-exported from the barrel
 // so callers (the worker, live proofs) reach SandboxComputer/ComputerUseCapability
@@ -106,6 +106,7 @@ export {
   ComputerActionError,
   type SandboxComputerOptions,
   type ComputerUseArgs,
+  type ComputerToolMode,
 } from "./sandbox-computer";
 // The agent-loop-free sandbox leaf (createSandboxClient + resume/recovery
@@ -643,6 +644,12 @@ export type BuildAgentOptions = {
   encryptedReasoning?: boolean;
   contextWindowTokens?: number;
   structuredToolTransport?: boolean;
+  // EXPLICIT computer-use tool transport, decided where provider identity is
+  // authoritative (the worker's model resolution — agent-turn.ts). Threaded into
+  // buildAgentCapabilities → computerUse({toolMode}) so tool selection never rests
+  // on the SDK's constructor-name sniff. When omitted, the legacy sniff +
+  // `structuredToolTransport` neutralize path is preserved byte-for-byte.
+  computerToolMode?: ComputerToolMode;
   // The LIVE, by-reference connector-namespace Set from prepareAgentTools
   // (codexConnectorNamespaces): fills during each turn's codex_apps tools/list,
   // read per model call by the codex tool_search description so the model sees
@@ -864,6 +871,7 @@ export function buildOpenGeniAgent(settings: Settings, resources: ResourceRef[],
       compactionMode,
       contextWindowTokens,
       ...(options.structuredToolTransport !== undefined ? { structuredToolTransport: options.structuredToolTransport } : {}),
+      ...(options.computerToolMode !== undefined ? { computerToolMode: options.computerToolMode } : {}),
     }),
   });
   agentFileDownloads.set(agent, normalizeSandboxFileDownloads(options.fileResourceDownloads ?? []).filter((download) => !download.content));
@@ -961,7 +969,16 @@ function neutralizeStructuredToolTransport(capability: ReturnType<typeof filesys
 export function buildAgentCapabilities(
   settings: Settings,
   packSkills: PackSkill[],
-  options: { compactionMode?: ContextCompactionMode; contextWindowTokens?: number; structuredToolTransport?: boolean } = {},
+  options: {
+    compactionMode?: ContextCompactionMode;
+    contextWindowTokens?: number;
+    structuredToolTransport?: boolean;
+    // EXPLICIT computer-use transport (see BuildAgentOptions.computerToolMode). When
+    // present, computerUse() is handed the mode directly and its tools() obeys it
+    // without the constructor-name sniff. When absent, the legacy neutralize +
+    // imageFunctionResults path (driven by structuredToolTransport) is unchanged.
+    computerToolMode?: ComputerToolMode;
+  } = {},
 ): ReturnType<typeof Capabilities.default> {
   const mode = options.compactionMode ?? resolveContextCompactionMode(settings);
   const contextWindowTokens = options.contextWindowTokens ?? settings.contextWindowTokens;
@@ -996,25 +1013,37 @@ export function buildAgentCapabilities(
     && settings.sandboxDesktopEnabled
     && desktopCapableBackend(settings.sandboxBackend)
   ) {
-    // computer-use is now transport-aware, exactly like filesystem: its `tools()`
-    // emits the HOSTED `computer_use_preview` tool on the structured transport and a
-    // set of FUNCTION `computer_*` tools on the text transport. The ChatGPT/Codex
-    // backend rejects hosted tool types (only function/custom/web_search accepted),
-    // so on the codex path (structuredToolTransport === false) we neutralize the
-    // capability's model binding — the SAME trick used for filesystem above — so
-    // `tools()` sees no model instance and emits the function tools the backend can
-    // call, instead of suppressing the desktop tier entirely.
+    // computer-use is transport-aware, exactly like filesystem: `tools()` emits the
+    // HOSTED `computer_use_preview` tool on the structured transport and a set of
+    // FUNCTION `computer_*` tools on the text transport. The ChatGPT/Codex backend
+    // rejects hosted tool types (only function/custom/web_search accepted).
+    //
+    // HARDENING: when the caller declares an EXPLICIT `computerToolMode` (the worker
+    // does, from its authoritative model resolution), thread it straight through —
+    // tool selection then never depends on the SDK's model-instance constructor-name
+    // sniff (which a wrapped/proxied model would defeat, silently 400ing a
+    // chat-completions provider handed the hosted tool). When ABSENT, the legacy path
+    // is preserved byte-for-byte: on the codex path (structuredToolTransport === false)
+    // we set imageFunctionResults and neutralize the capability's model binding — the
+    // SAME trick used for filesystem above — so `tools()` sees no model instance and
+    // emits the function tools the backend can call, instead of suppressing the tier.
+    const explicitMode = options.computerToolMode;
     const computerCapability = computerUse({
       dimensions: [settings.streamResolutionWidth, settings.streamResolutionHeight],
       readOnly: settings.computerUseReadOnly,
-      // On the codex path the function tools deliver screenshots as a real image the
-      // model can see. The ChatGPT/Codex backend rejects HOSTED tool types but DOES
-      // accept `input_image` content items inside a `function_call_output` (proven by
-      // openai/codex codex-rs, whose view_image tool ships exactly that shape) — so a
-      // structured image tool result is seen, where a text data-URL would be unreadable.
-      ...(options.structuredToolTransport === false ? { imageFunctionResults: true } : {}),
+      ...(explicitMode
+        ? { toolMode: explicitMode }
+        // Legacy (no explicit mode): on the codex path the function tools deliver
+        // screenshots as a real image the model can see. The ChatGPT/Codex backend
+        // rejects HOSTED tool types but DOES accept `input_image` content items inside a
+        // `function_call_output` (proven by openai/codex codex-rs, whose view_image tool
+        // ships exactly that shape) — so a structured image tool result is seen, where a
+        // text data-URL would be unreadable.
+        : options.structuredToolTransport === false ? { imageFunctionResults: true } : {}),
     });
-    if (options.structuredToolTransport === false) {
+    // Neutralize ONLY on the legacy sniff path. With an explicit toolMode the mode
+    // already forces the function tools, so the constructor-name override is moot.
+    if (!explicitMode && options.structuredToolTransport === false) {
       neutralizeStructuredToolTransport(computerCapability);
     }
     caps.push(computerCapability as unknown as ReturnType<typeof Capabilities.default>[number]);
@@ -1088,19 +1117,19 @@ export async function prepareAgentTools(settings: Settings, tools: ToolRef[], op
     //    device-code login may lack the connector scopes, and the backend can
     //    reject the bearer at the initialize/tools-list handshake, so a 401/403
     //    (or a missing/failed token) drops the server.
-    //  - an AUTO-ATTACHED workspace-default capability MCP (ToolRef.optional):
-    //    the caller never explicitly requested it, so a broken/expired
-    //    capability credential must SKIP the server with a warning, never kill
-    //    the turn before the model runs. An EXPLICITLY-requested tool omits
-    //    `optional` and stays strict (below), preserving the fail-loud contract.
+    //  - an optional ToolRef: either an auto-attached workspace-default
+    //    capability MCP or a client/pack-selected portable ref. A
+    //    broken/expired credential or unavailable endpoint skips the server
+    //    with a warning, never killing the turn before the model runs. Bare
+    //    refs stay strict (below), preserving the fail-loud default.
     const optional = tool.optional === true;
     return { server, bestEffort: isCodexAppsMcpServer(config) || optional, optional };
   }));
   const requiredServers = servers.filter((entry) => !entry.bestEffort).map((entry) => entry.server);
   const bestEffortServers = servers.filter((entry) => entry.bestEffort).map((entry) => entry.server);
-  // Names of the OPTIONAL capability servers (not codex_apps) so a drop is
-  // surfaced as a warning; codex_apps keeps its historically-quiet drop (a
-  // not-logged-in ChatGPT plan is a normal, non-noteworthy state).
+  // Names of the OPTIONAL servers (not codex_apps) so a drop is surfaced as a
+  // warning; codex_apps keeps its historically-quiet drop (a not-logged-in
+  // ChatGPT plan is a normal, non-noteworthy state).
   const optionalServerNames = new Set(
     servers.filter((entry) => entry.optional).map((entry) => entry.server.name),
   );
@@ -1121,7 +1150,7 @@ export async function prepareAgentTools(settings: Settings, tools: ToolRef[], op
       }
       const error = connectedBestEffort.errors.get(failed);
       console.warn(
-        `[mcp] optional capability server "${failed.name}" failed to connect/list tools; skipping it for this turn`,
+        `[mcp] optional server "${failed.name}" failed to connect/list tools; skipping it for this turn`,
         error instanceof Error ? error.message : error,
       );
     }

package/src/sandbox-computer.ts CHANGED Viewed

@@ -761,6 +761,25 @@ export function computerFunctionTools(
 // ── The capability (the SDK seam) ────────────────────────────────────────────
+/**
+ * EXPLICIT tool-transport selection, decided by the caller that knows the
+ * provider's true wire identity (the worker's model resolution — see agent-turn.ts),
+ * NOT inferred from the bound model instance's constructor name. This is the
+ * HARDENING seam: `supportsStructuredToolOutputTransport` string-sniffs the
+ * constructor for "ChatCompletions", which a wrapped / proxied / minified model
+ * instance would defeat — silently handing a chat-completions provider the HOSTED
+ * `computer_use_preview` tool it 400s on every turn. When `toolMode` is set, tools()
+ * OBEYS it and never consults the sniff:
+ *   • "hosted"         → the single hosted `computer_use_preview` tool (Responses backends).
+ *   • "function-image" → the FUNCTION `computer_*` tools with screenshots delivered as a
+ *                        structured `{type:'image'}` output (the codex/ChatGPT backend,
+ *                        which rejects hosted tool types but SEES structured image results).
+ *   • "function-text"  → the FUNCTION tools with screenshots rendered as a text
+ *                        `data:…;base64` URL (chat-completions providers, which can't read
+ *                        structured image tool results).
+ */
+export type ComputerToolMode = "hosted" | "function-image" | "function-text";
 export type ComputerUseArgs = {
   dimensions?: [number, number];
   readOnly?: boolean;
@@ -771,8 +790,14 @@ export type ComputerUseArgs = {
   // `input_image` content item inside the function_call_output) instead of the text
   // data-URL string. Only the codex/ChatGPT backend can read structured image tool
   // results; chat-completions providers cannot, so this stays OFF (text rendering)
-  // by default and is turned on only on the codex path (see index.ts).
+  // by default and is turned on only on the codex path (see index.ts). Ignored when
+  // `toolMode` is set (the mode carries its own image-delivery choice).
   imageFunctionResults?: boolean;
+  // EXPLICIT transport selection (see {@link ComputerToolMode}). When present, tools()
+  // obeys it directly — the constructor-name sniff is NOT consulted. When ABSENT, the
+  // legacy sniff behaviour is preserved byte-for-byte (back-compat for any embedder
+  // that constructs the capability without threading a mode).
+  toolMode?: ComputerToolMode;
 };
 export function computerUse(args: ComputerUseArgs = {}): ComputerUseCapability {
@@ -820,16 +845,36 @@ export class ComputerUseCapability extends Capability {
           // The SDK base exposes the bound runAs as a protected field.
           ...(typeof this._runAs === "string" ? { runAs: this._runAs } : {}),
         });
-    // Structured transport keeps the HOSTED computer tool (unchanged); the codex /
-    // text backend gets the FUNCTION tools it can actually call.
+    // HARDENING: when the caller declares an EXPLICIT toolMode, obey it and NEVER
+    // consult `supportsStructuredToolOutputTransport` — tool selection must not
+    // depend on the model instance's constructor name (a wrapped/proxied/minified
+    // instance would defeat the "ChatCompletions" string-sniff and silently hand a
+    // chat-completions provider the hosted tool it 400s on). The mode is decided by
+    // the worker, where provider identity is authoritative (see agent-turn.ts).
+    switch (this.args.toolMode) {
+      case "hosted":
+        return [this.hostedComputerTool(computer)];
+      case "function-image":
+        return computerFunctionTools(computer, this.args.readOnly ?? false, this.args.needsApproval, true);
+      case "function-text":
+        return computerFunctionTools(computer, this.args.readOnly ?? false, this.args.needsApproval, false);
+      case undefined:
+        break; // fall through to the legacy sniff (back-compat), preserved byte-for-byte
+    }
+    // Legacy (no toolMode): structured transport keeps the HOSTED computer tool
+    // (unchanged); the codex / text backend gets the FUNCTION tools it can call.
     if (supportsStructuredToolOutputTransport(this._modelInstance)) {
-      return [
-        computerTool({
-          computer,
-          ...(this.args.needsApproval !== undefined ? { needsApproval: this.args.needsApproval as never } : {}),
-        }) as unknown as Tool<unknown>,
-      ];
+      return [this.hostedComputerTool(computer)];
     }
     return computerFunctionTools(computer, this.args.readOnly ?? false, this.args.needsApproval, this.args.imageFunctionResults ?? false);
   }
+  /** The single HOSTED `computer_use_preview` tool bound to `computer` — identical
+   *  construction for the explicit "hosted" mode and the legacy structured-sniff path. */
+  private hostedComputerTool(computer: Computer): Tool<unknown> {
+    return computerTool({
+      computer,
+      ...(this.args.needsApproval !== undefined ? { needsApproval: this.args.needsApproval as never } : {}),
+    }) as unknown as Tool<unknown>;
+  }
 }