@opengeni/runtime 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -63,6 +63,8 @@ import {
63
63
  isWorkspaceEscapeError,
64
64
  makeActiveBackendResolver,
65
65
  mintStreamToken,
66
+ modalSandboxAttributionEnvironment,
67
+ modalSandboxAttributionTags,
66
68
  negotiateCapabilities,
67
69
  negotiateSelfhostedCapabilities,
68
70
  offlineAgentError,
@@ -85,12 +87,15 @@ import {
85
87
  stopRecording,
86
88
  stripExecBanner,
87
89
  subjectFor,
90
+ sweepModalOrphanSandboxes,
91
+ tagModalSandbox,
88
92
  tearDownDisplayStack,
89
93
  tearDownTerminalServer,
94
+ terminateModalSandboxById,
90
95
  timeoutAgentError,
91
96
  timeoutControlResponse,
92
97
  verifyStreamToken
93
- } from "../chunk-KNW7AMQB.js";
98
+ } from "../chunk-HGQ252FL.js";
94
99
  export {
95
100
  ActiveBackendUnresolvableError,
96
101
  CAPABILITY_DESCRIPTORS,
@@ -156,6 +161,8 @@ export {
156
161
  isWorkspaceEscapeError,
157
162
  makeActiveBackendResolver,
158
163
  mintStreamToken,
164
+ modalSandboxAttributionEnvironment,
165
+ modalSandboxAttributionTags,
159
166
  negotiateCapabilities,
160
167
  negotiateSelfhostedCapabilities,
161
168
  offlineAgentError,
@@ -178,8 +185,11 @@ export {
178
185
  stopRecording,
179
186
  stripExecBanner,
180
187
  subjectFor,
188
+ sweepModalOrphanSandboxes,
189
+ tagModalSandbox,
181
190
  tearDownDisplayStack,
182
191
  tearDownTerminalServer,
192
+ terminateModalSandboxById,
183
193
  timeoutAgentError,
184
194
  timeoutControlResponse,
185
195
  verifyStreamToken
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@opengeni/runtime",
3
- "version": "0.2.3",
3
+ "version": "0.3.1",
4
4
  "type": "module",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.js",
@@ -29,8 +29,8 @@
29
29
  },
30
30
  "dependencies": {
31
31
  "@opengeni/agent-proto": "^0.2.1",
32
- "@opengeni/config": "^0.2.3",
33
- "@opengeni/contracts": "^0.5.0",
32
+ "@opengeni/config": "^0.2.5",
33
+ "@opengeni/contracts": "^0.7.0",
34
34
  "@openai/agents": "^0.11.6",
35
35
  "@openai/agents-extensions": "^0.11.6",
36
36
  "modal": "^0.7.4",
@@ -27,6 +27,8 @@
27
27
  * filtered, keeping the persisted audit trail intact.
28
28
  */
29
29
 
30
+ import { SCREENSHOT_FAILURE_CARD_IMAGE_URL } from "./screenshot-error-card";
31
+
30
32
  /** A history item is any JSON object; we only inspect a few discriminator fields. */
31
33
  export type HistoryItem = Record<string, unknown>;
32
34
 
@@ -594,42 +596,35 @@ export function rewriteComputerCallsToActionsOnly(body: unknown): boolean {
594
596
  }
595
597
 
596
598
  /**
597
- * The 1×1 transparent PNG placeholder used by the SDK for tool-approval-rejection
598
- * screenshots (`TOOL_APPROVAL_REJECTION_SCREENSHOT_DATA_URL` in agents-core
599
- * `toolExecution.mjs`). We reuse the exact same constant as a backstop for the
600
- * action-timeout 400: when an action times out the SDK's catch sets output='' and
601
- * builds `{type:"computer_call_output",output:{type:"computer_screenshot",image_url:""}}`.
602
- * Azure rejects `image_url:""` with "400 Invalid input[N].output.image_url". This
603
- * placeholder is a valid data URI the provider accepts, so the turn continues and
604
- * the model receives the next real screenshot on its following step.
605
- */
606
- const EMPTY_IMAGE_URL_PLACEHOLDER =
607
- "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==";
608
-
609
- /**
610
- * Backstop for the action-timeout 400: walk the `input` array of a serialized
611
- * Responses request body and replace any `computer_call_output` item whose
612
- * `output.image_url` is an empty string, null, undefined, or otherwise not a
613
- * non-empty string with the 1×1 transparent PNG placeholder data URI.
614
- *
615
- * WHY THIS IS NEEDED. When a computer ACTION (click/type/scroll/drag) times out
616
- * at the 15-second yield window `SandboxComputer.x()` throws `ComputerActionError`.
617
- * The agents-core SDK `toolExecution.mjs` catch block sets `output = ''` and then
618
- * builds the wire item:
619
- *
620
- * `{type:"computer_call_output", output:{type:"computer_screenshot", image_url:""}}`
621
- *
622
- * Azure rejects the whole request with:
623
- *
624
- * `400 Invalid 'input[N].output.image_url'. Expected a valid URL, but got a
625
- * value with an invalid format.`
626
- *
627
- * Our screenshot() fail-loud guard (which throws on empty frames) only runs when
628
- * the SDK calls screenshot() on a SUCCESS path — not on this action-error catch
629
- * path that sets output='' directly. This wire-level rewrite is the only seam that
630
- * catches both paths regardless of how the empty image_url was produced. It runs
631
- * in the same `computerCallNormalizingFetch` wrapper, so a single parse/rewrite
632
- * pass covers both the action/actions-only rewrite and this placeholder injection.
599
+ * Backstop for the empty `computer_call_output` image_url: walk the `input` array of
600
+ * a serialized Responses request body and replace any `computer_call_output` item
601
+ * whose `output.image_url` is empty/missing with a LEGIBLE "screen capture failed"
602
+ * error card ({@link SCREENSHOT_FAILURE_CARD_IMAGE_URL}).
603
+ *
604
+ * WHY A CARD, NOT A BLANK. An empty `image_url` reaches this seam ONLY when the
605
+ * computer op genuinely FAILED to produce a screen: agents-core's `toolExecution.mjs`
606
+ * catch sets `output = ''` when the action OR the follow-up `computer.screenshot()`
607
+ * throws, building `{type:"computer_call_output",output:{type:"computer_screenshot",
608
+ * image_url:""}}`. Azure then rejects the whole request with
609
+ * `400 Invalid 'input[N].output.image_url'`. The previous fix substituted a 1×1
610
+ * TRANSPARENT placeholder to dodge the 400 — but that reaches the model as a
611
+ * plausible BLANK DESKTOP it confidently reports ("the screen appears blank/empty"),
612
+ * turning a hard capture FAILURE into a silent, wrong observation. That is the worst
613
+ * failure mode for computer use, and it is exactly what the 0.1.3 TCC-denied incident
614
+ * produced. Substituting a legible error card instead makes the failure REACH THE
615
+ * MODEL as an error (the only channel the hosted `computer_use_preview` protocol has
616
+ * is the image), so the model stops and tells the user rather than hallucinating.
617
+ *
618
+ * WHY THIS IS SAFE (empty = failure, never an intentional blank). Post-af289e3 the
619
+ * intentional-blank cases carry a NON-empty data URI already: agents-core's
620
+ * tool-approval-rejection screenshot is its own non-empty 1×1 placeholder, and the
621
+ * SandboxComputer action-timeout now warn+returns to a REAL screenshot rather than an
622
+ * empty output. So an EMPTY image_url at this seam is unambiguously a capture/interact
623
+ * FAILURE — the error card is the correct substitution for every empty case, and this
624
+ * function never touches a non-empty (real screenshot OR intentional blank) output.
625
+ *
626
+ * The failure REASON (permission denied / null image / timeout / display down) is not
627
+ * on the card; it is logged worker-side by `NativeDesktopComputer.screenshot()`.
633
628
  *
634
629
  * Mutates `body` in place (the caller has already JSON.parsed a private copy).
635
630
  * Returns `true` iff at least one image_url was replaced.
@@ -657,9 +652,11 @@ export function rewriteEmptyComputerCallOutputImageUrls(body: unknown): boolean
657
652
  }
658
653
  const out = output as Record<string, unknown>;
659
654
  const imageUrl = out.image_url;
660
- // Replace the image_url when it is not a non-empty string (covers: "", null, undefined, missing).
655
+ // Replace the image_url when it is not a non-empty string (covers: "", null,
656
+ // undefined, missing) — an empty output is always a genuine capture failure, so
657
+ // it becomes the legible error card, never a silent blank.
661
658
  if (typeof imageUrl !== "string" || imageUrl.length === 0) {
662
- out.image_url = EMPTY_IMAGE_URL_PLACEHOLDER;
659
+ out.image_url = SCREENSHOT_FAILURE_CARD_IMAGE_URL;
663
660
  changed = true;
664
661
  }
665
662
  }
package/src/index.ts CHANGED
@@ -27,6 +27,7 @@ import {
27
27
  setDefaultOpenAIClient,
28
28
  setDefaultOpenAIKey,
29
29
  setOpenAIResponsesTransport,
30
+ setTracingDisabled,
30
31
  // Hosted web_search tool factory. Re-exported from @openai/agents-openai via
31
32
  // `export * from '@openai/agents-openai'` in @openai/agents' index (0.11.6);
32
33
  // it returns a { type: 'hosted_tool', providerData: { type: 'web_search' } }
@@ -101,6 +102,9 @@ import {
101
102
  setSelfhostedApplyDiff,
102
103
  } from "./sandbox";
103
104
  import { computerUse, type ComputerToolMode } from "./sandbox-computer";
105
+ import type { RuntimeMetricsHooks } from "./metrics";
106
+
107
+ export type { RuntimeMetricsHooks } from "./metrics";
104
108
 
105
109
  // P4.3 computer-use surface (the agent's :0 driver). Re-exported from the barrel
106
110
  // so callers (the worker, live proofs) reach SandboxComputer/ComputerUseCapability
@@ -253,6 +257,12 @@ export type SandboxFileDownload = {
253
257
  sizeBytes?: number;
254
258
  };
255
259
 
260
+ let runtimeMetricsHooks: RuntimeMetricsHooks | null = null;
261
+
262
+ export function configureRuntimeMetricsHooks(hooks: RuntimeMetricsHooks | null | undefined): void {
263
+ runtimeMetricsHooks = hooks ?? null;
264
+ }
265
+
256
266
  export type OpenGeniRuntime = {
257
267
  configure: (settings: Settings) => void;
258
268
  // Multi-provider per-turn model routing. Returns the resolved provider, its
@@ -270,11 +280,15 @@ export type OpenGeniRuntime = {
270
280
  export type ProductionRuntimeOverrides = {
271
281
  model?: Model;
272
282
  sandboxClient?: unknown;
283
+ metrics?: RuntimeMetricsHooks;
273
284
  };
274
285
 
275
286
  export function createProductionAgentRuntime(overrides: ProductionRuntimeOverrides = {}): OpenGeniRuntime {
276
287
  return {
277
- configure: configureOpenAI,
288
+ configure: (settings) => {
289
+ configureRuntimeMetricsHooks(overrides.metrics);
290
+ configureOpenAI(settings);
291
+ },
278
292
  // A test/override model shadows the registry routing entirely (the scripted
279
293
  // model used in worker tests is not in any provider's allow-list), so when
280
294
  // one is supplied resolveTurnModel reports "no resolution" and the caller
@@ -301,7 +315,7 @@ export function createProductionAgentRuntime(overrides: ProductionRuntimeOverrid
301
315
  * the OpenAI-platform path has only a key (the SDK default client is used via
302
316
  * setDefaultOpenAIKey there); the caller then constructs a key-only client.
303
317
  */
304
- export function buildOpenAIClientFromSettings(settings: Settings): OpenAI {
318
+ export function buildOpenAIClientFromSettings(settings: Settings, providerId: string = settings.openaiProvider): OpenAI {
305
319
  if (settings.openaiProvider === "azure") {
306
320
  const baseURL = settings.azureOpenaiBaseUrl ?? azureDeploymentBaseUrl(settings);
307
321
  const apiKey = settings.azureOpenaiApiKey ?? settings.azureOpenaiAdToken ?? "azure-ad-token";
@@ -318,13 +332,14 @@ export function buildOpenAIClientFromSettings(settings: Settings): OpenAI {
318
332
  // seam — below the SDK responses converter, which always re-synthesizes BOTH
319
333
  // `action` and `actions` (rejected 400 "exactly one of action or actions").
320
334
  // See computerCallNormalizingFetch / rewriteComputerCallsToActionsOnly.
321
- fetch: computerCallNormalizingFetch(globalThis.fetch),
335
+ fetch: computerCallNormalizingFetch(instrumentedModelFetch(providerId, globalThis.fetch)),
322
336
  });
323
337
  }
324
338
  return new OpenAI({
325
339
  apiKey: settings.openaiApiKey ?? process.env.OPENAI_API_KEY,
326
340
  ...(settings.openaiBaseUrl ? { baseURL: settings.openaiBaseUrl } : {}),
327
341
  maxRetries: settings.openaiMaxRetries,
342
+ fetch: instrumentedModelFetch(providerId, globalThis.fetch),
328
343
  });
329
344
  }
330
345
 
@@ -346,7 +361,7 @@ export function buildProviderClient(provider: ResolvedModelProvider, settings: S
346
361
  return cached;
347
362
  }
348
363
  const client = provider.builtin
349
- ? buildOpenAIClientFromSettings(settings)
364
+ ? buildOpenAIClientFromSettings(settings, provider.id)
350
365
  : provider.kind === "codex-subscription"
351
366
  // Codex subscription: the static apiKey is a placeholder — the real per-request
352
367
  // bearer + ChatGPT-Account-ID, the /responses->/codex/responses rewrite, and the
@@ -358,7 +373,7 @@ export function buildProviderClient(provider: ResolvedModelProvider, settings: S
358
373
  apiKey: provider.apiKey ?? "codex-subscription",
359
374
  ...(provider.baseUrl ? { baseURL: provider.baseUrl } : {}),
360
375
  maxRetries: settings.openaiMaxRetries,
361
- fetch: codexSubscriptionFetch(globalThis.fetch),
376
+ fetch: codexSubscriptionFetch(instrumentedModelFetch(provider.id, globalThis.fetch)),
362
377
  })
363
378
  // ResolvedModelProvider.apiKey is already the resolved key (configuredProviders
364
379
  // ran resolveProviderApiKey at config time, collapsing apiKey/apiKeyEnv), so it
@@ -369,6 +384,7 @@ export function buildProviderClient(provider: ResolvedModelProvider, settings: S
369
384
  maxRetries: settings.openaiMaxRetries,
370
385
  ...(provider.defaultQuery ? { defaultQuery: provider.defaultQuery } : {}),
371
386
  ...(provider.defaultHeaders ? { defaultHeaders: provider.defaultHeaders } : {}),
387
+ fetch: instrumentedModelFetch(provider.id, globalThis.fetch),
372
388
  });
373
389
  providerClientCache.set(provider.id, client);
374
390
  return client;
@@ -441,7 +457,7 @@ export class MultiProviderModelProvider implements ModelProvider {
441
457
 
442
458
  async getModel(modelName?: string): Promise<Model> {
443
459
  if (modelName) {
444
- const resolved = resolveTurnModel(this.settings, modelName);
460
+ const resolved = resolveTurnModel(settingsForRunScopedModelResolution(this.settings, modelName), modelName);
445
461
  if (resolved) {
446
462
  // Fail-loud floor (defense in depth): a `codex/<slug>` id must only ever
447
463
  // resolve through the synthetic codex-subscription provider (which installs
@@ -479,6 +495,27 @@ export class MultiProviderModelProvider implements ModelProvider {
479
495
  }
480
496
  }
481
497
 
498
+ function settingsForRunScopedModelResolution(settings: Settings, modelName: string): Settings {
499
+ if (modelName !== settings.openaiModel) {
500
+ return settings;
501
+ }
502
+ const builtinAllowed = splitOpenaiAllowedModels(settings.openaiAllowedModels);
503
+ const fallbackBuiltin = builtinAllowed.find((id) => id !== modelName);
504
+ if (!fallbackBuiltin) {
505
+ return settings;
506
+ }
507
+ // The worker sets runSettings.openaiModel to the turn's model. For namespaced
508
+ // registry ids configuredModels filters the built-in entry out, but a unique
509
+ // bare registry id would otherwise be claimed by the built-in only because of
510
+ // that per-turn override. Resolve the run-scoped router against the deployment
511
+ // allow-list head instead; real built-in models stay in the allow-list.
512
+ return builtinAllowed.includes(modelName) ? settings : { ...settings, openaiModel: fallbackBuiltin };
513
+ }
514
+
515
+ function splitOpenaiAllowedModels(value: string): string[] {
516
+ return value.split(",").map((item) => item.trim()).filter(Boolean);
517
+ }
518
+
482
519
  /**
483
520
  * A `codex/<slug>` turn reached the model router but the workspace has no active
484
521
  * Codex subscription connected (the worker overlay never injected the synthetic
@@ -500,6 +537,7 @@ export class CodexSubscriptionUnavailableError extends Error {
500
537
 
501
538
  export function configureOpenAI(settings: Settings): void {
502
539
  setOpenAIResponsesTransport(settings.openaiResponsesTransport);
540
+ setTracingDisabled(settings.disableOpenaiTracing || !settings.observabilityOtlpEndpoint);
503
541
  // Install the registry-aware router as the process default model provider so a
504
542
  // model name re-resolved on the SandboxAgent/Modal path (where a Model instance
505
543
  // does not survive) routes to its provider instead of the built-in client.
@@ -519,6 +557,51 @@ export function configureOpenAI(settings: Settings): void {
519
557
  setDefaultModelProvider(router);
520
558
  }
521
559
 
560
+ function instrumentedModelFetch(provider: string, inner: typeof fetch): typeof fetch {
561
+ return (async (input: Parameters<typeof fetch>[0], init?: Parameters<typeof fetch>[1]) => {
562
+ if (!isModelCallFetch(input)) {
563
+ return await inner(input, init);
564
+ }
565
+ const started = performance.now();
566
+ try {
567
+ const response = await inner(input, init);
568
+ recordModelCallMetric(provider, response.ok ? "completed" : "failed", started);
569
+ return response;
570
+ } catch (error) {
571
+ recordModelCallMetric(provider, "failed", started);
572
+ throw error;
573
+ }
574
+ }) as typeof fetch;
575
+ }
576
+
577
+ function isModelCallFetch(input: Parameters<typeof fetch>[0]): boolean {
578
+ const rawUrl = typeof input === "string"
579
+ ? input
580
+ : input instanceof URL
581
+ ? input.toString()
582
+ : (input as { url?: unknown }).url;
583
+ if (typeof rawUrl !== "string" || rawUrl.length === 0) {
584
+ return false;
585
+ }
586
+ try {
587
+ const pathname = new URL(rawUrl, "http://opengeni.local").pathname;
588
+ return pathname.endsWith("/responses")
589
+ || pathname.endsWith("/chat/completions")
590
+ || pathname.endsWith("/codex/responses");
591
+ } catch {
592
+ return /\/(?:codex\/)?responses(?:\?|$)|\/chat\/completions(?:\?|$)/.test(rawUrl);
593
+ }
594
+ }
595
+
596
+ function recordModelCallMetric(provider: string, outcome: "completed" | "failed", started: number): void {
597
+ const durationSeconds = Math.max(0, (performance.now() - started) / 1000);
598
+ try {
599
+ runtimeMetricsHooks?.onModelCall?.({ provider, outcome, durationSeconds });
600
+ } catch {
601
+ // Metrics emission must never affect a model call.
602
+ }
603
+ }
604
+
522
605
  /**
523
606
  * Run the compaction summarizer as one plain, tool-less, non-streaming model
524
607
  * call against the resolved provider. `input` is the active history plus
@@ -711,6 +794,14 @@ export type BuildAgentOptions = {
711
794
  // restyle the persona but never drop the goal-loop contract or environment
712
795
  // block.
713
796
  instructionsTemplate?: string;
797
+ // Per-SESSION persona/system instructions (the per-agent-type prompt lever an
798
+ // embedding host supplies at session create). Composed AFTER the workspace
799
+ // instructionsTemplate + the non-bypassable CORE, so it refines the workspace
800
+ // persona for this one session without dropping the goal-loop/environment
801
+ // contract. Rides the SAME instructions channel (system-level) — NEVER a user/
802
+ // timeline message. Omitted ⇒ the composed instructions are byte-identical to
803
+ // a workspace-only persona.
804
+ sessionInstructions?: string;
714
805
  // Skills delivered by enabled capability packs. They join the bundled
715
806
  // skills in the sandbox skill index (mounted under .agents/) so
716
807
  // skills/<name> references resolve like any other indexed skill.
@@ -793,6 +884,27 @@ export function composeAgentInstructions(template: string, workspaceEnvironment?
793
884
  return core ? `${template} ${core}` : template;
794
885
  }
795
886
 
887
+ /**
888
+ * Appends the per-session persona instructions to the already-composed
889
+ * (workspace + CORE) instructions, joined by " " — exactly the join used
890
+ * throughout the persona composition. The session slice is intentionally LAST
891
+ * (session-specific refinement of the workspace persona). An absent/blank value
892
+ * is a no-op that returns the composed string byte-for-byte.
893
+ */
894
+ export function appendSessionInstructions(composed: string, sessionInstructions?: string): string {
895
+ const trimmed = sessionInstructions?.trim();
896
+ return trimmed ? `${composed} ${trimmed}` : composed;
897
+ }
898
+
899
+ /**
900
+ * Appends the one-shot genesis title directive (genesis turn only), joined by
901
+ * " " and always LAST so a white-label persona template or a per-session
902
+ * instruction can't drop it. A no-op when the hint is absent.
903
+ */
904
+ export function appendGenesisTitleDirective(instructions: string, genesisTitleHint?: boolean): string {
905
+ return genesisTitleHint ? `${instructions} ${GENESIS_TITLE_DIRECTIVE}` : instructions;
906
+ }
907
+
796
908
  const agentFileDownloads = new WeakMap<object, SandboxFileDownload[]>();
797
909
  const agentRepositoryCloneHooks = new WeakMap<object, SandboxLifecycleHook[]>();
798
910
  // TOKEN-BROKER (B1): the per-turn git token seed, stashed alongside the agent's
@@ -837,9 +949,21 @@ export function buildOpenGeniAgent(settings: Settings, resources: ResourceRef[],
837
949
  // ownership + workspace-environment block) at the {{core}} marker, or
838
950
  // appends it when the template omits the marker. With the default template
839
951
  // and no environment this is byte-identical to the historical preamble.
840
- instructions: options.genesisTitleHint
841
- ? `${composeAgentInstructions(options.instructionsTemplate ?? settings.agentInstructionsTemplate, options.workspaceEnvironment)} ${GENESIS_TITLE_DIRECTIVE}`
842
- : composeAgentInstructions(options.instructionsTemplate ?? settings.agentInstructionsTemplate, options.workspaceEnvironment),
952
+ // Persona composition order (all one system-level instructions string):
953
+ // 1. workspace instructionsTemplate (or deployment default) with the
954
+ // non-bypassable CORE substituted at {{core}} — composeAgentInstructions,
955
+ // 2. + the per-session persona instructions (session-specific, LAST so it
956
+ // refines the workspace persona),
957
+ // 3. + the one-shot genesis title directive (genesis turn only).
958
+ // With no session instructions and no genesis hint this is byte-identical to
959
+ // the historical composed instructions.
960
+ instructions: appendGenesisTitleDirective(
961
+ appendSessionInstructions(
962
+ composeAgentInstructions(options.instructionsTemplate ?? settings.agentInstructionsTemplate, options.workspaceEnvironment),
963
+ options.sessionInstructions,
964
+ ),
965
+ options.genesisTitleHint,
966
+ ),
843
967
  modelSettings: {
844
968
  reasoning: { effort: options.reasoningEffort ?? settings.openaiReasoningEffort, summary: "detailed" },
845
969
  // Server-side compaction (OpenAI platform) requires store=false: the
@@ -1920,7 +2044,6 @@ export async function runAgentStream(agent: Agent<any, any>, input: PreparedAgen
1920
2044
  // every mid-turn follow-up.
1921
2045
  callModelInputFilter,
1922
2046
  };
1923
- void settings.disableOpenaiTracing;
1924
2047
  if (client) {
1925
2048
  runOptions.sandbox = {
1926
2049
  client,
package/src/metrics.ts ADDED
@@ -0,0 +1,5 @@
1
+ export type RuntimeMetricsHooks = {
2
+ onModelCall?: (input: { provider: string; outcome: "completed" | "failed"; durationSeconds: number }) => void;
3
+ onSandboxCreate?: (input: { backend: string; outcome: "completed" | "failed"; durationSeconds: number }) => void;
4
+ onSandboxWarmingTimeout?: (input: { backend: string }) => void;
5
+ };
@@ -32,10 +32,50 @@ export const STREAM_PORT = DESKTOP_STREAM_PORT;
32
32
  export const DISPLAY_STACK_TIMEOUT_MS = 90_000;
33
33
 
34
34
  // PAINTABLE-FRAME gate: poll scrot up to this many times, this many seconds apart,
35
- // waiting for a non-empty frame before declaring the stack "up" (~30s worst case).
35
+ // waiting for an actually-PAINTED frame before declaring the stack "up" (~30s worst case).
36
36
  const PAINT_PROBE_ATTEMPTS = 150;
37
37
  const PAINT_PROBE_INTERVAL_S = 0.2;
38
38
 
39
+ // The paint FLOOR (bytes): a scrot at/above this size is a real painted desktop; below
40
+ // it, the root is still unpainted and the frame would read as "blank" to the model.
41
+ //
42
+ // WHY A SIZE FLOOR, NOT NON-EMPTINESS (the bug this fixes): the old gate only checked
43
+ // `[ -s frame.png ]` (non-empty). But an UNPAINTED root is never zero-byte — a fresh
44
+ // Xvfb draws either the `-retro` weave stipple or (with `-retro` dropped) solid black,
45
+ // and scrot happily encodes that as a small-but-non-empty PNG. So the old gate passed
46
+ // the instant the VNC ports bound — MEASURED at ~1.4s (fast runc host) to several
47
+ // seconds (cold gVisor) BEFORE xfdesktop finishes its first wallpaper paint — handing
48
+ // the model the pre-paint frame. That pre-paint frame is exactly the "blank/black"
49
+ // screenshot that 400s the model and blanks the human viewer.
50
+ //
51
+ // The sizes are unambiguous and were measured on the canonical desktop image (1280x800)
52
+ // under runc — both the current staging image and a fresh local build:
53
+ // painted XFCE desktop (blue-gradient wallpaper + panel + icons): ~210-222 KB
54
+ // `-retro` stipple root (unpainted, current image): ~17 KB
55
+ // solid-black root (unpainted, after we drop `-retro`): ~13.5 KB
56
+ // 60 KB sits ~3.5x above every unpainted state and ~3.5x below a real paint — a wide,
57
+ // unambiguous margin. It holds against BOTH the currently-deployed `-retro` image and
58
+ // the `-retro`-dropped image this change ships, so the runtime gate is correct before
59
+ // AND after the image rebuild lands. (Assumes the default ~1280x800 geometry; a larger
60
+ // framebuffer only scales the painted frame further above the floor.)
61
+ const PAINT_MIN_BYTES = 60_000;
62
+
63
+ // SETTLE gate (the gVisor staged-paint fix): crossing the 60 KB floor is necessary but
64
+ // NOT sufficient. On a fast runc host the paint is atomic (black 13.5 KB -> full 209 KB
65
+ // in one step, panel + icons included). On a STONE-COLD gVisor Modal box it is STAGED:
66
+ // the wallpaper gradient paints and crosses 60 KB a beat BEFORE xfdesktop draws the
67
+ // panel / launcher icons / logo. A screenshot in that window shows a bare teal wallpaper
68
+ // with no panel — which the model correctly reports as "graphical, but the desktop
69
+ // hasn't fully loaded" (VERIFIED live on staging: a cold-box turn's first agent
70
+ // screenshot caught exactly this). So the gate additionally waits for the frame to
71
+ // SETTLE: two consecutive probes both above the floor whose byte-sizes agree within
72
+ // PAINT_SETTLE_DELTA_BYTES. A still-painting desktop grows between probes; a fully
73
+ // rendered, static one is byte-stable (scrot -o omits the cursor, and the clock is
74
+ // minute-precision, so consecutive captures of a settled desktop are near-identical).
75
+ // This makes ensureDisplayStack block until the FULL desktop is up, so the turn's first
76
+ // screenshot — which runs AFTER this gate — sees the panel, not a bare wallpaper.
77
+ const PAINT_SETTLE_DELTA_BYTES = 2_000;
78
+
39
79
  /** Desktop geometry for the framebuffer. v1 has no live RANDR: a resolution
40
80
  * change is a full down -> up restart (a separate op). */
41
81
  export type DesktopGeometry = {
@@ -145,18 +185,25 @@ export function buildDisplayStackScript(options: EnsureDisplayStackOptions = {})
145
185
  // PAINTABLE-FRAME GATE (the completion criterion): the up-script's readiness gates
146
186
  // only assert that Xvfb answers xdpyinfo and that x11vnc:5900 + websockify:PORT are
147
187
  // LISTENING — NOT that the display actually PAINTS. On a stone-cold gVisor box (the
148
- // machine→sandbox swap-recovery turn always hits one), Xvfb can answer and the VNC
149
- // ports can bind seconds BEFORE the root window / XFCE compositor is drawable, so a
150
- // scrot right after the `OPENGENI_DESKTOP_UP` marker yields a ZERO-BYTE frame which
151
- // is exactly the empty screenshot that 400s the model and blanks the human viewer.
188
+ // machine→sandbox swap-recovery turn always hits one), Xvfb answers and the VNC ports
189
+ // bind ~1.4s (fast host) to several seconds BEFORE xfdesktop finishes its first
190
+ // wallpaper paint. In that window a scrot yields a small UNPAINTED frame (the -retro
191
+ // stipple or a solid-black root) never zero-byte which is exactly the "blank/black"
192
+ // screenshot that 400s the model and blanks the human viewer. (VERIFIED locally: the
193
+ // real xfdesktop backdrop window maps at full 1280x800 the whole time; the render is
194
+ // never structurally broken — it is purely this pre-paint capture race.)
195
+ //
152
196
  // We therefore chain a real scrot probe as the completion gate: after the up-script
153
- // reports success, poll scrot until it produces a NON-EMPTY frame (bounded ~30s), and
154
- // only THEN let the command exit 0. If it never paints we exit 14 so the caller sees a
155
- // typed DisplayStackError("paint") an HONEST failure the worker can degrade + log,
156
- // rather than a false "up" that hands the model an empty image. `-ac` on Xvfb disables
197
+ // reports success, poll scrot until it produces an actually-PAINTED frame a PNG at or
198
+ // above PAINT_MIN_BYTES, not merely NON-EMPTY (the old `[ -s ]` check passed on the
199
+ // ~17 KB pre-paint stipple immediately; that WAS the bug) bounded ~30s, and only THEN
200
+ // let the command exit 0. If it never paints we exit 14 so the caller sees a typed
201
+ // DisplayStackError("paint") — an HONEST failure the worker can degrade + log, rather
202
+ // than a false "up" that hands the model an unpainted image. `-ac` on Xvfb disables
157
203
  // access control so this root-side scrot reaches :0. Runs on a pre-check hit too (cheap
158
204
  // — an already-up display paints on the first probe). Lives in the runtime-built script
159
- // (not the baked image up-script) so it ships with the worker/api, no image rebuild.
205
+ // (not the baked image up-script) so it ships with the worker/api, no image rebuild
206
+ // and its size floor holds against the currently-deployed image too.
160
207
  const bringUp =
161
208
  `if nc -z 127.0.0.1 ${port} >/dev/null 2>&1 && nc -z 127.0.0.1 5900 >/dev/null 2>&1; then ` +
162
209
  `echo "OPENGENI_DESKTOP_UP port=${port} geometry=${geometry.width}x${geometry.height} dpi=${geometry.dpi} (precheck)"; ` +
@@ -166,13 +213,22 @@ export function buildDisplayStackScript(options: EnsureDisplayStackOptions = {})
166
213
  `env ${env} opengeni-desktop-up; ` +
167
214
  `fi`;
168
215
  const paintProbe =
169
- `p=/tmp/opengeni-desktop/paint-probe.png; ` +
216
+ `p=/tmp/opengeni-desktop/paint-probe.png; prev=0; ` +
170
217
  `for i in $(seq 1 ${PAINT_PROBE_ATTEMPTS}); do ` +
171
- `if DISPLAY=:0 scrot -o "$p" >/dev/null 2>&1 && [ -s "$p" ]; then rm -f "$p"; break; fi; ` +
218
+ // Capture, then measure the PNG byte-size. `wc -c < "$p"` yields a bare integer; a
219
+ // failed scrot leaves sz=0. A frame at/above PAINT_MIN_BYTES is a real painted desktop.
220
+ `if DISPLAY=:0 scrot -o "$p" >/dev/null 2>&1; then sz=$(wc -c < "$p" 2>/dev/null || echo 0); else sz=0; fi; ` +
172
221
  `rm -f "$p"; ` +
222
+ // SETTLE: accept only when THIS probe AND the PREVIOUS one are both above the floor
223
+ // and their sizes agree within PAINT_SETTLE_DELTA_BYTES — i.e., the paint has stopped
224
+ // growing (the full desktop, panel + icons included, is up), not merely crossed the
225
+ // floor mid-paint on a staged gVisor boot. ($sz/$prev/$d are bare shell — no ${}
226
+ // braces — so JS leaves them for bash; ${PAINT_*} ARE JS constants and interpolate.)
227
+ `if [ "$sz" -ge ${PAINT_MIN_BYTES} ] && [ "$prev" -ge ${PAINT_MIN_BYTES} ]; then d=$((sz-prev)); [ "$d" -lt 0 ] && d=$((0-d)); [ "$d" -le ${PAINT_SETTLE_DELTA_BYTES} ] && break; fi; ` +
228
+ `prev=$sz; ` +
173
229
  // NOTE: NOT_PAINTING goes to STDOUT (not stderr): Modal is execCommand-only, so the
174
230
  // caller infers the outcome by string-matching the output — stdout is always captured.
175
- `if [ "$i" = "${PAINT_PROBE_ATTEMPTS}" ]; then echo "OPENGENI_DESKTOP_NOT_PAINTING scrot empty after warmup"; exit 14; fi; ` +
231
+ `if [ "$i" = "${PAINT_PROBE_ATTEMPTS}" ]; then echo "OPENGENI_DESKTOP_NOT_PAINTING scrot below ${PAINT_MIN_BYTES}B or unsettled after warmup (last=$sz)"; exit 14; fi; ` +
176
232
  `sleep ${PAINT_PROBE_INTERVAL_S}; ` +
177
233
  `done`;
178
234
  return `mkdir -p /tmp/opengeni-desktop; { ${bringUp} ; } && { ${paintProbe} ; }`;