@opengeni/runtime 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,149 @@
1
+ import type { AgentInputItem } from "@openai/agents";
2
+
3
+ export const SCREENSHOT_OMITTED_PLACEHOLDER =
4
+ "[screenshot omitted: an older desktop frame — the full image remains in the session event log]";
5
+
6
+ const DATA_IMAGE_BASE64_PATTERN = /data:image\/[a-z0-9.+-]+;base64,[a-z0-9+/=_-]+/i;
7
+
8
+ type PathSegment = string | number;
9
+
10
+ type ImageOccurrence = {
11
+ path: PathSegment[];
12
+ replacement: unknown;
13
+ };
14
+
15
+ export type ElideStaleScreenshotsResult<T> = {
16
+ items: T[];
17
+ imageCount: number;
18
+ elidedCount: number;
19
+ };
20
+
21
+ export type ElideStaleScreenshotsOptions = {
22
+ keepLast?: number;
23
+ placeholder?: string;
24
+ };
25
+
26
+ export function elideStaleScreenshotImages<T extends AgentInputItem>(
27
+ items: readonly T[],
28
+ options: ElideStaleScreenshotsOptions = {},
29
+ ): ElideStaleScreenshotsResult<T> {
30
+ const keepLast = Math.max(0, Math.floor(options.keepLast ?? 3));
31
+ const placeholder = options.placeholder ?? SCREENSHOT_OMITTED_PLACEHOLDER;
32
+ const occurrences: ImageOccurrence[] = [];
33
+ for (let i = 0; i < items.length; i += 1) {
34
+ collectItemImageOccurrences(items[i], [i], placeholder, occurrences);
35
+ }
36
+
37
+ const elidedCount = Math.max(0, occurrences.length - keepLast);
38
+ if (elidedCount === 0) {
39
+ return { items: items.slice(), imageCount: occurrences.length, elidedCount: 0 };
40
+ }
41
+
42
+ const cloned = structuredClone(items) as T[];
43
+ for (const occurrence of occurrences.slice(0, elidedCount)) {
44
+ setPath(cloned, occurrence.path, occurrence.replacement);
45
+ }
46
+ return { items: cloned, imageCount: occurrences.length, elidedCount };
47
+ }
48
+
49
+ function collectItemImageOccurrences(
50
+ item: unknown,
51
+ path: PathSegment[],
52
+ placeholder: string,
53
+ out: ImageOccurrence[],
54
+ ): void {
55
+ if (!isRecord(item)) {
56
+ return;
57
+ }
58
+ if (item.type === "message" && (item.role === "user" || item.role === "system")) {
59
+ return;
60
+ }
61
+ if (item.type === "computer_call_result" || item.type === "computer_call_output") {
62
+ collectComputerOutputImages(item, path, placeholder, out);
63
+ return;
64
+ }
65
+ if (item.type === "function_call_result" || item.type === "function_call_output") {
66
+ collectToolResultImages(item.output, [...path, "output"], placeholder, out);
67
+ }
68
+ }
69
+
70
+ function collectComputerOutputImages(
71
+ item: Record<string, unknown>,
72
+ path: PathSegment[],
73
+ placeholder: string,
74
+ out: ImageOccurrence[],
75
+ ): void {
76
+ const output = item.output;
77
+ if (!isRecord(output) || output.type !== "computer_screenshot") {
78
+ return;
79
+ }
80
+ for (const key of ["data", "image_url", "imageUrl"]) {
81
+ if (isImageDataUrl(output[key])) {
82
+ out.push({ path: [...path, "output", key], replacement: placeholder });
83
+ return;
84
+ }
85
+ }
86
+ }
87
+
88
+ function collectToolResultImages(
89
+ value: unknown,
90
+ path: PathSegment[],
91
+ placeholder: string,
92
+ out: ImageOccurrence[],
93
+ ): void {
94
+ if (typeof value === "string") {
95
+ if (isImageDataUrl(value)) {
96
+ out.push({ path, replacement: placeholder });
97
+ }
98
+ return;
99
+ }
100
+ if (Array.isArray(value)) {
101
+ for (let i = 0; i < value.length; i += 1) {
102
+ collectToolResultImages(value[i], [...path, i], placeholder, out);
103
+ }
104
+ return;
105
+ }
106
+ if (!isRecord(value)) {
107
+ return;
108
+ }
109
+ if (value.type === "input_image") {
110
+ for (const key of ["image", "imageUrl", "image_url"]) {
111
+ if (isImageDataUrl(value[key])) {
112
+ out.push({ path, replacement: { type: "input_text", text: placeholder } });
113
+ return;
114
+ }
115
+ }
116
+ }
117
+ for (const key of ["content", "text", "output"]) {
118
+ if (key in value) {
119
+ collectToolResultImages(value[key], [...path, key], placeholder, out);
120
+ }
121
+ }
122
+ }
123
+
124
+ function isImageDataUrl(value: unknown): value is string {
125
+ return typeof value === "string" && DATA_IMAGE_BASE64_PATTERN.test(value);
126
+ }
127
+
128
+ function isRecord(value: unknown): value is Record<string, unknown> {
129
+ return Boolean(value) && typeof value === "object" && !Array.isArray(value);
130
+ }
131
+
132
+ function setPath(root: unknown, path: PathSegment[], value: unknown): void {
133
+ if (path.length === 0) {
134
+ return;
135
+ }
136
+ let cursor = root;
137
+ for (let i = 0; i < path.length - 1; i += 1) {
138
+ const segment = path[i]!;
139
+ cursor = Array.isArray(cursor)
140
+ ? cursor[segment as number]
141
+ : (cursor as Record<string, unknown>)[segment as string];
142
+ }
143
+ const last = path[path.length - 1]!;
144
+ if (Array.isArray(cursor)) {
145
+ cursor[last as number] = value;
146
+ } else {
147
+ (cursor as Record<string, unknown>)[last as string] = value;
148
+ }
149
+ }
package/src/index.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import type { ConfiguredModel, ContextCompactionMode, ModelProviderApi, ResolvedModelProvider, Settings } from "@opengeni/config";
2
- import { AGENT_INSTRUCTIONS_CORE_PLACEHOLDER, collectSandboxEnvironment, contextServerCompactThreshold, firstPartyMcpBaseUrl, parseExposedPorts, resolveContextCompactionMode, resolveModelProvider, sandboxLifecycleHookIds } from "@opengeni/config";
2
+ import { AGENT_INSTRUCTIONS_CORE_PLACEHOLDER, collectSandboxEnvironment, contextInputBudgetTokens, contextServerCompactThreshold, firstPartyMcpBaseUrl, parseExposedPorts, resolveContextCompactionMode, resolveModelProvider, sandboxLifecycleHookIds } from "@opengeni/config";
3
3
  import { CAPABILITY_DESCRIPTORS, isClearedRunStateBlob, signDelegatedAccessToken, type Permission, type ReasoningEffort, type ResourceRef, type SessionEventType, type ToolRef } from "@opengeni/contracts";
4
4
  import {
5
5
  Agent,
@@ -82,8 +82,17 @@ import { dirname, isAbsolute, join, posix as posixPath, relative } from "node:pa
82
82
  import { fileURLToPath } from "node:url";
83
83
 
84
84
  import { computerCallNormalizingFetch, normalizeComputerCallActions, sanitizeHistoryItemsForModel } from "./history-sanitizer";
85
+ import { elideStaleScreenshotImages } from "./image-history";
85
86
  import { installCodexToolSearch } from "./codex-tool-search";
86
- import { enforceInputBudget, estimateItemTokens } from "./context-compaction";
87
+ import {
88
+ CompactionNeededError,
89
+ SUMMARY_BUFFER_TOKENS,
90
+ clientCompactionThresholdTokens,
91
+ enforceInputBudget,
92
+ estimateItemTokens,
93
+ estimateTokens,
94
+ renderCompactionPromptInputForChat,
95
+ } from "./context-compaction";
87
96
  import {
88
97
  createSandboxClient,
89
98
  deserializeSandboxSessionStateEnvelope,
@@ -134,22 +143,34 @@ export type { HistoryItem } from "./history-sanitizer";
134
143
  export { OpenAIChatCompletionsModel, OpenAIResponsesModel } from "@openai/agents";
135
144
 
136
145
  export {
137
- planCompaction,
146
+ CompactionNeededError,
147
+ buildCompactionPromptInput,
148
+ buildCompactionReplacementHistory,
149
+ clientCompactionThresholdTokens,
150
+ decideClientCompaction,
138
151
  enforceInputBudget,
139
152
  buildSummaryItem,
140
- buildCompactionMessages,
153
+ findCompactionNeededError,
141
154
  isCompactionSummary,
142
155
  isUserMessage,
143
156
  findKeepBoundary,
144
157
  estimateTokens,
145
158
  estimateItemTokens,
146
- compactionSummaryText,
147
- renderPrefixTranscript,
159
+ renderCompactionPromptInputForChat,
148
160
  COMPACTION_SUMMARY_MARKER,
161
+ COMPACTION_PROMPT,
162
+ COMPACT_USER_MESSAGE_MAX_TOKENS,
163
+ CLIENT_COMPACTION_TRIGGER_FRACTION,
164
+ SUMMARY_BUFFER_TOKENS,
149
165
  SUMMARY_PREFIX,
150
- SUMMARY_INSTRUCTIONS,
166
+ USER_MESSAGE_TRUNCATION_MARKER,
151
167
  } from "./context-compaction";
152
- export type { CompactionItem, CompactionPlan, PlanCompactionInput } from "./context-compaction";
168
+ export type { ClientCompactionDecision, CompactionItem } from "./context-compaction";
169
+ export {
170
+ elideStaleScreenshotImages,
171
+ SCREENSHOT_OMITTED_PLACEHOLDER,
172
+ } from "./image-history";
173
+ export type { ElideStaleScreenshotsOptions, ElideStaleScreenshotsResult } from "./image-history";
153
174
 
154
175
  ensureReadableStreamFrom();
155
176
 
@@ -500,10 +521,10 @@ export function configureOpenAI(settings: Settings): void {
500
521
 
501
522
  /**
502
523
  * Run the compaction summarizer as one plain, tool-less, non-streaming model
503
- * call against the resolved provider. `system`/`user` come from
504
- * buildCompactionMessages. Returns the trimmed summary text, or null on any
524
+ * call against the resolved provider. `input` is the active history plus
525
+ * Codex's checkpoint prompt. Returns the trimmed summary text, or null on any
505
526
  * failure (the caller treats a failed summarize as "skip compaction this turn"
506
- * never fatal). The call deliberately does NOT request reasoning encryption,
527
+ * - never fatal). The call deliberately does NOT request reasoning encryption,
507
528
  * tools, or server-side compaction; it is a self-contained summarize.
508
529
  *
509
530
  * Provider-aware: the summary always runs on the SAME provider that serves the
@@ -517,22 +538,19 @@ export function configureOpenAI(settings: Settings): void {
517
538
  */
518
539
  export async function summarizeForCompaction(
519
540
  settings: Settings,
520
- messages: { system: string; user: string },
541
+ input: Array<Record<string, unknown>>,
521
542
  options: { client?: OpenAI; api?: ModelProviderApi; maxOutputTokens?: number; model?: string } = {},
522
543
  ): Promise<string | null> {
523
544
  const client = options.client ?? buildOpenAIClientFromSettings(settings);
524
545
  const api = options.api ?? "responses";
525
546
  const model = options.model ?? settings.openaiModel;
526
- const maxTokens = options.maxOutputTokens ?? settings.contextSummaryMaxTokens;
547
+ const maxTokens = options.maxOutputTokens ?? SUMMARY_BUFFER_TOKENS;
527
548
  try {
528
549
  if (api === "chat") {
529
550
  const completion = await client.chat.completions.create({
530
551
  model,
531
552
  max_tokens: maxTokens,
532
- messages: [
533
- { role: "system", content: messages.system },
534
- { role: "user", content: messages.user },
535
- ],
553
+ messages: [{ role: "user", content: renderCompactionPromptInputForChat(input) }],
536
554
  } as any);
537
555
  const text = (completion as { choices?: Array<{ message?: { content?: unknown } }> }).choices?.[0]?.message?.content;
538
556
  const trimmed = typeof text === "string" ? text.trim() : "";
@@ -545,10 +563,7 @@ export async function summarizeForCompaction(
545
563
  // built-in path (api "responses"), so gate it on the built-in provider.
546
564
  ...(settings.openaiProvider === "azure" ? {} : { store: false }),
547
565
  max_output_tokens: maxTokens,
548
- input: [
549
- { role: "system", content: messages.system },
550
- { role: "user", content: messages.user },
551
- ],
566
+ input,
552
567
  } as any);
553
568
  const text = extractResponseOutputText(response);
554
569
  const trimmed = text.trim();
@@ -1573,6 +1588,7 @@ export type RunAgentStreamOptions = {
1573
1588
  sandboxClient?: unknown;
1574
1589
  sandboxEnvironment?: Record<string, string>;
1575
1590
  onRuntimeEvent?: (event: NormalizedRuntimeEvent) => Promise<void> | void;
1591
+ contextCompactionSignalTokens?: () => number | null | undefined;
1576
1592
  // OWNERSHIP INVERSION (P1.2): an externally-owned, already-live sandbox
1577
1593
  // session resolved by the per-turn resume-by-id path. When present,
1578
1594
  // runAgentStream does NOT build (or resume, or discard) a client — it threads
@@ -1603,6 +1619,11 @@ export type RunAgentStreamOptions = {
1603
1619
  callModelInputFilter?: CallModelInputFilter;
1604
1620
  };
1605
1621
 
1622
+ export type ContextRobustnessFilterOptions = {
1623
+ contextCompactionSignalTokens?: () => number | null | undefined;
1624
+ throwOnCompactionNeeded?: boolean;
1625
+ };
1626
+
1606
1627
  // One-shot directive appended to the agent's system prompt on the genesis turn
1607
1628
  // (see buildOpenGeniAgent's genesisTitleHint). Delivered through the
1608
1629
  // authoritative instructions channel so the model reliably obeys; references
@@ -1656,6 +1677,59 @@ export const normalizeComputerCallsFilter: CallModelInputFilter = ({ modelData }
1656
1677
  ) as unknown as AgentInputItem[],
1657
1678
  });
1658
1679
 
1680
+ export function contextRobustnessFilterForSettings(
1681
+ settings: Settings,
1682
+ options: ContextRobustnessFilterOptions = {},
1683
+ ): CallModelInputFilter {
1684
+ const inputBudgetTokens = modelCallBudgetTokens(settings);
1685
+ const clientCompactionMode = resolveContextCompactionMode(settings) === "client";
1686
+ const compactionThresholdTokens = clientCompactionThresholdTokens(settings);
1687
+ return ({ modelData }) => {
1688
+ const images = elideStaleScreenshotImages(modelData.input);
1689
+ if (images.elidedCount > 0) {
1690
+ console.warn(
1691
+ `per-call image history policy elided ${images.elidedCount} older screenshot image(s), keeping the last ${Math.min(3, images.imageCount)} full image(s)`,
1692
+ );
1693
+ }
1694
+ let input = images.items;
1695
+ if (inputBudgetTokens !== undefined) {
1696
+ const guarded = enforceInputBudget(
1697
+ input as unknown as Array<Record<string, unknown>>,
1698
+ inputBudgetTokens,
1699
+ );
1700
+ if (guarded.trimmed) {
1701
+ console.warn(
1702
+ `per-call budget guard trimmed ${guarded.droppedCount} oldest history item(s) to fit input budget (${inputBudgetTokens} tokens); the over-budget model call was NOT sent`,
1703
+ );
1704
+ input = guarded.items as unknown as AgentInputItem[];
1705
+ }
1706
+ }
1707
+ if (clientCompactionMode && options.throwOnCompactionNeeded) {
1708
+ const reported = options.contextCompactionSignalTokens?.();
1709
+ const hasReported = typeof reported === "number" && reported > 0;
1710
+ const signalTokens = hasReported
1711
+ ? reported
1712
+ : estimateTokens(input as unknown as Array<Record<string, unknown>>);
1713
+ if (signalTokens > compactionThresholdTokens) {
1714
+ throw new CompactionNeededError({
1715
+ signalTokens,
1716
+ thresholdTokens: compactionThresholdTokens,
1717
+ signalSource: hasReported ? "provider" : "estimate",
1718
+ });
1719
+ }
1720
+ }
1721
+ return { ...modelData, input };
1722
+ };
1723
+ }
1724
+
1725
+ function modelCallBudgetTokens(settings: Settings): number | undefined {
1726
+ if (resolveContextCompactionMode(settings) !== "client") {
1727
+ return undefined;
1728
+ }
1729
+ const budget = contextInputBudgetTokens(settings);
1730
+ return budget > 0 ? budget : undefined;
1731
+ }
1732
+
1659
1733
  /**
1660
1734
  * Compose a list of callModelInputFilters into one, applied left-to-right so
1661
1735
  * each sees the prior filter's output.
@@ -1674,13 +1748,18 @@ function composeCallModelInputFilters(filters: CallModelInputFilter[]): CallMode
1674
1748
  * The model-input filter applied before every model call. The computer_call
1675
1749
  * action/actions normalizer is ALWAYS on (the Azure endpoint 400s without it);
1676
1750
  * the provider-item-id strip is layered on top when the configured policy
1677
- * selects it.
1751
+ * selects it; the context-robustness guard then elides stale screenshots on
1752
+ * every mode and applies hard budget trimming only on the client-compaction path.
1678
1753
  */
1679
- export function callModelInputFilterForSettings(settings: Settings): CallModelInputFilter | undefined {
1754
+ export function callModelInputFilterForSettings(
1755
+ settings: Settings,
1756
+ options: ContextRobustnessFilterOptions = {},
1757
+ ): CallModelInputFilter | undefined {
1680
1758
  const filters: CallModelInputFilter[] = [normalizeComputerCallsFilter];
1681
1759
  if (settings.openaiProviderItemIds === "strip") {
1682
1760
  filters.push(stripProviderItemIdsFilter);
1683
1761
  }
1762
+ filters.push(contextRobustnessFilterForSettings(settings, options));
1684
1763
  return composeCallModelInputFilters(filters);
1685
1764
  }
1686
1765
 
@@ -1759,7 +1838,15 @@ export async function runAgentStream(agent: Agent<any, any>, input: PreparedAgen
1759
1838
  // through the client during this run (it is inert for the provided session).
1760
1839
  const decoratedClient = withSandboxLifecycleHooks(resourceClient, ownedHooks, ownedHookContext);
1761
1840
  const ownedFilter = composeCallModelInputFilters(
1762
- [callModelInputFilterForSettings(settings), overrides.callModelInputFilter].filter(
1841
+ [
1842
+ callModelInputFilterForSettings(settings, {
1843
+ throwOnCompactionNeeded: Boolean(overrides.contextCompactionSignalTokens),
1844
+ ...(overrides.contextCompactionSignalTokens
1845
+ ? { contextCompactionSignalTokens: overrides.contextCompactionSignalTokens }
1846
+ : {}),
1847
+ }),
1848
+ overrides.callModelInputFilter,
1849
+ ].filter(
1763
1850
  (f): f is CallModelInputFilter => Boolean(f),
1764
1851
  ),
1765
1852
  );
@@ -1806,23 +1893,31 @@ export async function runAgentStream(agent: Agent<any, any>, input: PreparedAgen
1806
1893
  ?? (prepared.serializedRunStateForSandbox && client
1807
1894
  ? await restoredSandboxSessionState(await RunState.fromString(agent, prepared.serializedRunStateForSandbox), client)
1808
1895
  : undefined);
1809
- // Strip provider item ids first, then apply any per-turn filter (genesis
1810
- // title directive). Composed left-to-right so the directive lands on the
1811
- // already-id-stripped input. A callModelInputFilter only shapes the per-call
1812
- // model input, never the persisted run-state history.
1896
+ // Apply the built-in per-call filters (computer-call normalization, optional
1897
+ // provider-id stripping, image/budget guard), then any per-turn filter
1898
+ // (genesis title directive). A callModelInputFilter only shapes the per-call
1899
+ // model input; the SDK persists filtered clones into its session view, while
1900
+ // OpenGeni's durable conversation truth is still reconciled explicitly below.
1813
1901
  const callModelInputFilter = composeCallModelInputFilters(
1814
- [callModelInputFilterForSettings(settings), overrides.callModelInputFilter].filter(
1902
+ [
1903
+ callModelInputFilterForSettings(settings, {
1904
+ throwOnCompactionNeeded: Boolean(overrides.contextCompactionSignalTokens),
1905
+ ...(overrides.contextCompactionSignalTokens
1906
+ ? { contextCompactionSignalTokens: overrides.contextCompactionSignalTokens }
1907
+ : {}),
1908
+ }),
1909
+ overrides.callModelInputFilter,
1910
+ ].filter(
1815
1911
  (f): f is CallModelInputFilter => Boolean(f),
1816
1912
  ),
1817
1913
  );
1818
1914
  const runOptions: Parameters<typeof run>[2] = {
1819
1915
  stream: true,
1820
1916
  maxTurns: settings.agentMaxModelCallsPerTurn,
1821
- // Strip provider-assigned item ids from every model call (turn-start
1822
- // history replay AND mid-turn follow-ups) so requests never depend on the
1823
- // provider's server-side response store. A stored response can vanish
1824
- // between two calls of the same turn, failing the run with 400 "Item with
1825
- // id 'rs_…' not found"; with the ids gone the request is self-contained.
1917
+ // Built-in per-call guard chain: normalize computer calls, optionally strip
1918
+ // provider ids, elide stale screenshots in every mode, and trim to the input
1919
+ // budget on the client-compaction path. This runs for turn-start replay AND
1920
+ // every mid-turn follow-up.
1826
1921
  callModelInputFilter,
1827
1922
  };
1828
1923
  void settings.disableOpenaiTracing;
@@ -24,11 +24,17 @@ import { DESKTOP_STREAM_PORT } from "@opengeni/contracts";
24
24
  export { DESKTOP_STREAM_PORT };
25
25
  export const STREAM_PORT = DESKTOP_STREAM_PORT;
26
26
 
27
- // The whole-stack launch is bounded by the readiness gates inside the script
28
- // (four loops of 50 * 0.1s = ~5s each, ~20s worst case) PLUS first-boot XFCE/dbus
29
- // + font-cache warm-up on a cold gVisor box. 60s gives headroom over the spike's
30
- // observed ~5-10s warm path without masking a genuine wedge.
31
- export const DISPLAY_STACK_TIMEOUT_MS = 60_000;
27
+ // The whole-stack launch is bounded by the readiness gates inside the up-script
28
+ // (four loops of 50 * 0.1s = ~5s each, ~20s worst case) PLUS the PAINTABLE-FRAME
29
+ // gate we append (up to ~30s of scrot probing) PLUS first-boot XFCE/dbus + font-cache
30
+ // warm-up on a cold gVisor box. 90s gives headroom over the spike's observed ~5-10s
31
+ // warm path AND the cold-box paint warm-up without masking a genuine wedge.
32
+ export const DISPLAY_STACK_TIMEOUT_MS = 90_000;
33
+
34
+ // PAINTABLE-FRAME gate: poll scrot up to this many times, this many seconds apart,
35
+ // waiting for a non-empty frame before declaring the stack "up" (~30s worst case).
36
+ const PAINT_PROBE_ATTEMPTS = 150;
37
+ const PAINT_PROBE_INTERVAL_S = 0.2;
32
38
 
33
39
  /** Desktop geometry for the framebuffer. v1 has no live RANDR: a resolution
34
40
  * change is a full down -> up restart (a separate op). */
@@ -41,15 +47,25 @@ export type DesktopGeometry = {
41
47
  export const DEFAULT_DESKTOP_GEOMETRY: DesktopGeometry = { width: 1280, height: 800, dpi: 96 };
42
48
 
43
49
  /** Thrown when a stage of the launch script failed. exitCode 11/12/13 map to
44
- * Xvfb / x11vnc / websockify respectively (the stage that died). Degradation is
45
- * surfaced as a value to viewers by the caller; this error is for diagnostics. */
50
+ * Xvfb / x11vnc / websockify respectively (the stage that died); 14 is the
51
+ * PAINTABLE-FRAME gate (ports listening but scrot still yields an empty frame
52
+ * the display is up but not actually painting). Degradation is surfaced as a
53
+ * value to viewers by the caller; this error is for diagnostics. */
46
54
  export class DisplayStackError extends Error {
47
55
  readonly exitCode: number;
48
- readonly stage: "xvfb" | "x11vnc" | "websockify" | "unknown";
56
+ readonly stage: "xvfb" | "x11vnc" | "websockify" | "paint" | "unknown";
49
57
 
50
58
  constructor(exitCode: number, output: string) {
51
59
  const stage =
52
- exitCode === 11 ? "xvfb" : exitCode === 12 ? "x11vnc" : exitCode === 13 ? "websockify" : "unknown";
60
+ exitCode === 11
61
+ ? "xvfb"
62
+ : exitCode === 12
63
+ ? "x11vnc"
64
+ : exitCode === 13
65
+ ? "websockify"
66
+ : exitCode === 14
67
+ ? "paint"
68
+ : "unknown";
53
69
  super(`desktop display stack failed at stage "${stage}" (exit ${exitCode})${output ? `:\n${output}` : ""}`);
54
70
  this.name = "DisplayStackError";
55
71
  this.exitCode = exitCode;
@@ -125,15 +141,41 @@ export function buildDisplayStackScript(options: EnsureDisplayStackOptions = {})
125
141
  // flock -w bounds the wait so a wedged holder can't deadlock the caller; the
126
142
  // up-script itself ALSO takes the same lock (belt + braces) so this works even
127
143
  // against an older image that predates the wrapper.
128
- return (
144
+ //
145
+ // PAINTABLE-FRAME GATE (the completion criterion): the up-script's readiness gates
146
+ // only assert that Xvfb answers xdpyinfo and that x11vnc:5900 + websockify:PORT are
147
+ // LISTENING — NOT that the display actually PAINTS. On a stone-cold gVisor box (the
148
+ // machine→sandbox swap-recovery turn always hits one), Xvfb can answer and the VNC
149
+ // ports can bind seconds BEFORE the root window / XFCE compositor is drawable, so a
150
+ // scrot right after the `OPENGENI_DESKTOP_UP` marker yields a ZERO-BYTE frame — which
151
+ // is exactly the empty screenshot that 400s the model and blanks the human viewer.
152
+ // We therefore chain a real scrot probe as the completion gate: after the up-script
153
+ // reports success, poll scrot until it produces a NON-EMPTY frame (bounded ~30s), and
154
+ // only THEN let the command exit 0. If it never paints we exit 14 so the caller sees a
155
+ // typed DisplayStackError("paint") — an HONEST failure the worker can degrade + log,
156
+ // rather than a false "up" that hands the model an empty image. `-ac` on Xvfb disables
157
+ // access control so this root-side scrot reaches :0. Runs on a pre-check hit too (cheap
158
+ // — an already-up display paints on the first probe). Lives in the runtime-built script
159
+ // (not the baked image up-script) so it ships with the worker/api, no image rebuild.
160
+ const bringUp =
129
161
  `if nc -z 127.0.0.1 ${port} >/dev/null 2>&1 && nc -z 127.0.0.1 5900 >/dev/null 2>&1; then ` +
130
162
  `echo "OPENGENI_DESKTOP_UP port=${port} geometry=${geometry.width}x${geometry.height} dpi=${geometry.dpi} (precheck)"; ` +
131
163
  `else ` +
132
164
  `mkdir -p /tmp/opengeni-desktop && ` +
133
165
  `flock -w 45 /tmp/opengeni-desktop/up.outer.lock ` +
134
166
  `env ${env} opengeni-desktop-up; ` +
135
- `fi`
136
- );
167
+ `fi`;
168
+ const paintProbe =
169
+ `p=/tmp/opengeni-desktop/paint-probe.png; ` +
170
+ `for i in $(seq 1 ${PAINT_PROBE_ATTEMPTS}); do ` +
171
+ `if DISPLAY=:0 scrot -o "$p" >/dev/null 2>&1 && [ -s "$p" ]; then rm -f "$p"; break; fi; ` +
172
+ `rm -f "$p"; ` +
173
+ // NOTE: NOT_PAINTING goes to STDOUT (not stderr): Modal is execCommand-only, so the
174
+ // caller infers the outcome by string-matching the output — stdout is always captured.
175
+ `if [ "$i" = "${PAINT_PROBE_ATTEMPTS}" ]; then echo "OPENGENI_DESKTOP_NOT_PAINTING scrot empty after warmup"; exit 14; fi; ` +
176
+ `sleep ${PAINT_PROBE_INTERVAL_S}; ` +
177
+ `done`;
178
+ return `mkdir -p /tmp/opengeni-desktop; { ${bringUp} ; } && { ${paintProbe} ; }`;
137
179
  }
138
180
 
139
181
  function execResultOutput(result: ExecResultLike | string): string {
@@ -157,6 +199,13 @@ function execResultExitCode(result: ExecResultLike | string): number | null {
157
199
  // bare string), we infer success from the OPENGENI_DESKTOP_UP marker and infer
158
200
  // the failing stage from the stage-failure message the script prints to stderr.
159
201
  function inferExitFromOutput(output: string): number {
202
+ // Check the PAINTABLE-FRAME failure FIRST: on that path the up-script already
203
+ // printed OPENGENI_DESKTOP_UP (bring-up succeeded) and THEN the paint gate failed,
204
+ // so both markers are present — the NOT_PAINTING one is the authoritative outcome.
205
+ // (Modal is execCommand-only, so this string-inference path is the live one.)
206
+ if (/OPENGENI_DESKTOP_NOT_PAINTING/.test(output)) {
207
+ return 14;
208
+ }
160
209
  if (/OPENGENI_DESKTOP_UP\b/.test(output)) {
161
210
  return 0;
162
211
  }
@@ -67,10 +67,16 @@ const SCROLL_NOTCH_PIXELS = 100;
67
67
  const SCROLL_MAX_CLICKS = 15;
68
68
  // screenshot() never hands the model an empty image_url (the SDK turns "" into
69
69
  // `image_url: ''`, which the model API 400s). A cold/not-yet-painting :0 can yield
70
- // a zero-byte frame on the first scrot; bounded retries with a short pause let a
71
- // momentarily-unpainted-but-live display self-heal before we FAIL LOUD.
72
- const SCREENSHOT_MAX_ATTEMPTS = 3;
73
- const SCREENSHOT_RETRY_DELAY_MS = 400;
70
+ // zero-byte frames for the WHOLE warm-up window of a freshly cold-booted box — Xvfb
71
+ // + XFCE + dbus + font-cache under gVisor routinely take 20s+, and the recovery path
72
+ // after a machine→sandbox swap ALWAYS hits a stone-cold Modal box on its first turn.
73
+ // So we retry across a bounded WALL-CLOCK budget (not a tiny fixed attempt count) with
74
+ // a short pause between tries, so that first post-cold / post-swap screenshot self-heals
75
+ // as the display warms — then FAIL LOUD once the budget is genuinely spent (a display
76
+ // that is dead, not merely warming). ~800ms of retries (the prior 3×400ms) was far too
77
+ // short to ride out a cold gVisor XFCE boot, so the turn failed loud on a transient.
78
+ const SCREENSHOT_WARMUP_BUDGET_MS = 30_000;
79
+ const SCREENSHOT_RETRY_DELAY_MS = 750;
74
80
 
75
81
  export type SandboxComputerOptions = {
76
82
  display?: string; // ":0"
@@ -79,6 +85,11 @@ export type SandboxComputerOptions = {
79
85
  typeDelayMs?: number; // xdotool type --delay (default 12ms)
80
86
  readOnly?: boolean; // when true, every WRITE action throws ComputerReadOnlyError
81
87
  screenshotTmpDir?: string; // "/tmp"
88
+ // How long screenshot() keeps retrying an empty (still-warming) frame before it
89
+ // FAILS LOUD, and the pause between tries. Defaults to the cold-boot warm-up budget;
90
+ // exposed mainly so tests can shrink it (a real caller wants the full budget).
91
+ screenshotWarmupBudgetMs?: number;
92
+ screenshotRetryDelayMs?: number;
82
93
  };
83
94
 
84
95
  // X keysym map for keypress(): model key names → xdotool keysyms.
@@ -144,6 +155,8 @@ export class SandboxComputer implements Computer {
144
155
  private readonly typeDelayMs: number;
145
156
  private readonly readOnly: boolean;
146
157
  private readonly tmp: string;
158
+ private readonly screenshotWarmupBudgetMs: number;
159
+ private readonly screenshotRetryDelayMs: number;
147
160
 
148
161
  constructor(session: SandboxSessionLike, opts: SandboxComputerOptions = {}) {
149
162
  this.session = session as unknown as ComputerSession;
@@ -155,6 +168,8 @@ export class SandboxComputer implements Computer {
155
168
  this.typeDelayMs = opts.typeDelayMs ?? 12;
156
169
  this.readOnly = opts.readOnly ?? false;
157
170
  this.tmp = opts.screenshotTmpDir ?? "/tmp";
171
+ this.screenshotWarmupBudgetMs = opts.screenshotWarmupBudgetMs ?? SCREENSHOT_WARMUP_BUDGET_MS;
172
+ this.screenshotRetryDelayMs = opts.screenshotRetryDelayMs ?? SCREENSHOT_RETRY_DELAY_MS;
158
173
  }
159
174
 
160
175
  /** Rebind to a freshly resumed-by-id session after a box rollover / re-establish. */
@@ -231,17 +246,23 @@ export class SandboxComputer implements Computer {
231
246
  // but momentarily not painting (XFCE/dbus still warming) recovers without
232
247
  // failing the turn.
233
248
  let lastError: unknown;
234
- for (let attempt = 0; attempt < SCREENSHOT_MAX_ATTEMPTS; attempt++) {
249
+ const deadline = Date.now() + this.screenshotWarmupBudgetMs;
250
+ let attempt = 0;
251
+ // Retry across a WALL-CLOCK budget (not a fixed count): a stone-cold box on the
252
+ // first post-swap / post-cold turn can take 20s+ to paint, and a zero-byte frame
253
+ // is a KNOWN transient during that warm-up — not a reason to fail the turn.
254
+ while (true) {
235
255
  if (attempt > 0) {
236
- await new Promise((r) => setTimeout(r, SCREENSHOT_RETRY_DELAY_MS));
256
+ await new Promise((r) => setTimeout(r, this.screenshotRetryDelayMs));
237
257
  }
258
+ attempt++;
238
259
  const f = `${this.tmp}/og-shot-${Date.now()}-${Math.random().toString(36).slice(2)}.png`;
239
260
  try {
240
261
  await this.x(`scrot --pointer --overwrite ${f}`);
241
262
  const bytes = await this.readScreenshotBytes(f);
242
263
  if (bytes.length === 0) {
243
264
  // A cold/not-yet-painting :0 yields a zero-byte frame. Retry rather than
244
- // hand the model an empty image_url; throw on the final attempt.
265
+ // hand the model an empty image_url; throw once the budget is spent.
245
266
  throw new ComputerUnavailableError("scrot produced an empty screenshot (display not up?)");
246
267
  }
247
268
  return Buffer.from(bytes).toString("base64");
@@ -252,9 +273,15 @@ export class SandboxComputer implements Computer {
252
273
  // screenshot result.
253
274
  await this.x(`rm -f ${f}`).catch(() => undefined);
254
275
  }
276
+ // Stop once the warm-up budget is spent — the NEXT sleep would push us past it.
277
+ if (Date.now() + this.screenshotRetryDelayMs >= deadline) {
278
+ break;
279
+ }
255
280
  }
256
- // Exhausted retries: FAIL LOUD. A clear throw is the only acceptable outcome —
257
- // returning "" here would surface to the model as an invalid empty image_url.
281
+ // Exhausted the warm-up budget: FAIL LOUD. A clear throw is the only acceptable
282
+ // outcome — returning "" here would surface to the model as an invalid empty
283
+ // image_url. Reaching here means the display was still dead after ~30s, not merely
284
+ // warming, so a hard action failure is correct.
258
285
  if (lastError instanceof Error) {
259
286
  throw lastError;
260
287
  }