@opengeni/runtime 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-2PO56VAL.js → chunk-KNW7AMQB.js} +11 -4
- package/dist/chunk-KNW7AMQB.js.map +1 -0
- package/dist/index.d.ts +113 -177
- package/dist/index.js +371 -171
- package/dist/index.js.map +1 -1
- package/dist/sandbox/index.d.ts +6 -4
- package/dist/sandbox/index.js +1 -1
- package/package.json +5 -5
- package/src/context-compaction.ts +217 -348
- package/src/image-history.ts +149 -0
- package/src/index.ts +184 -60
- package/src/sandbox/display-stack.ts +61 -12
- package/src/sandbox-computer.ts +90 -18
- package/dist/chunk-2PO56VAL.js.map +0 -1
|
@@ -24,11 +24,17 @@ import { DESKTOP_STREAM_PORT } from "@opengeni/contracts";
|
|
|
24
24
|
export { DESKTOP_STREAM_PORT };
|
|
25
25
|
export const STREAM_PORT = DESKTOP_STREAM_PORT;
|
|
26
26
|
|
|
27
|
-
// The whole-stack launch is bounded by the readiness gates inside the script
|
|
28
|
-
// (four loops of 50 * 0.1s = ~5s each, ~20s worst case) PLUS
|
|
29
|
-
//
|
|
30
|
-
//
|
|
31
|
-
|
|
27
|
+
// The whole-stack launch is bounded by the readiness gates inside the up-script
|
|
28
|
+
// (four loops of 50 * 0.1s = ~5s each, ~20s worst case) PLUS the PAINTABLE-FRAME
|
|
29
|
+
// gate we append (up to ~30s of scrot probing) PLUS first-boot XFCE/dbus + font-cache
|
|
30
|
+
// warm-up on a cold gVisor box. 90s gives headroom over the spike's observed ~5-10s
|
|
31
|
+
// warm path AND the cold-box paint warm-up without masking a genuine wedge.
|
|
32
|
+
export const DISPLAY_STACK_TIMEOUT_MS = 90_000;
|
|
33
|
+
|
|
34
|
+
// PAINTABLE-FRAME gate: poll scrot up to this many times, this many seconds apart,
|
|
35
|
+
// waiting for a non-empty frame before declaring the stack "up" (~30s worst case).
|
|
36
|
+
const PAINT_PROBE_ATTEMPTS = 150;
|
|
37
|
+
const PAINT_PROBE_INTERVAL_S = 0.2;
|
|
32
38
|
|
|
33
39
|
/** Desktop geometry for the framebuffer. v1 has no live RANDR: a resolution
|
|
34
40
|
* change is a full down -> up restart (a separate op). */
|
|
@@ -41,15 +47,25 @@ export type DesktopGeometry = {
|
|
|
41
47
|
export const DEFAULT_DESKTOP_GEOMETRY: DesktopGeometry = { width: 1280, height: 800, dpi: 96 };
|
|
42
48
|
|
|
43
49
|
/** Thrown when a stage of the launch script failed. exitCode 11/12/13 map to
|
|
44
|
-
* Xvfb / x11vnc / websockify respectively (the stage that died)
|
|
45
|
-
*
|
|
50
|
+
* Xvfb / x11vnc / websockify respectively (the stage that died); 14 is the
|
|
51
|
+
* PAINTABLE-FRAME gate (ports listening but scrot still yields an empty frame —
|
|
52
|
+
* the display is up but not actually painting). Degradation is surfaced as a
|
|
53
|
+
* value to viewers by the caller; this error is for diagnostics. */
|
|
46
54
|
export class DisplayStackError extends Error {
|
|
47
55
|
readonly exitCode: number;
|
|
48
|
-
readonly stage: "xvfb" | "x11vnc" | "websockify" | "unknown";
|
|
56
|
+
readonly stage: "xvfb" | "x11vnc" | "websockify" | "paint" | "unknown";
|
|
49
57
|
|
|
50
58
|
constructor(exitCode: number, output: string) {
|
|
51
59
|
const stage =
|
|
52
|
-
exitCode === 11
|
|
60
|
+
exitCode === 11
|
|
61
|
+
? "xvfb"
|
|
62
|
+
: exitCode === 12
|
|
63
|
+
? "x11vnc"
|
|
64
|
+
: exitCode === 13
|
|
65
|
+
? "websockify"
|
|
66
|
+
: exitCode === 14
|
|
67
|
+
? "paint"
|
|
68
|
+
: "unknown";
|
|
53
69
|
super(`desktop display stack failed at stage "${stage}" (exit ${exitCode})${output ? `:\n${output}` : ""}`);
|
|
54
70
|
this.name = "DisplayStackError";
|
|
55
71
|
this.exitCode = exitCode;
|
|
@@ -125,15 +141,41 @@ export function buildDisplayStackScript(options: EnsureDisplayStackOptions = {})
|
|
|
125
141
|
// flock -w bounds the wait so a wedged holder can't deadlock the caller; the
|
|
126
142
|
// up-script itself ALSO takes the same lock (belt + braces) so this works even
|
|
127
143
|
// against an older image that predates the wrapper.
|
|
128
|
-
|
|
144
|
+
//
|
|
145
|
+
// PAINTABLE-FRAME GATE (the completion criterion): the up-script's readiness gates
|
|
146
|
+
// only assert that Xvfb answers xdpyinfo and that x11vnc:5900 + websockify:PORT are
|
|
147
|
+
// LISTENING — NOT that the display actually PAINTS. On a stone-cold gVisor box (the
|
|
148
|
+
// machine→sandbox swap-recovery turn always hits one), Xvfb can answer and the VNC
|
|
149
|
+
// ports can bind seconds BEFORE the root window / XFCE compositor is drawable, so a
|
|
150
|
+
// scrot right after the `OPENGENI_DESKTOP_UP` marker yields a ZERO-BYTE frame — which
|
|
151
|
+
// is exactly the empty screenshot that 400s the model and blanks the human viewer.
|
|
152
|
+
// We therefore chain a real scrot probe as the completion gate: after the up-script
|
|
153
|
+
// reports success, poll scrot until it produces a NON-EMPTY frame (bounded ~30s), and
|
|
154
|
+
// only THEN let the command exit 0. If it never paints we exit 14 so the caller sees a
|
|
155
|
+
// typed DisplayStackError("paint") — an HONEST failure the worker can degrade + log,
|
|
156
|
+
// rather than a false "up" that hands the model an empty image. `-ac` on Xvfb disables
|
|
157
|
+
// access control so this root-side scrot reaches :0. Runs on a pre-check hit too (cheap
|
|
158
|
+
// — an already-up display paints on the first probe). Lives in the runtime-built script
|
|
159
|
+
// (not the baked image up-script) so it ships with the worker/api, no image rebuild.
|
|
160
|
+
const bringUp =
|
|
129
161
|
`if nc -z 127.0.0.1 ${port} >/dev/null 2>&1 && nc -z 127.0.0.1 5900 >/dev/null 2>&1; then ` +
|
|
130
162
|
`echo "OPENGENI_DESKTOP_UP port=${port} geometry=${geometry.width}x${geometry.height} dpi=${geometry.dpi} (precheck)"; ` +
|
|
131
163
|
`else ` +
|
|
132
164
|
`mkdir -p /tmp/opengeni-desktop && ` +
|
|
133
165
|
`flock -w 45 /tmp/opengeni-desktop/up.outer.lock ` +
|
|
134
166
|
`env ${env} opengeni-desktop-up; ` +
|
|
135
|
-
`fi
|
|
136
|
-
|
|
167
|
+
`fi`;
|
|
168
|
+
const paintProbe =
|
|
169
|
+
`p=/tmp/opengeni-desktop/paint-probe.png; ` +
|
|
170
|
+
`for i in $(seq 1 ${PAINT_PROBE_ATTEMPTS}); do ` +
|
|
171
|
+
`if DISPLAY=:0 scrot -o "$p" >/dev/null 2>&1 && [ -s "$p" ]; then rm -f "$p"; break; fi; ` +
|
|
172
|
+
`rm -f "$p"; ` +
|
|
173
|
+
// NOTE: NOT_PAINTING goes to STDOUT (not stderr): Modal is execCommand-only, so the
|
|
174
|
+
// caller infers the outcome by string-matching the output — stdout is always captured.
|
|
175
|
+
`if [ "$i" = "${PAINT_PROBE_ATTEMPTS}" ]; then echo "OPENGENI_DESKTOP_NOT_PAINTING scrot empty after warmup"; exit 14; fi; ` +
|
|
176
|
+
`sleep ${PAINT_PROBE_INTERVAL_S}; ` +
|
|
177
|
+
`done`;
|
|
178
|
+
return `mkdir -p /tmp/opengeni-desktop; { ${bringUp} ; } && { ${paintProbe} ; }`;
|
|
137
179
|
}
|
|
138
180
|
|
|
139
181
|
function execResultOutput(result: ExecResultLike | string): string {
|
|
@@ -157,6 +199,13 @@ function execResultExitCode(result: ExecResultLike | string): number | null {
|
|
|
157
199
|
// bare string), we infer success from the OPENGENI_DESKTOP_UP marker and infer
|
|
158
200
|
// the failing stage from the stage-failure message the script prints to stderr.
|
|
159
201
|
function inferExitFromOutput(output: string): number {
|
|
202
|
+
// Check the PAINTABLE-FRAME failure FIRST: on that path the up-script already
|
|
203
|
+
// printed OPENGENI_DESKTOP_UP (bring-up succeeded) and THEN the paint gate failed,
|
|
204
|
+
// so both markers are present — the NOT_PAINTING one is the authoritative outcome.
|
|
205
|
+
// (Modal is execCommand-only, so this string-inference path is the live one.)
|
|
206
|
+
if (/OPENGENI_DESKTOP_NOT_PAINTING/.test(output)) {
|
|
207
|
+
return 14;
|
|
208
|
+
}
|
|
160
209
|
if (/OPENGENI_DESKTOP_UP\b/.test(output)) {
|
|
161
210
|
return 0;
|
|
162
211
|
}
|
package/src/sandbox-computer.ts
CHANGED
|
@@ -67,10 +67,16 @@ const SCROLL_NOTCH_PIXELS = 100;
|
|
|
67
67
|
const SCROLL_MAX_CLICKS = 15;
|
|
68
68
|
// screenshot() never hands the model an empty image_url (the SDK turns "" into
|
|
69
69
|
// `image_url: ''`, which the model API 400s). A cold/not-yet-painting :0 can yield
|
|
70
|
-
//
|
|
71
|
-
//
|
|
72
|
-
|
|
73
|
-
|
|
70
|
+
// zero-byte frames for the WHOLE warm-up window of a freshly cold-booted box — Xvfb
|
|
71
|
+
// + XFCE + dbus + font-cache under gVisor routinely take 20s+, and the recovery path
|
|
72
|
+
// after a machine→sandbox swap ALWAYS hits a stone-cold Modal box on its first turn.
|
|
73
|
+
// So we retry across a bounded WALL-CLOCK budget (not a tiny fixed attempt count) with
|
|
74
|
+
// a short pause between tries, so that first post-cold / post-swap screenshot self-heals
|
|
75
|
+
// as the display warms — then FAIL LOUD once the budget is genuinely spent (a display
|
|
76
|
+
// that is dead, not merely warming). ~800ms of retries (the prior 3×400ms) was far too
|
|
77
|
+
// short to ride out a cold gVisor XFCE boot, so the turn failed loud on a transient.
|
|
78
|
+
const SCREENSHOT_WARMUP_BUDGET_MS = 30_000;
|
|
79
|
+
const SCREENSHOT_RETRY_DELAY_MS = 750;
|
|
74
80
|
|
|
75
81
|
export type SandboxComputerOptions = {
|
|
76
82
|
display?: string; // ":0"
|
|
@@ -79,6 +85,11 @@ export type SandboxComputerOptions = {
|
|
|
79
85
|
typeDelayMs?: number; // xdotool type --delay (default 12ms)
|
|
80
86
|
readOnly?: boolean; // when true, every WRITE action throws ComputerReadOnlyError
|
|
81
87
|
screenshotTmpDir?: string; // "/tmp"
|
|
88
|
+
// How long screenshot() keeps retrying an empty (still-warming) frame before it
|
|
89
|
+
// FAILS LOUD, and the pause between tries. Defaults to the cold-boot warm-up budget;
|
|
90
|
+
// exposed mainly so tests can shrink it (a real caller wants the full budget).
|
|
91
|
+
screenshotWarmupBudgetMs?: number;
|
|
92
|
+
screenshotRetryDelayMs?: number;
|
|
82
93
|
};
|
|
83
94
|
|
|
84
95
|
// X keysym map for keypress(): model key names → xdotool keysyms.
|
|
@@ -144,6 +155,8 @@ export class SandboxComputer implements Computer {
|
|
|
144
155
|
private readonly typeDelayMs: number;
|
|
145
156
|
private readonly readOnly: boolean;
|
|
146
157
|
private readonly tmp: string;
|
|
158
|
+
private readonly screenshotWarmupBudgetMs: number;
|
|
159
|
+
private readonly screenshotRetryDelayMs: number;
|
|
147
160
|
|
|
148
161
|
constructor(session: SandboxSessionLike, opts: SandboxComputerOptions = {}) {
|
|
149
162
|
this.session = session as unknown as ComputerSession;
|
|
@@ -155,6 +168,8 @@ export class SandboxComputer implements Computer {
|
|
|
155
168
|
this.typeDelayMs = opts.typeDelayMs ?? 12;
|
|
156
169
|
this.readOnly = opts.readOnly ?? false;
|
|
157
170
|
this.tmp = opts.screenshotTmpDir ?? "/tmp";
|
|
171
|
+
this.screenshotWarmupBudgetMs = opts.screenshotWarmupBudgetMs ?? SCREENSHOT_WARMUP_BUDGET_MS;
|
|
172
|
+
this.screenshotRetryDelayMs = opts.screenshotRetryDelayMs ?? SCREENSHOT_RETRY_DELAY_MS;
|
|
158
173
|
}
|
|
159
174
|
|
|
160
175
|
/** Rebind to a freshly resumed-by-id session after a box rollover / re-establish. */
|
|
@@ -231,17 +246,23 @@ export class SandboxComputer implements Computer {
|
|
|
231
246
|
// but momentarily not painting (XFCE/dbus still warming) recovers without
|
|
232
247
|
// failing the turn.
|
|
233
248
|
let lastError: unknown;
|
|
234
|
-
|
|
249
|
+
const deadline = Date.now() + this.screenshotWarmupBudgetMs;
|
|
250
|
+
let attempt = 0;
|
|
251
|
+
// Retry across a WALL-CLOCK budget (not a fixed count): a stone-cold box on the
|
|
252
|
+
// first post-swap / post-cold turn can take 20s+ to paint, and a zero-byte frame
|
|
253
|
+
// is a KNOWN transient during that warm-up — not a reason to fail the turn.
|
|
254
|
+
while (true) {
|
|
235
255
|
if (attempt > 0) {
|
|
236
|
-
await new Promise((r) => setTimeout(r,
|
|
256
|
+
await new Promise((r) => setTimeout(r, this.screenshotRetryDelayMs));
|
|
237
257
|
}
|
|
258
|
+
attempt++;
|
|
238
259
|
const f = `${this.tmp}/og-shot-${Date.now()}-${Math.random().toString(36).slice(2)}.png`;
|
|
239
260
|
try {
|
|
240
261
|
await this.x(`scrot --pointer --overwrite ${f}`);
|
|
241
262
|
const bytes = await this.readScreenshotBytes(f);
|
|
242
263
|
if (bytes.length === 0) {
|
|
243
264
|
// A cold/not-yet-painting :0 yields a zero-byte frame. Retry rather than
|
|
244
|
-
// hand the model an empty image_url; throw
|
|
265
|
+
// hand the model an empty image_url; throw once the budget is spent.
|
|
245
266
|
throw new ComputerUnavailableError("scrot produced an empty screenshot (display not up?)");
|
|
246
267
|
}
|
|
247
268
|
return Buffer.from(bytes).toString("base64");
|
|
@@ -252,9 +273,15 @@ export class SandboxComputer implements Computer {
|
|
|
252
273
|
// screenshot result.
|
|
253
274
|
await this.x(`rm -f ${f}`).catch(() => undefined);
|
|
254
275
|
}
|
|
276
|
+
// Stop once the warm-up budget is spent — the NEXT sleep would push us past it.
|
|
277
|
+
if (Date.now() + this.screenshotRetryDelayMs >= deadline) {
|
|
278
|
+
break;
|
|
279
|
+
}
|
|
255
280
|
}
|
|
256
|
-
// Exhausted
|
|
257
|
-
// returning "" here would surface to the model as an invalid empty
|
|
281
|
+
// Exhausted the warm-up budget: FAIL LOUD. A clear throw is the only acceptable
|
|
282
|
+
// outcome — returning "" here would surface to the model as an invalid empty
|
|
283
|
+
// image_url. Reaching here means the display was still dead after ~30s, not merely
|
|
284
|
+
// warming, so a hard action failure is correct.
|
|
258
285
|
if (lastError instanceof Error) {
|
|
259
286
|
throw lastError;
|
|
260
287
|
}
|
|
@@ -761,6 +788,25 @@ export function computerFunctionTools(
|
|
|
761
788
|
|
|
762
789
|
// ── The capability (the SDK seam) ────────────────────────────────────────────
|
|
763
790
|
|
|
791
|
+
/**
|
|
792
|
+
* EXPLICIT tool-transport selection, decided by the caller that knows the
|
|
793
|
+
* provider's true wire identity (the worker's model resolution — see agent-turn.ts),
|
|
794
|
+
* NOT inferred from the bound model instance's constructor name. This is the
|
|
795
|
+
* HARDENING seam: `supportsStructuredToolOutputTransport` string-sniffs the
|
|
796
|
+
* constructor for "ChatCompletions", which a wrapped / proxied / minified model
|
|
797
|
+
* instance would defeat — silently handing a chat-completions provider the HOSTED
|
|
798
|
+
* `computer_use_preview` tool it 400s on every turn. When `toolMode` is set, tools()
|
|
799
|
+
* OBEYS it and never consults the sniff:
|
|
800
|
+
* • "hosted" → the single hosted `computer_use_preview` tool (Responses backends).
|
|
801
|
+
* • "function-image" → the FUNCTION `computer_*` tools with screenshots delivered as a
|
|
802
|
+
* structured `{type:'image'}` output (the codex/ChatGPT backend,
|
|
803
|
+
* which rejects hosted tool types but SEES structured image results).
|
|
804
|
+
* • "function-text" → the FUNCTION tools with screenshots rendered as a text
|
|
805
|
+
* `data:…;base64` URL (chat-completions providers, which can't read
|
|
806
|
+
* structured image tool results).
|
|
807
|
+
*/
|
|
808
|
+
export type ComputerToolMode = "hosted" | "function-image" | "function-text";
|
|
809
|
+
|
|
764
810
|
export type ComputerUseArgs = {
|
|
765
811
|
dimensions?: [number, number];
|
|
766
812
|
readOnly?: boolean;
|
|
@@ -771,8 +817,14 @@ export type ComputerUseArgs = {
|
|
|
771
817
|
// `input_image` content item inside the function_call_output) instead of the text
|
|
772
818
|
// data-URL string. Only the codex/ChatGPT backend can read structured image tool
|
|
773
819
|
// results; chat-completions providers cannot, so this stays OFF (text rendering)
|
|
774
|
-
// by default and is turned on only on the codex path (see index.ts).
|
|
820
|
+
// by default and is turned on only on the codex path (see index.ts). Ignored when
|
|
821
|
+
// `toolMode` is set (the mode carries its own image-delivery choice).
|
|
775
822
|
imageFunctionResults?: boolean;
|
|
823
|
+
// EXPLICIT transport selection (see {@link ComputerToolMode}). When present, tools()
|
|
824
|
+
// obeys it directly — the constructor-name sniff is NOT consulted. When ABSENT, the
|
|
825
|
+
// legacy sniff behaviour is preserved byte-for-byte (back-compat for any embedder
|
|
826
|
+
// that constructs the capability without threading a mode).
|
|
827
|
+
toolMode?: ComputerToolMode;
|
|
776
828
|
};
|
|
777
829
|
|
|
778
830
|
export function computerUse(args: ComputerUseArgs = {}): ComputerUseCapability {
|
|
@@ -820,16 +872,36 @@ export class ComputerUseCapability extends Capability {
|
|
|
820
872
|
// The SDK base exposes the bound runAs as a protected field.
|
|
821
873
|
...(typeof this._runAs === "string" ? { runAs: this._runAs } : {}),
|
|
822
874
|
});
|
|
823
|
-
//
|
|
824
|
-
//
|
|
875
|
+
// HARDENING: when the caller declares an EXPLICIT toolMode, obey it and NEVER
|
|
876
|
+
// consult `supportsStructuredToolOutputTransport` — tool selection must not
|
|
877
|
+
// depend on the model instance's constructor name (a wrapped/proxied/minified
|
|
878
|
+
// instance would defeat the "ChatCompletions" string-sniff and silently hand a
|
|
879
|
+
// chat-completions provider the hosted tool it 400s on). The mode is decided by
|
|
880
|
+
// the worker, where provider identity is authoritative (see agent-turn.ts).
|
|
881
|
+
switch (this.args.toolMode) {
|
|
882
|
+
case "hosted":
|
|
883
|
+
return [this.hostedComputerTool(computer)];
|
|
884
|
+
case "function-image":
|
|
885
|
+
return computerFunctionTools(computer, this.args.readOnly ?? false, this.args.needsApproval, true);
|
|
886
|
+
case "function-text":
|
|
887
|
+
return computerFunctionTools(computer, this.args.readOnly ?? false, this.args.needsApproval, false);
|
|
888
|
+
case undefined:
|
|
889
|
+
break; // fall through to the legacy sniff (back-compat), preserved byte-for-byte
|
|
890
|
+
}
|
|
891
|
+
// Legacy (no toolMode): structured transport keeps the HOSTED computer tool
|
|
892
|
+
// (unchanged); the codex / text backend gets the FUNCTION tools it can call.
|
|
825
893
|
if (supportsStructuredToolOutputTransport(this._modelInstance)) {
|
|
826
|
-
return [
|
|
827
|
-
computerTool({
|
|
828
|
-
computer,
|
|
829
|
-
...(this.args.needsApproval !== undefined ? { needsApproval: this.args.needsApproval as never } : {}),
|
|
830
|
-
}) as unknown as Tool<unknown>,
|
|
831
|
-
];
|
|
894
|
+
return [this.hostedComputerTool(computer)];
|
|
832
895
|
}
|
|
833
896
|
return computerFunctionTools(computer, this.args.readOnly ?? false, this.args.needsApproval, this.args.imageFunctionResults ?? false);
|
|
834
897
|
}
|
|
898
|
+
|
|
899
|
+
/** The single HOSTED `computer_use_preview` tool bound to `computer` — identical
|
|
900
|
+
* construction for the explicit "hosted" mode and the legacy structured-sniff path. */
|
|
901
|
+
private hostedComputerTool(computer: Computer): Tool<unknown> {
|
|
902
|
+
return computerTool({
|
|
903
|
+
computer,
|
|
904
|
+
...(this.args.needsApproval !== undefined ? { needsApproval: this.args.needsApproval as never } : {}),
|
|
905
|
+
}) as unknown as Tool<unknown>;
|
|
906
|
+
}
|
|
835
907
|
}
|