@opengeni/runtime 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-2PO56VAL.js → chunk-KNW7AMQB.js} +11 -4
- package/dist/chunk-KNW7AMQB.js.map +1 -0
- package/dist/index.d.ts +89 -177
- package/dist/index.js +346 -156
- package/dist/index.js.map +1 -1
- package/dist/sandbox/index.d.ts +6 -4
- package/dist/sandbox/index.js +1 -1
- package/package.json +3 -3
- package/src/context-compaction.ts +217 -348
- package/src/image-history.ts +149 -0
- package/src/index.ts +129 -34
- package/src/sandbox/display-stack.ts +61 -12
- package/src/sandbox-computer.ts +36 -9
- package/dist/chunk-2PO56VAL.js.map +0 -1
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
import type { AgentInputItem } from "@openai/agents";
|
|
2
|
+
|
|
3
|
+
export const SCREENSHOT_OMITTED_PLACEHOLDER =
|
|
4
|
+
"[screenshot omitted: an older desktop frame — the full image remains in the session event log]";
|
|
5
|
+
|
|
6
|
+
const DATA_IMAGE_BASE64_PATTERN = /data:image\/[a-z0-9.+-]+;base64,[a-z0-9+/=_-]+/i;
|
|
7
|
+
|
|
8
|
+
type PathSegment = string | number;
|
|
9
|
+
|
|
10
|
+
type ImageOccurrence = {
|
|
11
|
+
path: PathSegment[];
|
|
12
|
+
replacement: unknown;
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
export type ElideStaleScreenshotsResult<T> = {
|
|
16
|
+
items: T[];
|
|
17
|
+
imageCount: number;
|
|
18
|
+
elidedCount: number;
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
export type ElideStaleScreenshotsOptions = {
|
|
22
|
+
keepLast?: number;
|
|
23
|
+
placeholder?: string;
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
export function elideStaleScreenshotImages<T extends AgentInputItem>(
|
|
27
|
+
items: readonly T[],
|
|
28
|
+
options: ElideStaleScreenshotsOptions = {},
|
|
29
|
+
): ElideStaleScreenshotsResult<T> {
|
|
30
|
+
const keepLast = Math.max(0, Math.floor(options.keepLast ?? 3));
|
|
31
|
+
const placeholder = options.placeholder ?? SCREENSHOT_OMITTED_PLACEHOLDER;
|
|
32
|
+
const occurrences: ImageOccurrence[] = [];
|
|
33
|
+
for (let i = 0; i < items.length; i += 1) {
|
|
34
|
+
collectItemImageOccurrences(items[i], [i], placeholder, occurrences);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const elidedCount = Math.max(0, occurrences.length - keepLast);
|
|
38
|
+
if (elidedCount === 0) {
|
|
39
|
+
return { items: items.slice(), imageCount: occurrences.length, elidedCount: 0 };
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const cloned = structuredClone(items) as T[];
|
|
43
|
+
for (const occurrence of occurrences.slice(0, elidedCount)) {
|
|
44
|
+
setPath(cloned, occurrence.path, occurrence.replacement);
|
|
45
|
+
}
|
|
46
|
+
return { items: cloned, imageCount: occurrences.length, elidedCount };
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function collectItemImageOccurrences(
|
|
50
|
+
item: unknown,
|
|
51
|
+
path: PathSegment[],
|
|
52
|
+
placeholder: string,
|
|
53
|
+
out: ImageOccurrence[],
|
|
54
|
+
): void {
|
|
55
|
+
if (!isRecord(item)) {
|
|
56
|
+
return;
|
|
57
|
+
}
|
|
58
|
+
if (item.type === "message" && (item.role === "user" || item.role === "system")) {
|
|
59
|
+
return;
|
|
60
|
+
}
|
|
61
|
+
if (item.type === "computer_call_result" || item.type === "computer_call_output") {
|
|
62
|
+
collectComputerOutputImages(item, path, placeholder, out);
|
|
63
|
+
return;
|
|
64
|
+
}
|
|
65
|
+
if (item.type === "function_call_result" || item.type === "function_call_output") {
|
|
66
|
+
collectToolResultImages(item.output, [...path, "output"], placeholder, out);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function collectComputerOutputImages(
|
|
71
|
+
item: Record<string, unknown>,
|
|
72
|
+
path: PathSegment[],
|
|
73
|
+
placeholder: string,
|
|
74
|
+
out: ImageOccurrence[],
|
|
75
|
+
): void {
|
|
76
|
+
const output = item.output;
|
|
77
|
+
if (!isRecord(output) || output.type !== "computer_screenshot") {
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
80
|
+
for (const key of ["data", "image_url", "imageUrl"]) {
|
|
81
|
+
if (isImageDataUrl(output[key])) {
|
|
82
|
+
out.push({ path: [...path, "output", key], replacement: placeholder });
|
|
83
|
+
return;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function collectToolResultImages(
|
|
89
|
+
value: unknown,
|
|
90
|
+
path: PathSegment[],
|
|
91
|
+
placeholder: string,
|
|
92
|
+
out: ImageOccurrence[],
|
|
93
|
+
): void {
|
|
94
|
+
if (typeof value === "string") {
|
|
95
|
+
if (isImageDataUrl(value)) {
|
|
96
|
+
out.push({ path, replacement: placeholder });
|
|
97
|
+
}
|
|
98
|
+
return;
|
|
99
|
+
}
|
|
100
|
+
if (Array.isArray(value)) {
|
|
101
|
+
for (let i = 0; i < value.length; i += 1) {
|
|
102
|
+
collectToolResultImages(value[i], [...path, i], placeholder, out);
|
|
103
|
+
}
|
|
104
|
+
return;
|
|
105
|
+
}
|
|
106
|
+
if (!isRecord(value)) {
|
|
107
|
+
return;
|
|
108
|
+
}
|
|
109
|
+
if (value.type === "input_image") {
|
|
110
|
+
for (const key of ["image", "imageUrl", "image_url"]) {
|
|
111
|
+
if (isImageDataUrl(value[key])) {
|
|
112
|
+
out.push({ path, replacement: { type: "input_text", text: placeholder } });
|
|
113
|
+
return;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
for (const key of ["content", "text", "output"]) {
|
|
118
|
+
if (key in value) {
|
|
119
|
+
collectToolResultImages(value[key], [...path, key], placeholder, out);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
function isImageDataUrl(value: unknown): value is string {
|
|
125
|
+
return typeof value === "string" && DATA_IMAGE_BASE64_PATTERN.test(value);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
function isRecord(value: unknown): value is Record<string, unknown> {
|
|
129
|
+
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function setPath(root: unknown, path: PathSegment[], value: unknown): void {
|
|
133
|
+
if (path.length === 0) {
|
|
134
|
+
return;
|
|
135
|
+
}
|
|
136
|
+
let cursor = root;
|
|
137
|
+
for (let i = 0; i < path.length - 1; i += 1) {
|
|
138
|
+
const segment = path[i]!;
|
|
139
|
+
cursor = Array.isArray(cursor)
|
|
140
|
+
? cursor[segment as number]
|
|
141
|
+
: (cursor as Record<string, unknown>)[segment as string];
|
|
142
|
+
}
|
|
143
|
+
const last = path[path.length - 1]!;
|
|
144
|
+
if (Array.isArray(cursor)) {
|
|
145
|
+
cursor[last as number] = value;
|
|
146
|
+
} else {
|
|
147
|
+
(cursor as Record<string, unknown>)[last as string] = value;
|
|
148
|
+
}
|
|
149
|
+
}
|
package/src/index.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { ConfiguredModel, ContextCompactionMode, ModelProviderApi, ResolvedModelProvider, Settings } from "@opengeni/config";
|
|
2
|
-
import { AGENT_INSTRUCTIONS_CORE_PLACEHOLDER, collectSandboxEnvironment, contextServerCompactThreshold, firstPartyMcpBaseUrl, parseExposedPorts, resolveContextCompactionMode, resolveModelProvider, sandboxLifecycleHookIds } from "@opengeni/config";
|
|
2
|
+
import { AGENT_INSTRUCTIONS_CORE_PLACEHOLDER, collectSandboxEnvironment, contextInputBudgetTokens, contextServerCompactThreshold, firstPartyMcpBaseUrl, parseExposedPorts, resolveContextCompactionMode, resolveModelProvider, sandboxLifecycleHookIds } from "@opengeni/config";
|
|
3
3
|
import { CAPABILITY_DESCRIPTORS, isClearedRunStateBlob, signDelegatedAccessToken, type Permission, type ReasoningEffort, type ResourceRef, type SessionEventType, type ToolRef } from "@opengeni/contracts";
|
|
4
4
|
import {
|
|
5
5
|
Agent,
|
|
@@ -82,8 +82,17 @@ import { dirname, isAbsolute, join, posix as posixPath, relative } from "node:pa
|
|
|
82
82
|
import { fileURLToPath } from "node:url";
|
|
83
83
|
|
|
84
84
|
import { computerCallNormalizingFetch, normalizeComputerCallActions, sanitizeHistoryItemsForModel } from "./history-sanitizer";
|
|
85
|
+
import { elideStaleScreenshotImages } from "./image-history";
|
|
85
86
|
import { installCodexToolSearch } from "./codex-tool-search";
|
|
86
|
-
import {
|
|
87
|
+
import {
|
|
88
|
+
CompactionNeededError,
|
|
89
|
+
SUMMARY_BUFFER_TOKENS,
|
|
90
|
+
clientCompactionThresholdTokens,
|
|
91
|
+
enforceInputBudget,
|
|
92
|
+
estimateItemTokens,
|
|
93
|
+
estimateTokens,
|
|
94
|
+
renderCompactionPromptInputForChat,
|
|
95
|
+
} from "./context-compaction";
|
|
87
96
|
import {
|
|
88
97
|
createSandboxClient,
|
|
89
98
|
deserializeSandboxSessionStateEnvelope,
|
|
@@ -134,22 +143,34 @@ export type { HistoryItem } from "./history-sanitizer";
|
|
|
134
143
|
export { OpenAIChatCompletionsModel, OpenAIResponsesModel } from "@openai/agents";
|
|
135
144
|
|
|
136
145
|
export {
|
|
137
|
-
|
|
146
|
+
CompactionNeededError,
|
|
147
|
+
buildCompactionPromptInput,
|
|
148
|
+
buildCompactionReplacementHistory,
|
|
149
|
+
clientCompactionThresholdTokens,
|
|
150
|
+
decideClientCompaction,
|
|
138
151
|
enforceInputBudget,
|
|
139
152
|
buildSummaryItem,
|
|
140
|
-
|
|
153
|
+
findCompactionNeededError,
|
|
141
154
|
isCompactionSummary,
|
|
142
155
|
isUserMessage,
|
|
143
156
|
findKeepBoundary,
|
|
144
157
|
estimateTokens,
|
|
145
158
|
estimateItemTokens,
|
|
146
|
-
|
|
147
|
-
renderPrefixTranscript,
|
|
159
|
+
renderCompactionPromptInputForChat,
|
|
148
160
|
COMPACTION_SUMMARY_MARKER,
|
|
161
|
+
COMPACTION_PROMPT,
|
|
162
|
+
COMPACT_USER_MESSAGE_MAX_TOKENS,
|
|
163
|
+
CLIENT_COMPACTION_TRIGGER_FRACTION,
|
|
164
|
+
SUMMARY_BUFFER_TOKENS,
|
|
149
165
|
SUMMARY_PREFIX,
|
|
150
|
-
|
|
166
|
+
USER_MESSAGE_TRUNCATION_MARKER,
|
|
151
167
|
} from "./context-compaction";
|
|
152
|
-
export type {
|
|
168
|
+
export type { ClientCompactionDecision, CompactionItem } from "./context-compaction";
|
|
169
|
+
export {
|
|
170
|
+
elideStaleScreenshotImages,
|
|
171
|
+
SCREENSHOT_OMITTED_PLACEHOLDER,
|
|
172
|
+
} from "./image-history";
|
|
173
|
+
export type { ElideStaleScreenshotsOptions, ElideStaleScreenshotsResult } from "./image-history";
|
|
153
174
|
|
|
154
175
|
ensureReadableStreamFrom();
|
|
155
176
|
|
|
@@ -500,10 +521,10 @@ export function configureOpenAI(settings: Settings): void {
|
|
|
500
521
|
|
|
501
522
|
/**
|
|
502
523
|
* Run the compaction summarizer as one plain, tool-less, non-streaming model
|
|
503
|
-
* call against the resolved provider. `
|
|
504
|
-
*
|
|
524
|
+
* call against the resolved provider. `input` is the active history plus
|
|
525
|
+
* Codex's checkpoint prompt. Returns the trimmed summary text, or null on any
|
|
505
526
|
* failure (the caller treats a failed summarize as "skip compaction this turn"
|
|
506
|
-
*
|
|
527
|
+
* - never fatal). The call deliberately does NOT request reasoning encryption,
|
|
507
528
|
* tools, or server-side compaction; it is a self-contained summarize.
|
|
508
529
|
*
|
|
509
530
|
* Provider-aware: the summary always runs on the SAME provider that serves the
|
|
@@ -517,22 +538,19 @@ export function configureOpenAI(settings: Settings): void {
|
|
|
517
538
|
*/
|
|
518
539
|
export async function summarizeForCompaction(
|
|
519
540
|
settings: Settings,
|
|
520
|
-
|
|
541
|
+
input: Array<Record<string, unknown>>,
|
|
521
542
|
options: { client?: OpenAI; api?: ModelProviderApi; maxOutputTokens?: number; model?: string } = {},
|
|
522
543
|
): Promise<string | null> {
|
|
523
544
|
const client = options.client ?? buildOpenAIClientFromSettings(settings);
|
|
524
545
|
const api = options.api ?? "responses";
|
|
525
546
|
const model = options.model ?? settings.openaiModel;
|
|
526
|
-
const maxTokens = options.maxOutputTokens ??
|
|
547
|
+
const maxTokens = options.maxOutputTokens ?? SUMMARY_BUFFER_TOKENS;
|
|
527
548
|
try {
|
|
528
549
|
if (api === "chat") {
|
|
529
550
|
const completion = await client.chat.completions.create({
|
|
530
551
|
model,
|
|
531
552
|
max_tokens: maxTokens,
|
|
532
|
-
messages: [
|
|
533
|
-
{ role: "system", content: messages.system },
|
|
534
|
-
{ role: "user", content: messages.user },
|
|
535
|
-
],
|
|
553
|
+
messages: [{ role: "user", content: renderCompactionPromptInputForChat(input) }],
|
|
536
554
|
} as any);
|
|
537
555
|
const text = (completion as { choices?: Array<{ message?: { content?: unknown } }> }).choices?.[0]?.message?.content;
|
|
538
556
|
const trimmed = typeof text === "string" ? text.trim() : "";
|
|
@@ -545,10 +563,7 @@ export async function summarizeForCompaction(
|
|
|
545
563
|
// built-in path (api "responses"), so gate it on the built-in provider.
|
|
546
564
|
...(settings.openaiProvider === "azure" ? {} : { store: false }),
|
|
547
565
|
max_output_tokens: maxTokens,
|
|
548
|
-
input
|
|
549
|
-
{ role: "system", content: messages.system },
|
|
550
|
-
{ role: "user", content: messages.user },
|
|
551
|
-
],
|
|
566
|
+
input,
|
|
552
567
|
} as any);
|
|
553
568
|
const text = extractResponseOutputText(response);
|
|
554
569
|
const trimmed = text.trim();
|
|
@@ -1573,6 +1588,7 @@ export type RunAgentStreamOptions = {
|
|
|
1573
1588
|
sandboxClient?: unknown;
|
|
1574
1589
|
sandboxEnvironment?: Record<string, string>;
|
|
1575
1590
|
onRuntimeEvent?: (event: NormalizedRuntimeEvent) => Promise<void> | void;
|
|
1591
|
+
contextCompactionSignalTokens?: () => number | null | undefined;
|
|
1576
1592
|
// OWNERSHIP INVERSION (P1.2): an externally-owned, already-live sandbox
|
|
1577
1593
|
// session resolved by the per-turn resume-by-id path. When present,
|
|
1578
1594
|
// runAgentStream does NOT build (or resume, or discard) a client — it threads
|
|
@@ -1603,6 +1619,11 @@ export type RunAgentStreamOptions = {
|
|
|
1603
1619
|
callModelInputFilter?: CallModelInputFilter;
|
|
1604
1620
|
};
|
|
1605
1621
|
|
|
1622
|
+
export type ContextRobustnessFilterOptions = {
|
|
1623
|
+
contextCompactionSignalTokens?: () => number | null | undefined;
|
|
1624
|
+
throwOnCompactionNeeded?: boolean;
|
|
1625
|
+
};
|
|
1626
|
+
|
|
1606
1627
|
// One-shot directive appended to the agent's system prompt on the genesis turn
|
|
1607
1628
|
// (see buildOpenGeniAgent's genesisTitleHint). Delivered through the
|
|
1608
1629
|
// authoritative instructions channel so the model reliably obeys; references
|
|
@@ -1656,6 +1677,59 @@ export const normalizeComputerCallsFilter: CallModelInputFilter = ({ modelData }
|
|
|
1656
1677
|
) as unknown as AgentInputItem[],
|
|
1657
1678
|
});
|
|
1658
1679
|
|
|
1680
|
+
export function contextRobustnessFilterForSettings(
|
|
1681
|
+
settings: Settings,
|
|
1682
|
+
options: ContextRobustnessFilterOptions = {},
|
|
1683
|
+
): CallModelInputFilter {
|
|
1684
|
+
const inputBudgetTokens = modelCallBudgetTokens(settings);
|
|
1685
|
+
const clientCompactionMode = resolveContextCompactionMode(settings) === "client";
|
|
1686
|
+
const compactionThresholdTokens = clientCompactionThresholdTokens(settings);
|
|
1687
|
+
return ({ modelData }) => {
|
|
1688
|
+
const images = elideStaleScreenshotImages(modelData.input);
|
|
1689
|
+
if (images.elidedCount > 0) {
|
|
1690
|
+
console.warn(
|
|
1691
|
+
`per-call image history policy elided ${images.elidedCount} older screenshot image(s), keeping the last ${Math.min(3, images.imageCount)} full image(s)`,
|
|
1692
|
+
);
|
|
1693
|
+
}
|
|
1694
|
+
let input = images.items;
|
|
1695
|
+
if (inputBudgetTokens !== undefined) {
|
|
1696
|
+
const guarded = enforceInputBudget(
|
|
1697
|
+
input as unknown as Array<Record<string, unknown>>,
|
|
1698
|
+
inputBudgetTokens,
|
|
1699
|
+
);
|
|
1700
|
+
if (guarded.trimmed) {
|
|
1701
|
+
console.warn(
|
|
1702
|
+
`per-call budget guard trimmed ${guarded.droppedCount} oldest history item(s) to fit input budget (${inputBudgetTokens} tokens); the over-budget model call was NOT sent`,
|
|
1703
|
+
);
|
|
1704
|
+
input = guarded.items as unknown as AgentInputItem[];
|
|
1705
|
+
}
|
|
1706
|
+
}
|
|
1707
|
+
if (clientCompactionMode && options.throwOnCompactionNeeded) {
|
|
1708
|
+
const reported = options.contextCompactionSignalTokens?.();
|
|
1709
|
+
const hasReported = typeof reported === "number" && reported > 0;
|
|
1710
|
+
const signalTokens = hasReported
|
|
1711
|
+
? reported
|
|
1712
|
+
: estimateTokens(input as unknown as Array<Record<string, unknown>>);
|
|
1713
|
+
if (signalTokens > compactionThresholdTokens) {
|
|
1714
|
+
throw new CompactionNeededError({
|
|
1715
|
+
signalTokens,
|
|
1716
|
+
thresholdTokens: compactionThresholdTokens,
|
|
1717
|
+
signalSource: hasReported ? "provider" : "estimate",
|
|
1718
|
+
});
|
|
1719
|
+
}
|
|
1720
|
+
}
|
|
1721
|
+
return { ...modelData, input };
|
|
1722
|
+
};
|
|
1723
|
+
}
|
|
1724
|
+
|
|
1725
|
+
function modelCallBudgetTokens(settings: Settings): number | undefined {
|
|
1726
|
+
if (resolveContextCompactionMode(settings) !== "client") {
|
|
1727
|
+
return undefined;
|
|
1728
|
+
}
|
|
1729
|
+
const budget = contextInputBudgetTokens(settings);
|
|
1730
|
+
return budget > 0 ? budget : undefined;
|
|
1731
|
+
}
|
|
1732
|
+
|
|
1659
1733
|
/**
|
|
1660
1734
|
* Compose a list of callModelInputFilters into one, applied left-to-right so
|
|
1661
1735
|
* each sees the prior filter's output.
|
|
@@ -1674,13 +1748,18 @@ function composeCallModelInputFilters(filters: CallModelInputFilter[]): CallMode
|
|
|
1674
1748
|
* The model-input filter applied before every model call. The computer_call
|
|
1675
1749
|
* action/actions normalizer is ALWAYS on (the Azure endpoint 400s without it);
|
|
1676
1750
|
* the provider-item-id strip is layered on top when the configured policy
|
|
1677
|
-
* selects it
|
|
1751
|
+
* selects it; the context-robustness guard then elides stale screenshots on
|
|
1752
|
+
* every mode and applies hard budget trimming only on the client-compaction path.
|
|
1678
1753
|
*/
|
|
1679
|
-
export function callModelInputFilterForSettings(
|
|
1754
|
+
export function callModelInputFilterForSettings(
|
|
1755
|
+
settings: Settings,
|
|
1756
|
+
options: ContextRobustnessFilterOptions = {},
|
|
1757
|
+
): CallModelInputFilter | undefined {
|
|
1680
1758
|
const filters: CallModelInputFilter[] = [normalizeComputerCallsFilter];
|
|
1681
1759
|
if (settings.openaiProviderItemIds === "strip") {
|
|
1682
1760
|
filters.push(stripProviderItemIdsFilter);
|
|
1683
1761
|
}
|
|
1762
|
+
filters.push(contextRobustnessFilterForSettings(settings, options));
|
|
1684
1763
|
return composeCallModelInputFilters(filters);
|
|
1685
1764
|
}
|
|
1686
1765
|
|
|
@@ -1759,7 +1838,15 @@ export async function runAgentStream(agent: Agent<any, any>, input: PreparedAgen
|
|
|
1759
1838
|
// through the client during this run (it is inert for the provided session).
|
|
1760
1839
|
const decoratedClient = withSandboxLifecycleHooks(resourceClient, ownedHooks, ownedHookContext);
|
|
1761
1840
|
const ownedFilter = composeCallModelInputFilters(
|
|
1762
|
-
[
|
|
1841
|
+
[
|
|
1842
|
+
callModelInputFilterForSettings(settings, {
|
|
1843
|
+
throwOnCompactionNeeded: Boolean(overrides.contextCompactionSignalTokens),
|
|
1844
|
+
...(overrides.contextCompactionSignalTokens
|
|
1845
|
+
? { contextCompactionSignalTokens: overrides.contextCompactionSignalTokens }
|
|
1846
|
+
: {}),
|
|
1847
|
+
}),
|
|
1848
|
+
overrides.callModelInputFilter,
|
|
1849
|
+
].filter(
|
|
1763
1850
|
(f): f is CallModelInputFilter => Boolean(f),
|
|
1764
1851
|
),
|
|
1765
1852
|
);
|
|
@@ -1806,23 +1893,31 @@ export async function runAgentStream(agent: Agent<any, any>, input: PreparedAgen
|
|
|
1806
1893
|
?? (prepared.serializedRunStateForSandbox && client
|
|
1807
1894
|
? await restoredSandboxSessionState(await RunState.fromString(agent, prepared.serializedRunStateForSandbox), client)
|
|
1808
1895
|
: undefined);
|
|
1809
|
-
//
|
|
1810
|
-
//
|
|
1811
|
-
//
|
|
1812
|
-
// model input
|
|
1896
|
+
// Apply the built-in per-call filters (computer-call normalization, optional
|
|
1897
|
+
// provider-id stripping, image/budget guard), then any per-turn filter
|
|
1898
|
+
// (genesis title directive). A callModelInputFilter only shapes the per-call
|
|
1899
|
+
// model input; the SDK persists filtered clones into its session view, while
|
|
1900
|
+
// OpenGeni's durable conversation truth is still reconciled explicitly below.
|
|
1813
1901
|
const callModelInputFilter = composeCallModelInputFilters(
|
|
1814
|
-
[
|
|
1902
|
+
[
|
|
1903
|
+
callModelInputFilterForSettings(settings, {
|
|
1904
|
+
throwOnCompactionNeeded: Boolean(overrides.contextCompactionSignalTokens),
|
|
1905
|
+
...(overrides.contextCompactionSignalTokens
|
|
1906
|
+
? { contextCompactionSignalTokens: overrides.contextCompactionSignalTokens }
|
|
1907
|
+
: {}),
|
|
1908
|
+
}),
|
|
1909
|
+
overrides.callModelInputFilter,
|
|
1910
|
+
].filter(
|
|
1815
1911
|
(f): f is CallModelInputFilter => Boolean(f),
|
|
1816
1912
|
),
|
|
1817
1913
|
);
|
|
1818
1914
|
const runOptions: Parameters<typeof run>[2] = {
|
|
1819
1915
|
stream: true,
|
|
1820
1916
|
maxTurns: settings.agentMaxModelCallsPerTurn,
|
|
1821
|
-
//
|
|
1822
|
-
//
|
|
1823
|
-
//
|
|
1824
|
-
//
|
|
1825
|
-
// id 'rs_…' not found"; with the ids gone the request is self-contained.
|
|
1917
|
+
// Built-in per-call guard chain: normalize computer calls, optionally strip
|
|
1918
|
+
// provider ids, elide stale screenshots in every mode, and trim to the input
|
|
1919
|
+
// budget on the client-compaction path. This runs for turn-start replay AND
|
|
1920
|
+
// every mid-turn follow-up.
|
|
1826
1921
|
callModelInputFilter,
|
|
1827
1922
|
};
|
|
1828
1923
|
void settings.disableOpenaiTracing;
|
|
@@ -24,11 +24,17 @@ import { DESKTOP_STREAM_PORT } from "@opengeni/contracts";
|
|
|
24
24
|
export { DESKTOP_STREAM_PORT };
|
|
25
25
|
export const STREAM_PORT = DESKTOP_STREAM_PORT;
|
|
26
26
|
|
|
27
|
-
// The whole-stack launch is bounded by the readiness gates inside the script
|
|
28
|
-
// (four loops of 50 * 0.1s = ~5s each, ~20s worst case) PLUS
|
|
29
|
-
//
|
|
30
|
-
//
|
|
31
|
-
|
|
27
|
+
// The whole-stack launch is bounded by the readiness gates inside the up-script
|
|
28
|
+
// (four loops of 50 * 0.1s = ~5s each, ~20s worst case) PLUS the PAINTABLE-FRAME
|
|
29
|
+
// gate we append (up to ~30s of scrot probing) PLUS first-boot XFCE/dbus + font-cache
|
|
30
|
+
// warm-up on a cold gVisor box. 90s gives headroom over the spike's observed ~5-10s
|
|
31
|
+
// warm path AND the cold-box paint warm-up without masking a genuine wedge.
|
|
32
|
+
export const DISPLAY_STACK_TIMEOUT_MS = 90_000;
|
|
33
|
+
|
|
34
|
+
// PAINTABLE-FRAME gate: poll scrot up to this many times, this many seconds apart,
|
|
35
|
+
// waiting for a non-empty frame before declaring the stack "up" (~30s worst case).
|
|
36
|
+
const PAINT_PROBE_ATTEMPTS = 150;
|
|
37
|
+
const PAINT_PROBE_INTERVAL_S = 0.2;
|
|
32
38
|
|
|
33
39
|
/** Desktop geometry for the framebuffer. v1 has no live RANDR: a resolution
|
|
34
40
|
* change is a full down -> up restart (a separate op). */
|
|
@@ -41,15 +47,25 @@ export type DesktopGeometry = {
|
|
|
41
47
|
export const DEFAULT_DESKTOP_GEOMETRY: DesktopGeometry = { width: 1280, height: 800, dpi: 96 };
|
|
42
48
|
|
|
43
49
|
/** Thrown when a stage of the launch script failed. exitCode 11/12/13 map to
|
|
44
|
-
* Xvfb / x11vnc / websockify respectively (the stage that died)
|
|
45
|
-
*
|
|
50
|
+
* Xvfb / x11vnc / websockify respectively (the stage that died); 14 is the
|
|
51
|
+
* PAINTABLE-FRAME gate (ports listening but scrot still yields an empty frame —
|
|
52
|
+
* the display is up but not actually painting). Degradation is surfaced as a
|
|
53
|
+
* value to viewers by the caller; this error is for diagnostics. */
|
|
46
54
|
export class DisplayStackError extends Error {
|
|
47
55
|
readonly exitCode: number;
|
|
48
|
-
readonly stage: "xvfb" | "x11vnc" | "websockify" | "unknown";
|
|
56
|
+
readonly stage: "xvfb" | "x11vnc" | "websockify" | "paint" | "unknown";
|
|
49
57
|
|
|
50
58
|
constructor(exitCode: number, output: string) {
|
|
51
59
|
const stage =
|
|
52
|
-
exitCode === 11
|
|
60
|
+
exitCode === 11
|
|
61
|
+
? "xvfb"
|
|
62
|
+
: exitCode === 12
|
|
63
|
+
? "x11vnc"
|
|
64
|
+
: exitCode === 13
|
|
65
|
+
? "websockify"
|
|
66
|
+
: exitCode === 14
|
|
67
|
+
? "paint"
|
|
68
|
+
: "unknown";
|
|
53
69
|
super(`desktop display stack failed at stage "${stage}" (exit ${exitCode})${output ? `:\n${output}` : ""}`);
|
|
54
70
|
this.name = "DisplayStackError";
|
|
55
71
|
this.exitCode = exitCode;
|
|
@@ -125,15 +141,41 @@ export function buildDisplayStackScript(options: EnsureDisplayStackOptions = {})
|
|
|
125
141
|
// flock -w bounds the wait so a wedged holder can't deadlock the caller; the
|
|
126
142
|
// up-script itself ALSO takes the same lock (belt + braces) so this works even
|
|
127
143
|
// against an older image that predates the wrapper.
|
|
128
|
-
|
|
144
|
+
//
|
|
145
|
+
// PAINTABLE-FRAME GATE (the completion criterion): the up-script's readiness gates
|
|
146
|
+
// only assert that Xvfb answers xdpyinfo and that x11vnc:5900 + websockify:PORT are
|
|
147
|
+
// LISTENING — NOT that the display actually PAINTS. On a stone-cold gVisor box (the
|
|
148
|
+
// machine→sandbox swap-recovery turn always hits one), Xvfb can answer and the VNC
|
|
149
|
+
// ports can bind seconds BEFORE the root window / XFCE compositor is drawable, so a
|
|
150
|
+
// scrot right after the `OPENGENI_DESKTOP_UP` marker yields a ZERO-BYTE frame — which
|
|
151
|
+
// is exactly the empty screenshot that 400s the model and blanks the human viewer.
|
|
152
|
+
// We therefore chain a real scrot probe as the completion gate: after the up-script
|
|
153
|
+
// reports success, poll scrot until it produces a NON-EMPTY frame (bounded ~30s), and
|
|
154
|
+
// only THEN let the command exit 0. If it never paints we exit 14 so the caller sees a
|
|
155
|
+
// typed DisplayStackError("paint") — an HONEST failure the worker can degrade + log,
|
|
156
|
+
// rather than a false "up" that hands the model an empty image. `-ac` on Xvfb disables
|
|
157
|
+
// access control so this root-side scrot reaches :0. Runs on a pre-check hit too (cheap
|
|
158
|
+
// — an already-up display paints on the first probe). Lives in the runtime-built script
|
|
159
|
+
// (not the baked image up-script) so it ships with the worker/api, no image rebuild.
|
|
160
|
+
const bringUp =
|
|
129
161
|
`if nc -z 127.0.0.1 ${port} >/dev/null 2>&1 && nc -z 127.0.0.1 5900 >/dev/null 2>&1; then ` +
|
|
130
162
|
`echo "OPENGENI_DESKTOP_UP port=${port} geometry=${geometry.width}x${geometry.height} dpi=${geometry.dpi} (precheck)"; ` +
|
|
131
163
|
`else ` +
|
|
132
164
|
`mkdir -p /tmp/opengeni-desktop && ` +
|
|
133
165
|
`flock -w 45 /tmp/opengeni-desktop/up.outer.lock ` +
|
|
134
166
|
`env ${env} opengeni-desktop-up; ` +
|
|
135
|
-
`fi
|
|
136
|
-
|
|
167
|
+
`fi`;
|
|
168
|
+
const paintProbe =
|
|
169
|
+
`p=/tmp/opengeni-desktop/paint-probe.png; ` +
|
|
170
|
+
`for i in $(seq 1 ${PAINT_PROBE_ATTEMPTS}); do ` +
|
|
171
|
+
`if DISPLAY=:0 scrot -o "$p" >/dev/null 2>&1 && [ -s "$p" ]; then rm -f "$p"; break; fi; ` +
|
|
172
|
+
`rm -f "$p"; ` +
|
|
173
|
+
// NOTE: NOT_PAINTING goes to STDOUT (not stderr): Modal is execCommand-only, so the
|
|
174
|
+
// caller infers the outcome by string-matching the output — stdout is always captured.
|
|
175
|
+
`if [ "$i" = "${PAINT_PROBE_ATTEMPTS}" ]; then echo "OPENGENI_DESKTOP_NOT_PAINTING scrot empty after warmup"; exit 14; fi; ` +
|
|
176
|
+
`sleep ${PAINT_PROBE_INTERVAL_S}; ` +
|
|
177
|
+
`done`;
|
|
178
|
+
return `mkdir -p /tmp/opengeni-desktop; { ${bringUp} ; } && { ${paintProbe} ; }`;
|
|
137
179
|
}
|
|
138
180
|
|
|
139
181
|
function execResultOutput(result: ExecResultLike | string): string {
|
|
@@ -157,6 +199,13 @@ function execResultExitCode(result: ExecResultLike | string): number | null {
|
|
|
157
199
|
// bare string), we infer success from the OPENGENI_DESKTOP_UP marker and infer
|
|
158
200
|
// the failing stage from the stage-failure message the script prints to stderr.
|
|
159
201
|
function inferExitFromOutput(output: string): number {
|
|
202
|
+
// Check the PAINTABLE-FRAME failure FIRST: on that path the up-script already
|
|
203
|
+
// printed OPENGENI_DESKTOP_UP (bring-up succeeded) and THEN the paint gate failed,
|
|
204
|
+
// so both markers are present — the NOT_PAINTING one is the authoritative outcome.
|
|
205
|
+
// (Modal is execCommand-only, so this string-inference path is the live one.)
|
|
206
|
+
if (/OPENGENI_DESKTOP_NOT_PAINTING/.test(output)) {
|
|
207
|
+
return 14;
|
|
208
|
+
}
|
|
160
209
|
if (/OPENGENI_DESKTOP_UP\b/.test(output)) {
|
|
161
210
|
return 0;
|
|
162
211
|
}
|
package/src/sandbox-computer.ts
CHANGED
|
@@ -67,10 +67,16 @@ const SCROLL_NOTCH_PIXELS = 100;
|
|
|
67
67
|
const SCROLL_MAX_CLICKS = 15;
|
|
68
68
|
// screenshot() never hands the model an empty image_url (the SDK turns "" into
|
|
69
69
|
// `image_url: ''`, which the model API 400s). A cold/not-yet-painting :0 can yield
|
|
70
|
-
//
|
|
71
|
-
//
|
|
72
|
-
|
|
73
|
-
|
|
70
|
+
// zero-byte frames for the WHOLE warm-up window of a freshly cold-booted box — Xvfb
|
|
71
|
+
// + XFCE + dbus + font-cache under gVisor routinely take 20s+, and the recovery path
|
|
72
|
+
// after a machine→sandbox swap ALWAYS hits a stone-cold Modal box on its first turn.
|
|
73
|
+
// So we retry across a bounded WALL-CLOCK budget (not a tiny fixed attempt count) with
|
|
74
|
+
// a short pause between tries, so that first post-cold / post-swap screenshot self-heals
|
|
75
|
+
// as the display warms — then FAIL LOUD once the budget is genuinely spent (a display
|
|
76
|
+
// that is dead, not merely warming). ~800ms of retries (the prior 3×400ms) was far too
|
|
77
|
+
// short to ride out a cold gVisor XFCE boot, so the turn failed loud on a transient.
|
|
78
|
+
const SCREENSHOT_WARMUP_BUDGET_MS = 30_000;
|
|
79
|
+
const SCREENSHOT_RETRY_DELAY_MS = 750;
|
|
74
80
|
|
|
75
81
|
export type SandboxComputerOptions = {
|
|
76
82
|
display?: string; // ":0"
|
|
@@ -79,6 +85,11 @@ export type SandboxComputerOptions = {
|
|
|
79
85
|
typeDelayMs?: number; // xdotool type --delay (default 12ms)
|
|
80
86
|
readOnly?: boolean; // when true, every WRITE action throws ComputerReadOnlyError
|
|
81
87
|
screenshotTmpDir?: string; // "/tmp"
|
|
88
|
+
// How long screenshot() keeps retrying an empty (still-warming) frame before it
|
|
89
|
+
// FAILS LOUD, and the pause between tries. Defaults to the cold-boot warm-up budget;
|
|
90
|
+
// exposed mainly so tests can shrink it (a real caller wants the full budget).
|
|
91
|
+
screenshotWarmupBudgetMs?: number;
|
|
92
|
+
screenshotRetryDelayMs?: number;
|
|
82
93
|
};
|
|
83
94
|
|
|
84
95
|
// X keysym map for keypress(): model key names → xdotool keysyms.
|
|
@@ -144,6 +155,8 @@ export class SandboxComputer implements Computer {
|
|
|
144
155
|
private readonly typeDelayMs: number;
|
|
145
156
|
private readonly readOnly: boolean;
|
|
146
157
|
private readonly tmp: string;
|
|
158
|
+
private readonly screenshotWarmupBudgetMs: number;
|
|
159
|
+
private readonly screenshotRetryDelayMs: number;
|
|
147
160
|
|
|
148
161
|
constructor(session: SandboxSessionLike, opts: SandboxComputerOptions = {}) {
|
|
149
162
|
this.session = session as unknown as ComputerSession;
|
|
@@ -155,6 +168,8 @@ export class SandboxComputer implements Computer {
|
|
|
155
168
|
this.typeDelayMs = opts.typeDelayMs ?? 12;
|
|
156
169
|
this.readOnly = opts.readOnly ?? false;
|
|
157
170
|
this.tmp = opts.screenshotTmpDir ?? "/tmp";
|
|
171
|
+
this.screenshotWarmupBudgetMs = opts.screenshotWarmupBudgetMs ?? SCREENSHOT_WARMUP_BUDGET_MS;
|
|
172
|
+
this.screenshotRetryDelayMs = opts.screenshotRetryDelayMs ?? SCREENSHOT_RETRY_DELAY_MS;
|
|
158
173
|
}
|
|
159
174
|
|
|
160
175
|
/** Rebind to a freshly resumed-by-id session after a box rollover / re-establish. */
|
|
@@ -231,17 +246,23 @@ export class SandboxComputer implements Computer {
|
|
|
231
246
|
// but momentarily not painting (XFCE/dbus still warming) recovers without
|
|
232
247
|
// failing the turn.
|
|
233
248
|
let lastError: unknown;
|
|
234
|
-
|
|
249
|
+
const deadline = Date.now() + this.screenshotWarmupBudgetMs;
|
|
250
|
+
let attempt = 0;
|
|
251
|
+
// Retry across a WALL-CLOCK budget (not a fixed count): a stone-cold box on the
|
|
252
|
+
// first post-swap / post-cold turn can take 20s+ to paint, and a zero-byte frame
|
|
253
|
+
// is a KNOWN transient during that warm-up — not a reason to fail the turn.
|
|
254
|
+
while (true) {
|
|
235
255
|
if (attempt > 0) {
|
|
236
|
-
await new Promise((r) => setTimeout(r,
|
|
256
|
+
await new Promise((r) => setTimeout(r, this.screenshotRetryDelayMs));
|
|
237
257
|
}
|
|
258
|
+
attempt++;
|
|
238
259
|
const f = `${this.tmp}/og-shot-${Date.now()}-${Math.random().toString(36).slice(2)}.png`;
|
|
239
260
|
try {
|
|
240
261
|
await this.x(`scrot --pointer --overwrite ${f}`);
|
|
241
262
|
const bytes = await this.readScreenshotBytes(f);
|
|
242
263
|
if (bytes.length === 0) {
|
|
243
264
|
// A cold/not-yet-painting :0 yields a zero-byte frame. Retry rather than
|
|
244
|
-
// hand the model an empty image_url; throw
|
|
265
|
+
// hand the model an empty image_url; throw once the budget is spent.
|
|
245
266
|
throw new ComputerUnavailableError("scrot produced an empty screenshot (display not up?)");
|
|
246
267
|
}
|
|
247
268
|
return Buffer.from(bytes).toString("base64");
|
|
@@ -252,9 +273,15 @@ export class SandboxComputer implements Computer {
|
|
|
252
273
|
// screenshot result.
|
|
253
274
|
await this.x(`rm -f ${f}`).catch(() => undefined);
|
|
254
275
|
}
|
|
276
|
+
// Stop once the warm-up budget is spent — the NEXT sleep would push us past it.
|
|
277
|
+
if (Date.now() + this.screenshotRetryDelayMs >= deadline) {
|
|
278
|
+
break;
|
|
279
|
+
}
|
|
255
280
|
}
|
|
256
|
-
// Exhausted
|
|
257
|
-
// returning "" here would surface to the model as an invalid empty
|
|
281
|
+
// Exhausted the warm-up budget: FAIL LOUD. A clear throw is the only acceptable
|
|
282
|
+
// outcome — returning "" here would surface to the model as an invalid empty
|
|
283
|
+
// image_url. Reaching here means the display was still dead after ~30s, not merely
|
|
284
|
+
// warming, so a hard action failure is correct.
|
|
258
285
|
if (lastError instanceof Error) {
|
|
259
286
|
throw lastError;
|
|
260
287
|
}
|