@lobu/worker 7.0.0 → 7.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/error-handler.d.ts +0 -4
- package/dist/core/error-handler.d.ts.map +1 -1
- package/dist/core/error-handler.js +4 -15
- package/dist/core/error-handler.js.map +1 -1
- package/dist/core/types.d.ts +19 -19
- package/dist/core/types.d.ts.map +1 -1
- package/dist/core/types.js +0 -4
- package/dist/core/types.js.map +1 -1
- package/dist/core/workspace.d.ts +2 -11
- package/dist/core/workspace.d.ts.map +1 -1
- package/dist/core/workspace.js +14 -36
- package/dist/core/workspace.js.map +1 -1
- package/dist/embedded/just-bash-bootstrap.d.ts.map +1 -1
- package/dist/embedded/just-bash-bootstrap.js +34 -4
- package/dist/embedded/just-bash-bootstrap.js.map +1 -1
- package/dist/embedded/mcp-cli-commands.d.ts.map +1 -1
- package/dist/embedded/mcp-cli-commands.js +3 -38
- package/dist/embedded/mcp-cli-commands.js.map +1 -1
- package/dist/gateway/sse-client.d.ts.map +1 -1
- package/dist/gateway/sse-client.js +72 -10
- package/dist/gateway/sse-client.js.map +1 -1
- package/dist/gateway/types.d.ts +2 -0
- package/dist/gateway/types.d.ts.map +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +7 -24
- package/dist/index.js.map +1 -1
- package/dist/instructions/builder.d.ts.map +1 -1
- package/dist/instructions/builder.js +2 -1
- package/dist/instructions/builder.js.map +1 -1
- package/dist/openclaw/plugin-loader.d.ts.map +1 -1
- package/dist/openclaw/plugin-loader.js +8 -19
- package/dist/openclaw/plugin-loader.js.map +1 -1
- package/dist/openclaw/processor.d.ts.map +1 -1
- package/dist/openclaw/processor.js +2 -0
- package/dist/openclaw/processor.js.map +1 -1
- package/dist/openclaw/sandbox-leak.d.ts.map +1 -1
- package/dist/openclaw/sandbox-leak.js +1 -6
- package/dist/openclaw/sandbox-leak.js.map +1 -1
- package/dist/openclaw/session-context.d.ts.map +1 -1
- package/dist/openclaw/session-context.js +3 -0
- package/dist/openclaw/session-context.js.map +1 -1
- package/dist/openclaw/tool-policy.d.ts.map +1 -1
- package/dist/openclaw/tool-policy.js +5 -11
- package/dist/openclaw/tool-policy.js.map +1 -1
- package/dist/openclaw/transcript-snapshot.d.ts +88 -0
- package/dist/openclaw/transcript-snapshot.d.ts.map +1 -0
- package/dist/openclaw/transcript-snapshot.js +223 -0
- package/dist/openclaw/transcript-snapshot.js.map +1 -0
- package/dist/openclaw/worker.d.ts +14 -0
- package/dist/openclaw/worker.d.ts.map +1 -1
- package/dist/openclaw/worker.js +147 -10
- package/dist/openclaw/worker.js.map +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +3 -40
- package/dist/server.js.map +1 -1
- package/dist/shared/audio-provider-suggestions.d.ts.map +1 -1
- package/dist/shared/audio-provider-suggestions.js +4 -6
- package/dist/shared/audio-provider-suggestions.js.map +1 -1
- package/dist/shared/tool-implementations.d.ts.map +1 -1
- package/dist/shared/tool-implementations.js +62 -24
- package/dist/shared/tool-implementations.js.map +1 -1
- package/package.json +2 -2
- package/src/__tests__/processor-harden.test.ts +6 -16
- package/src/__tests__/sse-client.test.ts +99 -0
- package/src/__tests__/transcript-snapshot.test.ts +275 -0
- package/src/core/error-handler.ts +5 -20
- package/src/core/types.ts +19 -35
- package/src/core/workspace.ts +22 -45
- package/src/embedded/just-bash-bootstrap.ts +36 -4
- package/src/embedded/mcp-cli-commands.ts +9 -6
- package/src/gateway/sse-client.ts +87 -22
- package/src/gateway/types.ts +15 -0
- package/src/index.ts +8 -26
- package/src/instructions/builder.ts +2 -3
- package/src/openclaw/plugin-loader.ts +15 -19
- package/src/openclaw/processor.ts +1 -0
- package/src/openclaw/sandbox-leak.ts +1 -6
- package/src/openclaw/session-context.ts +3 -0
- package/src/openclaw/tool-policy.ts +5 -12
- package/src/openclaw/transcript-snapshot.ts +238 -0
- package/src/openclaw/worker.ts +167 -13
- package/src/server.ts +1 -5
- package/src/shared/audio-provider-suggestions.ts +4 -6
- package/src/shared/tool-implementations.ts +57 -16
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-run snapshot client for OpenClaw's `session.jsonl`.
|
|
3
|
+
*
|
|
4
|
+
* Why this exists: today's PVC-backed `workspaces/` directory is read-write-
|
|
5
|
+
* once, which forces the helm chart to pin `replicaCount: 1` for the
|
|
6
|
+
* gateway/worker. Mirroring the post-run session.jsonl to Postgres lets a
|
|
7
|
+
* second pod hydrate the file on boot and resume the conversation, which is
|
|
8
|
+
* the prerequisite for dropping the PVC (Phase 5, separate PR).
|
|
9
|
+
*
|
|
10
|
+
* Design contract for the next reader:
|
|
11
|
+
* - We do NOT fork or wrap `@mariozechner/pi-coding-agent`'s `SessionManager`.
|
|
12
|
+
* It owns the file on disk; we read it back at terminal time and write
|
|
13
|
+
* the bytes verbatim to PG. The next boot writes those bytes back to
|
|
14
|
+
* disk verbatim before SessionManager.open(), so SessionManager observes
|
|
15
|
+
* a byte-identical file to what it last wrote.
|
|
16
|
+
* - The snapshot is taken in `OpenClawWorker.cleanup()` on every terminal
|
|
17
|
+
* status — `completed`, `failed`, `timeout`, `cancelled`. Hydrate filters
|
|
18
|
+
* for `terminal_status='completed'` so a failed run can't poison the
|
|
19
|
+
* next worker with a dangling `tool_use` content block. Older completed
|
|
20
|
+
* snapshots remain readable; the hydrate query takes the latest one.
|
|
21
|
+
* - The worker is sandboxed — no PG access. Two new endpoints live on the
|
|
22
|
+
* existing worker gateway: `GET /worker/transcript/snapshot` for
|
|
23
|
+
* hydrate, `POST /worker/transcript/snapshot` for write. (org, agent,
|
|
24
|
+
* conv) are pulled from the worker JWT on the gateway side, so the
|
|
25
|
+
* worker can't impersonate another conversation.
|
|
26
|
+
* - Phase 5: snapshot mode is the default. `LOBU_SESSION_STORE=file`
|
|
27
|
+
* opts out for legacy/local-dev single-replica deploys. Phase 6
|
|
28
|
+
* drops the env var entirely.
|
|
29
|
+
*
|
|
30
|
+
* Trade-off accepted: a mid-run crash loses the partial transcript for that
|
|
31
|
+
* run. The next attempt re-runs from the previous user message. Tools must
|
|
32
|
+
* be idempotent (or accept user-visible re-execution).
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
import { promises as fs } from "node:fs";
|
|
36
|
+
import * as path from "node:path";
|
|
37
|
+
import { createLogger } from "@lobu/core";
|
|
38
|
+
|
|
39
|
+
const logger = createLogger("transcript-snapshot");
|
|
40
|
+
|
|
41
|
+
export type TerminalStatus = "completed" | "failed" | "timeout" | "cancelled";
|
|
42
|
+
|
|
43
|
+
export interface TranscriptSnapshotOptions {
|
|
44
|
+
/** Absolute path to the session.jsonl SessionManager reads/writes. */
|
|
45
|
+
sessionFile: string;
|
|
46
|
+
/** Gateway base URL (e.g. `http://127.0.0.1:8787/lobu`). */
|
|
47
|
+
gatewayUrl: string;
|
|
48
|
+
/** Worker JWT. The gateway pulls (org, agent, conv) from this token. */
|
|
49
|
+
workerToken: string;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Pull the latest `terminal_status='completed'` snapshot for this worker's
|
|
54
|
+
* (org, agent, conv) and write the bytes to `sessionFile`. Must run BEFORE
|
|
55
|
+
* SessionManager.open() so the rehydrated content is visible at open time.
|
|
56
|
+
*
|
|
57
|
+
* Returns `true` if a snapshot was found and written, `false` if no snapshot
|
|
58
|
+
* exists yet (first turn). Throws on transport errors — caller decides
|
|
59
|
+
* whether to fall back to a fresh session.
|
|
60
|
+
*/
|
|
61
|
+
export async function hydrateFromSnapshot(
|
|
62
|
+
opts: TranscriptSnapshotOptions
|
|
63
|
+
): Promise<boolean> {
|
|
64
|
+
const url = `${opts.gatewayUrl}/worker/transcript/snapshot`;
|
|
65
|
+
const res = await fetch(url, {
|
|
66
|
+
method: "GET",
|
|
67
|
+
headers: { Authorization: `Bearer ${opts.workerToken}` },
|
|
68
|
+
signal: AbortSignal.timeout(30_000),
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
// 404 = no completed snapshot for this (org, agent, conv). First turn or
|
|
72
|
+
// every previous attempt failed/timed out. Caller should start fresh.
|
|
73
|
+
if (res.status === 404) {
|
|
74
|
+
return false;
|
|
75
|
+
}
|
|
76
|
+
if (!res.ok) {
|
|
77
|
+
throw new Error(
|
|
78
|
+
`transcript hydrate failed: ${res.status} ${res.statusText}`
|
|
79
|
+
);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const body = await res.text();
|
|
83
|
+
await fs.mkdir(path.dirname(opts.sessionFile), { recursive: true });
|
|
84
|
+
// writeFile truncates atomically (open with O_TRUNC); no partial state
|
|
85
|
+
// is visible to SessionManager.open() because that call runs after this
|
|
86
|
+
// function resolves.
|
|
87
|
+
await fs.writeFile(opts.sessionFile, body, "utf-8");
|
|
88
|
+
// fsync so a pod crash between this return and SessionManager.open()
|
|
89
|
+
// doesn't leave the file half-written. The cost is one extra disk flush
|
|
90
|
+
// on every worker boot — acceptable.
|
|
91
|
+
const handle = await fs.open(opts.sessionFile, "r");
|
|
92
|
+
try {
|
|
93
|
+
await handle.sync();
|
|
94
|
+
} finally {
|
|
95
|
+
await handle.close();
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
logger.info(
|
|
99
|
+
`Hydrated session file from snapshot: ${body.length} bytes → ${opts.sessionFile}`
|
|
100
|
+
);
|
|
101
|
+
return true;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Read the session file in full and POST it to the gateway. Called once per
|
|
106
|
+
* worker run at terminal time, from `OpenClawWorker.cleanup()`. The
|
|
107
|
+
* `terminal_status` discriminator lets the hydrate path skip failed/timeout
|
|
108
|
+
* snapshots so a dangling `tool_use` doesn't poison the next attempt.
|
|
109
|
+
*
|
|
110
|
+
* Failure to snapshot is logged but does NOT throw — there's nothing the
|
|
111
|
+
* caller can do beyond what cleanup already does (the worker is exiting).
|
|
112
|
+
* The next attempt will hydrate from the previous successful snapshot.
|
|
113
|
+
*/
|
|
114
|
+
export async function writeSnapshot(
|
|
115
|
+
opts: TranscriptSnapshotOptions & {
|
|
116
|
+
terminalStatus: TerminalStatus;
|
|
117
|
+
/**
|
|
118
|
+
* The runs.id this worker claimed. Sent in the POST body so the route
|
|
119
|
+
* binds the snapshot to the correct run unambiguously; the route then
|
|
120
|
+
* verifies the runId actually belongs to the JWT's (org, agent, conv)
|
|
121
|
+
* tuple before INSERTing. Codex P1#1 on PR #865 — without this, the
|
|
122
|
+
* route fell back to a "latest run for (org, agent, conv)" lookup
|
|
123
|
+
* which raced with the next user message enqueuing a fresh run.
|
|
124
|
+
*/
|
|
125
|
+
runId: number;
|
|
126
|
+
}
|
|
127
|
+
): Promise<void> {
|
|
128
|
+
// Hydrate filters `terminal_status='completed'` — failed/timeout/cancelled
|
|
129
|
+
// snapshots are never used. POSTing them is pure network waste; the
|
|
130
|
+
// route would store them but no future hydrate would pick them up.
|
|
131
|
+
// Skip at the source so any caller (cleanup() today, future paths
|
|
132
|
+
// tomorrow) stays out of the wasteful write. Codex round 2 quality
|
|
133
|
+
// win C on PR #865.
|
|
134
|
+
if (opts.terminalStatus !== "completed") {
|
|
135
|
+
logger.debug(
|
|
136
|
+
`Skipping snapshot POST: terminal_status='${opts.terminalStatus}' is never read by hydrate`
|
|
137
|
+
);
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
let body: string;
|
|
142
|
+
try {
|
|
143
|
+
body = await fs.readFile(opts.sessionFile, "utf-8");
|
|
144
|
+
} catch (err) {
|
|
145
|
+
// No session file = nothing to snapshot. Common when the worker exits
|
|
146
|
+
// before SessionManager.open() ran (early error path).
|
|
147
|
+
const isMissing =
|
|
148
|
+
err instanceof Error && (err as NodeJS.ErrnoException).code === "ENOENT";
|
|
149
|
+
if (isMissing) {
|
|
150
|
+
logger.debug(`No session file at ${opts.sessionFile}; skipping snapshot`);
|
|
151
|
+
return;
|
|
152
|
+
}
|
|
153
|
+
logger.warn(
|
|
154
|
+
`Failed to read session file for snapshot: ${err instanceof Error ? err.message : String(err)}`
|
|
155
|
+
);
|
|
156
|
+
return;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
if (body.length === 0) {
|
|
160
|
+
logger.debug("Empty session file; skipping snapshot");
|
|
161
|
+
return;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
const url = `${opts.gatewayUrl}/worker/transcript/snapshot`;
|
|
165
|
+
try {
|
|
166
|
+
const res = await fetch(url, {
|
|
167
|
+
method: "POST",
|
|
168
|
+
headers: {
|
|
169
|
+
Authorization: `Bearer ${opts.workerToken}`,
|
|
170
|
+
"Content-Type": "application/json",
|
|
171
|
+
},
|
|
172
|
+
body: JSON.stringify({
|
|
173
|
+
terminalStatus: opts.terminalStatus,
|
|
174
|
+
snapshotJsonl: body,
|
|
175
|
+
runId: opts.runId,
|
|
176
|
+
}),
|
|
177
|
+
// Snapshots can be large (633 KB max measured); 60s timeout covers
|
|
178
|
+
// slow links + PG TOAST writes.
|
|
179
|
+
signal: AbortSignal.timeout(60_000),
|
|
180
|
+
});
|
|
181
|
+
if (!res.ok) {
|
|
182
|
+
// 409 = UNIQUE (org, agent, conv, run_id) collision. Means another
|
|
183
|
+
// pod (or a retry) already wrote this snapshot — benign, drop it.
|
|
184
|
+
if (res.status === 409) {
|
|
185
|
+
logger.info(
|
|
186
|
+
`Snapshot for run already exists (status=${opts.terminalStatus}); skipping duplicate`
|
|
187
|
+
);
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
190
|
+
logger.error(`Snapshot POST failed: ${res.status} ${res.statusText}`);
|
|
191
|
+
return;
|
|
192
|
+
}
|
|
193
|
+
logger.info(
|
|
194
|
+
`Wrote snapshot: ${body.length} bytes, status=${opts.terminalStatus}`
|
|
195
|
+
);
|
|
196
|
+
} catch (err) {
|
|
197
|
+
logger.error(
|
|
198
|
+
`Snapshot POST threw: ${err instanceof Error ? err.message : String(err)}`
|
|
199
|
+
);
|
|
200
|
+
return;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
/**
|
|
205
|
+
* Purge all snapshot rows for this worker's (org, agent, conv). Called
|
|
206
|
+
* by the session-reset path so the next boot doesn't rehydrate the
|
|
207
|
+
* conversation from Postgres after a `/new`. Idempotent — a 404 / empty
|
|
208
|
+
* result is treated as success.
|
|
209
|
+
*
|
|
210
|
+
* Failures are logged but not thrown — reset is best-effort; if the
|
|
211
|
+
* purge HTTP call fails the worst case is the next boot hydrates from
|
|
212
|
+
* the previous transcript (the legacy file-mode behaviour). The local
|
|
213
|
+
* session.jsonl unlink is the primary signal; this is the multi-replica
|
|
214
|
+
* complement to it.
|
|
215
|
+
*/
|
|
216
|
+
export async function clearSnapshots(
|
|
217
|
+
opts: Pick<TranscriptSnapshotOptions, "gatewayUrl" | "workerToken">
|
|
218
|
+
): Promise<void> {
|
|
219
|
+
const url = `${opts.gatewayUrl}/worker/transcript/snapshot`;
|
|
220
|
+
try {
|
|
221
|
+
const res = await fetch(url, {
|
|
222
|
+
method: "DELETE",
|
|
223
|
+
headers: { Authorization: `Bearer ${opts.workerToken}` },
|
|
224
|
+
signal: AbortSignal.timeout(30_000),
|
|
225
|
+
});
|
|
226
|
+
if (!res.ok) {
|
|
227
|
+
logger.warn(
|
|
228
|
+
`Snapshot DELETE failed: ${res.status} ${res.statusText} — next boot may rehydrate stale history`
|
|
229
|
+
);
|
|
230
|
+
return;
|
|
231
|
+
}
|
|
232
|
+
logger.info("Purged conversation snapshots for session reset");
|
|
233
|
+
} catch (err) {
|
|
234
|
+
logger.warn(
|
|
235
|
+
`Snapshot DELETE threw: ${err instanceof Error ? err.message : String(err)} — next boot may rehydrate stale history`
|
|
236
|
+
);
|
|
237
|
+
}
|
|
238
|
+
}
|
package/src/openclaw/worker.ts
CHANGED
|
@@ -54,6 +54,12 @@ import {
|
|
|
54
54
|
resolveModelRef,
|
|
55
55
|
} from "./model-resolver";
|
|
56
56
|
import { checkSandboxLeak } from "./sandbox-leak";
|
|
57
|
+
import {
|
|
58
|
+
clearSnapshots,
|
|
59
|
+
hydrateFromSnapshot,
|
|
60
|
+
type TerminalStatus,
|
|
61
|
+
writeSnapshot,
|
|
62
|
+
} from "./transcript-snapshot";
|
|
57
63
|
import {
|
|
58
64
|
loadPlugins,
|
|
59
65
|
runPluginHooks,
|
|
@@ -275,28 +281,40 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
275
281
|
public workerTransport: WorkerTransport;
|
|
276
282
|
private config: WorkerConfig;
|
|
277
283
|
private progressProcessor: OpenClawProgressProcessor;
|
|
284
|
+
/**
|
|
285
|
+
* Terminal status for the current run, used by `cleanup()` to discriminate
|
|
286
|
+
* the snapshot row. Defaults to `failed` (pessimistic) so an early crash
|
|
287
|
+
* before any return-path assignment is recorded as a failure, not silently
|
|
288
|
+
* accepted as a completion. Set to `completed` only on the success path
|
|
289
|
+
* in `execute()`. Resets on every `execute()` invocation.
|
|
290
|
+
*/
|
|
291
|
+
private terminalStatus: TerminalStatus = "failed";
|
|
292
|
+
/**
|
|
293
|
+
* Path to the OpenClaw session file for the current run. Captured in
|
|
294
|
+
* `runAISession()` (where SessionManager opens it) so `cleanup()` can
|
|
295
|
+
* read it back for the snapshot without re-deriving the path.
|
|
296
|
+
*/
|
|
297
|
+
private sessionFilePath: string | null = null;
|
|
278
298
|
|
|
279
299
|
constructor(config: WorkerConfig) {
|
|
280
300
|
this.config = config;
|
|
281
301
|
this.workspaceManager = new WorkspaceManager(config.workspace);
|
|
282
302
|
this.progressProcessor = new OpenClawProgressProcessor();
|
|
283
303
|
|
|
284
|
-
// Verify required environment variables
|
|
285
304
|
const gatewayUrl = process.env.DISPATCHER_URL;
|
|
286
305
|
const workerToken = process.env.WORKER_TOKEN;
|
|
287
|
-
|
|
288
306
|
if (!gatewayUrl || !workerToken) {
|
|
289
307
|
throw new Error(
|
|
290
308
|
"DISPATCHER_URL and WORKER_TOKEN environment variables are required"
|
|
291
309
|
);
|
|
292
310
|
}
|
|
293
|
-
|
|
294
311
|
if (!config.teamId) {
|
|
295
312
|
throw new Error("teamId is required for worker initialization");
|
|
296
313
|
}
|
|
297
314
|
if (!config.conversationId) {
|
|
298
315
|
throw new Error("conversationId is required for worker initialization");
|
|
299
316
|
}
|
|
317
|
+
|
|
300
318
|
this.workerTransport = new HttpWorkerTransport({
|
|
301
319
|
gatewayUrl,
|
|
302
320
|
workerToken,
|
|
@@ -316,6 +334,33 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
316
334
|
*/
|
|
317
335
|
async execute(): Promise<void> {
|
|
318
336
|
const executeStartTime = Date.now();
|
|
337
|
+
// Reset terminal status for this run. Defaults to `failed` (pessimistic);
|
|
338
|
+
// assigned to `completed` only on the success path below. SESSION_TIMEOUT
|
|
339
|
+
// throws and is reassigned in the catch block.
|
|
340
|
+
this.terminalStatus = "failed";
|
|
341
|
+
|
|
342
|
+
// Fail loud when snapshot mode is enabled but the per-run scope the
|
|
343
|
+
// gateway is supposed to provide hasn't reached this job. A silent
|
|
344
|
+
// skip in cleanup() would hide a configuration bug across many
|
|
345
|
+
// turns; throwing here surfaces it on the first turn and the runs
|
|
346
|
+
// queue's retry path handles re-delivery. Codex round 2 quality
|
|
347
|
+
// win D on PR #865.
|
|
348
|
+
//
|
|
349
|
+
// Phase 5: snapshot is the default; setting LOBU_SESSION_STORE=file
|
|
350
|
+
// opts out (legacy / local-dev path that keeps reading session.jsonl
|
|
351
|
+
// straight off disk without writing to Postgres).
|
|
352
|
+
if (process.env.LOBU_SESSION_STORE !== "file") {
|
|
353
|
+
if (typeof this.config.runId !== "number") {
|
|
354
|
+
throw new Error(
|
|
355
|
+
"Snapshot mode (LOBU_SESSION_STORE != 'file') but WorkerConfig.runId is missing — runs-queue dispatch did not stamp runId on the job payload"
|
|
356
|
+
);
|
|
357
|
+
}
|
|
358
|
+
if (!this.config.runJobToken) {
|
|
359
|
+
throw new Error(
|
|
360
|
+
"Snapshot mode (LOBU_SESSION_STORE != 'file') but WorkerConfig.runJobToken is missing — MessageConsumer did not mint a per-run worker token"
|
|
361
|
+
);
|
|
362
|
+
}
|
|
363
|
+
}
|
|
319
364
|
|
|
320
365
|
try {
|
|
321
366
|
this.progressProcessor.reset();
|
|
@@ -327,13 +372,11 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
327
372
|
`[TIMING] Worker execute() started at: ${new Date(executeStartTime).toISOString()}`
|
|
328
373
|
);
|
|
329
374
|
|
|
330
|
-
// Decode user prompt
|
|
331
375
|
const userPrompt = Buffer.from(this.config.userPrompt, "base64").toString(
|
|
332
376
|
"utf-8"
|
|
333
377
|
);
|
|
334
378
|
logger.info(`User prompt: ${userPrompt.substring(0, 100)}...`);
|
|
335
379
|
|
|
336
|
-
// Setup workspace
|
|
337
380
|
logger.info("Setting up workspace...");
|
|
338
381
|
|
|
339
382
|
await Sentry.startSpan(
|
|
@@ -360,13 +403,9 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
360
403
|
}
|
|
361
404
|
);
|
|
362
405
|
|
|
363
|
-
// Setup I/O directories for file handling
|
|
364
406
|
await this.setupIODirectories();
|
|
365
|
-
|
|
366
|
-
// Download input files if any
|
|
367
407
|
await this.downloadInputFiles();
|
|
368
408
|
|
|
369
|
-
// Generate custom instructions
|
|
370
409
|
let customInstructions = await generateCustomInstructions(
|
|
371
410
|
[
|
|
372
411
|
new OpenClawCoreInstructionProvider(),
|
|
@@ -385,7 +424,7 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
385
424
|
}
|
|
386
425
|
);
|
|
387
426
|
|
|
388
|
-
//
|
|
427
|
+
// Module hooks may modify the system prompt before agent execution.
|
|
389
428
|
try {
|
|
390
429
|
const { onSessionStart } = await import("../modules/lifecycle");
|
|
391
430
|
const moduleContext = await onSessionStart({
|
|
@@ -407,7 +446,6 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
407
446
|
// Add file I/O instructions AFTER module hooks so they aren't overwritten
|
|
408
447
|
customInstructions += this.getFileIOInstructions();
|
|
409
448
|
|
|
410
|
-
// Execute AI session
|
|
411
449
|
logger.info(
|
|
412
450
|
`[TIMING] Starting OpenClaw session at: ${new Date().toISOString()}`
|
|
413
451
|
);
|
|
@@ -468,7 +506,6 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
468
506
|
}
|
|
469
507
|
);
|
|
470
508
|
|
|
471
|
-
// Collect module data before sending final response
|
|
472
509
|
const { collectModuleData } = await import("../modules/lifecycle");
|
|
473
510
|
const moduleData = await collectModuleData({
|
|
474
511
|
workspaceDir: this.workspaceManager.getCurrentWorkingDirectory(),
|
|
@@ -477,8 +514,11 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
477
514
|
});
|
|
478
515
|
this.workerTransport.setModuleData(moduleData);
|
|
479
516
|
|
|
480
|
-
// Handle result
|
|
481
517
|
if (result.success) {
|
|
518
|
+
// Snapshot writer in cleanup() reads this to discriminate the row.
|
|
519
|
+
// Hydrate skips non-completed snapshots, so getting this right is
|
|
520
|
+
// what stops a failed turn from poisoning the next attempt.
|
|
521
|
+
this.terminalStatus = "completed";
|
|
482
522
|
const outputSnapshot = this.progressProcessor.getOutputSnapshot();
|
|
483
523
|
const hintGatewayUrl = process.env.DISPATCHER_URL;
|
|
484
524
|
const hintWorkerToken = process.env.WORKER_TOKEN;
|
|
@@ -532,6 +572,12 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
532
572
|
const isTimeout = result.exitCode === 124;
|
|
533
573
|
|
|
534
574
|
if (isTimeout) {
|
|
575
|
+
// Mark the snapshot as `timeout` instead of `failed` so operators
|
|
576
|
+
// can distinguish runaway agents from genuine failures in the
|
|
577
|
+
// dashboard. The catch block below sees `SESSION_TIMEOUT` and
|
|
578
|
+
// keeps this assignment intact (it only forces `failed` on
|
|
579
|
+
// exceptions that aren't already marked).
|
|
580
|
+
this.terminalStatus = "timeout";
|
|
535
581
|
logger.info(
|
|
536
582
|
`Session timed out (exit code 124) - will be retried automatically, not showing error to user`
|
|
537
583
|
);
|
|
@@ -562,6 +608,55 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
562
608
|
}
|
|
563
609
|
|
|
564
610
|
async cleanup(): Promise<void> {
|
|
611
|
+
// Snapshot the post-run session.jsonl to Postgres so the next worker
|
|
612
|
+
// (possibly on a different pod) can hydrate from it. Hydrate filters
|
|
613
|
+
// `terminal_status='completed'`, so we ONLY POST on the success path
|
|
614
|
+
// — writing `failed`/`timeout`/`cancelled` rows is pure network
|
|
615
|
+
// waste (codex round 2 quality win C on PR #865). Default-on in
|
|
616
|
+
// Phase 5; LOBU_SESSION_STORE=file opts out for legacy/local-dev.
|
|
617
|
+
//
|
|
618
|
+
// The runs queue has already moved this run to a terminal state by
|
|
619
|
+
// the time cleanup() fires (sse-client.ts:865 finally block runs
|
|
620
|
+
// after execute() returns). We POST in the worker's own dying
|
|
621
|
+
// breath; the gateway-side advisory lock held by the spawner is
|
|
622
|
+
// released when the subprocess exits, so by the next claim's boot
|
|
623
|
+
// this snapshot is the visible "latest" row.
|
|
624
|
+
if (
|
|
625
|
+
process.env.LOBU_SESSION_STORE !== "file" &&
|
|
626
|
+
this.sessionFilePath &&
|
|
627
|
+
this.terminalStatus === "completed"
|
|
628
|
+
) {
|
|
629
|
+
const gatewayUrl = process.env.DISPATCHER_URL;
|
|
630
|
+
const runId = this.config.runId;
|
|
631
|
+
// Per-run JWT minted by the gateway's MessageConsumer alongside
|
|
632
|
+
// `runId`. The snapshot route requires `tokenData.runId ===
|
|
633
|
+
// body.runId`, so the deployment-lifetime WORKER_TOKEN cannot be
|
|
634
|
+
// used here — it would carry no `runId` and the route would 403.
|
|
635
|
+
// Codex round 2 finding A.
|
|
636
|
+
const runJobToken = this.config.runJobToken;
|
|
637
|
+
if (gatewayUrl && runJobToken && typeof runId === "number") {
|
|
638
|
+
await writeSnapshot({
|
|
639
|
+
sessionFile: this.sessionFilePath,
|
|
640
|
+
gatewayUrl,
|
|
641
|
+
workerToken: runJobToken,
|
|
642
|
+
terminalStatus: this.terminalStatus,
|
|
643
|
+
runId,
|
|
644
|
+
});
|
|
645
|
+
} else if (gatewayUrl) {
|
|
646
|
+
// Missing per-run scope (legacy direct-enqueue path or token
|
|
647
|
+
// mint failure on the gateway). Skip the snapshot rather than
|
|
648
|
+
// risk a mis-attributed row; the next run will hydrate from
|
|
649
|
+
// the previous completed snapshot the next time a normal
|
|
650
|
+
// runs-queue dispatch comes through.
|
|
651
|
+
logger.warn(
|
|
652
|
+
`Skipping transcript snapshot: ${
|
|
653
|
+
typeof runId !== "number"
|
|
654
|
+
? "WorkerConfig.runId is missing"
|
|
655
|
+
: "WorkerConfig.runJobToken is missing"
|
|
656
|
+
} (legacy enqueue path)`
|
|
657
|
+
);
|
|
658
|
+
}
|
|
659
|
+
}
|
|
565
660
|
logger.info("Worker cleanup completed");
|
|
566
661
|
}
|
|
567
662
|
|
|
@@ -866,12 +961,56 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
866
961
|
await fs.mkdir(path.join(workspaceDir, ".openclaw"), { recursive: true });
|
|
867
962
|
|
|
868
963
|
const sessionFile = path.join(workspaceDir, ".openclaw", "session.jsonl");
|
|
964
|
+
// Capture for cleanup() — it reads the file back to write the snapshot
|
|
965
|
+
// at terminal time. Set unconditionally so file-mode opt-outs
|
|
966
|
+
// still get a defined value (snapshot writer no-ops when
|
|
967
|
+
// LOBU_SESSION_STORE=file).
|
|
968
|
+
this.sessionFilePath = sessionFile;
|
|
869
969
|
const providerStateFile = path.join(
|
|
870
970
|
workspaceDir,
|
|
871
971
|
".openclaw",
|
|
872
972
|
"provider.json"
|
|
873
973
|
);
|
|
874
974
|
|
|
975
|
+
// Hydrate from the latest completed Postgres snapshot BEFORE the
|
|
976
|
+
// provider-state check or SessionManager.open(). Phase 5: snapshot
|
|
977
|
+
// mode is the default; LOBU_SESSION_STORE=file opts out and keeps
|
|
978
|
+
// the legacy file-only behaviour for local-dev / single-replica
|
|
979
|
+
// self-hosters.
|
|
980
|
+
//
|
|
981
|
+
// Order matters: hydrate → provider check (may unlink) →
|
|
982
|
+
// SessionManager.open(). The provider-change unlink at line ~925 still
|
|
983
|
+
// does the right thing after hydrate: it drops the file we just wrote
|
|
984
|
+
// and SessionManager creates a fresh one, exactly like a first-turn
|
|
985
|
+
// boot. The next snapshot will have its own run_id, so the historical
|
|
986
|
+
// PG rows remain readable without poisoning the new conversation
|
|
987
|
+
// (hydrate would only resurrect them if a subsequent run completes
|
|
988
|
+
// successfully and overwrites the latest pointer).
|
|
989
|
+
if (process.env.LOBU_SESSION_STORE !== "file") {
|
|
990
|
+
const gatewayUrl = process.env.DISPATCHER_URL;
|
|
991
|
+
const workerToken = process.env.WORKER_TOKEN;
|
|
992
|
+
if (gatewayUrl && workerToken) {
|
|
993
|
+
try {
|
|
994
|
+
await hydrateFromSnapshot({
|
|
995
|
+
sessionFile,
|
|
996
|
+
gatewayUrl,
|
|
997
|
+
workerToken,
|
|
998
|
+
});
|
|
999
|
+
} catch (err) {
|
|
1000
|
+
// Hydrate failure is non-fatal — fall back to whatever's on disk.
|
|
1001
|
+
// Worst case the worker boots without history and the user re-
|
|
1002
|
+
// grounds the conversation. Better than refusing to start.
|
|
1003
|
+
logger.warn(
|
|
1004
|
+
`Snapshot hydrate failed; continuing with local session file: ${err instanceof Error ? err.message : String(err)}`
|
|
1005
|
+
);
|
|
1006
|
+
}
|
|
1007
|
+
} else {
|
|
1008
|
+
logger.warn(
|
|
1009
|
+
"Snapshot mode active (LOBU_SESSION_STORE != 'file') but DISPATCHER_URL or WORKER_TOKEN missing; snapshot disabled"
|
|
1010
|
+
);
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
|
|
875
1014
|
// Detect provider change and reset session if needed
|
|
876
1015
|
let sessionSummary: string | undefined;
|
|
877
1016
|
try {
|
|
@@ -1412,6 +1551,21 @@ Use it when the user references past discussions or you need context.`);
|
|
|
1412
1551
|
// File may not exist
|
|
1413
1552
|
}
|
|
1414
1553
|
|
|
1554
|
+
// Also purge the Postgres snapshots for this (org, agent, conv)
|
|
1555
|
+
// — in snapshot mode (the Phase 5 default) the next worker boot
|
|
1556
|
+
// would otherwise rehydrate from the now-flushed conversation
|
|
1557
|
+
// and the user-visible "Starting fresh" would be a lie. Best-
|
|
1558
|
+
// effort: a failure here is logged but doesn't block the reset
|
|
1559
|
+
// since the local unlink already happened and the snapshot
|
|
1560
|
+
// helper is a no-op in file mode.
|
|
1561
|
+
if (process.env.LOBU_SESSION_STORE !== "file") {
|
|
1562
|
+
const gatewayUrl = process.env.DISPATCHER_URL;
|
|
1563
|
+
const workerToken = process.env.WORKER_TOKEN;
|
|
1564
|
+
if (gatewayUrl && workerToken) {
|
|
1565
|
+
await clearSnapshots({ gatewayUrl, workerToken });
|
|
1566
|
+
}
|
|
1567
|
+
}
|
|
1568
|
+
|
|
1415
1569
|
// Send visible confirmation to user
|
|
1416
1570
|
await onProgress({
|
|
1417
1571
|
type: "output",
|
package/src/server.ts
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
* Lightweight Hono server started before SSE gateway connection.
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
-
import { readFile } from "node:fs/promises";
|
|
6
|
+
import { readdir, readFile, stat } from "node:fs/promises";
|
|
7
7
|
import { createServer } from "node:http";
|
|
8
8
|
import { join } from "node:path";
|
|
9
9
|
import { getRequestListener } from "@hono/node-server";
|
|
@@ -20,7 +20,6 @@ const logger = createLogger("worker-http");
|
|
|
20
20
|
const app = new Hono();
|
|
21
21
|
|
|
22
22
|
async function findSessionFile(): Promise<string | null> {
|
|
23
|
-
const { readdir, stat } = await import("node:fs/promises");
|
|
24
23
|
const workspaceDir = getOptionalEnv("WORKSPACE_DIR", "/workspace");
|
|
25
24
|
|
|
26
25
|
// Direct path: {WORKSPACE_DIR}/.openclaw/session.jsonl
|
|
@@ -163,10 +162,8 @@ function entryToMessage(entry: SessionEntry): ParsedMessage | null {
|
|
|
163
162
|
return null;
|
|
164
163
|
}
|
|
165
164
|
|
|
166
|
-
// Health check
|
|
167
165
|
app.get("/health", (c) => c.json({ status: "ok" }));
|
|
168
166
|
|
|
169
|
-
// Full session messages with cursor-based pagination
|
|
170
167
|
app.get("/session/messages", async (c) => {
|
|
171
168
|
const cursor = c.req.query("cursor");
|
|
172
169
|
const limit = Math.min(parseInt(c.req.query("limit") || "50", 10), 200);
|
|
@@ -230,7 +227,6 @@ app.get("/session/messages", async (c) => {
|
|
|
230
227
|
}
|
|
231
228
|
});
|
|
232
229
|
|
|
233
|
-
// Session stats
|
|
234
230
|
app.get("/session/stats", async (c) => {
|
|
235
231
|
try {
|
|
236
232
|
const sessionPath = await findSessionFile();
|
|
@@ -5,11 +5,7 @@ interface AudioProviderSuggestions {
|
|
|
5
5
|
usedFallback: boolean;
|
|
6
6
|
}
|
|
7
7
|
|
|
8
|
-
const
|
|
9
|
-
{ id: "chatgpt" },
|
|
10
|
-
{ id: "gemini" },
|
|
11
|
-
{ id: "elevenlabs" },
|
|
12
|
-
] as const;
|
|
8
|
+
const FALLBACK_PROVIDER_IDS = ["chatgpt", "gemini", "elevenlabs"] as const;
|
|
13
9
|
|
|
14
10
|
const KNOWN_PROVIDER_LABELS: Record<string, string> = {
|
|
15
11
|
chatgpt: "ChatGPT/OpenAI",
|
|
@@ -48,7 +44,7 @@ function getFallbackSuggestions(
|
|
|
48
44
|
available: boolean | null
|
|
49
45
|
): AudioProviderSuggestions {
|
|
50
46
|
return {
|
|
51
|
-
providerIds:
|
|
47
|
+
providerIds: [...FALLBACK_PROVIDER_IDS],
|
|
52
48
|
providerDisplayList: "",
|
|
53
49
|
available,
|
|
54
50
|
usedFallback: true,
|
|
@@ -119,6 +115,8 @@ export async function fetchAudioProviderSuggestions(params: {
|
|
|
119
115
|
`${params.gatewayUrl}/internal/audio/capabilities`,
|
|
120
116
|
{
|
|
121
117
|
headers: { Authorization: `Bearer ${params.workerToken}` },
|
|
118
|
+
// Capability probing is best-effort; never block the agent turn on it.
|
|
119
|
+
signal: AbortSignal.timeout(15_000),
|
|
122
120
|
}
|
|
123
121
|
);
|
|
124
122
|
if (!response.ok) {
|