@lobu/worker 7.0.0 → 7.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/dist/core/error-handler.d.ts +0 -4
  2. package/dist/core/error-handler.d.ts.map +1 -1
  3. package/dist/core/error-handler.js +4 -15
  4. package/dist/core/error-handler.js.map +1 -1
  5. package/dist/core/types.d.ts +19 -19
  6. package/dist/core/types.d.ts.map +1 -1
  7. package/dist/core/types.js +0 -4
  8. package/dist/core/types.js.map +1 -1
  9. package/dist/core/workspace.d.ts +2 -11
  10. package/dist/core/workspace.d.ts.map +1 -1
  11. package/dist/core/workspace.js +14 -36
  12. package/dist/core/workspace.js.map +1 -1
  13. package/dist/embedded/just-bash-bootstrap.d.ts.map +1 -1
  14. package/dist/embedded/just-bash-bootstrap.js +34 -4
  15. package/dist/embedded/just-bash-bootstrap.js.map +1 -1
  16. package/dist/embedded/mcp-cli-commands.d.ts.map +1 -1
  17. package/dist/embedded/mcp-cli-commands.js +3 -38
  18. package/dist/embedded/mcp-cli-commands.js.map +1 -1
  19. package/dist/gateway/sse-client.d.ts.map +1 -1
  20. package/dist/gateway/sse-client.js +72 -10
  21. package/dist/gateway/sse-client.js.map +1 -1
  22. package/dist/gateway/types.d.ts +2 -0
  23. package/dist/gateway/types.d.ts.map +1 -1
  24. package/dist/index.d.ts.map +1 -1
  25. package/dist/index.js +7 -24
  26. package/dist/index.js.map +1 -1
  27. package/dist/instructions/builder.d.ts.map +1 -1
  28. package/dist/instructions/builder.js +2 -1
  29. package/dist/instructions/builder.js.map +1 -1
  30. package/dist/openclaw/plugin-loader.d.ts.map +1 -1
  31. package/dist/openclaw/plugin-loader.js +8 -19
  32. package/dist/openclaw/plugin-loader.js.map +1 -1
  33. package/dist/openclaw/processor.d.ts.map +1 -1
  34. package/dist/openclaw/processor.js +2 -0
  35. package/dist/openclaw/processor.js.map +1 -1
  36. package/dist/openclaw/sandbox-leak.d.ts.map +1 -1
  37. package/dist/openclaw/sandbox-leak.js +1 -6
  38. package/dist/openclaw/sandbox-leak.js.map +1 -1
  39. package/dist/openclaw/session-context.d.ts.map +1 -1
  40. package/dist/openclaw/session-context.js +3 -0
  41. package/dist/openclaw/session-context.js.map +1 -1
  42. package/dist/openclaw/tool-policy.d.ts.map +1 -1
  43. package/dist/openclaw/tool-policy.js +5 -11
  44. package/dist/openclaw/tool-policy.js.map +1 -1
  45. package/dist/openclaw/transcript-snapshot.d.ts +88 -0
  46. package/dist/openclaw/transcript-snapshot.d.ts.map +1 -0
  47. package/dist/openclaw/transcript-snapshot.js +223 -0
  48. package/dist/openclaw/transcript-snapshot.js.map +1 -0
  49. package/dist/openclaw/worker.d.ts +14 -0
  50. package/dist/openclaw/worker.d.ts.map +1 -1
  51. package/dist/openclaw/worker.js +147 -10
  52. package/dist/openclaw/worker.js.map +1 -1
  53. package/dist/server.d.ts.map +1 -1
  54. package/dist/server.js +3 -40
  55. package/dist/server.js.map +1 -1
  56. package/dist/shared/audio-provider-suggestions.d.ts.map +1 -1
  57. package/dist/shared/audio-provider-suggestions.js +4 -6
  58. package/dist/shared/audio-provider-suggestions.js.map +1 -1
  59. package/dist/shared/tool-implementations.d.ts.map +1 -1
  60. package/dist/shared/tool-implementations.js +62 -24
  61. package/dist/shared/tool-implementations.js.map +1 -1
  62. package/package.json +2 -2
  63. package/src/__tests__/processor-harden.test.ts +6 -16
  64. package/src/__tests__/sse-client.test.ts +99 -0
  65. package/src/__tests__/transcript-snapshot.test.ts +275 -0
  66. package/src/core/error-handler.ts +5 -20
  67. package/src/core/types.ts +19 -35
  68. package/src/core/workspace.ts +22 -45
  69. package/src/embedded/just-bash-bootstrap.ts +36 -4
  70. package/src/embedded/mcp-cli-commands.ts +9 -6
  71. package/src/gateway/sse-client.ts +87 -22
  72. package/src/gateway/types.ts +15 -0
  73. package/src/index.ts +8 -26
  74. package/src/instructions/builder.ts +2 -3
  75. package/src/openclaw/plugin-loader.ts +15 -19
  76. package/src/openclaw/processor.ts +1 -0
  77. package/src/openclaw/sandbox-leak.ts +1 -6
  78. package/src/openclaw/session-context.ts +3 -0
  79. package/src/openclaw/tool-policy.ts +5 -12
  80. package/src/openclaw/transcript-snapshot.ts +238 -0
  81. package/src/openclaw/worker.ts +167 -13
  82. package/src/server.ts +1 -5
  83. package/src/shared/audio-provider-suggestions.ts +4 -6
  84. package/src/shared/tool-implementations.ts +57 -16
@@ -0,0 +1,238 @@
1
+ /**
2
+ * Per-run snapshot client for OpenClaw's `session.jsonl`.
3
+ *
4
+ * Why this exists: today's PVC-backed `workspaces/` directory is read-write-
5
+ * once, which forces the helm chart to pin `replicaCount: 1` for the
6
+ * gateway/worker. Mirroring the post-run session.jsonl to Postgres lets a
7
+ * second pod hydrate the file on boot and resume the conversation, which is
8
+ * the prerequisite for dropping the PVC (Phase 5, separate PR).
9
+ *
10
+ * Design contract for the next reader:
11
+ * - We do NOT fork or wrap `@mariozechner/pi-coding-agent`'s `SessionManager`.
12
+ * It owns the file on disk; we read it back at terminal time and write
13
+ * the bytes verbatim to PG. The next boot writes those bytes back to
14
+ * disk verbatim before SessionManager.open(), so SessionManager observes
15
+ * a byte-identical file to what it last wrote.
16
+ * - The snapshot is taken in `OpenClawWorker.cleanup()` on every terminal
17
+ * status — `completed`, `failed`, `timeout`, `cancelled`. Hydrate filters
18
+ * for `terminal_status='completed'` so a failed run can't poison the
19
+ * next worker with a dangling `tool_use` content block. Older completed
20
+ * snapshots remain readable; the hydrate query takes the latest one.
21
+ * - The worker is sandboxed — no PG access. Two new endpoints live on the
22
+ * existing worker gateway: `GET /worker/transcript/snapshot` for
23
+ * hydrate, `POST /worker/transcript/snapshot` for write. (org, agent,
24
+ * conv) are pulled from the worker JWT on the gateway side, so the
25
+ * worker can't impersonate another conversation.
26
+ * - Phase 5: snapshot mode is the default. `LOBU_SESSION_STORE=file`
27
+ * opts out for legacy/local-dev single-replica deploys. Phase 6
28
+ * drops the env var entirely.
29
+ *
30
+ * Trade-off accepted: a mid-run crash loses the partial transcript for that
31
+ * run. The next attempt re-runs from the previous user message. Tools must
32
+ * be idempotent (or accept user-visible re-execution).
33
+ */
34
+
35
+ import { promises as fs } from "node:fs";
36
+ import * as path from "node:path";
37
+ import { createLogger } from "@lobu/core";
38
+
39
+ const logger = createLogger("transcript-snapshot");
40
+
41
+ export type TerminalStatus = "completed" | "failed" | "timeout" | "cancelled";
42
+
43
+ export interface TranscriptSnapshotOptions {
44
+ /** Absolute path to the session.jsonl SessionManager reads/writes. */
45
+ sessionFile: string;
46
+ /** Gateway base URL (e.g. `http://127.0.0.1:8787/lobu`). */
47
+ gatewayUrl: string;
48
+ /** Worker JWT. The gateway pulls (org, agent, conv) from this token. */
49
+ workerToken: string;
50
+ }
51
+
52
+ /**
53
+ * Pull the latest `terminal_status='completed'` snapshot for this worker's
54
+ * (org, agent, conv) and write the bytes to `sessionFile`. Must run BEFORE
55
+ * SessionManager.open() so the rehydrated content is visible at open time.
56
+ *
57
+ * Returns `true` if a snapshot was found and written, `false` if no snapshot
58
+ * exists yet (first turn). Throws on transport errors — caller decides
59
+ * whether to fall back to a fresh session.
60
+ */
61
+ export async function hydrateFromSnapshot(
62
+ opts: TranscriptSnapshotOptions
63
+ ): Promise<boolean> {
64
+ const url = `${opts.gatewayUrl}/worker/transcript/snapshot`;
65
+ const res = await fetch(url, {
66
+ method: "GET",
67
+ headers: { Authorization: `Bearer ${opts.workerToken}` },
68
+ signal: AbortSignal.timeout(30_000),
69
+ });
70
+
71
+ // 404 = no completed snapshot for this (org, agent, conv). First turn or
72
+ // every previous attempt failed/timed out. Caller should start fresh.
73
+ if (res.status === 404) {
74
+ return false;
75
+ }
76
+ if (!res.ok) {
77
+ throw new Error(
78
+ `transcript hydrate failed: ${res.status} ${res.statusText}`
79
+ );
80
+ }
81
+
82
+ const body = await res.text();
83
+ await fs.mkdir(path.dirname(opts.sessionFile), { recursive: true });
84
+ // writeFile truncates atomically (open with O_TRUNC); no partial state
85
+ // is visible to SessionManager.open() because that call runs after this
86
+ // function resolves.
87
+ await fs.writeFile(opts.sessionFile, body, "utf-8");
88
+ // fsync so a pod crash between this return and SessionManager.open()
89
+ // doesn't leave the file half-written. The cost is one extra disk flush
90
+ // on every worker boot — acceptable.
91
+ const handle = await fs.open(opts.sessionFile, "r");
92
+ try {
93
+ await handle.sync();
94
+ } finally {
95
+ await handle.close();
96
+ }
97
+
98
+ logger.info(
99
+ `Hydrated session file from snapshot: ${body.length} bytes → ${opts.sessionFile}`
100
+ );
101
+ return true;
102
+ }
103
+
104
+ /**
105
+ * Read the session file in full and POST it to the gateway. Called once per
106
+ * worker run at terminal time, from `OpenClawWorker.cleanup()`. The
107
+ * `terminal_status` discriminator lets the hydrate path skip failed/timeout
108
+ * snapshots so a dangling `tool_use` doesn't poison the next attempt.
109
+ *
110
+ * Failure to snapshot is logged but does NOT throw — there's nothing the
111
+ * caller can do beyond what cleanup already does (the worker is exiting).
112
+ * The next attempt will hydrate from the previous successful snapshot.
113
+ */
114
+ export async function writeSnapshot(
115
+ opts: TranscriptSnapshotOptions & {
116
+ terminalStatus: TerminalStatus;
117
+ /**
118
+ * The runs.id this worker claimed. Sent in the POST body so the route
119
+ * binds the snapshot to the correct run unambiguously; the route then
120
+ * verifies the runId actually belongs to the JWT's (org, agent, conv)
121
+ * tuple before INSERTing. Codex P1#1 on PR #865 — without this, the
122
+ * route fell back to a "latest run for (org, agent, conv)" lookup
123
+ * which raced with the next user message enqueuing a fresh run.
124
+ */
125
+ runId: number;
126
+ }
127
+ ): Promise<void> {
128
+ // Hydrate filters `terminal_status='completed'` — failed/timeout/cancelled
129
+ // snapshots are never used. POSTing them is pure network waste; the
130
+ // route would store them but no future hydrate would pick them up.
131
+ // Skip at the source so any caller (cleanup() today, future paths
132
+ // tomorrow) stays out of the wasteful write. Codex round 2 quality
133
+ // win C on PR #865.
134
+ if (opts.terminalStatus !== "completed") {
135
+ logger.debug(
136
+ `Skipping snapshot POST: terminal_status='${opts.terminalStatus}' is never read by hydrate`
137
+ );
138
+ return;
139
+ }
140
+
141
+ let body: string;
142
+ try {
143
+ body = await fs.readFile(opts.sessionFile, "utf-8");
144
+ } catch (err) {
145
+ // No session file = nothing to snapshot. Common when the worker exits
146
+ // before SessionManager.open() ran (early error path).
147
+ const isMissing =
148
+ err instanceof Error && (err as NodeJS.ErrnoException).code === "ENOENT";
149
+ if (isMissing) {
150
+ logger.debug(`No session file at ${opts.sessionFile}; skipping snapshot`);
151
+ return;
152
+ }
153
+ logger.warn(
154
+ `Failed to read session file for snapshot: ${err instanceof Error ? err.message : String(err)}`
155
+ );
156
+ return;
157
+ }
158
+
159
+ if (body.length === 0) {
160
+ logger.debug("Empty session file; skipping snapshot");
161
+ return;
162
+ }
163
+
164
+ const url = `${opts.gatewayUrl}/worker/transcript/snapshot`;
165
+ try {
166
+ const res = await fetch(url, {
167
+ method: "POST",
168
+ headers: {
169
+ Authorization: `Bearer ${opts.workerToken}`,
170
+ "Content-Type": "application/json",
171
+ },
172
+ body: JSON.stringify({
173
+ terminalStatus: opts.terminalStatus,
174
+ snapshotJsonl: body,
175
+ runId: opts.runId,
176
+ }),
177
+ // Snapshots can be large (633 KB max measured); 60s timeout covers
178
+ // slow links + PG TOAST writes.
179
+ signal: AbortSignal.timeout(60_000),
180
+ });
181
+ if (!res.ok) {
182
+ // 409 = UNIQUE (org, agent, conv, run_id) collision. Means another
183
+ // pod (or a retry) already wrote this snapshot — benign, drop it.
184
+ if (res.status === 409) {
185
+ logger.info(
186
+ `Snapshot for run already exists (status=${opts.terminalStatus}); skipping duplicate`
187
+ );
188
+ return;
189
+ }
190
+ logger.error(`Snapshot POST failed: ${res.status} ${res.statusText}`);
191
+ return;
192
+ }
193
+ logger.info(
194
+ `Wrote snapshot: ${body.length} bytes, status=${opts.terminalStatus}`
195
+ );
196
+ } catch (err) {
197
+ logger.error(
198
+ `Snapshot POST threw: ${err instanceof Error ? err.message : String(err)}`
199
+ );
200
+ return;
201
+ }
202
+ }
203
+
204
+ /**
205
+ * Purge all snapshot rows for this worker's (org, agent, conv). Called
206
+ * by the session-reset path so the next boot doesn't rehydrate the
207
+ * conversation from Postgres after a `/new`. Idempotent — a 404 / empty
208
+ * result is treated as success.
209
+ *
210
+ * Failures are logged but not thrown — reset is best-effort; if the
211
+ * purge HTTP call fails the worst case is the next boot hydrates from
212
+ * the previous transcript (the legacy file-mode behaviour). The local
213
+ * session.jsonl unlink is the primary signal; this is the multi-replica
214
+ * complement to it.
215
+ */
216
+ export async function clearSnapshots(
217
+ opts: Pick<TranscriptSnapshotOptions, "gatewayUrl" | "workerToken">
218
+ ): Promise<void> {
219
+ const url = `${opts.gatewayUrl}/worker/transcript/snapshot`;
220
+ try {
221
+ const res = await fetch(url, {
222
+ method: "DELETE",
223
+ headers: { Authorization: `Bearer ${opts.workerToken}` },
224
+ signal: AbortSignal.timeout(30_000),
225
+ });
226
+ if (!res.ok) {
227
+ logger.warn(
228
+ `Snapshot DELETE failed: ${res.status} ${res.statusText} — next boot may rehydrate stale history`
229
+ );
230
+ return;
231
+ }
232
+ logger.info("Purged conversation snapshots for session reset");
233
+ } catch (err) {
234
+ logger.warn(
235
+ `Snapshot DELETE threw: ${err instanceof Error ? err.message : String(err)} — next boot may rehydrate stale history`
236
+ );
237
+ }
238
+ }
@@ -54,6 +54,12 @@ import {
54
54
  resolveModelRef,
55
55
  } from "./model-resolver";
56
56
  import { checkSandboxLeak } from "./sandbox-leak";
57
+ import {
58
+ clearSnapshots,
59
+ hydrateFromSnapshot,
60
+ type TerminalStatus,
61
+ writeSnapshot,
62
+ } from "./transcript-snapshot";
57
63
  import {
58
64
  loadPlugins,
59
65
  runPluginHooks,
@@ -275,28 +281,40 @@ export class OpenClawWorker implements WorkerExecutor {
275
281
  public workerTransport: WorkerTransport;
276
282
  private config: WorkerConfig;
277
283
  private progressProcessor: OpenClawProgressProcessor;
284
+ /**
285
+ * Terminal status for the current run, used by `cleanup()` to discriminate
286
+ * the snapshot row. Defaults to `failed` (pessimistic) so an early crash
287
+ * before any return-path assignment is recorded as a failure, not silently
288
+ * accepted as a completion. Set to `completed` only on the success path
289
+ * in `execute()`. Resets on every `execute()` invocation.
290
+ */
291
+ private terminalStatus: TerminalStatus = "failed";
292
+ /**
293
+ * Path to the OpenClaw session file for the current run. Captured in
294
+ * `runAISession()` (where SessionManager opens it) so `cleanup()` can
295
+ * read it back for the snapshot without re-deriving the path.
296
+ */
297
+ private sessionFilePath: string | null = null;
278
298
 
279
299
  constructor(config: WorkerConfig) {
280
300
  this.config = config;
281
301
  this.workspaceManager = new WorkspaceManager(config.workspace);
282
302
  this.progressProcessor = new OpenClawProgressProcessor();
283
303
 
284
- // Verify required environment variables
285
304
  const gatewayUrl = process.env.DISPATCHER_URL;
286
305
  const workerToken = process.env.WORKER_TOKEN;
287
-
288
306
  if (!gatewayUrl || !workerToken) {
289
307
  throw new Error(
290
308
  "DISPATCHER_URL and WORKER_TOKEN environment variables are required"
291
309
  );
292
310
  }
293
-
294
311
  if (!config.teamId) {
295
312
  throw new Error("teamId is required for worker initialization");
296
313
  }
297
314
  if (!config.conversationId) {
298
315
  throw new Error("conversationId is required for worker initialization");
299
316
  }
317
+
300
318
  this.workerTransport = new HttpWorkerTransport({
301
319
  gatewayUrl,
302
320
  workerToken,
@@ -316,6 +334,33 @@ export class OpenClawWorker implements WorkerExecutor {
316
334
  */
317
335
  async execute(): Promise<void> {
318
336
  const executeStartTime = Date.now();
337
+ // Reset terminal status for this run. Defaults to `failed` (pessimistic);
338
+ // assigned to `completed` only on the success path below. SESSION_TIMEOUT
339
+ // throws and is reassigned in the catch block.
340
+ this.terminalStatus = "failed";
341
+
342
+ // Fail loud when snapshot mode is enabled but the per-run scope the
343
+ // gateway is supposed to provide hasn't reached this job. A silent
344
+ // skip in cleanup() would hide a configuration bug across many
345
+ // turns; throwing here surfaces it on the first turn and the runs
346
+ // queue's retry path handles re-delivery. Codex round 2 quality
347
+ // win D on PR #865.
348
+ //
349
+ // Phase 5: snapshot is the default; setting LOBU_SESSION_STORE=file
350
+ // opts out (legacy / local-dev path that keeps reading session.jsonl
351
+ // straight off disk without writing to Postgres).
352
+ if (process.env.LOBU_SESSION_STORE !== "file") {
353
+ if (typeof this.config.runId !== "number") {
354
+ throw new Error(
355
+ "Snapshot mode (LOBU_SESSION_STORE != 'file') but WorkerConfig.runId is missing — runs-queue dispatch did not stamp runId on the job payload"
356
+ );
357
+ }
358
+ if (!this.config.runJobToken) {
359
+ throw new Error(
360
+ "Snapshot mode (LOBU_SESSION_STORE != 'file') but WorkerConfig.runJobToken is missing — MessageConsumer did not mint a per-run worker token"
361
+ );
362
+ }
363
+ }
319
364
 
320
365
  try {
321
366
  this.progressProcessor.reset();
@@ -327,13 +372,11 @@ export class OpenClawWorker implements WorkerExecutor {
327
372
  `[TIMING] Worker execute() started at: ${new Date(executeStartTime).toISOString()}`
328
373
  );
329
374
 
330
- // Decode user prompt
331
375
  const userPrompt = Buffer.from(this.config.userPrompt, "base64").toString(
332
376
  "utf-8"
333
377
  );
334
378
  logger.info(`User prompt: ${userPrompt.substring(0, 100)}...`);
335
379
 
336
- // Setup workspace
337
380
  logger.info("Setting up workspace...");
338
381
 
339
382
  await Sentry.startSpan(
@@ -360,13 +403,9 @@ export class OpenClawWorker implements WorkerExecutor {
360
403
  }
361
404
  );
362
405
 
363
- // Setup I/O directories for file handling
364
406
  await this.setupIODirectories();
365
-
366
- // Download input files if any
367
407
  await this.downloadInputFiles();
368
408
 
369
- // Generate custom instructions
370
409
  let customInstructions = await generateCustomInstructions(
371
410
  [
372
411
  new OpenClawCoreInstructionProvider(),
@@ -385,7 +424,7 @@ export class OpenClawWorker implements WorkerExecutor {
385
424
  }
386
425
  );
387
426
 
388
- // Call module onSessionStart hooks to allow modules to modify system prompt
427
+ // Module hooks may modify the system prompt before agent execution.
389
428
  try {
390
429
  const { onSessionStart } = await import("../modules/lifecycle");
391
430
  const moduleContext = await onSessionStart({
@@ -407,7 +446,6 @@ export class OpenClawWorker implements WorkerExecutor {
407
446
  // Add file I/O instructions AFTER module hooks so they aren't overwritten
408
447
  customInstructions += this.getFileIOInstructions();
409
448
 
410
- // Execute AI session
411
449
  logger.info(
412
450
  `[TIMING] Starting OpenClaw session at: ${new Date().toISOString()}`
413
451
  );
@@ -468,7 +506,6 @@ export class OpenClawWorker implements WorkerExecutor {
468
506
  }
469
507
  );
470
508
 
471
- // Collect module data before sending final response
472
509
  const { collectModuleData } = await import("../modules/lifecycle");
473
510
  const moduleData = await collectModuleData({
474
511
  workspaceDir: this.workspaceManager.getCurrentWorkingDirectory(),
@@ -477,8 +514,11 @@ export class OpenClawWorker implements WorkerExecutor {
477
514
  });
478
515
  this.workerTransport.setModuleData(moduleData);
479
516
 
480
- // Handle result
481
517
  if (result.success) {
518
+ // Snapshot writer in cleanup() reads this to discriminate the row.
519
+ // Hydrate skips non-completed snapshots, so getting this right is
520
+ // what stops a failed turn from poisoning the next attempt.
521
+ this.terminalStatus = "completed";
482
522
  const outputSnapshot = this.progressProcessor.getOutputSnapshot();
483
523
  const hintGatewayUrl = process.env.DISPATCHER_URL;
484
524
  const hintWorkerToken = process.env.WORKER_TOKEN;
@@ -532,6 +572,12 @@ export class OpenClawWorker implements WorkerExecutor {
532
572
  const isTimeout = result.exitCode === 124;
533
573
 
534
574
  if (isTimeout) {
575
+ // Mark the snapshot as `timeout` instead of `failed` so operators
576
+ // can distinguish runaway agents from genuine failures in the
577
+ // dashboard. The catch block below sees `SESSION_TIMEOUT` and
578
+ // keeps this assignment intact (it only forces `failed` on
579
+ // exceptions that aren't already marked).
580
+ this.terminalStatus = "timeout";
535
581
  logger.info(
536
582
  `Session timed out (exit code 124) - will be retried automatically, not showing error to user`
537
583
  );
@@ -562,6 +608,55 @@ export class OpenClawWorker implements WorkerExecutor {
562
608
  }
563
609
 
564
610
  async cleanup(): Promise<void> {
611
+ // Snapshot the post-run session.jsonl to Postgres so the next worker
612
+ // (possibly on a different pod) can hydrate from it. Hydrate filters
613
+ // `terminal_status='completed'`, so we ONLY POST on the success path
614
+ // — writing `failed`/`timeout`/`cancelled` rows is pure network
615
+ // waste (codex round 2 quality win C on PR #865). Default-on in
616
+ // Phase 5; LOBU_SESSION_STORE=file opts out for legacy/local-dev.
617
+ //
618
+ // The runs queue has already moved this run to a terminal state by
619
+ // the time cleanup() fires (sse-client.ts:865 finally block runs
620
+ // after execute() returns). We POST in the worker's own dying
621
+ // breath; the gateway-side advisory lock held by the spawner is
622
+ // released when the subprocess exits, so by the next claim's boot
623
+ // this snapshot is the visible "latest" row.
624
+ if (
625
+ process.env.LOBU_SESSION_STORE !== "file" &&
626
+ this.sessionFilePath &&
627
+ this.terminalStatus === "completed"
628
+ ) {
629
+ const gatewayUrl = process.env.DISPATCHER_URL;
630
+ const runId = this.config.runId;
631
+ // Per-run JWT minted by the gateway's MessageConsumer alongside
632
+ // `runId`. The snapshot route requires `tokenData.runId ===
633
+ // body.runId`, so the deployment-lifetime WORKER_TOKEN cannot be
634
+ // used here — it would carry no `runId` and the route would 403.
635
+ // Codex round 2 finding A.
636
+ const runJobToken = this.config.runJobToken;
637
+ if (gatewayUrl && runJobToken && typeof runId === "number") {
638
+ await writeSnapshot({
639
+ sessionFile: this.sessionFilePath,
640
+ gatewayUrl,
641
+ workerToken: runJobToken,
642
+ terminalStatus: this.terminalStatus,
643
+ runId,
644
+ });
645
+ } else if (gatewayUrl) {
646
+ // Missing per-run scope (legacy direct-enqueue path or token
647
+ // mint failure on the gateway). Skip the snapshot rather than
648
+ // risk a mis-attributed row; the next run will hydrate from
649
+ // the previous completed snapshot the next time a normal
650
+ // runs-queue dispatch comes through.
651
+ logger.warn(
652
+ `Skipping transcript snapshot: ${
653
+ typeof runId !== "number"
654
+ ? "WorkerConfig.runId is missing"
655
+ : "WorkerConfig.runJobToken is missing"
656
+ } (legacy enqueue path)`
657
+ );
658
+ }
659
+ }
565
660
  logger.info("Worker cleanup completed");
566
661
  }
567
662
 
@@ -866,12 +961,56 @@ export class OpenClawWorker implements WorkerExecutor {
866
961
  await fs.mkdir(path.join(workspaceDir, ".openclaw"), { recursive: true });
867
962
 
868
963
  const sessionFile = path.join(workspaceDir, ".openclaw", "session.jsonl");
964
+ // Capture for cleanup() — it reads the file back to write the snapshot
965
+ // at terminal time. Set unconditionally so file-mode opt-outs
966
+ // still get a defined value (snapshot writer no-ops when
967
+ // LOBU_SESSION_STORE=file).
968
+ this.sessionFilePath = sessionFile;
869
969
  const providerStateFile = path.join(
870
970
  workspaceDir,
871
971
  ".openclaw",
872
972
  "provider.json"
873
973
  );
874
974
 
975
+ // Hydrate from the latest completed Postgres snapshot BEFORE the
976
+ // provider-state check or SessionManager.open(). Phase 5: snapshot
977
+ // mode is the default; LOBU_SESSION_STORE=file opts out and keeps
978
+ // the legacy file-only behaviour for local-dev / single-replica
979
+ // self-hosters.
980
+ //
981
+ // Order matters: hydrate → provider check (may unlink) →
982
+ // SessionManager.open(). The provider-change unlink at line ~925 still
983
+ // does the right thing after hydrate: it drops the file we just wrote
984
+ // and SessionManager creates a fresh one, exactly like a first-turn
985
+ // boot. The next snapshot will have its own run_id, so the historical
986
+ // PG rows remain readable without poisoning the new conversation
987
+ // (hydrate would only resurrect them if a subsequent run completes
988
+ // successfully and overwrites the latest pointer).
989
+ if (process.env.LOBU_SESSION_STORE !== "file") {
990
+ const gatewayUrl = process.env.DISPATCHER_URL;
991
+ const workerToken = process.env.WORKER_TOKEN;
992
+ if (gatewayUrl && workerToken) {
993
+ try {
994
+ await hydrateFromSnapshot({
995
+ sessionFile,
996
+ gatewayUrl,
997
+ workerToken,
998
+ });
999
+ } catch (err) {
1000
+ // Hydrate failure is non-fatal — fall back to whatever's on disk.
1001
+ // Worst case the worker boots without history and the user re-
1002
+ // grounds the conversation. Better than refusing to start.
1003
+ logger.warn(
1004
+ `Snapshot hydrate failed; continuing with local session file: ${err instanceof Error ? err.message : String(err)}`
1005
+ );
1006
+ }
1007
+ } else {
1008
+ logger.warn(
1009
+ "Snapshot mode active (LOBU_SESSION_STORE != 'file') but DISPATCHER_URL or WORKER_TOKEN missing; snapshot disabled"
1010
+ );
1011
+ }
1012
+ }
1013
+
875
1014
  // Detect provider change and reset session if needed
876
1015
  let sessionSummary: string | undefined;
877
1016
  try {
@@ -1412,6 +1551,21 @@ Use it when the user references past discussions or you need context.`);
1412
1551
  // File may not exist
1413
1552
  }
1414
1553
 
1554
+ // Also purge the Postgres snapshots for this (org, agent, conv)
1555
+ // — in snapshot mode (the Phase 5 default) the next worker boot
1556
+ // would otherwise rehydrate from the now-flushed conversation
1557
+ // and the user-visible "Starting fresh" would be a lie. Best-
1558
+ // effort: a failure here is logged but doesn't block the reset
1559
+ // since the local unlink already happened and the snapshot
1560
+ // helper is a no-op in file mode.
1561
+ if (process.env.LOBU_SESSION_STORE !== "file") {
1562
+ const gatewayUrl = process.env.DISPATCHER_URL;
1563
+ const workerToken = process.env.WORKER_TOKEN;
1564
+ if (gatewayUrl && workerToken) {
1565
+ await clearSnapshots({ gatewayUrl, workerToken });
1566
+ }
1567
+ }
1568
+
1415
1569
  // Send visible confirmation to user
1416
1570
  await onProgress({
1417
1571
  type: "output",
package/src/server.ts CHANGED
@@ -3,7 +3,7 @@
3
3
  * Lightweight Hono server started before SSE gateway connection.
4
4
  */
5
5
 
6
- import { readFile } from "node:fs/promises";
6
+ import { readdir, readFile, stat } from "node:fs/promises";
7
7
  import { createServer } from "node:http";
8
8
  import { join } from "node:path";
9
9
  import { getRequestListener } from "@hono/node-server";
@@ -20,7 +20,6 @@ const logger = createLogger("worker-http");
20
20
  const app = new Hono();
21
21
 
22
22
  async function findSessionFile(): Promise<string | null> {
23
- const { readdir, stat } = await import("node:fs/promises");
24
23
  const workspaceDir = getOptionalEnv("WORKSPACE_DIR", "/workspace");
25
24
 
26
25
  // Direct path: {WORKSPACE_DIR}/.openclaw/session.jsonl
@@ -163,10 +162,8 @@ function entryToMessage(entry: SessionEntry): ParsedMessage | null {
163
162
  return null;
164
163
  }
165
164
 
166
- // Health check
167
165
  app.get("/health", (c) => c.json({ status: "ok" }));
168
166
 
169
- // Full session messages with cursor-based pagination
170
167
  app.get("/session/messages", async (c) => {
171
168
  const cursor = c.req.query("cursor");
172
169
  const limit = Math.min(parseInt(c.req.query("limit") || "50", 10), 200);
@@ -230,7 +227,6 @@ app.get("/session/messages", async (c) => {
230
227
  }
231
228
  });
232
229
 
233
- // Session stats
234
230
  app.get("/session/stats", async (c) => {
235
231
  try {
236
232
  const sessionPath = await findSessionFile();
@@ -5,11 +5,7 @@ interface AudioProviderSuggestions {
5
5
  usedFallback: boolean;
6
6
  }
7
7
 
8
- const FALLBACK_PROVIDER_ENTRIES = [
9
- { id: "chatgpt" },
10
- { id: "gemini" },
11
- { id: "elevenlabs" },
12
- ] as const;
8
+ const FALLBACK_PROVIDER_IDS = ["chatgpt", "gemini", "elevenlabs"] as const;
13
9
 
14
10
  const KNOWN_PROVIDER_LABELS: Record<string, string> = {
15
11
  chatgpt: "ChatGPT/OpenAI",
@@ -48,7 +44,7 @@ function getFallbackSuggestions(
48
44
  available: boolean | null
49
45
  ): AudioProviderSuggestions {
50
46
  return {
51
- providerIds: FALLBACK_PROVIDER_ENTRIES.map((entry) => entry.id),
47
+ providerIds: [...FALLBACK_PROVIDER_IDS],
52
48
  providerDisplayList: "",
53
49
  available,
54
50
  usedFallback: true,
@@ -119,6 +115,8 @@ export async function fetchAudioProviderSuggestions(params: {
119
115
  `${params.gatewayUrl}/internal/audio/capabilities`,
120
116
  {
121
117
  headers: { Authorization: `Bearer ${params.workerToken}` },
118
+ // Capability probing is best-effort; never block the agent turn on it.
119
+ signal: AbortSignal.timeout(15_000),
122
120
  }
123
121
  );
124
122
  if (!response.ok) {