@lobu/worker 7.1.0 → 7.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -54,6 +54,12 @@ import {
54
54
  resolveModelRef,
55
55
  } from "./model-resolver";
56
56
  import { checkSandboxLeak } from "./sandbox-leak";
57
+ import {
58
+ clearSnapshots,
59
+ hydrateFromSnapshot,
60
+ type TerminalStatus,
61
+ writeSnapshot,
62
+ } from "./transcript-snapshot";
57
63
  import {
58
64
  loadPlugins,
59
65
  runPluginHooks,
@@ -275,6 +281,20 @@ export class OpenClawWorker implements WorkerExecutor {
275
281
  public workerTransport: WorkerTransport;
276
282
  private config: WorkerConfig;
277
283
  private progressProcessor: OpenClawProgressProcessor;
284
+ /**
285
+ * Terminal status for the current run, used by `cleanup()` to discriminate
286
+ * the snapshot row. Defaults to `failed` (pessimistic) so an early crash
287
+ * before any return-path assignment is recorded as a failure, not silently
288
+ * accepted as a completion. Set to `completed` only on the success path
289
+ * in `execute()`. Resets on every `execute()` invocation.
290
+ */
291
+ private terminalStatus: TerminalStatus = "failed";
292
+ /**
293
+ * Path to the OpenClaw session file for the current run. Captured in
294
+ * `runAISession()` (where SessionManager opens it) so `cleanup()` can
295
+ * read it back for the snapshot without re-deriving the path.
296
+ */
297
+ private sessionFilePath: string | null = null;
278
298
 
279
299
  constructor(config: WorkerConfig) {
280
300
  this.config = config;
@@ -314,6 +334,33 @@ export class OpenClawWorker implements WorkerExecutor {
314
334
  */
315
335
  async execute(): Promise<void> {
316
336
  const executeStartTime = Date.now();
337
+ // Reset terminal status for this run. Defaults to `failed` (pessimistic);
338
+ // assigned to `completed` only on the success path below. SESSION_TIMEOUT
339
+ // throws and is reassigned in the catch block.
340
+ this.terminalStatus = "failed";
341
+
342
+ // Fail loud when snapshot mode is enabled but the per-run scope the
343
+ // gateway is supposed to provide hasn't reached this job. A silent
344
+ // skip in cleanup() would hide a configuration bug across many
345
+ // turns; throwing here surfaces it on the first turn and the runs
346
+ // queue's retry path handles re-delivery. Codex round 2 quality
347
+ // win D on PR #865.
348
+ //
349
+ // Phase 5: snapshot is the default; setting LOBU_SESSION_STORE=file
350
+ // opts out (legacy / local-dev path that keeps reading session.jsonl
351
+ // straight off disk without writing to Postgres).
352
+ if (process.env.LOBU_SESSION_STORE !== "file") {
353
+ if (typeof this.config.runId !== "number") {
354
+ throw new Error(
355
+ "Snapshot mode (LOBU_SESSION_STORE != 'file') but WorkerConfig.runId is missing — runs-queue dispatch did not stamp runId on the job payload"
356
+ );
357
+ }
358
+ if (!this.config.runJobToken) {
359
+ throw new Error(
360
+ "Snapshot mode (LOBU_SESSION_STORE != 'file') but WorkerConfig.runJobToken is missing — MessageConsumer did not mint a per-run worker token"
361
+ );
362
+ }
363
+ }
317
364
 
318
365
  try {
319
366
  this.progressProcessor.reset();
@@ -468,6 +515,10 @@ export class OpenClawWorker implements WorkerExecutor {
468
515
  this.workerTransport.setModuleData(moduleData);
469
516
 
470
517
  if (result.success) {
518
+ // Snapshot writer in cleanup() reads this to discriminate the row.
519
+ // Hydrate skips non-completed snapshots, so getting this right is
520
+ // what stops a failed turn from poisoning the next attempt.
521
+ this.terminalStatus = "completed";
471
522
  const outputSnapshot = this.progressProcessor.getOutputSnapshot();
472
523
  const hintGatewayUrl = process.env.DISPATCHER_URL;
473
524
  const hintWorkerToken = process.env.WORKER_TOKEN;
@@ -521,6 +572,12 @@ export class OpenClawWorker implements WorkerExecutor {
521
572
  const isTimeout = result.exitCode === 124;
522
573
 
523
574
  if (isTimeout) {
575
+ // Mark the snapshot as `timeout` instead of `failed` so operators
576
+ // can distinguish runaway agents from genuine failures in the
577
+ // dashboard. The catch block below sees `SESSION_TIMEOUT` and
578
+ // keeps this assignment intact (it only forces `failed` on
579
+ // exceptions that aren't already marked).
580
+ this.terminalStatus = "timeout";
524
581
  logger.info(
525
582
  `Session timed out (exit code 124) - will be retried automatically, not showing error to user`
526
583
  );
@@ -551,6 +608,55 @@ export class OpenClawWorker implements WorkerExecutor {
551
608
  }
552
609
 
553
610
  async cleanup(): Promise<void> {
611
+ // Snapshot the post-run session.jsonl to Postgres so the next worker
612
+ // (possibly on a different pod) can hydrate from it. Hydrate filters
613
+ // `terminal_status='completed'`, so we ONLY POST on the success path
614
+ // — writing `failed`/`timeout`/`cancelled` rows is pure network
615
+ // waste (codex round 2 quality win C on PR #865). Default-on in
616
+ // Phase 5; LOBU_SESSION_STORE=file opts out for legacy/local-dev.
617
+ //
618
+ // The runs queue has already moved this run to a terminal state by
619
+ // the time cleanup() fires (sse-client.ts:865 finally block runs
620
+ // after execute() returns). We POST in the worker's own dying
621
+ // breath; the gateway-side advisory lock held by the spawner is
622
+ // released when the subprocess exits, so by the next claim's boot
623
+ // this snapshot is the visible "latest" row.
624
+ if (
625
+ process.env.LOBU_SESSION_STORE !== "file" &&
626
+ this.sessionFilePath &&
627
+ this.terminalStatus === "completed"
628
+ ) {
629
+ const gatewayUrl = process.env.DISPATCHER_URL;
630
+ const runId = this.config.runId;
631
+ // Per-run JWT minted by the gateway's MessageConsumer alongside
632
+ // `runId`. The snapshot route requires `tokenData.runId ===
633
+ // body.runId`, so the deployment-lifetime WORKER_TOKEN cannot be
634
+ // used here — it would carry no `runId` and the route would 403.
635
+ // Codex round 2 finding A.
636
+ const runJobToken = this.config.runJobToken;
637
+ if (gatewayUrl && runJobToken && typeof runId === "number") {
638
+ await writeSnapshot({
639
+ sessionFile: this.sessionFilePath,
640
+ gatewayUrl,
641
+ workerToken: runJobToken,
642
+ terminalStatus: this.terminalStatus,
643
+ runId,
644
+ });
645
+ } else if (gatewayUrl) {
646
+ // Missing per-run scope (legacy direct-enqueue path or token
647
+ // mint failure on the gateway). Skip the snapshot rather than
648
+ // risk a mis-attributed row; the next run will hydrate from
649
+ // the previous completed snapshot the next time a normal
650
+ // runs-queue dispatch comes through.
651
+ logger.warn(
652
+ `Skipping transcript snapshot: ${
653
+ typeof runId !== "number"
654
+ ? "WorkerConfig.runId is missing"
655
+ : "WorkerConfig.runJobToken is missing"
656
+ } (legacy enqueue path)`
657
+ );
658
+ }
659
+ }
554
660
  logger.info("Worker cleanup completed");
555
661
  }
556
662
 
@@ -855,12 +961,56 @@ export class OpenClawWorker implements WorkerExecutor {
855
961
  await fs.mkdir(path.join(workspaceDir, ".openclaw"), { recursive: true });
856
962
 
857
963
  const sessionFile = path.join(workspaceDir, ".openclaw", "session.jsonl");
964
+ // Capture for cleanup() — it reads the file back to write the snapshot
965
+ // at terminal time. Set unconditionally so file-mode opt-outs
966
+ // still get a defined value (snapshot writer no-ops when
967
+ // LOBU_SESSION_STORE=file).
968
+ this.sessionFilePath = sessionFile;
858
969
  const providerStateFile = path.join(
859
970
  workspaceDir,
860
971
  ".openclaw",
861
972
  "provider.json"
862
973
  );
863
974
 
975
+ // Hydrate from the latest completed Postgres snapshot BEFORE the
976
+ // provider-state check or SessionManager.open(). Phase 5: snapshot
977
+ // mode is the default; LOBU_SESSION_STORE=file opts out and keeps
978
+ // the legacy file-only behaviour for local-dev / single-replica
979
+ // self-hosters.
980
+ //
981
+ // Order matters: hydrate → provider check (may unlink) →
982
+ // SessionManager.open(). The provider-change unlink at line ~925 still
983
+ // does the right thing after hydrate: it drops the file we just wrote
984
+ // and SessionManager creates a fresh one, exactly like a first-turn
985
+ // boot. The next snapshot will have its own run_id, so the historical
986
+ // PG rows remain readable without poisoning the new conversation
987
+ // (hydrate would only resurrect them if a subsequent run completes
988
+ // successfully and overwrites the latest pointer).
989
+ if (process.env.LOBU_SESSION_STORE !== "file") {
990
+ const gatewayUrl = process.env.DISPATCHER_URL;
991
+ const workerToken = process.env.WORKER_TOKEN;
992
+ if (gatewayUrl && workerToken) {
993
+ try {
994
+ await hydrateFromSnapshot({
995
+ sessionFile,
996
+ gatewayUrl,
997
+ workerToken,
998
+ });
999
+ } catch (err) {
1000
+ // Hydrate failure is non-fatal — fall back to whatever's on disk.
1001
+ // Worst case the worker boots without history and the user re-
1002
+ // grounds the conversation. Better than refusing to start.
1003
+ logger.warn(
1004
+ `Snapshot hydrate failed; continuing with local session file: ${err instanceof Error ? err.message : String(err)}`
1005
+ );
1006
+ }
1007
+ } else {
1008
+ logger.warn(
1009
+ "Snapshot mode active (LOBU_SESSION_STORE != 'file') but DISPATCHER_URL or WORKER_TOKEN missing; snapshot disabled"
1010
+ );
1011
+ }
1012
+ }
1013
+
864
1014
  // Detect provider change and reset session if needed
865
1015
  let sessionSummary: string | undefined;
866
1016
  try {
@@ -1401,6 +1551,21 @@ Use it when the user references past discussions or you need context.`);
1401
1551
  // File may not exist
1402
1552
  }
1403
1553
 
1554
+ // Also purge the Postgres snapshots for this (org, agent, conv)
1555
+ // — in snapshot mode (the Phase 5 default) the next worker boot
1556
+ // would otherwise rehydrate from the now-flushed conversation
1557
+ // and the user-visible "Starting fresh" would be a lie. Best-
1558
+ // effort: a failure here is logged but doesn't block the reset
1559
+ // since the local unlink already happened and the snapshot
1560
+ // helper is a no-op in file mode.
1561
+ if (process.env.LOBU_SESSION_STORE !== "file") {
1562
+ const gatewayUrl = process.env.DISPATCHER_URL;
1563
+ const workerToken = process.env.WORKER_TOKEN;
1564
+ if (gatewayUrl && workerToken) {
1565
+ await clearSnapshots({ gatewayUrl, workerToken });
1566
+ }
1567
+ }
1568
+
1404
1569
  // Send visible confirmation to user
1405
1570
  await onProgress({
1406
1571
  type: "output",