@lobu/worker 7.1.0 → 7.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/types.d.ts +18 -0
- package/dist/core/types.d.ts.map +1 -1
- package/dist/gateway/sse-client.d.ts.map +1 -1
- package/dist/gateway/sse-client.js +28 -2
- package/dist/gateway/sse-client.js.map +1 -1
- package/dist/gateway/types.d.ts +2 -0
- package/dist/gateway/types.d.ts.map +1 -1
- package/dist/openclaw/transcript-snapshot.d.ts +88 -0
- package/dist/openclaw/transcript-snapshot.d.ts.map +1 -0
- package/dist/openclaw/transcript-snapshot.js +223 -0
- package/dist/openclaw/transcript-snapshot.js.map +1 -0
- package/dist/openclaw/worker.d.ts +14 -0
- package/dist/openclaw/worker.d.ts.map +1 -1
- package/dist/openclaw/worker.js +146 -0
- package/dist/openclaw/worker.js.map +1 -1
- package/package.json +2 -2
- package/src/__tests__/sse-client.test.ts +99 -0
- package/src/__tests__/transcript-snapshot.test.ts +275 -0
- package/src/core/types.ts +18 -0
- package/src/gateway/sse-client.ts +42 -14
- package/src/gateway/types.ts +15 -0
- package/src/openclaw/transcript-snapshot.ts +238 -0
- package/src/openclaw/worker.ts +165 -0
package/src/openclaw/worker.ts
CHANGED
|
@@ -54,6 +54,12 @@ import {
|
|
|
54
54
|
resolveModelRef,
|
|
55
55
|
} from "./model-resolver";
|
|
56
56
|
import { checkSandboxLeak } from "./sandbox-leak";
|
|
57
|
+
import {
|
|
58
|
+
clearSnapshots,
|
|
59
|
+
hydrateFromSnapshot,
|
|
60
|
+
type TerminalStatus,
|
|
61
|
+
writeSnapshot,
|
|
62
|
+
} from "./transcript-snapshot";
|
|
57
63
|
import {
|
|
58
64
|
loadPlugins,
|
|
59
65
|
runPluginHooks,
|
|
@@ -275,6 +281,20 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
275
281
|
public workerTransport: WorkerTransport;
|
|
276
282
|
private config: WorkerConfig;
|
|
277
283
|
private progressProcessor: OpenClawProgressProcessor;
|
|
284
|
+
/**
|
|
285
|
+
* Terminal status for the current run, used by `cleanup()` to discriminate
|
|
286
|
+
* the snapshot row. Defaults to `failed` (pessimistic) so an early crash
|
|
287
|
+
* before any return-path assignment is recorded as a failure, not silently
|
|
288
|
+
* accepted as a completion. Set to `completed` only on the success path
|
|
289
|
+
* in `execute()`. Resets on every `execute()` invocation.
|
|
290
|
+
*/
|
|
291
|
+
private terminalStatus: TerminalStatus = "failed";
|
|
292
|
+
/**
|
|
293
|
+
* Path to the OpenClaw session file for the current run. Captured in
|
|
294
|
+
* `runAISession()` (where SessionManager opens it) so `cleanup()` can
|
|
295
|
+
* read it back for the snapshot without re-deriving the path.
|
|
296
|
+
*/
|
|
297
|
+
private sessionFilePath: string | null = null;
|
|
278
298
|
|
|
279
299
|
constructor(config: WorkerConfig) {
|
|
280
300
|
this.config = config;
|
|
@@ -314,6 +334,33 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
314
334
|
*/
|
|
315
335
|
async execute(): Promise<void> {
|
|
316
336
|
const executeStartTime = Date.now();
|
|
337
|
+
// Reset terminal status for this run. Defaults to `failed` (pessimistic);
|
|
338
|
+
// assigned to `completed` only on the success path below. SESSION_TIMEOUT
|
|
339
|
+
// throws and is reassigned in the catch block.
|
|
340
|
+
this.terminalStatus = "failed";
|
|
341
|
+
|
|
342
|
+
// Fail loud when snapshot mode is enabled but the per-run scope the
|
|
343
|
+
// gateway is supposed to provide hasn't reached this job. A silent
|
|
344
|
+
// skip in cleanup() would hide a configuration bug across many
|
|
345
|
+
// turns; throwing here surfaces it on the first turn and the runs
|
|
346
|
+
// queue's retry path handles re-delivery. Codex round 2 quality
|
|
347
|
+
// win D on PR #865.
|
|
348
|
+
//
|
|
349
|
+
// Phase 5: snapshot is the default; setting LOBU_SESSION_STORE=file
|
|
350
|
+
// opts out (legacy / local-dev path that keeps reading session.jsonl
|
|
351
|
+
// straight off disk without writing to Postgres).
|
|
352
|
+
if (process.env.LOBU_SESSION_STORE !== "file") {
|
|
353
|
+
if (typeof this.config.runId !== "number") {
|
|
354
|
+
throw new Error(
|
|
355
|
+
"Snapshot mode (LOBU_SESSION_STORE != 'file') but WorkerConfig.runId is missing — runs-queue dispatch did not stamp runId on the job payload"
|
|
356
|
+
);
|
|
357
|
+
}
|
|
358
|
+
if (!this.config.runJobToken) {
|
|
359
|
+
throw new Error(
|
|
360
|
+
"Snapshot mode (LOBU_SESSION_STORE != 'file') but WorkerConfig.runJobToken is missing — MessageConsumer did not mint a per-run worker token"
|
|
361
|
+
);
|
|
362
|
+
}
|
|
363
|
+
}
|
|
317
364
|
|
|
318
365
|
try {
|
|
319
366
|
this.progressProcessor.reset();
|
|
@@ -468,6 +515,10 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
468
515
|
this.workerTransport.setModuleData(moduleData);
|
|
469
516
|
|
|
470
517
|
if (result.success) {
|
|
518
|
+
// Snapshot writer in cleanup() reads this to discriminate the row.
|
|
519
|
+
// Hydrate skips non-completed snapshots, so getting this right is
|
|
520
|
+
// what stops a failed turn from poisoning the next attempt.
|
|
521
|
+
this.terminalStatus = "completed";
|
|
471
522
|
const outputSnapshot = this.progressProcessor.getOutputSnapshot();
|
|
472
523
|
const hintGatewayUrl = process.env.DISPATCHER_URL;
|
|
473
524
|
const hintWorkerToken = process.env.WORKER_TOKEN;
|
|
@@ -521,6 +572,12 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
521
572
|
const isTimeout = result.exitCode === 124;
|
|
522
573
|
|
|
523
574
|
if (isTimeout) {
|
|
575
|
+
// Mark the snapshot as `timeout` instead of `failed` so operators
|
|
576
|
+
// can distinguish runaway agents from genuine failures in the
|
|
577
|
+
// dashboard. The catch block below sees `SESSION_TIMEOUT` and
|
|
578
|
+
// keeps this assignment intact (it only forces `failed` on
|
|
579
|
+
// exceptions that aren't already marked).
|
|
580
|
+
this.terminalStatus = "timeout";
|
|
524
581
|
logger.info(
|
|
525
582
|
`Session timed out (exit code 124) - will be retried automatically, not showing error to user`
|
|
526
583
|
);
|
|
@@ -551,6 +608,55 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
551
608
|
}
|
|
552
609
|
|
|
553
610
|
async cleanup(): Promise<void> {
|
|
611
|
+
// Snapshot the post-run session.jsonl to Postgres so the next worker
|
|
612
|
+
// (possibly on a different pod) can hydrate from it. Hydrate filters
|
|
613
|
+
// `terminal_status='completed'`, so we ONLY POST on the success path
|
|
614
|
+
// — writing `failed`/`timeout`/`cancelled` rows is pure network
|
|
615
|
+
// waste (codex round 2 quality win C on PR #865). Default-on in
|
|
616
|
+
// Phase 5; LOBU_SESSION_STORE=file opts out for legacy/local-dev.
|
|
617
|
+
//
|
|
618
|
+
// The runs queue has already moved this run to a terminal state by
|
|
619
|
+
// the time cleanup() fires (sse-client.ts:865 finally block runs
|
|
620
|
+
// after execute() returns). We POST in the worker's own dying
|
|
621
|
+
// breath; the gateway-side advisory lock held by the spawner is
|
|
622
|
+
// released when the subprocess exits, so by the next claim's boot
|
|
623
|
+
// this snapshot is the visible "latest" row.
|
|
624
|
+
if (
|
|
625
|
+
process.env.LOBU_SESSION_STORE !== "file" &&
|
|
626
|
+
this.sessionFilePath &&
|
|
627
|
+
this.terminalStatus === "completed"
|
|
628
|
+
) {
|
|
629
|
+
const gatewayUrl = process.env.DISPATCHER_URL;
|
|
630
|
+
const runId = this.config.runId;
|
|
631
|
+
// Per-run JWT minted by the gateway's MessageConsumer alongside
|
|
632
|
+
// `runId`. The snapshot route requires `tokenData.runId ===
|
|
633
|
+
// body.runId`, so the deployment-lifetime WORKER_TOKEN cannot be
|
|
634
|
+
// used here — it would carry no `runId` and the route would 403.
|
|
635
|
+
// Codex round 2 finding A.
|
|
636
|
+
const runJobToken = this.config.runJobToken;
|
|
637
|
+
if (gatewayUrl && runJobToken && typeof runId === "number") {
|
|
638
|
+
await writeSnapshot({
|
|
639
|
+
sessionFile: this.sessionFilePath,
|
|
640
|
+
gatewayUrl,
|
|
641
|
+
workerToken: runJobToken,
|
|
642
|
+
terminalStatus: this.terminalStatus,
|
|
643
|
+
runId,
|
|
644
|
+
});
|
|
645
|
+
} else if (gatewayUrl) {
|
|
646
|
+
// Missing per-run scope (legacy direct-enqueue path or token
|
|
647
|
+
// mint failure on the gateway). Skip the snapshot rather than
|
|
648
|
+
// risk a mis-attributed row; the next run will hydrate from
|
|
649
|
+
// the previous completed snapshot the next time a normal
|
|
650
|
+
// runs-queue dispatch comes through.
|
|
651
|
+
logger.warn(
|
|
652
|
+
`Skipping transcript snapshot: ${
|
|
653
|
+
typeof runId !== "number"
|
|
654
|
+
? "WorkerConfig.runId is missing"
|
|
655
|
+
: "WorkerConfig.runJobToken is missing"
|
|
656
|
+
} (legacy enqueue path)`
|
|
657
|
+
);
|
|
658
|
+
}
|
|
659
|
+
}
|
|
554
660
|
logger.info("Worker cleanup completed");
|
|
555
661
|
}
|
|
556
662
|
|
|
@@ -855,12 +961,56 @@ export class OpenClawWorker implements WorkerExecutor {
|
|
|
855
961
|
await fs.mkdir(path.join(workspaceDir, ".openclaw"), { recursive: true });
|
|
856
962
|
|
|
857
963
|
const sessionFile = path.join(workspaceDir, ".openclaw", "session.jsonl");
|
|
964
|
+
// Capture for cleanup() — it reads the file back to write the snapshot
|
|
965
|
+
// at terminal time. Set unconditionally so file-mode opt-outs
|
|
966
|
+
// still get a defined value (snapshot writer no-ops when
|
|
967
|
+
// LOBU_SESSION_STORE=file).
|
|
968
|
+
this.sessionFilePath = sessionFile;
|
|
858
969
|
const providerStateFile = path.join(
|
|
859
970
|
workspaceDir,
|
|
860
971
|
".openclaw",
|
|
861
972
|
"provider.json"
|
|
862
973
|
);
|
|
863
974
|
|
|
975
|
+
// Hydrate from the latest completed Postgres snapshot BEFORE the
|
|
976
|
+
// provider-state check or SessionManager.open(). Phase 5: snapshot
|
|
977
|
+
// mode is the default; LOBU_SESSION_STORE=file opts out and keeps
|
|
978
|
+
// the legacy file-only behaviour for local-dev / single-replica
|
|
979
|
+
// self-hosters.
|
|
980
|
+
//
|
|
981
|
+
// Order matters: hydrate → provider check (may unlink) →
|
|
982
|
+
// SessionManager.open(). The provider-change unlink at line ~925 still
|
|
983
|
+
// does the right thing after hydrate: it drops the file we just wrote
|
|
984
|
+
// and SessionManager creates a fresh one, exactly like a first-turn
|
|
985
|
+
// boot. The next snapshot will have its own run_id, so the historical
|
|
986
|
+
// PG rows remain readable without poisoning the new conversation
|
|
987
|
+
// (hydrate would only resurrect them if a subsequent run completes
|
|
988
|
+
// successfully and overwrites the latest pointer).
|
|
989
|
+
if (process.env.LOBU_SESSION_STORE !== "file") {
|
|
990
|
+
const gatewayUrl = process.env.DISPATCHER_URL;
|
|
991
|
+
const workerToken = process.env.WORKER_TOKEN;
|
|
992
|
+
if (gatewayUrl && workerToken) {
|
|
993
|
+
try {
|
|
994
|
+
await hydrateFromSnapshot({
|
|
995
|
+
sessionFile,
|
|
996
|
+
gatewayUrl,
|
|
997
|
+
workerToken,
|
|
998
|
+
});
|
|
999
|
+
} catch (err) {
|
|
1000
|
+
// Hydrate failure is non-fatal — fall back to whatever's on disk.
|
|
1001
|
+
// Worst case the worker boots without history and the user re-
|
|
1002
|
+
// grounds the conversation. Better than refusing to start.
|
|
1003
|
+
logger.warn(
|
|
1004
|
+
`Snapshot hydrate failed; continuing with local session file: ${err instanceof Error ? err.message : String(err)}`
|
|
1005
|
+
);
|
|
1006
|
+
}
|
|
1007
|
+
} else {
|
|
1008
|
+
logger.warn(
|
|
1009
|
+
"Snapshot mode active (LOBU_SESSION_STORE != 'file') but DISPATCHER_URL or WORKER_TOKEN missing; snapshot disabled"
|
|
1010
|
+
);
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
|
|
864
1014
|
// Detect provider change and reset session if needed
|
|
865
1015
|
let sessionSummary: string | undefined;
|
|
866
1016
|
try {
|
|
@@ -1401,6 +1551,21 @@ Use it when the user references past discussions or you need context.`);
|
|
|
1401
1551
|
// File may not exist
|
|
1402
1552
|
}
|
|
1403
1553
|
|
|
1554
|
+
// Also purge the Postgres snapshots for this (org, agent, conv)
|
|
1555
|
+
// — in snapshot mode (the Phase 5 default) the next worker boot
|
|
1556
|
+
// would otherwise rehydrate from the now-flushed conversation
|
|
1557
|
+
// and the user-visible "Starting fresh" would be a lie. Best-
|
|
1558
|
+
// effort: a failure here is logged but doesn't block the reset
|
|
1559
|
+
// since the local unlink already happened and the snapshot
|
|
1560
|
+
// helper is a no-op in file mode.
|
|
1561
|
+
if (process.env.LOBU_SESSION_STORE !== "file") {
|
|
1562
|
+
const gatewayUrl = process.env.DISPATCHER_URL;
|
|
1563
|
+
const workerToken = process.env.WORKER_TOKEN;
|
|
1564
|
+
if (gatewayUrl && workerToken) {
|
|
1565
|
+
await clearSnapshots({ gatewayUrl, workerToken });
|
|
1566
|
+
}
|
|
1567
|
+
}
|
|
1568
|
+
|
|
1404
1569
|
// Send visible confirmation to user
|
|
1405
1570
|
await onProgress({
|
|
1406
1571
|
type: "output",
|