@lobu/worker 7.1.0 → 7.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/types.d.ts +18 -0
- package/dist/core/types.d.ts.map +1 -1
- package/dist/gateway/sse-client.d.ts.map +1 -1
- package/dist/gateway/sse-client.js +28 -2
- package/dist/gateway/sse-client.js.map +1 -1
- package/dist/gateway/types.d.ts +2 -0
- package/dist/gateway/types.d.ts.map +1 -1
- package/dist/openclaw/transcript-snapshot.d.ts +88 -0
- package/dist/openclaw/transcript-snapshot.d.ts.map +1 -0
- package/dist/openclaw/transcript-snapshot.js +223 -0
- package/dist/openclaw/transcript-snapshot.js.map +1 -0
- package/dist/openclaw/worker.d.ts +14 -0
- package/dist/openclaw/worker.d.ts.map +1 -1
- package/dist/openclaw/worker.js +146 -0
- package/dist/openclaw/worker.js.map +1 -1
- package/package.json +2 -2
- package/src/__tests__/sse-client.test.ts +99 -0
- package/src/__tests__/transcript-snapshot.test.ts +275 -0
- package/src/core/types.ts +18 -0
- package/src/gateway/sse-client.ts +42 -14
- package/src/gateway/types.ts +15 -0
- package/src/openclaw/transcript-snapshot.ts +238 -0
- package/src/openclaw/worker.ts +165 -0
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for the worker-side per-run snapshot helpers
|
|
3
|
+
* (`hydrateFromSnapshot`, `writeSnapshot`).
|
|
4
|
+
*
|
|
5
|
+
* These exercise the HTTP-client side of the snapshot path against a mocked
|
|
6
|
+
* gateway. Coverage:
|
|
7
|
+
* - Hydrate writes the gateway's bytes verbatim to disk, fsyncs, returns
|
|
8
|
+
* the post-hydrate file size matching the byte_size column contract.
|
|
9
|
+
* - Hydrate handles 404 (no completed snapshot) → returns false, leaves
|
|
10
|
+
* the local file untouched.
|
|
11
|
+
* - Hydrate failures are non-fatal at the caller's discretion (we re-throw,
|
|
12
|
+
* caller logs+continues; behaviour verified in worker.ts but we assert
|
|
13
|
+
* the throw shape here).
|
|
14
|
+
* - writeSnapshot reads the session file, POSTs body, handles 409 (race
|
|
15
|
+
* win), missing file (early-exit worker), and empty file all silently.
|
|
16
|
+
* - The transport layer never throws — `cleanup()` runs in the worker's
|
|
17
|
+
* dying breath and any throw would abort the surrounding `finally`.
|
|
18
|
+
*
|
|
19
|
+
* The gateway test (`packages/server/src/gateway/__tests__/
|
|
20
|
+
* agent-transcript-snapshot.test.ts`) covers the route + PG side; this
|
|
21
|
+
* file is the symmetric client side.
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import { promises as fs } from "node:fs";
|
|
25
|
+
import { mkdtemp, rm } from "node:fs/promises";
|
|
26
|
+
import { tmpdir } from "node:os";
|
|
27
|
+
import { join } from "node:path";
|
|
28
|
+
import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
|
|
29
|
+
import {
|
|
30
|
+
hydrateFromSnapshot,
|
|
31
|
+
writeSnapshot,
|
|
32
|
+
} from "../openclaw/transcript-snapshot";
|
|
33
|
+
|
|
34
|
+
let tmp: string;
|
|
35
|
+
let originalFetch: typeof globalThis.fetch;
|
|
36
|
+
|
|
37
|
+
beforeEach(async () => {
|
|
38
|
+
tmp = await mkdtemp(join(tmpdir(), "snapshot-test-"));
|
|
39
|
+
originalFetch = globalThis.fetch;
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
afterEach(async () => {
|
|
43
|
+
globalThis.fetch = originalFetch;
|
|
44
|
+
await rm(tmp, { recursive: true, force: true });
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
function stubFetch(
|
|
48
|
+
handler: (url: string, init: RequestInit) => Response
|
|
49
|
+
): void {
|
|
50
|
+
globalThis.fetch = mock(
|
|
51
|
+
async (input: RequestInfo | URL, init?: RequestInit) => {
|
|
52
|
+
const url = typeof input === "string" ? input : input.toString();
|
|
53
|
+
return handler(url, init ?? {});
|
|
54
|
+
}
|
|
55
|
+
) as unknown as typeof globalThis.fetch;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
describe("hydrateFromSnapshot", () => {
|
|
59
|
+
test("boot-hydrate-fsync: writes bytes verbatim, file size matches body length", async () => {
|
|
60
|
+
const sessionFile = join(tmp, ".openclaw", "session.jsonl");
|
|
61
|
+
const expected =
|
|
62
|
+
`{"type":"session","version":3,"id":"hydrate","timestamp":"2026-05-18T10:00:00Z","cwd":"/w"}\n` +
|
|
63
|
+
`{"type":"message","id":"m1","parentId":null,"timestamp":"2026-05-18T10:00:01Z","message":{"role":"user","content":[{"type":"text","text":"resume"}]}}\n`;
|
|
64
|
+
|
|
65
|
+
stubFetch((url, init) => {
|
|
66
|
+
expect(url.endsWith("/worker/transcript/snapshot")).toBe(true);
|
|
67
|
+
expect(init.method).toBe("GET");
|
|
68
|
+
expect((init.headers as Record<string, string>).Authorization).toBe(
|
|
69
|
+
"Bearer test-jwt"
|
|
70
|
+
);
|
|
71
|
+
return new Response(expected, { status: 200 });
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
const hydrated = await hydrateFromSnapshot({
|
|
75
|
+
sessionFile,
|
|
76
|
+
gatewayUrl: "http://gw.test/lobu",
|
|
77
|
+
workerToken: "test-jwt",
|
|
78
|
+
});
|
|
79
|
+
expect(hydrated).toBe(true);
|
|
80
|
+
|
|
81
|
+
// File written + fsynced → stat size matches byte_size we'd compute.
|
|
82
|
+
const stats = await fs.stat(sessionFile);
|
|
83
|
+
expect(stats.size).toBe(Buffer.byteLength(expected, "utf-8"));
|
|
84
|
+
const back = await fs.readFile(sessionFile, "utf-8");
|
|
85
|
+
expect(back).toBe(expected);
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
test("returns false on 404 and does not touch the file", async () => {
|
|
89
|
+
const sessionFile = join(tmp, ".openclaw", "session.jsonl");
|
|
90
|
+
stubFetch(() => new Response("", { status: 404 }));
|
|
91
|
+
|
|
92
|
+
const hydrated = await hydrateFromSnapshot({
|
|
93
|
+
sessionFile,
|
|
94
|
+
gatewayUrl: "http://gw.test/lobu",
|
|
95
|
+
workerToken: "test-jwt",
|
|
96
|
+
});
|
|
97
|
+
expect(hydrated).toBe(false);
|
|
98
|
+
// No file created.
|
|
99
|
+
let exists = false;
|
|
100
|
+
try {
|
|
101
|
+
await fs.stat(sessionFile);
|
|
102
|
+
exists = true;
|
|
103
|
+
} catch {
|
|
104
|
+
exists = false;
|
|
105
|
+
}
|
|
106
|
+
expect(exists).toBe(false);
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
test("throws on non-2xx, non-404 — caller logs + continues with local file", async () => {
|
|
110
|
+
const sessionFile = join(tmp, ".openclaw", "session.jsonl");
|
|
111
|
+
stubFetch(() => new Response("boom", { status: 500 }));
|
|
112
|
+
await expect(
|
|
113
|
+
hydrateFromSnapshot({
|
|
114
|
+
sessionFile,
|
|
115
|
+
gatewayUrl: "http://gw.test/lobu",
|
|
116
|
+
workerToken: "test-jwt",
|
|
117
|
+
})
|
|
118
|
+
).rejects.toThrow(/transcript hydrate failed: 500/);
|
|
119
|
+
});
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
describe("writeSnapshot", () => {
|
|
123
|
+
test("happy path: reads file, POSTs body + terminalStatus, gateway 200", async () => {
|
|
124
|
+
const sessionFile = join(tmp, ".openclaw", "session.jsonl");
|
|
125
|
+
await fs.mkdir(join(tmp, ".openclaw"), { recursive: true });
|
|
126
|
+
const body =
|
|
127
|
+
`{"type":"session","version":3,"id":"write","timestamp":"2026-05-18T10:00:00Z","cwd":"/w"}\n` +
|
|
128
|
+
`{"type":"message","id":"u1","parentId":null,"timestamp":"2026-05-18T10:00:01Z","message":{"role":"user","content":[{"type":"text","text":"hi"}]}}\n`;
|
|
129
|
+
await fs.writeFile(sessionFile, body, "utf-8");
|
|
130
|
+
|
|
131
|
+
let postedBody: string | null = null;
|
|
132
|
+
stubFetch((url, init) => {
|
|
133
|
+
expect(url.endsWith("/worker/transcript/snapshot")).toBe(true);
|
|
134
|
+
expect(init.method).toBe("POST");
|
|
135
|
+
postedBody = init.body as string;
|
|
136
|
+
return new Response('{"id":1}', { status: 200 });
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
await writeSnapshot({
|
|
140
|
+
sessionFile,
|
|
141
|
+
gatewayUrl: "http://gw.test/lobu",
|
|
142
|
+
workerToken: "test-jwt",
|
|
143
|
+
terminalStatus: "completed",
|
|
144
|
+
runId: 42,
|
|
145
|
+
});
|
|
146
|
+
expect(postedBody).not.toBeNull();
|
|
147
|
+
const parsed = JSON.parse(postedBody!);
|
|
148
|
+
expect(parsed.snapshotJsonl).toBe(body);
|
|
149
|
+
expect(parsed.terminalStatus).toBe("completed");
|
|
150
|
+
// P1#1: runId MUST be on the POST body so the route attributes the
|
|
151
|
+
// snapshot to the exact run this worker claimed, not "latest run for
|
|
152
|
+
// (org, agent, conv)".
|
|
153
|
+
expect(parsed.runId).toBe(42);
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
test("non-completed terminalStatus is skipped (no POST, no waste)", async () => {
|
|
157
|
+
// Hydrate filters terminal_status='completed' — writing failed/
|
|
158
|
+
// timeout/cancelled rows is pure network waste. Codex round 2
|
|
159
|
+
// quality win C on PR #865. The cleanup() path is also gated on
|
|
160
|
+
// `terminalStatus === "completed"`, but writeSnapshot defends in
|
|
161
|
+
// depth so any future caller can't accidentally write a row that
|
|
162
|
+
// hydrate will never read.
|
|
163
|
+
const sessionFile = join(tmp, ".openclaw", "session.jsonl");
|
|
164
|
+
await fs.mkdir(join(tmp, ".openclaw"), { recursive: true });
|
|
165
|
+
await fs.writeFile(sessionFile, `{"type":"session"}\n`, "utf-8");
|
|
166
|
+
|
|
167
|
+
let calls = 0;
|
|
168
|
+
stubFetch(() => {
|
|
169
|
+
calls++;
|
|
170
|
+
return new Response("{}", { status: 200 });
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
for (const terminalStatus of ["failed", "timeout", "cancelled"] as const) {
|
|
174
|
+
await writeSnapshot({
|
|
175
|
+
sessionFile,
|
|
176
|
+
gatewayUrl: "http://gw.test/lobu",
|
|
177
|
+
workerToken: "test-jwt",
|
|
178
|
+
terminalStatus,
|
|
179
|
+
runId: 42,
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
expect(calls).toBe(0);
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
test("race-win-409 is benign — no throw", async () => {
|
|
186
|
+
const sessionFile = join(tmp, ".openclaw", "session.jsonl");
|
|
187
|
+
await fs.mkdir(join(tmp, ".openclaw"), { recursive: true });
|
|
188
|
+
await fs.writeFile(sessionFile, `{"type":"session"}\n`, "utf-8");
|
|
189
|
+
|
|
190
|
+
stubFetch(() => new Response("conflict", { status: 409 }));
|
|
191
|
+
|
|
192
|
+
// No throw — cleanup() in the worker's dying breath must never
|
|
193
|
+
// re-throw inside a `finally`.
|
|
194
|
+
await writeSnapshot({
|
|
195
|
+
sessionFile,
|
|
196
|
+
gatewayUrl: "http://gw.test/lobu",
|
|
197
|
+
workerToken: "test-jwt",
|
|
198
|
+
terminalStatus: "completed",
|
|
199
|
+
runId: 42,
|
|
200
|
+
});
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
test("no session file (early-exit worker): silently skips, no fetch", async () => {
|
|
204
|
+
const sessionFile = join(tmp, ".openclaw", "session.jsonl");
|
|
205
|
+
let calls = 0;
|
|
206
|
+
stubFetch(() => {
|
|
207
|
+
calls++;
|
|
208
|
+
return new Response("", { status: 200 });
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
await writeSnapshot({
|
|
212
|
+
sessionFile,
|
|
213
|
+
gatewayUrl: "http://gw.test/lobu",
|
|
214
|
+
workerToken: "test-jwt",
|
|
215
|
+
terminalStatus: "failed",
|
|
216
|
+
runId: 42,
|
|
217
|
+
});
|
|
218
|
+
expect(calls).toBe(0);
|
|
219
|
+
});
|
|
220
|
+
|
|
221
|
+
test("empty session file is skipped — never POST an empty snapshot", async () => {
|
|
222
|
+
const sessionFile = join(tmp, ".openclaw", "session.jsonl");
|
|
223
|
+
await fs.mkdir(join(tmp, ".openclaw"), { recursive: true });
|
|
224
|
+
await fs.writeFile(sessionFile, "", "utf-8");
|
|
225
|
+
let calls = 0;
|
|
226
|
+
stubFetch(() => {
|
|
227
|
+
calls++;
|
|
228
|
+
return new Response("{}", { status: 200 });
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
await writeSnapshot({
|
|
232
|
+
sessionFile,
|
|
233
|
+
gatewayUrl: "http://gw.test/lobu",
|
|
234
|
+
workerToken: "test-jwt",
|
|
235
|
+
terminalStatus: "completed",
|
|
236
|
+
runId: 42,
|
|
237
|
+
});
|
|
238
|
+
expect(calls).toBe(0);
|
|
239
|
+
});
|
|
240
|
+
|
|
241
|
+
test("server 500 is logged, not thrown", async () => {
|
|
242
|
+
const sessionFile = join(tmp, ".openclaw", "session.jsonl");
|
|
243
|
+
await fs.mkdir(join(tmp, ".openclaw"), { recursive: true });
|
|
244
|
+
await fs.writeFile(sessionFile, `{"type":"session"}\n`, "utf-8");
|
|
245
|
+
stubFetch(() => new Response("boom", { status: 500 }));
|
|
246
|
+
|
|
247
|
+
// No throw — same invariant as the 409 case. Logs go to pino; we
|
|
248
|
+
// don't assert log content here.
|
|
249
|
+
await writeSnapshot({
|
|
250
|
+
sessionFile,
|
|
251
|
+
gatewayUrl: "http://gw.test/lobu",
|
|
252
|
+
workerToken: "test-jwt",
|
|
253
|
+
terminalStatus: "completed",
|
|
254
|
+
runId: 42,
|
|
255
|
+
});
|
|
256
|
+
});
|
|
257
|
+
|
|
258
|
+
test("fetch throw is caught — cleanup() must never re-throw", async () => {
|
|
259
|
+
const sessionFile = join(tmp, ".openclaw", "session.jsonl");
|
|
260
|
+
await fs.mkdir(join(tmp, ".openclaw"), { recursive: true });
|
|
261
|
+
await fs.writeFile(sessionFile, `{"type":"session"}\n`, "utf-8");
|
|
262
|
+
globalThis.fetch = (() => {
|
|
263
|
+
throw new Error("ECONNREFUSED");
|
|
264
|
+
}) as unknown as typeof globalThis.fetch;
|
|
265
|
+
|
|
266
|
+
// No throw escapes — caller is the cleanup() finally block.
|
|
267
|
+
await writeSnapshot({
|
|
268
|
+
sessionFile,
|
|
269
|
+
gatewayUrl: "http://gw.test/lobu",
|
|
270
|
+
workerToken: "test-jwt",
|
|
271
|
+
terminalStatus: "completed",
|
|
272
|
+
runId: 42,
|
|
273
|
+
});
|
|
274
|
+
});
|
|
275
|
+
});
|
package/src/core/types.ts
CHANGED
|
@@ -28,6 +28,24 @@ export interface WorkerConfig {
|
|
|
28
28
|
workspace: {
|
|
29
29
|
baseDirectory: string;
|
|
30
30
|
};
|
|
31
|
+
/**
|
|
32
|
+
* The runs.id of the row that dispatched this job. Set by the gateway
|
|
33
|
+
* (MessageConsumer stamps it from the runs-queue claim's job.id) so the
|
|
34
|
+
* worker's cleanup() snapshot can attribute itself to the correct run
|
|
35
|
+
* even when a follow-up run for the same conversation has already been
|
|
36
|
+
* enqueued (codex P1#1 on PR #865). Optional for backward-compatibility
|
|
37
|
+
* with legacy direct-enqueue paths that don't go through the runs queue.
|
|
38
|
+
*/
|
|
39
|
+
runId?: number;
|
|
40
|
+
/**
|
|
41
|
+
* Per-run worker JWT bound to `runId`. Set by MessageConsumer at
|
|
42
|
+
* dispatch time and used by cleanup()'s writeSnapshot call as the
|
|
43
|
+
* Authorization bearer — replaces the deployment-lifetime WORKER_TOKEN
|
|
44
|
+
* for the snapshot path so the gateway's route can require token-runId
|
|
45
|
+
* equality with body.runId (codex round 2 finding A on PR #865).
|
|
46
|
+
* When absent (legacy direct-enqueue), the snapshot write is skipped.
|
|
47
|
+
*/
|
|
48
|
+
runJobToken?: string;
|
|
31
49
|
}
|
|
32
50
|
|
|
33
51
|
export interface WorkspaceSetupConfig {
|
|
@@ -81,20 +81,33 @@ const AgentOptionsSchema = z
|
|
|
81
81
|
.passthrough();
|
|
82
82
|
|
|
83
83
|
const JobEventSchema = z.object({
|
|
84
|
-
payload: z
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
84
|
+
payload: z
|
|
85
|
+
.object({
|
|
86
|
+
botId: z.string(),
|
|
87
|
+
userId: z.string(),
|
|
88
|
+
agentId: z.string(),
|
|
89
|
+
conversationId: z.string(),
|
|
90
|
+
platform: z.string(),
|
|
91
|
+
channelId: z.string(),
|
|
92
|
+
messageId: z.string(),
|
|
93
|
+
messageText: z.string(),
|
|
94
|
+
platformMetadata: PlatformMetadataSchema,
|
|
95
|
+
agentOptions: AgentOptionsSchema,
|
|
96
|
+
jobId: z.string().optional(),
|
|
97
|
+
teamId: z.string().optional(), // Optional for WhatsApp (top-level) and Slack (in platformMetadata)
|
|
98
|
+
// Threaded through from MessageConsumer's runs-queue claim. The worker
|
|
99
|
+
// asserts these in snapshot mode (LOBU_SESSION_STORE != "file") — see
|
|
100
|
+
// worker.ts:353-360. The default zod object mode strips unknown keys,
|
|
101
|
+
// which silently dropped these fields and broke every Telegram chat
|
|
102
|
+
// when snapshot mode became the default in PR #871. Declare them
|
|
103
|
+
// explicitly so they survive parsing, and `.passthrough()` keeps any
|
|
104
|
+
// future MessagePayload field (mcpConfig, nixConfig, egressConfig,
|
|
105
|
+
// preApprovedTools, exec* fields, organizationId, networkConfig...)
|
|
106
|
+
// from regressing the same way.
|
|
107
|
+
runId: z.number().optional(),
|
|
108
|
+
runJobToken: z.string().optional(),
|
|
109
|
+
})
|
|
110
|
+
.passthrough(),
|
|
98
111
|
processedIds: z.array(z.string()).optional(),
|
|
99
112
|
});
|
|
100
113
|
|
|
@@ -919,6 +932,21 @@ export class GatewayClient {
|
|
|
919
932
|
workspace: {
|
|
920
933
|
baseDirectory: process.env.WORKSPACE_DIR || "/workspace",
|
|
921
934
|
},
|
|
935
|
+
// Threaded through from MessageConsumer (set from the runs-queue
|
|
936
|
+
// claim's job.id). Used by cleanup() to attribute the snapshot to
|
|
937
|
+
// the correct run; codex P1#1 on PR #865.
|
|
938
|
+
runId:
|
|
939
|
+
typeof payload.runId === "number" && Number.isFinite(payload.runId)
|
|
940
|
+
? payload.runId
|
|
941
|
+
: undefined,
|
|
942
|
+
// Per-run JWT minted by MessageConsumer alongside runId. Worker
|
|
943
|
+
// uses this for the snapshot POST instead of the deployment-
|
|
944
|
+
// lifetime WORKER_TOKEN, so the gateway can enforce
|
|
945
|
+
// tokenData.runId === body.runId — codex round 2 finding A.
|
|
946
|
+
runJobToken:
|
|
947
|
+
typeof payload.runJobToken === "string" && payload.runJobToken
|
|
948
|
+
? payload.runJobToken
|
|
949
|
+
: undefined,
|
|
922
950
|
};
|
|
923
951
|
}
|
|
924
952
|
|
package/src/gateway/types.ts
CHANGED
|
@@ -41,6 +41,21 @@ export interface MessagePayload {
|
|
|
41
41
|
jobId?: string; // Optional job ID from gateway
|
|
42
42
|
teamId?: string; // Optional team ID (WhatsApp uses top-level, Slack uses platformMetadata)
|
|
43
43
|
|
|
44
|
+
// The runs.id of the row that dispatched this job. Set by the gateway
|
|
45
|
+
// MessageConsumer (stamped from the runs-queue claim's job.id) and
|
|
46
|
+
// threaded into WorkerConfig.runId. The worker's cleanup() uses it to
|
|
47
|
+
// attribute the agent_transcript_snapshot row to the correct run —
|
|
48
|
+
// see codex P1#1 on PR #865.
|
|
49
|
+
runId?: number;
|
|
50
|
+
|
|
51
|
+
// Per-run worker JWT bound to `runId` above. Minted by MessageConsumer
|
|
52
|
+
// and threaded into WorkerConfig.runJobToken. The worker uses THIS
|
|
53
|
+
// token (not the deployment-lifetime WORKER_TOKEN) when calling the
|
|
54
|
+
// snapshot endpoint, so the route's `tokenData.runId === body.runId`
|
|
55
|
+
// equality check can reject any cross-run impersonation — codex round
|
|
56
|
+
// 2 finding A on PR #865.
|
|
57
|
+
runJobToken?: string;
|
|
58
|
+
|
|
44
59
|
// Job type (default: "message")
|
|
45
60
|
jobType?: JobType;
|
|
46
61
|
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-run snapshot client for OpenClaw's `session.jsonl`.
|
|
3
|
+
*
|
|
4
|
+
* Why this exists: today's PVC-backed `workspaces/` directory is read-write-
|
|
5
|
+
* once, which forces the helm chart to pin `replicaCount: 1` for the
|
|
6
|
+
* gateway/worker. Mirroring the post-run session.jsonl to Postgres lets a
|
|
7
|
+
* second pod hydrate the file on boot and resume the conversation, which is
|
|
8
|
+
* the prerequisite for dropping the PVC (Phase 5, separate PR).
|
|
9
|
+
*
|
|
10
|
+
* Design contract for the next reader:
|
|
11
|
+
* - We do NOT fork or wrap `@mariozechner/pi-coding-agent`'s `SessionManager`.
|
|
12
|
+
* It owns the file on disk; we read it back at terminal time and write
|
|
13
|
+
* the bytes verbatim to PG. The next boot writes those bytes back to
|
|
14
|
+
* disk verbatim before SessionManager.open(), so SessionManager observes
|
|
15
|
+
* a byte-identical file to what it last wrote.
|
|
16
|
+
* - The snapshot is taken in `OpenClawWorker.cleanup()` on every terminal
|
|
17
|
+
* status — `completed`, `failed`, `timeout`, `cancelled`. Hydrate filters
|
|
18
|
+
* for `terminal_status='completed'` so a failed run can't poison the
|
|
19
|
+
* next worker with a dangling `tool_use` content block. Older completed
|
|
20
|
+
* snapshots remain readable; the hydrate query takes the latest one.
|
|
21
|
+
* - The worker is sandboxed — no PG access. Two new endpoints live on the
|
|
22
|
+
* existing worker gateway: `GET /worker/transcript/snapshot` for
|
|
23
|
+
* hydrate, `POST /worker/transcript/snapshot` for write. (org, agent,
|
|
24
|
+
* conv) are pulled from the worker JWT on the gateway side, so the
|
|
25
|
+
* worker can't impersonate another conversation.
|
|
26
|
+
* - Phase 5: snapshot mode is the default. `LOBU_SESSION_STORE=file`
|
|
27
|
+
* opts out for legacy/local-dev single-replica deploys. Phase 6
|
|
28
|
+
* drops the env var entirely.
|
|
29
|
+
*
|
|
30
|
+
* Trade-off accepted: a mid-run crash loses the partial transcript for that
|
|
31
|
+
* run. The next attempt re-runs from the previous user message. Tools must
|
|
32
|
+
* be idempotent (or accept user-visible re-execution).
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
import { promises as fs } from "node:fs";
|
|
36
|
+
import * as path from "node:path";
|
|
37
|
+
import { createLogger } from "@lobu/core";
|
|
38
|
+
|
|
39
|
+
const logger = createLogger("transcript-snapshot");
|
|
40
|
+
|
|
41
|
+
export type TerminalStatus = "completed" | "failed" | "timeout" | "cancelled";
|
|
42
|
+
|
|
43
|
+
export interface TranscriptSnapshotOptions {
|
|
44
|
+
/** Absolute path to the session.jsonl SessionManager reads/writes. */
|
|
45
|
+
sessionFile: string;
|
|
46
|
+
/** Gateway base URL (e.g. `http://127.0.0.1:8787/lobu`). */
|
|
47
|
+
gatewayUrl: string;
|
|
48
|
+
/** Worker JWT. The gateway pulls (org, agent, conv) from this token. */
|
|
49
|
+
workerToken: string;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Pull the latest `terminal_status='completed'` snapshot for this worker's
|
|
54
|
+
* (org, agent, conv) and write the bytes to `sessionFile`. Must run BEFORE
|
|
55
|
+
* SessionManager.open() so the rehydrated content is visible at open time.
|
|
56
|
+
*
|
|
57
|
+
* Returns `true` if a snapshot was found and written, `false` if no snapshot
|
|
58
|
+
* exists yet (first turn). Throws on transport errors — caller decides
|
|
59
|
+
* whether to fall back to a fresh session.
|
|
60
|
+
*/
|
|
61
|
+
export async function hydrateFromSnapshot(
|
|
62
|
+
opts: TranscriptSnapshotOptions
|
|
63
|
+
): Promise<boolean> {
|
|
64
|
+
const url = `${opts.gatewayUrl}/worker/transcript/snapshot`;
|
|
65
|
+
const res = await fetch(url, {
|
|
66
|
+
method: "GET",
|
|
67
|
+
headers: { Authorization: `Bearer ${opts.workerToken}` },
|
|
68
|
+
signal: AbortSignal.timeout(30_000),
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
// 404 = no completed snapshot for this (org, agent, conv). First turn or
|
|
72
|
+
// every previous attempt failed/timed out. Caller should start fresh.
|
|
73
|
+
if (res.status === 404) {
|
|
74
|
+
return false;
|
|
75
|
+
}
|
|
76
|
+
if (!res.ok) {
|
|
77
|
+
throw new Error(
|
|
78
|
+
`transcript hydrate failed: ${res.status} ${res.statusText}`
|
|
79
|
+
);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const body = await res.text();
|
|
83
|
+
await fs.mkdir(path.dirname(opts.sessionFile), { recursive: true });
|
|
84
|
+
// writeFile truncates atomically (open with O_TRUNC); no partial state
|
|
85
|
+
// is visible to SessionManager.open() because that call runs after this
|
|
86
|
+
// function resolves.
|
|
87
|
+
await fs.writeFile(opts.sessionFile, body, "utf-8");
|
|
88
|
+
// fsync so a pod crash between this return and SessionManager.open()
|
|
89
|
+
// doesn't leave the file half-written. The cost is one extra disk flush
|
|
90
|
+
// on every worker boot — acceptable.
|
|
91
|
+
const handle = await fs.open(opts.sessionFile, "r");
|
|
92
|
+
try {
|
|
93
|
+
await handle.sync();
|
|
94
|
+
} finally {
|
|
95
|
+
await handle.close();
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
logger.info(
|
|
99
|
+
`Hydrated session file from snapshot: ${body.length} bytes → ${opts.sessionFile}`
|
|
100
|
+
);
|
|
101
|
+
return true;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Read the session file in full and POST it to the gateway. Called once per
|
|
106
|
+
* worker run at terminal time, from `OpenClawWorker.cleanup()`. The
|
|
107
|
+
* `terminal_status` discriminator lets the hydrate path skip failed/timeout
|
|
108
|
+
* snapshots so a dangling `tool_use` doesn't poison the next attempt.
|
|
109
|
+
*
|
|
110
|
+
* Failure to snapshot is logged but does NOT throw — there's nothing the
|
|
111
|
+
* caller can do beyond what cleanup already does (the worker is exiting).
|
|
112
|
+
* The next attempt will hydrate from the previous successful snapshot.
|
|
113
|
+
*/
|
|
114
|
+
export async function writeSnapshot(
|
|
115
|
+
opts: TranscriptSnapshotOptions & {
|
|
116
|
+
terminalStatus: TerminalStatus;
|
|
117
|
+
/**
|
|
118
|
+
* The runs.id this worker claimed. Sent in the POST body so the route
|
|
119
|
+
* binds the snapshot to the correct run unambiguously; the route then
|
|
120
|
+
* verifies the runId actually belongs to the JWT's (org, agent, conv)
|
|
121
|
+
* tuple before INSERTing. Codex P1#1 on PR #865 — without this, the
|
|
122
|
+
* route fell back to a "latest run for (org, agent, conv)" lookup
|
|
123
|
+
* which raced with the next user message enqueuing a fresh run.
|
|
124
|
+
*/
|
|
125
|
+
runId: number;
|
|
126
|
+
}
|
|
127
|
+
): Promise<void> {
|
|
128
|
+
// Hydrate filters `terminal_status='completed'` — failed/timeout/cancelled
|
|
129
|
+
// snapshots are never used. POSTing them is pure network waste; the
|
|
130
|
+
// route would store them but no future hydrate would pick them up.
|
|
131
|
+
// Skip at the source so any caller (cleanup() today, future paths
|
|
132
|
+
// tomorrow) stays out of the wasteful write. Codex round 2 quality
|
|
133
|
+
// win C on PR #865.
|
|
134
|
+
if (opts.terminalStatus !== "completed") {
|
|
135
|
+
logger.debug(
|
|
136
|
+
`Skipping snapshot POST: terminal_status='${opts.terminalStatus}' is never read by hydrate`
|
|
137
|
+
);
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
let body: string;
|
|
142
|
+
try {
|
|
143
|
+
body = await fs.readFile(opts.sessionFile, "utf-8");
|
|
144
|
+
} catch (err) {
|
|
145
|
+
// No session file = nothing to snapshot. Common when the worker exits
|
|
146
|
+
// before SessionManager.open() ran (early error path).
|
|
147
|
+
const isMissing =
|
|
148
|
+
err instanceof Error && (err as NodeJS.ErrnoException).code === "ENOENT";
|
|
149
|
+
if (isMissing) {
|
|
150
|
+
logger.debug(`No session file at ${opts.sessionFile}; skipping snapshot`);
|
|
151
|
+
return;
|
|
152
|
+
}
|
|
153
|
+
logger.warn(
|
|
154
|
+
`Failed to read session file for snapshot: ${err instanceof Error ? err.message : String(err)}`
|
|
155
|
+
);
|
|
156
|
+
return;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
if (body.length === 0) {
|
|
160
|
+
logger.debug("Empty session file; skipping snapshot");
|
|
161
|
+
return;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
const url = `${opts.gatewayUrl}/worker/transcript/snapshot`;
|
|
165
|
+
try {
|
|
166
|
+
const res = await fetch(url, {
|
|
167
|
+
method: "POST",
|
|
168
|
+
headers: {
|
|
169
|
+
Authorization: `Bearer ${opts.workerToken}`,
|
|
170
|
+
"Content-Type": "application/json",
|
|
171
|
+
},
|
|
172
|
+
body: JSON.stringify({
|
|
173
|
+
terminalStatus: opts.terminalStatus,
|
|
174
|
+
snapshotJsonl: body,
|
|
175
|
+
runId: opts.runId,
|
|
176
|
+
}),
|
|
177
|
+
// Snapshots can be large (633 KB max measured); 60s timeout covers
|
|
178
|
+
// slow links + PG TOAST writes.
|
|
179
|
+
signal: AbortSignal.timeout(60_000),
|
|
180
|
+
});
|
|
181
|
+
if (!res.ok) {
|
|
182
|
+
// 409 = UNIQUE (org, agent, conv, run_id) collision. Means another
|
|
183
|
+
// pod (or a retry) already wrote this snapshot — benign, drop it.
|
|
184
|
+
if (res.status === 409) {
|
|
185
|
+
logger.info(
|
|
186
|
+
`Snapshot for run already exists (status=${opts.terminalStatus}); skipping duplicate`
|
|
187
|
+
);
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
190
|
+
logger.error(`Snapshot POST failed: ${res.status} ${res.statusText}`);
|
|
191
|
+
return;
|
|
192
|
+
}
|
|
193
|
+
logger.info(
|
|
194
|
+
`Wrote snapshot: ${body.length} bytes, status=${opts.terminalStatus}`
|
|
195
|
+
);
|
|
196
|
+
} catch (err) {
|
|
197
|
+
logger.error(
|
|
198
|
+
`Snapshot POST threw: ${err instanceof Error ? err.message : String(err)}`
|
|
199
|
+
);
|
|
200
|
+
return;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
/**
|
|
205
|
+
* Purge all snapshot rows for this worker's (org, agent, conv). Called
|
|
206
|
+
* by the session-reset path so the next boot doesn't rehydrate the
|
|
207
|
+
* conversation from Postgres after a `/new`. Idempotent — a 404 / empty
|
|
208
|
+
* result is treated as success.
|
|
209
|
+
*
|
|
210
|
+
* Failures are logged but not thrown — reset is best-effort; if the
|
|
211
|
+
* purge HTTP call fails the worst case is the next boot hydrates from
|
|
212
|
+
* the previous transcript (the legacy file-mode behaviour). The local
|
|
213
|
+
* session.jsonl unlink is the primary signal; this is the multi-replica
|
|
214
|
+
* complement to it.
|
|
215
|
+
*/
|
|
216
|
+
export async function clearSnapshots(
|
|
217
|
+
opts: Pick<TranscriptSnapshotOptions, "gatewayUrl" | "workerToken">
|
|
218
|
+
): Promise<void> {
|
|
219
|
+
const url = `${opts.gatewayUrl}/worker/transcript/snapshot`;
|
|
220
|
+
try {
|
|
221
|
+
const res = await fetch(url, {
|
|
222
|
+
method: "DELETE",
|
|
223
|
+
headers: { Authorization: `Bearer ${opts.workerToken}` },
|
|
224
|
+
signal: AbortSignal.timeout(30_000),
|
|
225
|
+
});
|
|
226
|
+
if (!res.ok) {
|
|
227
|
+
logger.warn(
|
|
228
|
+
`Snapshot DELETE failed: ${res.status} ${res.statusText} — next boot may rehydrate stale history`
|
|
229
|
+
);
|
|
230
|
+
return;
|
|
231
|
+
}
|
|
232
|
+
logger.info("Purged conversation snapshots for session reset");
|
|
233
|
+
} catch (err) {
|
|
234
|
+
logger.warn(
|
|
235
|
+
`Snapshot DELETE threw: ${err instanceof Error ? err.message : String(err)} — next boot may rehydrate stale history`
|
|
236
|
+
);
|
|
237
|
+
}
|
|
238
|
+
}
|