@lobu/worker 7.1.0 → 7.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,275 @@
1
+ /**
2
+ * Unit tests for the worker-side per-run snapshot helpers
3
+ * (`hydrateFromSnapshot`, `writeSnapshot`).
4
+ *
5
+ * These exercise the HTTP-client side of the snapshot path against a mocked
6
+ * gateway. Coverage:
7
+ * - Hydrate writes the gateway's bytes verbatim to disk, fsyncs, returns
8
+ * the post-hydrate file size matching the byte_size column contract.
9
+ * - Hydrate handles 404 (no completed snapshot) → returns false, leaves
10
+ * the local file untouched.
11
+ * - Hydrate failures are non-fatal at the caller's discretion (we re-throw,
12
+ * caller logs+continues; behaviour verified in worker.ts but we assert
13
+ * the throw shape here).
14
+ * - writeSnapshot reads the session file, POSTs body, handles 409 (race
15
+ * win), missing file (early-exit worker), and empty file all silently.
16
+ * - The transport layer never throws — `cleanup()` runs in the worker's
17
+ * dying breath and any throw would abort the surrounding `finally`.
18
+ *
19
+ * The gateway test (`packages/server/src/gateway/__tests__/
20
+ * agent-transcript-snapshot.test.ts`) covers the route + PG side; this
21
+ * file is the symmetric client side.
22
+ */
23
+
24
+ import { promises as fs } from "node:fs";
25
+ import { mkdtemp, rm } from "node:fs/promises";
26
+ import { tmpdir } from "node:os";
27
+ import { join } from "node:path";
28
+ import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test";
29
+ import {
30
+ hydrateFromSnapshot,
31
+ writeSnapshot,
32
+ } from "../openclaw/transcript-snapshot";
33
+
34
+ let tmp: string;
35
+ let originalFetch: typeof globalThis.fetch;
36
+
37
+ beforeEach(async () => {
38
+ tmp = await mkdtemp(join(tmpdir(), "snapshot-test-"));
39
+ originalFetch = globalThis.fetch;
40
+ });
41
+
42
+ afterEach(async () => {
43
+ globalThis.fetch = originalFetch;
44
+ await rm(tmp, { recursive: true, force: true });
45
+ });
46
+
47
+ function stubFetch(
48
+ handler: (url: string, init: RequestInit) => Response
49
+ ): void {
50
+ globalThis.fetch = mock(
51
+ async (input: RequestInfo | URL, init?: RequestInit) => {
52
+ const url = typeof input === "string" ? input : input.toString();
53
+ return handler(url, init ?? {});
54
+ }
55
+ ) as unknown as typeof globalThis.fetch;
56
+ }
57
+
58
+ describe("hydrateFromSnapshot", () => {
59
+ test("boot-hydrate-fsync: writes bytes verbatim, file size matches body length", async () => {
60
+ const sessionFile = join(tmp, ".openclaw", "session.jsonl");
61
+ const expected =
62
+ `{"type":"session","version":3,"id":"hydrate","timestamp":"2026-05-18T10:00:00Z","cwd":"/w"}\n` +
63
+ `{"type":"message","id":"m1","parentId":null,"timestamp":"2026-05-18T10:00:01Z","message":{"role":"user","content":[{"type":"text","text":"resume"}]}}\n`;
64
+
65
+ stubFetch((url, init) => {
66
+ expect(url.endsWith("/worker/transcript/snapshot")).toBe(true);
67
+ expect(init.method).toBe("GET");
68
+ expect((init.headers as Record<string, string>).Authorization).toBe(
69
+ "Bearer test-jwt"
70
+ );
71
+ return new Response(expected, { status: 200 });
72
+ });
73
+
74
+ const hydrated = await hydrateFromSnapshot({
75
+ sessionFile,
76
+ gatewayUrl: "http://gw.test/lobu",
77
+ workerToken: "test-jwt",
78
+ });
79
+ expect(hydrated).toBe(true);
80
+
81
+ // File written + fsynced → stat size matches byte_size we'd compute.
82
+ const stats = await fs.stat(sessionFile);
83
+ expect(stats.size).toBe(Buffer.byteLength(expected, "utf-8"));
84
+ const back = await fs.readFile(sessionFile, "utf-8");
85
+ expect(back).toBe(expected);
86
+ });
87
+
88
+ test("returns false on 404 and does not touch the file", async () => {
89
+ const sessionFile = join(tmp, ".openclaw", "session.jsonl");
90
+ stubFetch(() => new Response("", { status: 404 }));
91
+
92
+ const hydrated = await hydrateFromSnapshot({
93
+ sessionFile,
94
+ gatewayUrl: "http://gw.test/lobu",
95
+ workerToken: "test-jwt",
96
+ });
97
+ expect(hydrated).toBe(false);
98
+ // No file created.
99
+ let exists = false;
100
+ try {
101
+ await fs.stat(sessionFile);
102
+ exists = true;
103
+ } catch {
104
+ exists = false;
105
+ }
106
+ expect(exists).toBe(false);
107
+ });
108
+
109
+ test("throws on non-2xx, non-404 — caller logs + continues with local file", async () => {
110
+ const sessionFile = join(tmp, ".openclaw", "session.jsonl");
111
+ stubFetch(() => new Response("boom", { status: 500 }));
112
+ await expect(
113
+ hydrateFromSnapshot({
114
+ sessionFile,
115
+ gatewayUrl: "http://gw.test/lobu",
116
+ workerToken: "test-jwt",
117
+ })
118
+ ).rejects.toThrow(/transcript hydrate failed: 500/);
119
+ });
120
+ });
121
+
122
+ describe("writeSnapshot", () => {
123
+ test("happy path: reads file, POSTs body + terminalStatus, gateway 200", async () => {
124
+ const sessionFile = join(tmp, ".openclaw", "session.jsonl");
125
+ await fs.mkdir(join(tmp, ".openclaw"), { recursive: true });
126
+ const body =
127
+ `{"type":"session","version":3,"id":"write","timestamp":"2026-05-18T10:00:00Z","cwd":"/w"}\n` +
128
+ `{"type":"message","id":"u1","parentId":null,"timestamp":"2026-05-18T10:00:01Z","message":{"role":"user","content":[{"type":"text","text":"hi"}]}}\n`;
129
+ await fs.writeFile(sessionFile, body, "utf-8");
130
+
131
+ let postedBody: string | null = null;
132
+ stubFetch((url, init) => {
133
+ expect(url.endsWith("/worker/transcript/snapshot")).toBe(true);
134
+ expect(init.method).toBe("POST");
135
+ postedBody = init.body as string;
136
+ return new Response('{"id":1}', { status: 200 });
137
+ });
138
+
139
+ await writeSnapshot({
140
+ sessionFile,
141
+ gatewayUrl: "http://gw.test/lobu",
142
+ workerToken: "test-jwt",
143
+ terminalStatus: "completed",
144
+ runId: 42,
145
+ });
146
+ expect(postedBody).not.toBeNull();
147
+ const parsed = JSON.parse(postedBody!);
148
+ expect(parsed.snapshotJsonl).toBe(body);
149
+ expect(parsed.terminalStatus).toBe("completed");
150
+ // P1#1: runId MUST be on the POST body so the route attributes the
151
+ // snapshot to the exact run this worker claimed, not "latest run for
152
+ // (org, agent, conv)".
153
+ expect(parsed.runId).toBe(42);
154
+ });
155
+
156
+ test("non-completed terminalStatus is skipped (no POST, no waste)", async () => {
157
+ // Hydrate filters terminal_status='completed' — writing failed/
158
+ // timeout/cancelled rows is pure network waste. Codex round 2
159
+ // quality win C on PR #865. The cleanup() path is also gated on
160
+ // `terminalStatus === "completed"`, but writeSnapshot defends in
161
+ // depth so any future caller can't accidentally write a row that
162
+ // hydrate will never read.
163
+ const sessionFile = join(tmp, ".openclaw", "session.jsonl");
164
+ await fs.mkdir(join(tmp, ".openclaw"), { recursive: true });
165
+ await fs.writeFile(sessionFile, `{"type":"session"}\n`, "utf-8");
166
+
167
+ let calls = 0;
168
+ stubFetch(() => {
169
+ calls++;
170
+ return new Response("{}", { status: 200 });
171
+ });
172
+
173
+ for (const terminalStatus of ["failed", "timeout", "cancelled"] as const) {
174
+ await writeSnapshot({
175
+ sessionFile,
176
+ gatewayUrl: "http://gw.test/lobu",
177
+ workerToken: "test-jwt",
178
+ terminalStatus,
179
+ runId: 42,
180
+ });
181
+ }
182
+ expect(calls).toBe(0);
183
+ });
184
+
185
+ test("race-win-409 is benign — no throw", async () => {
186
+ const sessionFile = join(tmp, ".openclaw", "session.jsonl");
187
+ await fs.mkdir(join(tmp, ".openclaw"), { recursive: true });
188
+ await fs.writeFile(sessionFile, `{"type":"session"}\n`, "utf-8");
189
+
190
+ stubFetch(() => new Response("conflict", { status: 409 }));
191
+
192
+ // No throw — cleanup() in the worker's dying breath must never
193
+ // re-throw inside a `finally`.
194
+ await writeSnapshot({
195
+ sessionFile,
196
+ gatewayUrl: "http://gw.test/lobu",
197
+ workerToken: "test-jwt",
198
+ terminalStatus: "completed",
199
+ runId: 42,
200
+ });
201
+ });
202
+
203
+ test("no session file (early-exit worker): silently skips, no fetch", async () => {
204
+ const sessionFile = join(tmp, ".openclaw", "session.jsonl");
205
+ let calls = 0;
206
+ stubFetch(() => {
207
+ calls++;
208
+ return new Response("", { status: 200 });
209
+ });
210
+
211
+ await writeSnapshot({
212
+ sessionFile,
213
+ gatewayUrl: "http://gw.test/lobu",
214
+ workerToken: "test-jwt",
215
+ terminalStatus: "failed",
216
+ runId: 42,
217
+ });
218
+ expect(calls).toBe(0);
219
+ });
220
+
221
+ test("empty session file is skipped — never POST an empty snapshot", async () => {
222
+ const sessionFile = join(tmp, ".openclaw", "session.jsonl");
223
+ await fs.mkdir(join(tmp, ".openclaw"), { recursive: true });
224
+ await fs.writeFile(sessionFile, "", "utf-8");
225
+ let calls = 0;
226
+ stubFetch(() => {
227
+ calls++;
228
+ return new Response("{}", { status: 200 });
229
+ });
230
+
231
+ await writeSnapshot({
232
+ sessionFile,
233
+ gatewayUrl: "http://gw.test/lobu",
234
+ workerToken: "test-jwt",
235
+ terminalStatus: "completed",
236
+ runId: 42,
237
+ });
238
+ expect(calls).toBe(0);
239
+ });
240
+
241
+ test("server 500 is logged, not thrown", async () => {
242
+ const sessionFile = join(tmp, ".openclaw", "session.jsonl");
243
+ await fs.mkdir(join(tmp, ".openclaw"), { recursive: true });
244
+ await fs.writeFile(sessionFile, `{"type":"session"}\n`, "utf-8");
245
+ stubFetch(() => new Response("boom", { status: 500 }));
246
+
247
+ // No throw — same invariant as the 409 case. Logs go to pino; we
248
+ // don't assert log content here.
249
+ await writeSnapshot({
250
+ sessionFile,
251
+ gatewayUrl: "http://gw.test/lobu",
252
+ workerToken: "test-jwt",
253
+ terminalStatus: "completed",
254
+ runId: 42,
255
+ });
256
+ });
257
+
258
+ test("fetch throw is caught — cleanup() must never re-throw", async () => {
259
+ const sessionFile = join(tmp, ".openclaw", "session.jsonl");
260
+ await fs.mkdir(join(tmp, ".openclaw"), { recursive: true });
261
+ await fs.writeFile(sessionFile, `{"type":"session"}\n`, "utf-8");
262
+ globalThis.fetch = (() => {
263
+ throw new Error("ECONNREFUSED");
264
+ }) as unknown as typeof globalThis.fetch;
265
+
266
+ // No throw escapes — caller is the cleanup() finally block.
267
+ await writeSnapshot({
268
+ sessionFile,
269
+ gatewayUrl: "http://gw.test/lobu",
270
+ workerToken: "test-jwt",
271
+ terminalStatus: "completed",
272
+ runId: 42,
273
+ });
274
+ });
275
+ });
package/src/core/types.ts CHANGED
@@ -28,6 +28,24 @@ export interface WorkerConfig {
28
28
  workspace: {
29
29
  baseDirectory: string;
30
30
  };
31
+ /**
32
+ * The runs.id of the row that dispatched this job. Set by the gateway
33
+ * (MessageConsumer stamps it from the runs-queue claim's job.id) so the
34
+ * worker's cleanup() snapshot can attribute itself to the correct run
35
+ * even when a follow-up run for the same conversation has already been
36
+ * enqueued (codex P1#1 on PR #865). Optional for backward-compatibility
37
+ * with legacy direct-enqueue paths that don't go through the runs queue.
38
+ */
39
+ runId?: number;
40
+ /**
41
+ * Per-run worker JWT bound to `runId`. Set by MessageConsumer at
42
+ * dispatch time and used by cleanup()'s writeSnapshot call as the
43
+ * Authorization bearer — replaces the deployment-lifetime WORKER_TOKEN
44
+ * for the snapshot path so the gateway's route can require token-runId
45
+ * equality with body.runId (codex round 2 finding A on PR #865).
46
+ * When absent (legacy direct-enqueue), the snapshot write is skipped.
47
+ */
48
+ runJobToken?: string;
31
49
  }
32
50
 
33
51
  export interface WorkspaceSetupConfig {
@@ -81,20 +81,33 @@ const AgentOptionsSchema = z
81
81
  .passthrough();
82
82
 
83
83
  const JobEventSchema = z.object({
84
- payload: z.object({
85
- botId: z.string(),
86
- userId: z.string(),
87
- agentId: z.string(),
88
- conversationId: z.string(),
89
- platform: z.string(),
90
- channelId: z.string(),
91
- messageId: z.string(),
92
- messageText: z.string(),
93
- platformMetadata: PlatformMetadataSchema,
94
- agentOptions: AgentOptionsSchema,
95
- jobId: z.string().optional(),
96
- teamId: z.string().optional(), // Optional for WhatsApp (top-level) and Slack (in platformMetadata)
97
- }),
84
+ payload: z
85
+ .object({
86
+ botId: z.string(),
87
+ userId: z.string(),
88
+ agentId: z.string(),
89
+ conversationId: z.string(),
90
+ platform: z.string(),
91
+ channelId: z.string(),
92
+ messageId: z.string(),
93
+ messageText: z.string(),
94
+ platformMetadata: PlatformMetadataSchema,
95
+ agentOptions: AgentOptionsSchema,
96
+ jobId: z.string().optional(),
97
+ teamId: z.string().optional(), // Optional for WhatsApp (top-level) and Slack (in platformMetadata)
98
+ // Threaded through from MessageConsumer's runs-queue claim. The worker
99
+ // asserts these in snapshot mode (LOBU_SESSION_STORE != "file") — see
100
+ // worker.ts:353-360. The default zod object mode strips unknown keys,
101
+ // which silently dropped these fields and broke every Telegram chat
102
+ // when snapshot mode became the default in PR #871. Declare them
103
+ // explicitly so they survive parsing, and `.passthrough()` keeps any
104
+ // future MessagePayload field (mcpConfig, nixConfig, egressConfig,
105
+ // preApprovedTools, exec* fields, organizationId, networkConfig...)
106
+ // from regressing the same way.
107
+ runId: z.number().optional(),
108
+ runJobToken: z.string().optional(),
109
+ })
110
+ .passthrough(),
98
111
  processedIds: z.array(z.string()).optional(),
99
112
  });
100
113
 
@@ -919,6 +932,21 @@ export class GatewayClient {
919
932
  workspace: {
920
933
  baseDirectory: process.env.WORKSPACE_DIR || "/workspace",
921
934
  },
935
+ // Threaded through from MessageConsumer (set from the runs-queue
936
+ // claim's job.id). Used by cleanup() to attribute the snapshot to
937
+ // the correct run; codex P1#1 on PR #865.
938
+ runId:
939
+ typeof payload.runId === "number" && Number.isFinite(payload.runId)
940
+ ? payload.runId
941
+ : undefined,
942
+ // Per-run JWT minted by MessageConsumer alongside runId. Worker
943
+ // uses this for the snapshot POST instead of the deployment-
944
+ // lifetime WORKER_TOKEN, so the gateway can enforce
945
+ // tokenData.runId === body.runId — codex round 2 finding A.
946
+ runJobToken:
947
+ typeof payload.runJobToken === "string" && payload.runJobToken
948
+ ? payload.runJobToken
949
+ : undefined,
922
950
  };
923
951
  }
924
952
 
@@ -41,6 +41,21 @@ export interface MessagePayload {
41
41
  jobId?: string; // Optional job ID from gateway
42
42
  teamId?: string; // Optional team ID (WhatsApp uses top-level, Slack uses platformMetadata)
43
43
 
44
+ // The runs.id of the row that dispatched this job. Set by the gateway
45
+ // MessageConsumer (stamped from the runs-queue claim's job.id) and
46
+ // threaded into WorkerConfig.runId. The worker's cleanup() uses it to
47
+ // attribute the agent_transcript_snapshot row to the correct run —
48
+ // see codex P1#1 on PR #865.
49
+ runId?: number;
50
+
51
+ // Per-run worker JWT bound to `runId` above. Minted by MessageConsumer
52
+ // and threaded into WorkerConfig.runJobToken. The worker uses THIS
53
+ // token (not the deployment-lifetime WORKER_TOKEN) when calling the
54
+ // snapshot endpoint, so the route's `tokenData.runId === body.runId`
55
+ // equality check can reject any cross-run impersonation — codex round
56
+ // 2 finding A on PR #865.
57
+ runJobToken?: string;
58
+
44
59
  // Job type (default: "message")
45
60
  jobType?: JobType;
46
61
 
@@ -0,0 +1,238 @@
1
+ /**
2
+ * Per-run snapshot client for OpenClaw's `session.jsonl`.
3
+ *
4
+ * Why this exists: today's PVC-backed `workspaces/` directory is read-write-
5
+ * once, which forces the helm chart to pin `replicaCount: 1` for the
6
+ * gateway/worker. Mirroring the post-run session.jsonl to Postgres lets a
7
+ * second pod hydrate the file on boot and resume the conversation, which is
8
+ * the prerequisite for dropping the PVC (Phase 5, separate PR).
9
+ *
10
+ * Design contract for the next reader:
11
+ * - We do NOT fork or wrap `@mariozechner/pi-coding-agent`'s `SessionManager`.
12
+ * It owns the file on disk; we read it back at terminal time and write
13
+ * the bytes verbatim to PG. The next boot writes those bytes back to
14
+ * disk verbatim before SessionManager.open(), so SessionManager observes
15
+ * a byte-identical file to what it last wrote.
16
+ * - The snapshot is taken in `OpenClawWorker.cleanup()` on every terminal
17
+ * status — `completed`, `failed`, `timeout`, `cancelled`. Hydrate filters
18
+ * for `terminal_status='completed'` so a failed run can't poison the
19
+ * next worker with a dangling `tool_use` content block. Older completed
20
+ * snapshots remain readable; the hydrate query takes the latest one.
21
+ * - The worker is sandboxed — no PG access. Two new endpoints live on the
22
+ * existing worker gateway: `GET /worker/transcript/snapshot` for
23
+ * hydrate, `POST /worker/transcript/snapshot` for write. (org, agent,
24
+ * conv) are pulled from the worker JWT on the gateway side, so the
25
+ * worker can't impersonate another conversation.
26
+ * - Phase 5: snapshot mode is the default. `LOBU_SESSION_STORE=file`
27
+ * opts out for legacy/local-dev single-replica deploys. Phase 6
28
+ * drops the env var entirely.
29
+ *
30
+ * Trade-off accepted: a mid-run crash loses the partial transcript for that
31
+ * run. The next attempt re-runs from the previous user message. Tools must
32
+ * be idempotent (or accept user-visible re-execution).
33
+ */
34
+
35
+ import { promises as fs } from "node:fs";
36
+ import * as path from "node:path";
37
+ import { createLogger } from "@lobu/core";
38
+
39
+ const logger = createLogger("transcript-snapshot");
40
+
41
+ export type TerminalStatus = "completed" | "failed" | "timeout" | "cancelled";
42
+
43
+ export interface TranscriptSnapshotOptions {
44
+ /** Absolute path to the session.jsonl SessionManager reads/writes. */
45
+ sessionFile: string;
46
+ /** Gateway base URL (e.g. `http://127.0.0.1:8787/lobu`). */
47
+ gatewayUrl: string;
48
+ /** Worker JWT. The gateway pulls (org, agent, conv) from this token. */
49
+ workerToken: string;
50
+ }
51
+
52
+ /**
53
+ * Pull the latest `terminal_status='completed'` snapshot for this worker's
54
+ * (org, agent, conv) and write the bytes to `sessionFile`. Must run BEFORE
55
+ * SessionManager.open() so the rehydrated content is visible at open time.
56
+ *
57
+ * Returns `true` if a snapshot was found and written, `false` if no snapshot
58
+ * exists yet (first turn). Throws on transport errors — caller decides
59
+ * whether to fall back to a fresh session.
60
+ */
61
+ export async function hydrateFromSnapshot(
62
+ opts: TranscriptSnapshotOptions
63
+ ): Promise<boolean> {
64
+ const url = `${opts.gatewayUrl}/worker/transcript/snapshot`;
65
+ const res = await fetch(url, {
66
+ method: "GET",
67
+ headers: { Authorization: `Bearer ${opts.workerToken}` },
68
+ signal: AbortSignal.timeout(30_000),
69
+ });
70
+
71
+ // 404 = no completed snapshot for this (org, agent, conv). First turn or
72
+ // every previous attempt failed/timed out. Caller should start fresh.
73
+ if (res.status === 404) {
74
+ return false;
75
+ }
76
+ if (!res.ok) {
77
+ throw new Error(
78
+ `transcript hydrate failed: ${res.status} ${res.statusText}`
79
+ );
80
+ }
81
+
82
+ const body = await res.text();
83
+ await fs.mkdir(path.dirname(opts.sessionFile), { recursive: true });
84
+ // writeFile truncates atomically (open with O_TRUNC); no partial state
85
+ // is visible to SessionManager.open() because that call runs after this
86
+ // function resolves.
87
+ await fs.writeFile(opts.sessionFile, body, "utf-8");
88
+ // fsync so a pod crash between this return and SessionManager.open()
89
+ // doesn't leave the file half-written. The cost is one extra disk flush
90
+ // on every worker boot — acceptable.
91
+ const handle = await fs.open(opts.sessionFile, "r");
92
+ try {
93
+ await handle.sync();
94
+ } finally {
95
+ await handle.close();
96
+ }
97
+
98
+ logger.info(
99
+ `Hydrated session file from snapshot: ${body.length} bytes → ${opts.sessionFile}`
100
+ );
101
+ return true;
102
+ }
103
+
104
+ /**
105
+ * Read the session file in full and POST it to the gateway. Called once per
106
+ * worker run at terminal time, from `OpenClawWorker.cleanup()`. The
107
+ * `terminal_status` discriminator lets the hydrate path skip failed/timeout
108
+ * snapshots so a dangling `tool_use` doesn't poison the next attempt.
109
+ *
110
+ * Failure to snapshot is logged but does NOT throw — there's nothing the
111
+ * caller can do beyond what cleanup already does (the worker is exiting).
112
+ * The next attempt will hydrate from the previous successful snapshot.
113
+ */
114
+ export async function writeSnapshot(
115
+ opts: TranscriptSnapshotOptions & {
116
+ terminalStatus: TerminalStatus;
117
+ /**
118
+ * The runs.id this worker claimed. Sent in the POST body so the route
119
+ * binds the snapshot to the correct run unambiguously; the route then
120
+ * verifies the runId actually belongs to the JWT's (org, agent, conv)
121
+ * tuple before INSERTing. Codex P1#1 on PR #865 — without this, the
122
+ * route fell back to a "latest run for (org, agent, conv)" lookup
123
+ * which raced with the next user message enqueuing a fresh run.
124
+ */
125
+ runId: number;
126
+ }
127
+ ): Promise<void> {
128
+ // Hydrate filters `terminal_status='completed'` — failed/timeout/cancelled
129
+ // snapshots are never used. POSTing them is pure network waste; the
130
+ // route would store them but no future hydrate would pick them up.
131
+ // Skip at the source so any caller (cleanup() today, future paths
132
+ // tomorrow) stays out of the wasteful write. Codex round 2 quality
133
+ // win C on PR #865.
134
+ if (opts.terminalStatus !== "completed") {
135
+ logger.debug(
136
+ `Skipping snapshot POST: terminal_status='${opts.terminalStatus}' is never read by hydrate`
137
+ );
138
+ return;
139
+ }
140
+
141
+ let body: string;
142
+ try {
143
+ body = await fs.readFile(opts.sessionFile, "utf-8");
144
+ } catch (err) {
145
+ // No session file = nothing to snapshot. Common when the worker exits
146
+ // before SessionManager.open() ran (early error path).
147
+ const isMissing =
148
+ err instanceof Error && (err as NodeJS.ErrnoException).code === "ENOENT";
149
+ if (isMissing) {
150
+ logger.debug(`No session file at ${opts.sessionFile}; skipping snapshot`);
151
+ return;
152
+ }
153
+ logger.warn(
154
+ `Failed to read session file for snapshot: ${err instanceof Error ? err.message : String(err)}`
155
+ );
156
+ return;
157
+ }
158
+
159
+ if (body.length === 0) {
160
+ logger.debug("Empty session file; skipping snapshot");
161
+ return;
162
+ }
163
+
164
+ const url = `${opts.gatewayUrl}/worker/transcript/snapshot`;
165
+ try {
166
+ const res = await fetch(url, {
167
+ method: "POST",
168
+ headers: {
169
+ Authorization: `Bearer ${opts.workerToken}`,
170
+ "Content-Type": "application/json",
171
+ },
172
+ body: JSON.stringify({
173
+ terminalStatus: opts.terminalStatus,
174
+ snapshotJsonl: body,
175
+ runId: opts.runId,
176
+ }),
177
+ // Snapshots can be large (633 KB max measured); 60s timeout covers
178
+ // slow links + PG TOAST writes.
179
+ signal: AbortSignal.timeout(60_000),
180
+ });
181
+ if (!res.ok) {
182
+ // 409 = UNIQUE (org, agent, conv, run_id) collision. Means another
183
+ // pod (or a retry) already wrote this snapshot — benign, drop it.
184
+ if (res.status === 409) {
185
+ logger.info(
186
+ `Snapshot for run already exists (status=${opts.terminalStatus}); skipping duplicate`
187
+ );
188
+ return;
189
+ }
190
+ logger.error(`Snapshot POST failed: ${res.status} ${res.statusText}`);
191
+ return;
192
+ }
193
+ logger.info(
194
+ `Wrote snapshot: ${body.length} bytes, status=${opts.terminalStatus}`
195
+ );
196
+ } catch (err) {
197
+ logger.error(
198
+ `Snapshot POST threw: ${err instanceof Error ? err.message : String(err)}`
199
+ );
200
+ return;
201
+ }
202
+ }
203
+
204
+ /**
205
+ * Purge all snapshot rows for this worker's (org, agent, conv). Called
206
+ * by the session-reset path so the next boot doesn't rehydrate the
207
+ * conversation from Postgres after a `/new`. Idempotent — a 404 / empty
208
+ * result is treated as success.
209
+ *
210
+ * Failures are logged but not thrown — reset is best-effort; if the
211
+ * purge HTTP call fails the worst case is the next boot hydrates from
212
+ * the previous transcript (the legacy file-mode behaviour). The local
213
+ * session.jsonl unlink is the primary signal; this is the multi-replica
214
+ * complement to it.
215
+ */
216
+ export async function clearSnapshots(
217
+ opts: Pick<TranscriptSnapshotOptions, "gatewayUrl" | "workerToken">
218
+ ): Promise<void> {
219
+ const url = `${opts.gatewayUrl}/worker/transcript/snapshot`;
220
+ try {
221
+ const res = await fetch(url, {
222
+ method: "DELETE",
223
+ headers: { Authorization: `Bearer ${opts.workerToken}` },
224
+ signal: AbortSignal.timeout(30_000),
225
+ });
226
+ if (!res.ok) {
227
+ logger.warn(
228
+ `Snapshot DELETE failed: ${res.status} ${res.statusText} — next boot may rehydrate stale history`
229
+ );
230
+ return;
231
+ }
232
+ logger.info("Purged conversation snapshots for session reset");
233
+ } catch (err) {
234
+ logger.warn(
235
+ `Snapshot DELETE threw: ${err instanceof Error ? err.message : String(err)} — next boot may rehydrate stale history`
236
+ );
237
+ }
238
+ }