pi-oracle 0.1.12 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +37 -0
- package/README.md +24 -10
- package/docs/ORACLE_DESIGN.md +583 -0
- package/docs/ORACLE_RECOVERY_DRILL.md +127 -0
- package/extensions/oracle/index.ts +15 -4
- package/extensions/oracle/lib/commands.ts +35 -12
- package/extensions/oracle/lib/config.ts +2 -2
- package/extensions/oracle/lib/jobs.ts +438 -72
- package/extensions/oracle/lib/locks.ts +99 -13
- package/extensions/oracle/lib/poller.ts +223 -38
- package/extensions/oracle/lib/queue.ts +193 -0
- package/extensions/oracle/lib/runtime.ts +69 -15
- package/extensions/oracle/lib/tools.ts +274 -64
- package/extensions/oracle/worker/artifact-heuristics.d.mts +29 -0
- package/extensions/oracle/worker/auth-bootstrap.mjs +2 -72
- package/extensions/oracle/worker/auth-cookie-policy.d.mts +31 -0
- package/extensions/oracle/worker/run-job.mjs +330 -71
- package/extensions/oracle/worker/state-locks.d.mts +45 -0
- package/extensions/oracle/worker/state-locks.mjs +235 -0
- package/package.json +13 -4
- package/prompts/oracle.md +2 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
export interface ImportedAuthCookie {
|
|
2
|
+
name: string;
|
|
3
|
+
value?: string;
|
|
4
|
+
domain?: string;
|
|
5
|
+
path?: string;
|
|
6
|
+
expires?: number;
|
|
7
|
+
httpOnly?: boolean;
|
|
8
|
+
secure?: boolean;
|
|
9
|
+
sameSite?: "Lax" | "Strict" | "None";
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface NormalizedAuthCookie {
|
|
13
|
+
name: string;
|
|
14
|
+
value: string;
|
|
15
|
+
domain: string;
|
|
16
|
+
path: string;
|
|
17
|
+
expires?: number;
|
|
18
|
+
httpOnly: boolean;
|
|
19
|
+
secure: boolean;
|
|
20
|
+
sameSite?: "Lax" | "Strict" | "None";
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export function filterImportableAuthCookies(
|
|
24
|
+
cookies: ImportedAuthCookie[],
|
|
25
|
+
chatUrl: string,
|
|
26
|
+
): { cookies: NormalizedAuthCookie[]; dropped: Array<{ cookie: NormalizedAuthCookie; reason: string }> };
|
|
27
|
+
|
|
28
|
+
export function ensureAccountCookie(
|
|
29
|
+
cookies: NormalizedAuthCookie[],
|
|
30
|
+
chatUrl: string,
|
|
31
|
+
): { cookies: NormalizedAuthCookie[]; synthesized: boolean; value?: string };
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
import { createHash } from "node:crypto";
|
|
2
|
-
import { existsSync } from "node:fs";
|
|
1
|
+
import { createHash, randomUUID } from "node:crypto";
|
|
2
|
+
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
|
3
3
|
import { appendFile, chmod, mkdir, readFile, rename, rm, stat, writeFile } from "node:fs/promises";
|
|
4
4
|
import { basename, dirname, join } from "node:path";
|
|
5
|
-
import {
|
|
5
|
+
import { fileURLToPath } from "node:url";
|
|
6
|
+
import { spawn, execFileSync } from "node:child_process";
|
|
6
7
|
import { FILE_LABEL_PATTERN_SOURCE, filterStructuralArtifactCandidates, GENERIC_ARTIFACT_LABELS, parseSnapshotEntries } from "./artifact-heuristics.mjs";
|
|
8
|
+
import { createLease, listLeaseMetadata, readLeaseMetadata, releaseLease, withLock } from "./state-locks.mjs";
|
|
7
9
|
|
|
8
10
|
const jobId = process.argv[2];
|
|
9
11
|
if (!jobId) {
|
|
@@ -29,10 +31,9 @@ const MODEL_FAMILY_PREFIX = {
|
|
|
29
31
|
pro: "Pro ",
|
|
30
32
|
};
|
|
31
33
|
|
|
34
|
+
const WORKER_SCRIPT_PATH = fileURLToPath(import.meta.url);
|
|
32
35
|
const DEFAULT_ORACLE_STATE_DIR = "/tmp/pi-oracle-state";
|
|
33
36
|
const ORACLE_STATE_DIR = process.env.PI_ORACLE_STATE_DIR?.trim() || DEFAULT_ORACLE_STATE_DIR;
|
|
34
|
-
const LOCKS_DIR = join(ORACLE_STATE_DIR, "locks");
|
|
35
|
-
const LEASES_DIR = join(ORACLE_STATE_DIR, "leases");
|
|
36
37
|
const SEED_GENERATION_FILE = ".oracle-seed-generation";
|
|
37
38
|
const ARTIFACT_CANDIDATE_STABILITY_TIMEOUT_MS = 15_000;
|
|
38
39
|
const ARTIFACT_CANDIDATE_STABILITY_POLL_MS = 1_500;
|
|
@@ -61,23 +62,6 @@ async function ensurePrivateDir(path) {
|
|
|
61
62
|
await chmod(path, 0o700).catch(() => undefined);
|
|
62
63
|
}
|
|
63
64
|
|
|
64
|
-
function leaseKey(kind, key) {
|
|
65
|
-
return `${kind}-${createHash("sha256").update(key).digest("hex").slice(0, 24)}`;
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
async function readLockProcessPid(path) {
|
|
69
|
-
const metadataPath = join(path, "metadata.json");
|
|
70
|
-
if (!existsSync(metadataPath)) return undefined;
|
|
71
|
-
try {
|
|
72
|
-
const metadata = JSON.parse(await readFile(metadataPath, "utf8"));
|
|
73
|
-
return typeof metadata?.processPid === "number" && Number.isInteger(metadata.processPid) && metadata.processPid > 0
|
|
74
|
-
? metadata.processPid
|
|
75
|
-
: undefined;
|
|
76
|
-
} catch {
|
|
77
|
-
return undefined;
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
|
|
81
65
|
function isProcessAlive(pid) {
|
|
82
66
|
try {
|
|
83
67
|
process.kill(pid, 0);
|
|
@@ -88,53 +72,69 @@ function isProcessAlive(pid) {
|
|
|
88
72
|
}
|
|
89
73
|
}
|
|
90
74
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
75
|
+
function readProcessStartedAt(pid) {
|
|
76
|
+
if (!pid || pid <= 0) return undefined;
|
|
77
|
+
try {
|
|
78
|
+
const startedAt = execFileSync("ps", ["-o", "lstart=", "-p", String(pid)], { encoding: "utf8" }).trim();
|
|
79
|
+
return startedAt || undefined;
|
|
80
|
+
} catch {
|
|
81
|
+
return undefined;
|
|
82
|
+
}
|
|
96
83
|
}
|
|
97
84
|
|
|
98
|
-
async function
|
|
99
|
-
const path = join(LOCKS_DIR, leaseKey(kind, key));
|
|
85
|
+
async function waitForProcessStartedAt(pid, timeoutMs = 2_000) {
|
|
100
86
|
const deadline = Date.now() + timeoutMs;
|
|
101
|
-
await ensurePrivateDir(ORACLE_STATE_DIR);
|
|
102
|
-
await ensurePrivateDir(LOCKS_DIR);
|
|
103
|
-
|
|
104
87
|
while (Date.now() < deadline) {
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
return path;
|
|
109
|
-
} catch (error) {
|
|
110
|
-
if (!(error && typeof error === "object" && "code" in error && error.code === "EEXIST")) throw error;
|
|
111
|
-
if (await maybeReclaimStaleLock(path)) continue;
|
|
112
|
-
}
|
|
113
|
-
await sleep(200);
|
|
88
|
+
const startedAt = readProcessStartedAt(pid);
|
|
89
|
+
if (startedAt) return startedAt;
|
|
90
|
+
await sleep(100);
|
|
114
91
|
}
|
|
115
|
-
|
|
116
|
-
throw new Error(`Timed out waiting for oracle ${kind} lock: ${key}`);
|
|
92
|
+
return readProcessStartedAt(pid);
|
|
117
93
|
}
|
|
118
94
|
|
|
119
|
-
async function
|
|
120
|
-
if (!
|
|
121
|
-
|
|
122
|
-
|
|
95
|
+
async function terminateWorkerPid(pid, startedAt, options = {}) {
|
|
96
|
+
if (!pid || pid <= 0) return true;
|
|
97
|
+
const currentStartedAt = readProcessStartedAt(pid);
|
|
98
|
+
if (!currentStartedAt) return true;
|
|
99
|
+
if (startedAt && currentStartedAt !== startedAt) return false;
|
|
100
|
+
|
|
101
|
+
const termGraceMs = options.termGraceMs ?? 5_000;
|
|
102
|
+
const killGraceMs = options.killGraceMs ?? 2_000;
|
|
123
103
|
|
|
124
|
-
async function withLock(kind, key, metadata, fn, timeoutMs) {
|
|
125
|
-
const handle = await acquireLock(kind, key, metadata, timeoutMs);
|
|
126
104
|
try {
|
|
127
|
-
|
|
128
|
-
}
|
|
129
|
-
|
|
105
|
+
process.kill(pid, "SIGTERM");
|
|
106
|
+
} catch {
|
|
107
|
+
return !isProcessAlive(pid);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const termDeadline = Date.now() + termGraceMs;
|
|
111
|
+
while (Date.now() < termDeadline) {
|
|
112
|
+
const liveStartedAt = readProcessStartedAt(pid);
|
|
113
|
+
if (!liveStartedAt) return true;
|
|
114
|
+
if (startedAt && liveStartedAt !== startedAt) return true;
|
|
115
|
+
await sleep(250);
|
|
130
116
|
}
|
|
131
|
-
}
|
|
132
117
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
118
|
+
try {
|
|
119
|
+
process.kill(pid, "SIGKILL");
|
|
120
|
+
} catch {
|
|
121
|
+
return !isProcessAlive(pid);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const killDeadline = Date.now() + killGraceMs;
|
|
125
|
+
while (Date.now() < killDeadline) {
|
|
126
|
+
const liveStartedAt = readProcessStartedAt(pid);
|
|
127
|
+
if (!liveStartedAt) return true;
|
|
128
|
+
if (startedAt && liveStartedAt !== startedAt) return true;
|
|
129
|
+
await sleep(250);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
const finalStartedAt = readProcessStartedAt(pid);
|
|
133
|
+
if (!finalStartedAt) return true;
|
|
134
|
+
return startedAt ? finalStartedAt !== startedAt : false;
|
|
136
135
|
}
|
|
137
136
|
|
|
137
|
+
|
|
138
138
|
async function secureWriteText(path, content) {
|
|
139
139
|
const tmpPath = `${path}.${process.pid}.${Date.now()}.tmp`;
|
|
140
140
|
await writeFile(tmpPath, content, { encoding: "utf8", mode: 0o600 });
|
|
@@ -156,18 +156,79 @@ async function readJob() {
|
|
|
156
156
|
return readJobUnlocked();
|
|
157
157
|
}
|
|
158
158
|
|
|
159
|
+
function getAnyJobDir(targetJobId) {
|
|
160
|
+
return join(ORACLE_JOBS_DIR, `oracle-${targetJobId}`);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
function getAnyJobPath(targetJobId) {
|
|
164
|
+
return join(getAnyJobDir(targetJobId), "job.json");
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
function readAnyJob(targetJobId) {
|
|
168
|
+
const path = getAnyJobPath(targetJobId);
|
|
169
|
+
if (!existsSync(path)) return undefined;
|
|
170
|
+
try {
|
|
171
|
+
return JSON.parse(readFileSync(path, "utf8"));
|
|
172
|
+
} catch {
|
|
173
|
+
return undefined;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
function listQueuedJobs() {
|
|
178
|
+
if (!existsSync(ORACLE_JOBS_DIR)) return [];
|
|
179
|
+
return readdirSync(ORACLE_JOBS_DIR)
|
|
180
|
+
.filter((name) => name.startsWith("oracle-"))
|
|
181
|
+
.map((name) => readAnyJob(name.slice("oracle-".length)))
|
|
182
|
+
.filter((job) => job?.status === "queued")
|
|
183
|
+
.sort((left, right) => {
|
|
184
|
+
const leftKey = left?.queuedAt || left?.createdAt || "";
|
|
185
|
+
const rightKey = right?.queuedAt || right?.createdAt || "";
|
|
186
|
+
return leftKey.localeCompare(rightKey) || String(left?.createdAt || "").localeCompare(String(right?.createdAt || "")) || String(left?.id || "").localeCompare(String(right?.id || ""));
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function isActiveJobStatus(status) {
|
|
191
|
+
return ["preparing", "submitted", "waiting"].includes(String(status || ""));
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
function jobBlocksAdmission(job) {
|
|
195
|
+
return isActiveJobStatus(job?.status) || job?.cleanupPending === true || (Array.isArray(job?.cleanupWarnings) && job.cleanupWarnings.length > 0);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
function hasDurableWorkerHandoff(job) {
|
|
199
|
+
if (!job || job.status === "queued") return false;
|
|
200
|
+
if (job.workerPid) return true;
|
|
201
|
+
return false;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
async function mutateAnyJob(targetJobId, mutator) {
|
|
205
|
+
return withLock(ORACLE_STATE_DIR, "job", targetJobId, { processPid: process.pid, action: "mutateJob", targetJobId }, async () => {
|
|
206
|
+
const path = getAnyJobPath(targetJobId);
|
|
207
|
+
const current = JSON.parse(await readFile(path, "utf8"));
|
|
208
|
+
const next = mutator(current);
|
|
209
|
+
await secureWriteText(path, `${JSON.stringify(next, null, 2)}\n`);
|
|
210
|
+
return next;
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
async function writeAnyJob(targetJobId, job) {
|
|
215
|
+
await withLock(ORACLE_STATE_DIR, "job", targetJobId, { processPid: process.pid, action: "writeJob", targetJobId }, async () => {
|
|
216
|
+
await secureWriteText(getAnyJobPath(targetJobId), `${JSON.stringify(job, null, 2)}\n`);
|
|
217
|
+
});
|
|
218
|
+
}
|
|
219
|
+
|
|
159
220
|
async function writeJobUnlocked(job) {
|
|
160
221
|
await secureWriteText(jobPath, `${JSON.stringify(job, null, 2)}\n`);
|
|
161
222
|
}
|
|
162
223
|
|
|
163
224
|
async function writeJob(job) {
|
|
164
|
-
await withLock("job", jobId, { processPid: process.pid, action: "writeJob" }, async () => {
|
|
225
|
+
await withLock(ORACLE_STATE_DIR, "job", jobId, { processPid: process.pid, action: "writeJob" }, async () => {
|
|
165
226
|
await writeJobUnlocked(job);
|
|
166
227
|
});
|
|
167
228
|
}
|
|
168
229
|
|
|
169
230
|
async function mutateJob(mutator) {
|
|
170
|
-
return withLock("job", jobId, { processPid: process.pid, action: "mutateJob" }, async () => {
|
|
231
|
+
return withLock(ORACLE_STATE_DIR, "job", jobId, { processPid: process.pid, action: "mutateJob" }, async () => {
|
|
171
232
|
const job = await readJobUnlocked();
|
|
172
233
|
const next = mutator(job);
|
|
173
234
|
await writeJobUnlocked(next);
|
|
@@ -271,7 +332,7 @@ async function cloneSeedProfileToRuntime(job) {
|
|
|
271
332
|
const seedGenerationPath = join(seedDir, SEED_GENERATION_FILE);
|
|
272
333
|
const seedGeneration = existsSync(seedGenerationPath) ? (await readFile(seedGenerationPath, "utf8")).trim() || undefined : undefined;
|
|
273
334
|
|
|
274
|
-
await withLock("auth", "global", { jobId: job.id, processPid: process.pid, action: "cloneSeedProfile" }, async () => {
|
|
335
|
+
await withLock(ORACLE_STATE_DIR, "auth", "global", { jobId: job.id, processPid: process.pid, action: "cloneSeedProfile" }, async () => {
|
|
275
336
|
await rm(job.runtimeProfileDir, { recursive: true, force: true }).catch(() => undefined);
|
|
276
337
|
await ensurePrivateDir(dirname(job.runtimeProfileDir));
|
|
277
338
|
const cloneArgs = job.config.browser.cloneStrategy === "apfs-clone" ? ["-cR", seedDir, job.runtimeProfileDir] : ["-R", seedDir, job.runtimeProfileDir];
|
|
@@ -282,7 +343,7 @@ async function cloneSeedProfileToRuntime(job) {
|
|
|
282
343
|
}
|
|
283
344
|
|
|
284
345
|
async function cleanupRuntime(job) {
|
|
285
|
-
if (!job || cleaningUpRuntime) return;
|
|
346
|
+
if (!job || cleaningUpRuntime) return [];
|
|
286
347
|
cleaningUpRuntime = true;
|
|
287
348
|
const warnings = [];
|
|
288
349
|
try {
|
|
@@ -291,31 +352,202 @@ async function cleanupRuntime(job) {
|
|
|
291
352
|
warnings.push(message);
|
|
292
353
|
await log(message).catch(() => undefined);
|
|
293
354
|
});
|
|
294
|
-
await releaseLease("conversation", job.conversationId).catch(async (error) => {
|
|
295
|
-
const message = `Conversation lease cleanup warning: ${error instanceof Error ? error.message : String(error)}`;
|
|
296
|
-
warnings.push(message);
|
|
297
|
-
await log(message).catch(() => undefined);
|
|
298
|
-
});
|
|
299
|
-
await releaseLease("runtime", job.runtimeId).catch(async (error) => {
|
|
300
|
-
const message = `Runtime lease cleanup warning: ${error instanceof Error ? error.message : String(error)}`;
|
|
301
|
-
warnings.push(message);
|
|
302
|
-
await log(message).catch(() => undefined);
|
|
303
|
-
});
|
|
304
355
|
await rm(job.runtimeProfileDir, { recursive: true, force: true }).catch(async (error) => {
|
|
305
356
|
const message = `Runtime profile cleanup warning: ${error instanceof Error ? error.message : String(error)}`;
|
|
306
357
|
warnings.push(message);
|
|
307
358
|
await log(message).catch(() => undefined);
|
|
308
359
|
});
|
|
360
|
+
if (warnings.length === 0) {
|
|
361
|
+
await releaseLease(ORACLE_STATE_DIR, "conversation", job.conversationId).catch(async (error) => {
|
|
362
|
+
const message = `Conversation lease cleanup warning: ${error instanceof Error ? error.message : String(error)}`;
|
|
363
|
+
warnings.push(message);
|
|
364
|
+
await log(message).catch(() => undefined);
|
|
365
|
+
});
|
|
366
|
+
await releaseLease(ORACLE_STATE_DIR, "runtime", job.runtimeId).catch(async (error) => {
|
|
367
|
+
const message = `Runtime lease cleanup warning: ${error instanceof Error ? error.message : String(error)}`;
|
|
368
|
+
warnings.push(message);
|
|
369
|
+
await log(message).catch(() => undefined);
|
|
370
|
+
});
|
|
371
|
+
}
|
|
309
372
|
if (warnings.length === 0) {
|
|
310
373
|
await log(`Cleanup summary: runtime ${job.runtimeId} released with no warnings`).catch(() => undefined);
|
|
311
374
|
} else {
|
|
312
375
|
await log(`Cleanup summary: runtime ${job.runtimeId} released with ${warnings.length} warning(s)`).catch(() => undefined);
|
|
313
376
|
}
|
|
377
|
+
return warnings;
|
|
314
378
|
} finally {
|
|
315
379
|
cleaningUpRuntime = false;
|
|
316
380
|
}
|
|
317
381
|
}
|
|
318
382
|
|
|
383
|
+
async function tryAcquireRuntimeLeaseForJob(job, createdAt) {
|
|
384
|
+
const existing = listLeaseMetadata(ORACLE_STATE_DIR, "runtime");
|
|
385
|
+
const liveLeases = [];
|
|
386
|
+
for (const lease of existing) {
|
|
387
|
+
const owner = lease?.jobId ? readAnyJob(lease.jobId) : undefined;
|
|
388
|
+
if (!jobBlocksAdmission(owner)) {
|
|
389
|
+
await releaseLease(ORACLE_STATE_DIR, "runtime", lease?.runtimeId).catch(() => undefined);
|
|
390
|
+
continue;
|
|
391
|
+
}
|
|
392
|
+
liveLeases.push(lease);
|
|
393
|
+
}
|
|
394
|
+
if (liveLeases.length >= job.config.browser.maxConcurrentJobs) {
|
|
395
|
+
return false;
|
|
396
|
+
}
|
|
397
|
+
await createLease(ORACLE_STATE_DIR, "runtime", job.runtimeId, {
|
|
398
|
+
jobId: job.id,
|
|
399
|
+
runtimeId: job.runtimeId,
|
|
400
|
+
runtimeSessionName: job.runtimeSessionName,
|
|
401
|
+
runtimeProfileDir: job.runtimeProfileDir,
|
|
402
|
+
projectId: job.projectId,
|
|
403
|
+
sessionId: job.sessionId,
|
|
404
|
+
createdAt,
|
|
405
|
+
});
|
|
406
|
+
return true;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
async function tryAcquireConversationLeaseForJob(job, createdAt) {
|
|
410
|
+
if (!job.conversationId) return true;
|
|
411
|
+
const existing = await readLeaseMetadata(ORACLE_STATE_DIR, "conversation", job.conversationId);
|
|
412
|
+
if (existing?.jobId === job.id) return true;
|
|
413
|
+
if (existing && existing.jobId !== job.id) {
|
|
414
|
+
if (!jobBlocksAdmission(readAnyJob(existing.jobId))) {
|
|
415
|
+
await releaseLease(ORACLE_STATE_DIR, "conversation", job.conversationId).catch(() => undefined);
|
|
416
|
+
} else {
|
|
417
|
+
return false;
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
await createLease(ORACLE_STATE_DIR, "conversation", job.conversationId, {
|
|
421
|
+
jobId: job.id,
|
|
422
|
+
conversationId: job.conversationId,
|
|
423
|
+
projectId: job.projectId,
|
|
424
|
+
sessionId: job.sessionId,
|
|
425
|
+
createdAt,
|
|
426
|
+
});
|
|
427
|
+
return true;
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
async function spawnDetachedWorker(targetJobId) {
|
|
431
|
+
const child = spawn(process.execPath, [WORKER_SCRIPT_PATH, targetJobId], {
|
|
432
|
+
detached: true,
|
|
433
|
+
stdio: "ignore",
|
|
434
|
+
});
|
|
435
|
+
child.unref();
|
|
436
|
+
return {
|
|
437
|
+
pid: child.pid,
|
|
438
|
+
workerNonce: randomUUID(),
|
|
439
|
+
workerStartedAt: await waitForProcessStartedAt(child.pid),
|
|
440
|
+
};
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
async function failQueuedPromotion(targetJobId, message, at = new Date().toISOString()) {
|
|
444
|
+
await mutateAnyJob(targetJobId, (latest) => {
|
|
445
|
+
if (["complete", "failed", "cancelled"].includes(String(latest.status || ""))) return latest;
|
|
446
|
+
return {
|
|
447
|
+
...latest,
|
|
448
|
+
...phasePatch("failed", {
|
|
449
|
+
status: "failed",
|
|
450
|
+
completedAt: at,
|
|
451
|
+
heartbeatAt: at,
|
|
452
|
+
error: message,
|
|
453
|
+
}, at),
|
|
454
|
+
};
|
|
455
|
+
}).catch(() => undefined);
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
async function promoteQueuedJobsAfterCleanup() {
|
|
459
|
+
await withLock(ORACLE_STATE_DIR, "admission", "global", { processPid: process.pid, source: "worker_cleanup_promoter", jobId }, async () => {
|
|
460
|
+
for (const queuedJob of listQueuedJobs()) {
|
|
461
|
+
const current = readAnyJob(queuedJob.id);
|
|
462
|
+
if (!current || current.status !== "queued") continue;
|
|
463
|
+
|
|
464
|
+
let spawnedWorker;
|
|
465
|
+
const promotedAt = new Date().toISOString();
|
|
466
|
+
if (!existsSync(current.archivePath)) {
|
|
467
|
+
await failQueuedPromotion(current.id, `Queued oracle archive is missing: ${current.archivePath}`, promotedAt);
|
|
468
|
+
continue;
|
|
469
|
+
}
|
|
470
|
+
const runtimeLeaseAcquired = await tryAcquireRuntimeLeaseForJob(current, promotedAt);
|
|
471
|
+
if (!runtimeLeaseAcquired) break;
|
|
472
|
+
|
|
473
|
+
const conversationLeaseAcquired = await tryAcquireConversationLeaseForJob(current, promotedAt);
|
|
474
|
+
if (!conversationLeaseAcquired) {
|
|
475
|
+
await releaseLease(ORACLE_STATE_DIR, "runtime", current.runtimeId).catch(() => undefined);
|
|
476
|
+
continue;
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
try {
|
|
480
|
+
await mutateAnyJob(current.id, (latest) => {
|
|
481
|
+
if (latest.status !== "queued") throw new Error(`Queued job ${latest.id} changed state during cleanup promotion (${latest.status})`);
|
|
482
|
+
return {
|
|
483
|
+
...latest,
|
|
484
|
+
...phasePatch("submitted", {
|
|
485
|
+
status: "submitted",
|
|
486
|
+
submittedAt: latest.submittedAt || promotedAt,
|
|
487
|
+
}, promotedAt),
|
|
488
|
+
};
|
|
489
|
+
});
|
|
490
|
+
|
|
491
|
+
spawnedWorker = await spawnDetachedWorker(current.id);
|
|
492
|
+
await mutateAnyJob(current.id, (latest) => {
|
|
493
|
+
if (hasDurableWorkerHandoff(latest)) {
|
|
494
|
+
return {
|
|
495
|
+
...latest,
|
|
496
|
+
workerPid: latest.workerPid || spawnedWorker.pid,
|
|
497
|
+
workerNonce: latest.workerNonce || spawnedWorker.workerNonce,
|
|
498
|
+
workerStartedAt: latest.workerStartedAt || spawnedWorker.workerStartedAt,
|
|
499
|
+
};
|
|
500
|
+
}
|
|
501
|
+
return {
|
|
502
|
+
...latest,
|
|
503
|
+
workerPid: spawnedWorker.pid,
|
|
504
|
+
workerNonce: spawnedWorker.workerNonce,
|
|
505
|
+
workerStartedAt: spawnedWorker.workerStartedAt,
|
|
506
|
+
};
|
|
507
|
+
});
|
|
508
|
+
} catch (error) {
|
|
509
|
+
const latest = readAnyJob(current.id);
|
|
510
|
+
if (hasDurableWorkerHandoff(latest)) {
|
|
511
|
+
await log(`Queued promotion handoff already durable for ${current.id}; leaving active job intact`).catch(() => undefined);
|
|
512
|
+
continue;
|
|
513
|
+
}
|
|
514
|
+
if (spawnedWorker) {
|
|
515
|
+
await terminateWorkerPid(spawnedWorker.pid, spawnedWorker.workerStartedAt).catch(() => undefined);
|
|
516
|
+
}
|
|
517
|
+
const failedAt = new Date().toISOString();
|
|
518
|
+
if (latest && !["complete", "failed", "cancelled"].includes(String(latest.status || ""))) {
|
|
519
|
+
await failQueuedPromotion(current.id, error instanceof Error ? error.message : String(error), failedAt);
|
|
520
|
+
}
|
|
521
|
+
if (spawnedWorker) {
|
|
522
|
+
let cleanupWarnings = [];
|
|
523
|
+
try {
|
|
524
|
+
cleanupWarnings = await cleanupRuntime(current);
|
|
525
|
+
} catch (cleanupError) {
|
|
526
|
+
const message = `Cleanup-driven promotion teardown warning for ${current.id}: ${cleanupError instanceof Error ? cleanupError.message : String(cleanupError)}`;
|
|
527
|
+
cleanupWarnings = [message];
|
|
528
|
+
await log(message).catch(() => undefined);
|
|
529
|
+
}
|
|
530
|
+
if (cleanupWarnings.length > 0) {
|
|
531
|
+
await mutateAnyJob(current.id, (job) => ({
|
|
532
|
+
...job,
|
|
533
|
+
cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupWarnings],
|
|
534
|
+
lastCleanupAt: failedAt,
|
|
535
|
+
error: [job.error, ...cleanupWarnings].filter(Boolean).join("\n"),
|
|
536
|
+
})).catch(() => undefined);
|
|
537
|
+
await log(`Stopping queued cleanup promotion after ${current.id} because teardown left ${cleanupWarnings.length} warning(s)`).catch(() => undefined);
|
|
538
|
+
break;
|
|
539
|
+
}
|
|
540
|
+
} else {
|
|
541
|
+
await releaseLease(ORACLE_STATE_DIR, "conversation", current.conversationId).catch(() => undefined);
|
|
542
|
+
await releaseLease(ORACLE_STATE_DIR, "runtime", current.runtimeId).catch(() => undefined);
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
}).catch(async (error) => {
|
|
547
|
+
await log(`Queued cleanup promotion warning: ${error instanceof Error ? error.message : String(error)}`).catch(() => undefined);
|
|
548
|
+
});
|
|
549
|
+
}
|
|
550
|
+
|
|
319
551
|
function browserBaseArgs(job, options = {}) {
|
|
320
552
|
const args = ["--session", job.runtimeSessionName];
|
|
321
553
|
if (options.withLaunchOptions) {
|
|
@@ -1535,6 +1767,7 @@ async function run() {
|
|
|
1535
1767
|
responsePath: currentJob.responsePath,
|
|
1536
1768
|
responseFormat: "text/plain",
|
|
1537
1769
|
artifactFailureCount,
|
|
1770
|
+
cleanupPending: true,
|
|
1538
1771
|
}),
|
|
1539
1772
|
{ force: true },
|
|
1540
1773
|
);
|
|
@@ -1551,13 +1784,39 @@ async function run() {
|
|
|
1551
1784
|
status: "failed",
|
|
1552
1785
|
completedAt: new Date().toISOString(),
|
|
1553
1786
|
error: message,
|
|
1787
|
+
cleanupPending: true,
|
|
1554
1788
|
}),
|
|
1555
1789
|
{ force: true },
|
|
1556
1790
|
);
|
|
1557
1791
|
process.exitCode = 1;
|
|
1558
1792
|
}
|
|
1559
1793
|
} finally {
|
|
1560
|
-
|
|
1794
|
+
let cleanupWarnings = [];
|
|
1795
|
+
try {
|
|
1796
|
+
cleanupWarnings = await cleanupRuntime(currentJob);
|
|
1797
|
+
} catch (error) {
|
|
1798
|
+
cleanupWarnings = [`Runtime cleanup failed before queued promotion: ${error instanceof Error ? error.message : String(error)}`];
|
|
1799
|
+
await log(cleanupWarnings[0]).catch(() => undefined);
|
|
1800
|
+
}
|
|
1801
|
+
if (currentJob?.id) {
|
|
1802
|
+
const cleanupAt = new Date().toISOString();
|
|
1803
|
+
await mutateJob((job) => ({
|
|
1804
|
+
...job,
|
|
1805
|
+
cleanupPending: false,
|
|
1806
|
+
...(cleanupWarnings.length > 0
|
|
1807
|
+
? {
|
|
1808
|
+
cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupWarnings],
|
|
1809
|
+
lastCleanupAt: cleanupAt,
|
|
1810
|
+
error: [job.error, ...cleanupWarnings].filter(Boolean).join("\n"),
|
|
1811
|
+
}
|
|
1812
|
+
: { lastCleanupAt: cleanupAt }),
|
|
1813
|
+
})).catch(() => undefined);
|
|
1814
|
+
}
|
|
1815
|
+
if (cleanupWarnings.length === 0) {
|
|
1816
|
+
await promoteQueuedJobsAfterCleanup().catch(() => undefined);
|
|
1817
|
+
} else {
|
|
1818
|
+
await log(`Skipping queued promotion because runtime cleanup left ${cleanupWarnings.length} warning(s)`).catch(() => undefined);
|
|
1819
|
+
}
|
|
1561
1820
|
}
|
|
1562
1821
|
}
|
|
1563
1822
|
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
export const ORACLE_METADATA_WRITE_GRACE_MS: number;
|
|
2
|
+
|
|
3
|
+
export function acquireLock(
|
|
4
|
+
stateDir: string,
|
|
5
|
+
kind: string,
|
|
6
|
+
key: string,
|
|
7
|
+
metadata: unknown,
|
|
8
|
+
timeoutMs?: number,
|
|
9
|
+
): Promise<string>;
|
|
10
|
+
|
|
11
|
+
export function releaseLock(path: string | undefined): Promise<void>;
|
|
12
|
+
|
|
13
|
+
export function withLock<T>(
|
|
14
|
+
stateDir: string,
|
|
15
|
+
kind: string,
|
|
16
|
+
key: string,
|
|
17
|
+
metadata: unknown,
|
|
18
|
+
fn: () => Promise<T>,
|
|
19
|
+
timeoutMs?: number,
|
|
20
|
+
): Promise<T>;
|
|
21
|
+
|
|
22
|
+
export function createLease(
|
|
23
|
+
stateDir: string,
|
|
24
|
+
kind: string,
|
|
25
|
+
key: string,
|
|
26
|
+
metadata: unknown,
|
|
27
|
+
timeoutMs?: number,
|
|
28
|
+
): Promise<string>;
|
|
29
|
+
|
|
30
|
+
export function writeLeaseMetadata(
|
|
31
|
+
stateDir: string,
|
|
32
|
+
kind: string,
|
|
33
|
+
key: string,
|
|
34
|
+
metadata: unknown,
|
|
35
|
+
): Promise<string>;
|
|
36
|
+
|
|
37
|
+
export function readLeaseMetadata<T = unknown>(
|
|
38
|
+
stateDir: string,
|
|
39
|
+
kind: string,
|
|
40
|
+
key: string,
|
|
41
|
+
): Promise<T | undefined>;
|
|
42
|
+
|
|
43
|
+
export function listLeaseMetadata<T = unknown>(stateDir: string, kind: string): T[];
|
|
44
|
+
|
|
45
|
+
export function releaseLease(stateDir: string, kind: string, key: string | undefined): Promise<void>;
|