pi-oracle 0.1.12 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +37 -0
- package/README.md +24 -10
- package/docs/ORACLE_DESIGN.md +583 -0
- package/docs/ORACLE_RECOVERY_DRILL.md +127 -0
- package/extensions/oracle/index.ts +15 -4
- package/extensions/oracle/lib/commands.ts +35 -12
- package/extensions/oracle/lib/config.ts +2 -2
- package/extensions/oracle/lib/jobs.ts +438 -72
- package/extensions/oracle/lib/locks.ts +99 -13
- package/extensions/oracle/lib/poller.ts +223 -38
- package/extensions/oracle/lib/queue.ts +193 -0
- package/extensions/oracle/lib/runtime.ts +69 -15
- package/extensions/oracle/lib/tools.ts +274 -64
- package/extensions/oracle/worker/artifact-heuristics.d.mts +29 -0
- package/extensions/oracle/worker/auth-bootstrap.mjs +2 -72
- package/extensions/oracle/worker/auth-cookie-policy.d.mts +31 -0
- package/extensions/oracle/worker/run-job.mjs +330 -71
- package/extensions/oracle/worker/state-locks.d.mts +45 -0
- package/extensions/oracle/worker/state-locks.mjs +235 -0
- package/package.json +13 -4
- package/prompts/oracle.md +2 -0
|
@@ -5,11 +5,12 @@ import { chmod, mkdir, readFile, rename, rm, writeFile } from "node:fs/promises"
|
|
|
5
5
|
import { join, resolve } from "node:path";
|
|
6
6
|
import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
|
|
7
7
|
import type { OracleConfig, OracleEffort, OracleModelFamily } from "./config.js";
|
|
8
|
-
import { withJobLock } from "./locks.js";
|
|
9
|
-
import { cleanupRuntimeArtifacts, getProjectId, getSessionId, parseConversationId, type OracleCleanupReport } from "./runtime.js";
|
|
8
|
+
import { withJobLock, withLock } from "./locks.js";
|
|
9
|
+
import { cleanupRuntimeArtifacts, getProjectId, getSessionId, parseConversationId, requirePersistedSessionFile, type OracleCleanupReport } from "./runtime.js";
|
|
10
10
|
|
|
11
|
-
export type OracleJobStatus = "preparing" | "submitted" | "waiting" | "complete" | "failed" | "cancelled";
|
|
11
|
+
export type OracleJobStatus = "queued" | "preparing" | "submitted" | "waiting" | "complete" | "failed" | "cancelled";
|
|
12
12
|
export type OracleJobPhase =
|
|
13
|
+
| "queued"
|
|
13
14
|
| "submitted"
|
|
14
15
|
| "cloning_runtime"
|
|
15
16
|
| "launching_browser"
|
|
@@ -25,9 +26,14 @@ export type OracleJobPhase =
|
|
|
25
26
|
| "cancelled";
|
|
26
27
|
|
|
27
28
|
export const ACTIVE_ORACLE_JOB_STATUSES: OracleJobStatus[] = ["preparing", "submitted", "waiting"];
|
|
29
|
+
export const OPEN_ORACLE_JOB_STATUSES: OracleJobStatus[] = ["queued", ...ACTIVE_ORACLE_JOB_STATUSES];
|
|
30
|
+
export const TERMINAL_ORACLE_JOB_STATUSES: OracleJobStatus[] = ["complete", "failed", "cancelled"];
|
|
28
31
|
export const ORACLE_MISSING_WORKER_GRACE_MS = 30_000;
|
|
29
32
|
export const ORACLE_STALE_HEARTBEAT_MS = 3 * 60 * 1000;
|
|
30
33
|
export const ORACLE_NOTIFICATION_CLAIM_TTL_MS = 60_000;
|
|
34
|
+
export const ORACLE_WAKEUP_MAX_ATTEMPTS = 3;
|
|
35
|
+
export const ORACLE_WAKEUP_RETRY_DELAYS_MS = [0, 15_000, 60_000] as const;
|
|
36
|
+
export const ORACLE_WAKEUP_POST_SEND_RETENTION_MS = 2 * 60 * 1000;
|
|
31
37
|
const ORACLE_COMPLETE_JOB_RETENTION_MS = 14 * 24 * 60 * 60 * 1000;
|
|
32
38
|
const ORACLE_FAILED_JOB_RETENTION_MS = 30 * 24 * 60 * 60 * 1000;
|
|
33
39
|
export const DEFAULT_ORACLE_JOBS_DIR = "/tmp";
|
|
@@ -38,6 +44,32 @@ export function isActiveOracleJob(job: Pick<OracleJob, "status">): boolean {
|
|
|
38
44
|
return ACTIVE_ORACLE_JOB_STATUSES.includes(job.status);
|
|
39
45
|
}
|
|
40
46
|
|
|
47
|
+
export function isOpenOracleJob(job: Pick<OracleJob, "status">): boolean {
|
|
48
|
+
return OPEN_ORACLE_JOB_STATUSES.includes(job.status);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export function isTerminalOracleJob(job: Pick<OracleJob, "status">): boolean {
|
|
52
|
+
return TERMINAL_ORACLE_JOB_STATUSES.includes(job.status);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export function shouldAdvanceQueueAfterCancellation(job: Pick<OracleJob, "status" | "cleanupWarnings" | "cleanupPending">): boolean {
|
|
56
|
+
return job.status === "cancelled" && !job.cleanupPending && !job.cleanupWarnings?.length;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export function hasDurableWorkerHandoff(
|
|
60
|
+
job: Pick<OracleJob, "status" | "phase" | "workerPid" | "workerStartedAt" | "heartbeatAt">,
|
|
61
|
+
): boolean {
|
|
62
|
+
if (job.status === "queued") return false;
|
|
63
|
+
if (job.workerPid) return true;
|
|
64
|
+
return false;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export function hasPersistedOriginSession(
|
|
68
|
+
job: Pick<OracleJob, "originSessionFile" | "sessionId">,
|
|
69
|
+
): job is Pick<OracleJob, "originSessionFile" | "sessionId"> & { originSessionFile: string } {
|
|
70
|
+
return typeof job.originSessionFile === "string" && job.originSessionFile.length > 0 && job.sessionId === job.originSessionFile;
|
|
71
|
+
}
|
|
72
|
+
|
|
41
73
|
function readProcessStartedAt(pid: number | undefined): string | undefined {
|
|
42
74
|
if (!pid || pid <= 0) return undefined;
|
|
43
75
|
try {
|
|
@@ -86,6 +118,7 @@ export interface OracleJob {
|
|
|
86
118
|
phase: OracleJobPhase;
|
|
87
119
|
phaseAt: string;
|
|
88
120
|
createdAt: string;
|
|
121
|
+
queuedAt?: string;
|
|
89
122
|
submittedAt?: string;
|
|
90
123
|
completedAt?: string;
|
|
91
124
|
heartbeatAt?: string;
|
|
@@ -108,6 +141,12 @@ export interface OracleJob {
|
|
|
108
141
|
archiveSha256?: string;
|
|
109
142
|
archiveDeletedAfterUpload: boolean;
|
|
110
143
|
notifiedAt?: string;
|
|
144
|
+
notificationEntryId?: string;
|
|
145
|
+
notificationSessionKey?: string;
|
|
146
|
+
notificationSessionFile?: string;
|
|
147
|
+
wakeupAttemptCount?: number;
|
|
148
|
+
wakeupLastRequestedAt?: string;
|
|
149
|
+
wakeupSettledAt?: string;
|
|
111
150
|
notifyClaimedAt?: string;
|
|
112
151
|
notifyClaimedBy?: string;
|
|
113
152
|
artifactFailureCount?: number;
|
|
@@ -126,6 +165,7 @@ export interface OracleJob {
|
|
|
126
165
|
config: OracleConfig;
|
|
127
166
|
cleanupWarnings?: string[];
|
|
128
167
|
lastCleanupAt?: string;
|
|
168
|
+
cleanupPending?: boolean;
|
|
129
169
|
}
|
|
130
170
|
|
|
131
171
|
export interface OracleSubmitInput {
|
|
@@ -214,6 +254,34 @@ export async function updateJob(id: string, mutate: (job: OracleJob) => OracleJo
|
|
|
214
254
|
});
|
|
215
255
|
}
|
|
216
256
|
|
|
257
|
+
export async function appendCleanupWarnings(jobId: string, warnings: string[], at = new Date().toISOString()): Promise<OracleJob | undefined> {
|
|
258
|
+
if (warnings.length === 0) return readJob(jobId);
|
|
259
|
+
try {
|
|
260
|
+
return await updateJob(jobId, (job) => ({
|
|
261
|
+
...job,
|
|
262
|
+
cleanupPending: false,
|
|
263
|
+
cleanupWarnings: Array.from(new Set([...(job.cleanupWarnings || []), ...warnings])),
|
|
264
|
+
lastCleanupAt: at,
|
|
265
|
+
error: [job.error, ...warnings].filter(Boolean).join("\n"),
|
|
266
|
+
}));
|
|
267
|
+
} catch {
|
|
268
|
+
return readJob(jobId);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
export async function clearCleanupPending(jobId: string, at = new Date().toISOString()): Promise<OracleJob | undefined> {
|
|
273
|
+
try {
|
|
274
|
+
return await updateJob(jobId, (job) => ({
|
|
275
|
+
...job,
|
|
276
|
+
cleanupPending: false,
|
|
277
|
+
cleanupWarnings: undefined,
|
|
278
|
+
lastCleanupAt: at,
|
|
279
|
+
}));
|
|
280
|
+
} catch {
|
|
281
|
+
return readJob(jobId);
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
217
285
|
function sleep(ms: number): Promise<void> {
|
|
218
286
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
219
287
|
}
|
|
@@ -224,6 +292,39 @@ function parseTimestamp(value: string | undefined): number | undefined {
|
|
|
224
292
|
return Number.isFinite(parsed) ? parsed : undefined;
|
|
225
293
|
}
|
|
226
294
|
|
|
295
|
+
function notificationClaimIsOwnedBy(job: Pick<OracleJob, "notifyClaimedAt" | "notifyClaimedBy">, claimedBy: string, now = Date.now()): boolean {
|
|
296
|
+
if (job.notifyClaimedBy !== claimedBy) return false;
|
|
297
|
+
const claimedAtMs = parseTimestamp(job.notifyClaimedAt);
|
|
298
|
+
if (claimedAtMs === undefined) return false;
|
|
299
|
+
return now - claimedAtMs < ORACLE_NOTIFICATION_CLAIM_TTL_MS;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
function notificationClaimIsLive(job: Pick<OracleJob, "notifyClaimedAt" | "notifyClaimedBy">, now = Date.now()): boolean {
|
|
303
|
+
if (!job.notifyClaimedBy) return false;
|
|
304
|
+
const claimedAtMs = parseTimestamp(job.notifyClaimedAt);
|
|
305
|
+
if (claimedAtMs === undefined) return false;
|
|
306
|
+
return now - claimedAtMs < ORACLE_NOTIFICATION_CLAIM_TTL_MS;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
function wakeupRetentionGraceIsActive(job: Pick<OracleJob, "wakeupLastRequestedAt">, now = Date.now()): boolean {
|
|
310
|
+
const lastRequestedAtMs = parseTimestamp(job.wakeupLastRequestedAt);
|
|
311
|
+
if (lastRequestedAtMs === undefined) return false;
|
|
312
|
+
return now - lastRequestedAtMs < ORACLE_WAKEUP_POST_SEND_RETENTION_MS;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
export function getWakeupRetryDelayMs(attemptCount: number): number {
|
|
316
|
+
return ORACLE_WAKEUP_RETRY_DELAYS_MS[Math.min(attemptCount, ORACLE_WAKEUP_RETRY_DELAYS_MS.length - 1)] ?? ORACLE_WAKEUP_RETRY_DELAYS_MS[ORACLE_WAKEUP_RETRY_DELAYS_MS.length - 1];
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
export function shouldRequestWakeup(job: Pick<OracleJob, "wakeupAttemptCount" | "wakeupLastRequestedAt" | "wakeupSettledAt">, now = Date.now()): boolean {
|
|
320
|
+
if (job.wakeupSettledAt) return false;
|
|
321
|
+
const attempts = job.wakeupAttemptCount ?? 0;
|
|
322
|
+
if (attempts >= ORACLE_WAKEUP_MAX_ATTEMPTS) return false;
|
|
323
|
+
const lastRequestedAtMs = parseTimestamp(job.wakeupLastRequestedAt);
|
|
324
|
+
if (lastRequestedAtMs === undefined) return true;
|
|
325
|
+
return now - lastRequestedAtMs >= getWakeupRetryDelayMs(attempts);
|
|
326
|
+
}
|
|
327
|
+
|
|
227
328
|
export function withJobPhase<T extends Pick<OracleJob, "phase" | "phaseAt">>(
|
|
228
329
|
phase: OracleJobPhase,
|
|
229
330
|
patch?: Omit<Partial<OracleJob>, "phase" | "phaseAt">,
|
|
@@ -237,7 +338,7 @@ export function withJobPhase<T extends Pick<OracleJob, "phase" | "phaseAt">>(
|
|
|
237
338
|
}
|
|
238
339
|
|
|
239
340
|
function isTerminalOracleJobStatus(status: OracleJobStatus): boolean {
|
|
240
|
-
return status
|
|
341
|
+
return TERMINAL_ORACLE_JOB_STATUSES.includes(status);
|
|
241
342
|
}
|
|
242
343
|
|
|
243
344
|
export async function terminateWorkerPid(
|
|
@@ -312,9 +413,41 @@ export function getStaleOracleJobReason(job: OracleJob, now = Date.now()): strin
|
|
|
312
413
|
return undefined;
|
|
313
414
|
}
|
|
314
415
|
|
|
416
|
+
function getTerminalCleanupStaleReason(job: Pick<OracleJob, "status" | "cleanupPending" | "cleanupWarnings" | "lastCleanupAt" | "heartbeatAt" | "completedAt" | "phaseAt" | "createdAt" | "workerPid" | "workerStartedAt">, now = Date.now()): string | undefined {
|
|
417
|
+
if (!isTerminalOracleJob(job)) return undefined;
|
|
418
|
+
if (!job.cleanupPending && !job.cleanupWarnings?.length) return undefined;
|
|
419
|
+
|
|
420
|
+
const baselineMs =
|
|
421
|
+
parseTimestamp(job.lastCleanupAt) ??
|
|
422
|
+
parseTimestamp(job.heartbeatAt) ??
|
|
423
|
+
parseTimestamp(job.completedAt) ??
|
|
424
|
+
parseTimestamp(job.phaseAt) ??
|
|
425
|
+
parseTimestamp(job.createdAt);
|
|
426
|
+
if (baselineMs === undefined) return "Oracle terminal cleanup has no valid timestamps";
|
|
427
|
+
if (!job.workerPid) return undefined;
|
|
428
|
+
|
|
429
|
+
const currentStartedAt = readProcessStartedAt(job.workerPid);
|
|
430
|
+
if (!currentStartedAt) {
|
|
431
|
+
return `Oracle terminal cleanup worker PID ${job.workerPid} is no longer running`;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
if (job.workerStartedAt && currentStartedAt !== job.workerStartedAt) {
|
|
435
|
+
return `Oracle terminal cleanup worker PID ${job.workerPid} no longer matches the recorded process identity`;
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
if (now - baselineMs > ORACLE_STALE_HEARTBEAT_MS) {
|
|
439
|
+
return `Oracle terminal cleanup is stale (${Math.round((now - baselineMs) / 1000)}s since last update)`;
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
return undefined;
|
|
443
|
+
}
|
|
444
|
+
|
|
315
445
|
export async function cleanupJobResources(
|
|
316
|
-
job: Pick<OracleJob, "runtimeId" | "runtimeProfileDir" | "runtimeSessionName" | "conversationId">,
|
|
446
|
+
job: Pick<OracleJob, "submittedAt" | "runtimeId" | "runtimeProfileDir" | "runtimeSessionName" | "conversationId">,
|
|
317
447
|
): Promise<OracleCleanupReport> {
|
|
448
|
+
if (!job.submittedAt) {
|
|
449
|
+
return { attempted: [], warnings: [] };
|
|
450
|
+
}
|
|
318
451
|
return cleanupRuntimeArtifacts({
|
|
319
452
|
runtimeId: job.runtimeId,
|
|
320
453
|
runtimeProfileDir: job.runtimeProfileDir,
|
|
@@ -330,15 +463,18 @@ function getCleanupRetentionMs(job: OracleJob): { complete: number; failed: numb
|
|
|
330
463
|
};
|
|
331
464
|
}
|
|
332
465
|
|
|
333
|
-
function shouldPruneTerminalJob(job: OracleJob, now = Date.now()): boolean {
|
|
466
|
+
export function shouldPruneTerminalJob(job: OracleJob, now = Date.now()): boolean {
|
|
334
467
|
if (!isTerminalOracleJobStatus(job.status)) return false;
|
|
468
|
+
if (job.cleanupPending || job.cleanupWarnings?.length) return false;
|
|
469
|
+
if (notificationClaimIsLive(job, now)) return false;
|
|
470
|
+
if (wakeupRetentionGraceIsActive(job, now)) return false;
|
|
335
471
|
const completedMs = parseTimestamp(job.completedAt) ?? parseTimestamp(job.createdAt);
|
|
336
472
|
if (completedMs === undefined) return false;
|
|
337
473
|
const ageMs = now - completedMs;
|
|
338
474
|
|
|
339
475
|
const retention = getCleanupRetentionMs(job);
|
|
340
476
|
|
|
341
|
-
if (
|
|
477
|
+
if (job.status === "complete" || job.status === "cancelled") {
|
|
342
478
|
return ageMs >= retention.complete;
|
|
343
479
|
}
|
|
344
480
|
|
|
@@ -350,10 +486,54 @@ function shouldPruneTerminalJob(job: OracleJob, now = Date.now()): boolean {
|
|
|
350
486
|
}
|
|
351
487
|
|
|
352
488
|
export async function removeTerminalOracleJob(job: OracleJob): Promise<{ removed: boolean; cleanupReport: OracleCleanupReport }> {
|
|
353
|
-
if (
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
489
|
+
if (!isTerminalOracleJob(job)) return { removed: false, cleanupReport: { attempted: [], warnings: [] } };
|
|
490
|
+
|
|
491
|
+
return withJobLock(job.id, { processPid: process.pid, action: "removeTerminalOracleJob" }, async () => {
|
|
492
|
+
const current = readJob(job.id);
|
|
493
|
+
if (!current) return { removed: true, cleanupReport: { attempted: [], warnings: [] } };
|
|
494
|
+
if (!isTerminalOracleJob(current)) return { removed: false, cleanupReport: { attempted: [], warnings: [] } };
|
|
495
|
+
if (notificationClaimIsLive(current)) {
|
|
496
|
+
return {
|
|
497
|
+
removed: false,
|
|
498
|
+
cleanupReport: {
|
|
499
|
+
attempted: [],
|
|
500
|
+
warnings: [`Refusing to remove terminal oracle job ${current.id} while a notification delivery is in flight.`],
|
|
501
|
+
},
|
|
502
|
+
};
|
|
503
|
+
}
|
|
504
|
+
if (wakeupRetentionGraceIsActive(current)) {
|
|
505
|
+
return {
|
|
506
|
+
removed: false,
|
|
507
|
+
cleanupReport: {
|
|
508
|
+
attempted: [],
|
|
509
|
+
warnings: [`Refusing to remove terminal oracle job ${current.id} because its wake-up delivery is still within the post-send retention grace window.`],
|
|
510
|
+
},
|
|
511
|
+
};
|
|
512
|
+
}
|
|
513
|
+
if (current.workerPid && isWorkerProcessAlive(current.workerPid, current.workerStartedAt)) {
|
|
514
|
+
return {
|
|
515
|
+
removed: false,
|
|
516
|
+
cleanupReport: {
|
|
517
|
+
attempted: [],
|
|
518
|
+
warnings: [`Refusing to remove terminal oracle job ${current.id} while worker PID ${current.workerPid} is still live.`],
|
|
519
|
+
},
|
|
520
|
+
};
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
const cleanupReport = await cleanupJobResources(current);
|
|
524
|
+
if (cleanupReport.warnings.length > 0) {
|
|
525
|
+
await writeJobUnlocked({
|
|
526
|
+
...current,
|
|
527
|
+
cleanupPending: false,
|
|
528
|
+
cleanupWarnings: [...(current.cleanupWarnings || []), ...cleanupReport.warnings],
|
|
529
|
+
lastCleanupAt: new Date().toISOString(),
|
|
530
|
+
error: [current.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
|
|
531
|
+
});
|
|
532
|
+
return { removed: false, cleanupReport };
|
|
533
|
+
}
|
|
534
|
+
await rm(getJobDir(current.id), { recursive: true, force: true });
|
|
535
|
+
return { removed: true, cleanupReport };
|
|
536
|
+
});
|
|
357
537
|
}
|
|
358
538
|
|
|
359
539
|
export async function pruneTerminalOracleJobs(now = Date.now()): Promise<string[]> {
|
|
@@ -374,43 +554,108 @@ export async function pruneTerminalOracleJobs(now = Date.now()): Promise<string[
|
|
|
374
554
|
export async function reconcileStaleOracleJobs(): Promise<OracleJob[]> {
|
|
375
555
|
const repaired: OracleJob[] = [];
|
|
376
556
|
const now = Date.now();
|
|
557
|
+
const recoveredAt = new Date(now).toISOString();
|
|
377
558
|
|
|
378
559
|
for (const jobDir of listOracleJobDirs()) {
|
|
379
560
|
const job = readJob(jobDir);
|
|
380
561
|
if (!job) continue;
|
|
562
|
+
|
|
563
|
+
if (isTerminalOracleJob(job) && (job.cleanupPending || job.cleanupWarnings?.length)) {
|
|
564
|
+
let cleanupTarget: OracleJob | undefined;
|
|
565
|
+
let blockedWarning: string | undefined;
|
|
566
|
+
|
|
567
|
+
await withJobLock(job.id, { processPid: process.pid, action: "reconcileTerminalCleanupJob" }, async () => {
|
|
568
|
+
const current = readJob(job.id);
|
|
569
|
+
if (!current || !isTerminalOracleJob(current) || (!current.cleanupPending && !current.cleanupWarnings?.length)) return;
|
|
570
|
+
|
|
571
|
+
if (current.workerPid && isWorkerProcessAlive(current.workerPid, current.workerStartedAt)) {
|
|
572
|
+
const staleCleanupReason = getTerminalCleanupStaleReason(current, now);
|
|
573
|
+
if (!staleCleanupReason) return;
|
|
574
|
+
const terminated = await terminateWorkerPid(current.workerPid, current.workerStartedAt);
|
|
575
|
+
if (!terminated) {
|
|
576
|
+
blockedWarning = `Oracle terminal cleanup is blocked because worker PID ${current.workerPid} could not be terminated safely after ${staleCleanupReason}.`;
|
|
577
|
+
return;
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
cleanupTarget = current;
|
|
582
|
+
});
|
|
583
|
+
|
|
584
|
+
if (blockedWarning) {
|
|
585
|
+
const blocked = await appendCleanupWarnings(job.id, [blockedWarning], recoveredAt);
|
|
586
|
+
if (blocked) repaired.push(blocked);
|
|
587
|
+
continue;
|
|
588
|
+
}
|
|
589
|
+
if (!cleanupTarget) continue;
|
|
590
|
+
|
|
591
|
+
const cleanupReport = await cleanupJobResources(cleanupTarget);
|
|
592
|
+
if (cleanupReport.warnings.length > 0) {
|
|
593
|
+
const withWarnings = await appendCleanupWarnings(job.id, cleanupReport.warnings, recoveredAt);
|
|
594
|
+
if (withWarnings) repaired.push(withWarnings);
|
|
595
|
+
} else {
|
|
596
|
+
const recoveredJob = await clearCleanupPending(job.id, recoveredAt);
|
|
597
|
+
if (recoveredJob) repaired.push(recoveredJob);
|
|
598
|
+
}
|
|
599
|
+
continue;
|
|
600
|
+
}
|
|
601
|
+
|
|
381
602
|
const staleReason = getStaleOracleJobReason(job, now);
|
|
382
603
|
if (!staleReason) continue;
|
|
383
604
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
605
|
+
let terminated = false;
|
|
606
|
+
let transitioned = false;
|
|
607
|
+
let repairedJob: OracleJob | undefined;
|
|
608
|
+
|
|
609
|
+
await withJobLock(job.id, { processPid: process.pid, action: "reconcileStaleOracleJob" }, async () => {
|
|
610
|
+
const current = readJob(job.id);
|
|
611
|
+
if (!current) return;
|
|
612
|
+
const currentStaleReason = getStaleOracleJobReason(current, now);
|
|
613
|
+
if (!currentStaleReason) return;
|
|
614
|
+
|
|
615
|
+
terminated = await terminateWorkerPid(current.workerPid, current.workerStartedAt);
|
|
616
|
+
transitioned = true;
|
|
617
|
+
const suffix = current.workerPid
|
|
618
|
+
? terminated
|
|
619
|
+
? ` Terminated stale worker PID ${current.workerPid}.`
|
|
620
|
+
: ` Failed to terminate stale worker PID ${current.workerPid}.`
|
|
621
|
+
: "";
|
|
622
|
+
repairedJob = {
|
|
623
|
+
...current,
|
|
624
|
+
...withJobPhase("failed", {
|
|
625
|
+
status: "failed",
|
|
626
|
+
completedAt: recoveredAt,
|
|
627
|
+
heartbeatAt: recoveredAt,
|
|
628
|
+
notifyClaimedAt: undefined,
|
|
629
|
+
notifyClaimedBy: undefined,
|
|
630
|
+
cleanupPending: terminated,
|
|
631
|
+
error: current.error
|
|
632
|
+
? `${current.error}\nRecovered stale job: ${currentStaleReason}.${suffix}`.trim()
|
|
633
|
+
: `Recovered stale job: ${currentStaleReason}.${suffix}`.trim(),
|
|
634
|
+
}, recoveredAt),
|
|
635
|
+
};
|
|
636
|
+
await writeJobUnlocked(repairedJob);
|
|
637
|
+
});
|
|
638
|
+
|
|
639
|
+
if (!transitioned || !repairedJob || !isTerminalOracleJob(repairedJob)) continue;
|
|
640
|
+
|
|
641
|
+
if (!terminated) {
|
|
642
|
+
const cleanupWarnings = [
|
|
643
|
+
`Oracle runtime cleanup is blocked because worker PID ${job.workerPid ?? "unknown"} could not be terminated safely.`,
|
|
644
|
+
];
|
|
645
|
+
const blocked = await appendCleanupWarnings(repairedJob.id, cleanupWarnings, recoveredAt);
|
|
646
|
+
repaired.push(blocked ?? repairedJob);
|
|
647
|
+
continue;
|
|
648
|
+
}
|
|
390
649
|
|
|
391
|
-
const repairedJob = await updateJob(job.id, (current) => ({
|
|
392
|
-
...current,
|
|
393
|
-
...withJobPhase("failed", {
|
|
394
|
-
status: "failed",
|
|
395
|
-
completedAt: new Date(now).toISOString(),
|
|
396
|
-
heartbeatAt: new Date(now).toISOString(),
|
|
397
|
-
notifyClaimedAt: undefined,
|
|
398
|
-
notifyClaimedBy: undefined,
|
|
399
|
-
error: current.error
|
|
400
|
-
? `${current.error}\nRecovered stale job: ${staleReason}.${suffix}`.trim()
|
|
401
|
-
: `Recovered stale job: ${staleReason}.${suffix}`.trim(),
|
|
402
|
-
}, new Date(now).toISOString()),
|
|
403
|
-
}));
|
|
404
650
|
const cleanupReport = await cleanupJobResources(repairedJob);
|
|
405
651
|
if (cleanupReport.warnings.length > 0) {
|
|
406
|
-
await
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
lastCleanupAt: new Date(now).toISOString(),
|
|
410
|
-
error: [current.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
|
|
411
|
-
}));
|
|
652
|
+
const withWarnings = await appendCleanupWarnings(repairedJob.id, cleanupReport.warnings, recoveredAt);
|
|
653
|
+
repaired.push(withWarnings ?? repairedJob);
|
|
654
|
+
continue;
|
|
412
655
|
}
|
|
413
|
-
|
|
656
|
+
|
|
657
|
+
const finalized = await clearCleanupPending(repairedJob.id, recoveredAt);
|
|
658
|
+
repaired.push(finalized ?? repairedJob);
|
|
414
659
|
}
|
|
415
660
|
|
|
416
661
|
return repaired;
|
|
@@ -427,6 +672,10 @@ export async function tryClaimNotification(jobId: string, claimedBy: string, now
|
|
|
427
672
|
if (!current) return undefined;
|
|
428
673
|
if (!isTerminalOracleJobStatus(current.status)) return undefined;
|
|
429
674
|
if (current.notifiedAt) return undefined;
|
|
675
|
+
if (!hasPersistedOriginSession(current)) return undefined;
|
|
676
|
+
const nowMs = parseTimestamp(now) ?? Date.now();
|
|
677
|
+
if (shouldPruneTerminalJob(current, nowMs)) return undefined;
|
|
678
|
+
if (!shouldRequestWakeup(current, nowMs)) return undefined;
|
|
430
679
|
|
|
431
680
|
const claimedAtMs = parseTimestamp(current.notifyClaimedAt);
|
|
432
681
|
const claimIsLive =
|
|
@@ -446,13 +695,50 @@ export async function tryClaimNotification(jobId: string, claimedBy: string, now
|
|
|
446
695
|
});
|
|
447
696
|
}
|
|
448
697
|
|
|
449
|
-
export async function
|
|
698
|
+
export async function recordNotificationTarget(
|
|
699
|
+
jobId: string,
|
|
700
|
+
claimedBy: string,
|
|
701
|
+
options: { notificationSessionKey: string; notificationSessionFile?: string },
|
|
702
|
+
): Promise<OracleJob> {
|
|
703
|
+
return withJobLock(jobId, { processPid: process.pid, action: "recordNotificationTarget", claimedBy }, async () => {
|
|
704
|
+
const current = readJob(jobId);
|
|
705
|
+
if (!current) throw new Error(`Oracle job not found: ${jobId}`);
|
|
706
|
+
if (current.notifiedAt) return current;
|
|
707
|
+
if (!notificationClaimIsOwnedBy(current, claimedBy)) {
|
|
708
|
+
throw new Error(`Oracle notification claim is not owned by ${claimedBy}: ${jobId}`);
|
|
709
|
+
}
|
|
710
|
+
const next: OracleJob = {
|
|
711
|
+
...current,
|
|
712
|
+
notificationSessionKey: options.notificationSessionKey,
|
|
713
|
+
notificationSessionFile: options.notificationSessionFile,
|
|
714
|
+
};
|
|
715
|
+
await writeJobUnlocked(next);
|
|
716
|
+
return next;
|
|
717
|
+
});
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
export async function markJobNotified(
|
|
721
|
+
jobId: string,
|
|
722
|
+
claimedBy: string,
|
|
723
|
+
options?: { at?: string; notificationEntryId?: string; notificationSessionKey?: string; notificationSessionFile?: string },
|
|
724
|
+
): Promise<OracleJob> {
|
|
725
|
+
const at = options?.at ?? new Date().toISOString();
|
|
450
726
|
return withJobLock(jobId, { processPid: process.pid, action: "markJobNotified", claimedBy }, async () => {
|
|
451
727
|
const current = readJob(jobId);
|
|
452
728
|
if (!current) throw new Error(`Oracle job not found: ${jobId}`);
|
|
729
|
+
if (current.notifiedAt) return current;
|
|
730
|
+
if (!notificationClaimIsOwnedBy(current, claimedBy)) {
|
|
731
|
+
throw new Error(`Oracle notification claim is not owned by ${claimedBy}: ${jobId}`);
|
|
732
|
+
}
|
|
453
733
|
const next: OracleJob = {
|
|
454
734
|
...current,
|
|
455
|
-
notifiedAt:
|
|
735
|
+
notifiedAt: at,
|
|
736
|
+
notificationEntryId: options?.notificationEntryId ?? current.notificationEntryId,
|
|
737
|
+
notificationSessionKey: options?.notificationSessionKey ?? current.notificationSessionKey,
|
|
738
|
+
notificationSessionFile: options?.notificationSessionFile ?? current.notificationSessionFile,
|
|
739
|
+
wakeupAttemptCount: 0,
|
|
740
|
+
wakeupLastRequestedAt: undefined,
|
|
741
|
+
wakeupSettledAt: undefined,
|
|
456
742
|
notifyClaimedAt: undefined,
|
|
457
743
|
notifyClaimedBy: undefined,
|
|
458
744
|
};
|
|
@@ -476,33 +762,109 @@ export async function releaseNotificationClaim(jobId: string, claimedBy: string)
|
|
|
476
762
|
});
|
|
477
763
|
}
|
|
478
764
|
|
|
765
|
+
export async function noteWakeupRequested(jobId: string, at = new Date().toISOString()): Promise<OracleJob | undefined> {
|
|
766
|
+
try {
|
|
767
|
+
return await updateJob(jobId, (job) => ({
|
|
768
|
+
...job,
|
|
769
|
+
wakeupAttemptCount: (job.wakeupAttemptCount ?? 0) + 1,
|
|
770
|
+
wakeupLastRequestedAt: at,
|
|
771
|
+
}));
|
|
772
|
+
} catch {
|
|
773
|
+
return readJob(jobId);
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
export async function markWakeupSettled(jobId: string, at = new Date().toISOString()): Promise<OracleJob | undefined> {
|
|
778
|
+
try {
|
|
779
|
+
return await updateJob(jobId, (job) => ({
|
|
780
|
+
...job,
|
|
781
|
+
wakeupSettledAt: job.wakeupSettledAt ?? at,
|
|
782
|
+
}));
|
|
783
|
+
} catch {
|
|
784
|
+
return readJob(jobId);
|
|
785
|
+
}
|
|
786
|
+
}
|
|
787
|
+
|
|
479
788
|
export async function cancelOracleJob(id: string, reason = "Cancelled by user"): Promise<OracleJob> {
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
789
|
+
return withLock("admission", "global", { processPid: process.pid, action: "cancelOracleJob", jobId: id }, async () => {
|
|
790
|
+
const current = readJob(id);
|
|
791
|
+
if (!current) throw new Error(`Oracle job not found: ${id}`);
|
|
792
|
+
if (!isOpenOracleJob(current)) return current;
|
|
793
|
+
|
|
794
|
+
const now = new Date().toISOString();
|
|
795
|
+
if (current.status === "queued") {
|
|
796
|
+
const cancelled = await updateJob(id, (job) => ({
|
|
797
|
+
...job,
|
|
798
|
+
...withJobPhase("cancelled", {
|
|
799
|
+
status: "cancelled",
|
|
800
|
+
completedAt: now,
|
|
801
|
+
heartbeatAt: now,
|
|
802
|
+
notifyClaimedAt: undefined,
|
|
803
|
+
notifyClaimedBy: undefined,
|
|
804
|
+
error: reason,
|
|
805
|
+
}, now),
|
|
806
|
+
}));
|
|
807
|
+
|
|
808
|
+
const cleanupWarnings: string[] = [];
|
|
809
|
+
await rm(cancelled.archivePath, { force: true }).catch((error: Error) => {
|
|
810
|
+
cleanupWarnings.push(`Failed to remove queued archive ${cancelled.archivePath}: ${error.message}`);
|
|
811
|
+
});
|
|
812
|
+
if (cleanupWarnings.length === 0) return cancelled;
|
|
813
|
+
|
|
814
|
+
return updateJob(id, (job) => ({
|
|
815
|
+
...job,
|
|
816
|
+
cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupWarnings],
|
|
817
|
+
lastCleanupAt: now,
|
|
818
|
+
error: [job.error, ...cleanupWarnings].filter(Boolean).join("\n"),
|
|
819
|
+
}));
|
|
820
|
+
}
|
|
499
821
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
822
|
+
const terminated = await terminateWorkerPid(current.workerPid, current.workerStartedAt);
|
|
823
|
+
let transitioned = false;
|
|
824
|
+
const cancelled = await updateJob(id, (job) => {
|
|
825
|
+
if (isTerminalOracleJob(job)) return job;
|
|
826
|
+
transitioned = true;
|
|
827
|
+
return {
|
|
828
|
+
...job,
|
|
829
|
+
...withJobPhase(terminated ? "cancelled" : "failed", {
|
|
830
|
+
status: terminated ? "cancelled" : "failed",
|
|
831
|
+
completedAt: now,
|
|
832
|
+
heartbeatAt: now,
|
|
833
|
+
notifyClaimedAt: undefined,
|
|
834
|
+
notifyClaimedBy: undefined,
|
|
835
|
+
cleanupPending: terminated,
|
|
836
|
+
error: terminated ? reason : `${reason}; worker PID ${job.workerPid ?? "unknown"} did not exit`,
|
|
837
|
+
}, now),
|
|
838
|
+
};
|
|
839
|
+
});
|
|
840
|
+
if (!transitioned) return cancelled;
|
|
841
|
+
|
|
842
|
+
if (!terminated) {
|
|
843
|
+
const cleanupWarnings = [
|
|
844
|
+
`Oracle runtime cleanup is blocked because worker PID ${current.workerPid ?? "unknown"} could not be terminated safely.`,
|
|
845
|
+
];
|
|
846
|
+
return updateJob(id, (job) => ({
|
|
847
|
+
...job,
|
|
848
|
+
cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupWarnings],
|
|
849
|
+
lastCleanupAt: now,
|
|
850
|
+
error: [job.error, ...cleanupWarnings].filter(Boolean).join("\n"),
|
|
851
|
+
}));
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
const cleanupReport = await cleanupJobResources(cancelled);
|
|
855
|
+
if (cleanupReport.warnings.length === 0) {
|
|
856
|
+
const finalized = await clearCleanupPending(id, now);
|
|
857
|
+
return finalized ?? cancelled;
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
return updateJob(id, (job) => ({
|
|
861
|
+
...job,
|
|
862
|
+
cleanupPending: false,
|
|
863
|
+
cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupReport.warnings],
|
|
864
|
+
lastCleanupAt: now,
|
|
865
|
+
error: [job.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
|
|
866
|
+
}));
|
|
867
|
+
});
|
|
506
868
|
}
|
|
507
869
|
|
|
508
870
|
export async function createJob(
|
|
@@ -512,6 +874,7 @@ export async function createJob(
|
|
|
512
874
|
originSessionFile: string | undefined,
|
|
513
875
|
config: OracleConfig,
|
|
514
876
|
runtime: OracleRuntimeAllocation,
|
|
877
|
+
options?: { initialState?: "queued" | "submitted"; createdAt?: string },
|
|
515
878
|
): Promise<OracleJob> {
|
|
516
879
|
const jobDir = getJobDir(id);
|
|
517
880
|
const logsDir = join(jobDir, "logs");
|
|
@@ -522,7 +885,8 @@ export async function createJob(
|
|
|
522
885
|
const reasoningPath = join(jobDir, "reasoning.md");
|
|
523
886
|
const artifactsManifestPath = join(jobDir, "artifacts.json");
|
|
524
887
|
const projectId = getProjectId(cwd);
|
|
525
|
-
const
|
|
888
|
+
const sessionFile = requirePersistedSessionFile(originSessionFile, "create oracle jobs");
|
|
889
|
+
const sessionId = getSessionId(sessionFile, projectId);
|
|
526
890
|
const conversationId = parseConversationId(input.chatUrl);
|
|
527
891
|
|
|
528
892
|
await mkdir(jobDir, { recursive: true, mode: 0o700 });
|
|
@@ -534,22 +898,24 @@ export async function createJob(
|
|
|
534
898
|
await writeFile(promptPath, input.prompt, { encoding: "utf8", mode: 0o600 });
|
|
535
899
|
await chmod(promptPath, 0o600).catch(() => undefined);
|
|
536
900
|
|
|
537
|
-
const
|
|
901
|
+
const createdAt = options?.createdAt ?? new Date().toISOString();
|
|
902
|
+
const initialState = options?.initialState ?? "submitted";
|
|
538
903
|
const normalizedEffort = input.modelFamily === "instant" ? undefined : (input.effort ?? config.defaults.effort);
|
|
539
904
|
const normalizedAutoSwitchToThinking = input.modelFamily === "instant"
|
|
540
905
|
? (input.autoSwitchToThinking ?? config.defaults.autoSwitchToThinking)
|
|
541
906
|
: false;
|
|
542
907
|
const job: OracleJob = {
|
|
543
908
|
id,
|
|
544
|
-
status:
|
|
545
|
-
phase:
|
|
546
|
-
phaseAt:
|
|
547
|
-
createdAt
|
|
548
|
-
|
|
909
|
+
status: initialState,
|
|
910
|
+
phase: initialState,
|
|
911
|
+
phaseAt: createdAt,
|
|
912
|
+
createdAt,
|
|
913
|
+
queuedAt: initialState === "queued" ? createdAt : undefined,
|
|
914
|
+
submittedAt: initialState === "submitted" ? createdAt : undefined,
|
|
549
915
|
cwd,
|
|
550
916
|
projectId,
|
|
551
917
|
sessionId,
|
|
552
|
-
originSessionFile,
|
|
918
|
+
originSessionFile: sessionFile,
|
|
553
919
|
requestSource: input.requestSource,
|
|
554
920
|
chatModelFamily: input.modelFamily,
|
|
555
921
|
effort: normalizedEffort,
|