pi-oracle 0.1.12 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +46 -0
- package/README.md +26 -10
- package/docs/ORACLE_DESIGN.md +593 -0
- package/docs/ORACLE_RECOVERY_DRILL.md +127 -0
- package/extensions/oracle/index.ts +15 -4
- package/extensions/oracle/lib/commands.ts +39 -12
- package/extensions/oracle/lib/config.ts +2 -2
- package/extensions/oracle/lib/jobs.ts +510 -73
- package/extensions/oracle/lib/locks.ts +99 -13
- package/extensions/oracle/lib/poller.ts +224 -38
- package/extensions/oracle/lib/queue.ts +193 -0
- package/extensions/oracle/lib/runtime.ts +70 -16
- package/extensions/oracle/lib/tools.ts +313 -64
- package/extensions/oracle/worker/artifact-heuristics.d.mts +29 -0
- package/extensions/oracle/worker/auth-bootstrap.mjs +2 -72
- package/extensions/oracle/worker/auth-cookie-policy.d.mts +31 -0
- package/extensions/oracle/worker/run-job.mjs +330 -71
- package/extensions/oracle/worker/state-locks.d.mts +45 -0
- package/extensions/oracle/worker/state-locks.mjs +235 -0
- package/package.json +13 -4
- package/prompts/oracle.md +2 -0
|
@@ -5,11 +5,12 @@ import { chmod, mkdir, readFile, rename, rm, writeFile } from "node:fs/promises"
|
|
|
5
5
|
import { join, resolve } from "node:path";
|
|
6
6
|
import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
|
|
7
7
|
import type { OracleConfig, OracleEffort, OracleModelFamily } from "./config.js";
|
|
8
|
-
import { withJobLock } from "./locks.js";
|
|
9
|
-
import { cleanupRuntimeArtifacts, getProjectId, getSessionId, parseConversationId, type OracleCleanupReport } from "./runtime.js";
|
|
8
|
+
import { withJobLock, withLock } from "./locks.js";
|
|
9
|
+
import { cleanupRuntimeArtifacts, getProjectId, getSessionId, parseConversationId, requirePersistedSessionFile, type OracleCleanupReport } from "./runtime.js";
|
|
10
10
|
|
|
11
|
-
export type OracleJobStatus = "preparing" | "submitted" | "waiting" | "complete" | "failed" | "cancelled";
|
|
11
|
+
export type OracleJobStatus = "queued" | "preparing" | "submitted" | "waiting" | "complete" | "failed" | "cancelled";
|
|
12
12
|
export type OracleJobPhase =
|
|
13
|
+
| "queued"
|
|
13
14
|
| "submitted"
|
|
14
15
|
| "cloning_runtime"
|
|
15
16
|
| "launching_browser"
|
|
@@ -24,10 +25,17 @@ export type OracleJobPhase =
|
|
|
24
25
|
| "failed"
|
|
25
26
|
| "cancelled";
|
|
26
27
|
|
|
28
|
+
export type OracleWakeupSettlementSource = "oracle_read" | "oracle_status";
|
|
29
|
+
|
|
27
30
|
export const ACTIVE_ORACLE_JOB_STATUSES: OracleJobStatus[] = ["preparing", "submitted", "waiting"];
|
|
31
|
+
export const OPEN_ORACLE_JOB_STATUSES: OracleJobStatus[] = ["queued", ...ACTIVE_ORACLE_JOB_STATUSES];
|
|
32
|
+
export const TERMINAL_ORACLE_JOB_STATUSES: OracleJobStatus[] = ["complete", "failed", "cancelled"];
|
|
28
33
|
export const ORACLE_MISSING_WORKER_GRACE_MS = 30_000;
|
|
29
34
|
export const ORACLE_STALE_HEARTBEAT_MS = 3 * 60 * 1000;
|
|
30
35
|
export const ORACLE_NOTIFICATION_CLAIM_TTL_MS = 60_000;
|
|
36
|
+
export const ORACLE_WAKEUP_MAX_ATTEMPTS = 3;
|
|
37
|
+
export const ORACLE_WAKEUP_RETRY_DELAYS_MS = [0, 15_000, 60_000] as const;
|
|
38
|
+
export const ORACLE_WAKEUP_POST_SEND_RETENTION_MS = 2 * 60 * 1000;
|
|
31
39
|
const ORACLE_COMPLETE_JOB_RETENTION_MS = 14 * 24 * 60 * 60 * 1000;
|
|
32
40
|
const ORACLE_FAILED_JOB_RETENTION_MS = 30 * 24 * 60 * 60 * 1000;
|
|
33
41
|
export const DEFAULT_ORACLE_JOBS_DIR = "/tmp";
|
|
@@ -38,6 +46,36 @@ export function isActiveOracleJob(job: Pick<OracleJob, "status">): boolean {
|
|
|
38
46
|
return ACTIVE_ORACLE_JOB_STATUSES.includes(job.status);
|
|
39
47
|
}
|
|
40
48
|
|
|
49
|
+
export function isOpenOracleJob(job: Pick<OracleJob, "status">): boolean {
|
|
50
|
+
return OPEN_ORACLE_JOB_STATUSES.includes(job.status);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export function isTerminalOracleJob(job: Pick<OracleJob, "status">): boolean {
|
|
54
|
+
return TERMINAL_ORACLE_JOB_STATUSES.includes(job.status);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export function shouldAdvanceQueueAfterCancellation(job: Pick<OracleJob, "status" | "cleanupWarnings" | "cleanupPending">): boolean {
|
|
58
|
+
return job.status === "cancelled" && !job.cleanupPending && !job.cleanupWarnings?.length;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export function hasRetainedPreSubmitArchive(job: Pick<OracleJob, "submittedAt" | "archiveDeletedAfterUpload" | "archivePath">): boolean {
|
|
62
|
+
return !job.submittedAt && !job.archiveDeletedAfterUpload && typeof job.archivePath === "string" && job.archivePath.length > 0;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export function hasDurableWorkerHandoff(
|
|
66
|
+
job: Pick<OracleJob, "status" | "phase" | "workerPid" | "workerStartedAt" | "heartbeatAt">,
|
|
67
|
+
): boolean {
|
|
68
|
+
if (job.status === "queued") return false;
|
|
69
|
+
if (job.workerPid) return true;
|
|
70
|
+
return false;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
export function hasPersistedOriginSession(
|
|
74
|
+
job: Pick<OracleJob, "originSessionFile" | "sessionId">,
|
|
75
|
+
): job is Pick<OracleJob, "originSessionFile" | "sessionId"> & { originSessionFile: string } {
|
|
76
|
+
return typeof job.originSessionFile === "string" && job.originSessionFile.length > 0 && job.sessionId === job.originSessionFile;
|
|
77
|
+
}
|
|
78
|
+
|
|
41
79
|
function readProcessStartedAt(pid: number | undefined): string | undefined {
|
|
42
80
|
if (!pid || pid <= 0) return undefined;
|
|
43
81
|
try {
|
|
@@ -86,6 +124,7 @@ export interface OracleJob {
|
|
|
86
124
|
phase: OracleJobPhase;
|
|
87
125
|
phaseAt: string;
|
|
88
126
|
createdAt: string;
|
|
127
|
+
queuedAt?: string;
|
|
89
128
|
submittedAt?: string;
|
|
90
129
|
completedAt?: string;
|
|
91
130
|
heartbeatAt?: string;
|
|
@@ -108,6 +147,20 @@ export interface OracleJob {
|
|
|
108
147
|
archiveSha256?: string;
|
|
109
148
|
archiveDeletedAfterUpload: boolean;
|
|
110
149
|
notifiedAt?: string;
|
|
150
|
+
notificationEntryId?: string;
|
|
151
|
+
notificationSessionKey?: string;
|
|
152
|
+
notificationSessionFile?: string;
|
|
153
|
+
wakeupAttemptCount?: number;
|
|
154
|
+
wakeupLastRequestedAt?: string;
|
|
155
|
+
wakeupSettledAt?: string;
|
|
156
|
+
wakeupSettledSource?: OracleWakeupSettlementSource;
|
|
157
|
+
wakeupSettledSessionFile?: string;
|
|
158
|
+
wakeupSettledSessionKey?: string;
|
|
159
|
+
wakeupSettledBeforeFirstAttempt?: boolean;
|
|
160
|
+
wakeupObservedAt?: string;
|
|
161
|
+
wakeupObservedSource?: OracleWakeupSettlementSource;
|
|
162
|
+
wakeupObservedSessionFile?: string;
|
|
163
|
+
wakeupObservedSessionKey?: string;
|
|
111
164
|
notifyClaimedAt?: string;
|
|
112
165
|
notifyClaimedBy?: string;
|
|
113
166
|
artifactFailureCount?: number;
|
|
@@ -126,6 +179,7 @@ export interface OracleJob {
|
|
|
126
179
|
config: OracleConfig;
|
|
127
180
|
cleanupWarnings?: string[];
|
|
128
181
|
lastCleanupAt?: string;
|
|
182
|
+
cleanupPending?: boolean;
|
|
129
183
|
}
|
|
130
184
|
|
|
131
185
|
export interface OracleSubmitInput {
|
|
@@ -214,6 +268,34 @@ export async function updateJob(id: string, mutate: (job: OracleJob) => OracleJo
|
|
|
214
268
|
});
|
|
215
269
|
}
|
|
216
270
|
|
|
271
|
+
export async function appendCleanupWarnings(jobId: string, warnings: string[], at = new Date().toISOString()): Promise<OracleJob | undefined> {
|
|
272
|
+
if (warnings.length === 0) return readJob(jobId);
|
|
273
|
+
try {
|
|
274
|
+
return await updateJob(jobId, (job) => ({
|
|
275
|
+
...job,
|
|
276
|
+
cleanupPending: false,
|
|
277
|
+
cleanupWarnings: Array.from(new Set([...(job.cleanupWarnings || []), ...warnings])),
|
|
278
|
+
lastCleanupAt: at,
|
|
279
|
+
error: [job.error, ...warnings].filter(Boolean).join("\n"),
|
|
280
|
+
}));
|
|
281
|
+
} catch {
|
|
282
|
+
return readJob(jobId);
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
export async function clearCleanupPending(jobId: string, at = new Date().toISOString()): Promise<OracleJob | undefined> {
|
|
287
|
+
try {
|
|
288
|
+
return await updateJob(jobId, (job) => ({
|
|
289
|
+
...job,
|
|
290
|
+
cleanupPending: false,
|
|
291
|
+
cleanupWarnings: undefined,
|
|
292
|
+
lastCleanupAt: at,
|
|
293
|
+
}));
|
|
294
|
+
} catch {
|
|
295
|
+
return readJob(jobId);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
217
299
|
function sleep(ms: number): Promise<void> {
|
|
218
300
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
219
301
|
}
|
|
@@ -224,6 +306,39 @@ function parseTimestamp(value: string | undefined): number | undefined {
|
|
|
224
306
|
return Number.isFinite(parsed) ? parsed : undefined;
|
|
225
307
|
}
|
|
226
308
|
|
|
309
|
+
function notificationClaimIsOwnedBy(job: Pick<OracleJob, "notifyClaimedAt" | "notifyClaimedBy">, claimedBy: string, now = Date.now()): boolean {
|
|
310
|
+
if (job.notifyClaimedBy !== claimedBy) return false;
|
|
311
|
+
const claimedAtMs = parseTimestamp(job.notifyClaimedAt);
|
|
312
|
+
if (claimedAtMs === undefined) return false;
|
|
313
|
+
return now - claimedAtMs < ORACLE_NOTIFICATION_CLAIM_TTL_MS;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
function notificationClaimIsLive(job: Pick<OracleJob, "notifyClaimedAt" | "notifyClaimedBy">, now = Date.now()): boolean {
|
|
317
|
+
if (!job.notifyClaimedBy) return false;
|
|
318
|
+
const claimedAtMs = parseTimestamp(job.notifyClaimedAt);
|
|
319
|
+
if (claimedAtMs === undefined) return false;
|
|
320
|
+
return now - claimedAtMs < ORACLE_NOTIFICATION_CLAIM_TTL_MS;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
function wakeupRetentionGraceIsActive(job: Pick<OracleJob, "wakeupLastRequestedAt">, now = Date.now()): boolean {
|
|
324
|
+
const lastRequestedAtMs = parseTimestamp(job.wakeupLastRequestedAt);
|
|
325
|
+
if (lastRequestedAtMs === undefined) return false;
|
|
326
|
+
return now - lastRequestedAtMs < ORACLE_WAKEUP_POST_SEND_RETENTION_MS;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
export function getWakeupRetryDelayMs(attemptCount: number): number {
|
|
330
|
+
return ORACLE_WAKEUP_RETRY_DELAYS_MS[Math.min(attemptCount, ORACLE_WAKEUP_RETRY_DELAYS_MS.length - 1)] ?? ORACLE_WAKEUP_RETRY_DELAYS_MS[ORACLE_WAKEUP_RETRY_DELAYS_MS.length - 1];
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
export function shouldRequestWakeup(job: Pick<OracleJob, "wakeupAttemptCount" | "wakeupLastRequestedAt" | "wakeupSettledAt">, now = Date.now()): boolean {
|
|
334
|
+
if (job.wakeupSettledAt) return false;
|
|
335
|
+
const attempts = job.wakeupAttemptCount ?? 0;
|
|
336
|
+
if (attempts >= ORACLE_WAKEUP_MAX_ATTEMPTS) return false;
|
|
337
|
+
const lastRequestedAtMs = parseTimestamp(job.wakeupLastRequestedAt);
|
|
338
|
+
if (lastRequestedAtMs === undefined) return true;
|
|
339
|
+
return now - lastRequestedAtMs >= getWakeupRetryDelayMs(attempts);
|
|
340
|
+
}
|
|
341
|
+
|
|
227
342
|
export function withJobPhase<T extends Pick<OracleJob, "phase" | "phaseAt">>(
|
|
228
343
|
phase: OracleJobPhase,
|
|
229
344
|
patch?: Omit<Partial<OracleJob>, "phase" | "phaseAt">,
|
|
@@ -237,7 +352,7 @@ export function withJobPhase<T extends Pick<OracleJob, "phase" | "phaseAt">>(
|
|
|
237
352
|
}
|
|
238
353
|
|
|
239
354
|
function isTerminalOracleJobStatus(status: OracleJobStatus): boolean {
|
|
240
|
-
return status
|
|
355
|
+
return TERMINAL_ORACLE_JOB_STATUSES.includes(status);
|
|
241
356
|
}
|
|
242
357
|
|
|
243
358
|
export async function terminateWorkerPid(
|
|
@@ -312,15 +427,62 @@ export function getStaleOracleJobReason(job: OracleJob, now = Date.now()): strin
|
|
|
312
427
|
return undefined;
|
|
313
428
|
}
|
|
314
429
|
|
|
430
|
+
function getTerminalCleanupStaleReason(job: Pick<OracleJob, "status" | "cleanupPending" | "cleanupWarnings" | "lastCleanupAt" | "heartbeatAt" | "completedAt" | "phaseAt" | "createdAt" | "workerPid" | "workerStartedAt">, now = Date.now()): string | undefined {
|
|
431
|
+
if (!isTerminalOracleJob(job)) return undefined;
|
|
432
|
+
if (!job.cleanupPending && !job.cleanupWarnings?.length) return undefined;
|
|
433
|
+
|
|
434
|
+
const baselineMs =
|
|
435
|
+
parseTimestamp(job.lastCleanupAt) ??
|
|
436
|
+
parseTimestamp(job.heartbeatAt) ??
|
|
437
|
+
parseTimestamp(job.completedAt) ??
|
|
438
|
+
parseTimestamp(job.phaseAt) ??
|
|
439
|
+
parseTimestamp(job.createdAt);
|
|
440
|
+
if (baselineMs === undefined) return "Oracle terminal cleanup has no valid timestamps";
|
|
441
|
+
if (!job.workerPid) return undefined;
|
|
442
|
+
|
|
443
|
+
const currentStartedAt = readProcessStartedAt(job.workerPid);
|
|
444
|
+
if (!currentStartedAt) {
|
|
445
|
+
return `Oracle terminal cleanup worker PID ${job.workerPid} is no longer running`;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
if (job.workerStartedAt && currentStartedAt !== job.workerStartedAt) {
|
|
449
|
+
return `Oracle terminal cleanup worker PID ${job.workerPid} no longer matches the recorded process identity`;
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
if (now - baselineMs > ORACLE_STALE_HEARTBEAT_MS) {
|
|
453
|
+
return `Oracle terminal cleanup is stale (${Math.round((now - baselineMs) / 1000)}s since last update)`;
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
return undefined;
|
|
457
|
+
}
|
|
458
|
+
|
|
315
459
|
export async function cleanupJobResources(
|
|
316
|
-
job: Pick<OracleJob, "runtimeId" | "runtimeProfileDir" | "runtimeSessionName" | "conversationId">,
|
|
460
|
+
job: Pick<OracleJob, "submittedAt" | "runtimeId" | "runtimeProfileDir" | "runtimeSessionName" | "conversationId" | "archivePath" | "archiveDeletedAfterUpload">,
|
|
317
461
|
): Promise<OracleCleanupReport> {
|
|
318
|
-
|
|
462
|
+
const report: OracleCleanupReport = { attempted: [], warnings: [] };
|
|
463
|
+
|
|
464
|
+
if (hasRetainedPreSubmitArchive(job)) {
|
|
465
|
+
report.attempted.push("queuedArchive");
|
|
466
|
+
await rm(job.archivePath, { force: true }).catch((error: Error) => {
|
|
467
|
+
report.warnings.push(`Failed to remove queued archive ${job.archivePath}: ${error.message}`);
|
|
468
|
+
});
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
if (!job.submittedAt) {
|
|
472
|
+
return report;
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
const runtimeReport = await cleanupRuntimeArtifacts({
|
|
319
476
|
runtimeId: job.runtimeId,
|
|
320
477
|
runtimeProfileDir: job.runtimeProfileDir,
|
|
321
478
|
runtimeSessionName: job.runtimeSessionName,
|
|
322
479
|
conversationId: job.conversationId,
|
|
323
480
|
});
|
|
481
|
+
|
|
482
|
+
return {
|
|
483
|
+
attempted: [...report.attempted, ...runtimeReport.attempted],
|
|
484
|
+
warnings: [...report.warnings, ...runtimeReport.warnings],
|
|
485
|
+
};
|
|
324
486
|
}
|
|
325
487
|
|
|
326
488
|
function getCleanupRetentionMs(job: OracleJob): { complete: number; failed: number } {
|
|
@@ -330,15 +492,18 @@ function getCleanupRetentionMs(job: OracleJob): { complete: number; failed: numb
|
|
|
330
492
|
};
|
|
331
493
|
}
|
|
332
494
|
|
|
333
|
-
function shouldPruneTerminalJob(job: OracleJob, now = Date.now()): boolean {
|
|
495
|
+
export function shouldPruneTerminalJob(job: OracleJob, now = Date.now()): boolean {
|
|
334
496
|
if (!isTerminalOracleJobStatus(job.status)) return false;
|
|
497
|
+
if (job.cleanupPending || job.cleanupWarnings?.length) return false;
|
|
498
|
+
if (notificationClaimIsLive(job, now)) return false;
|
|
499
|
+
if (wakeupRetentionGraceIsActive(job, now)) return false;
|
|
335
500
|
const completedMs = parseTimestamp(job.completedAt) ?? parseTimestamp(job.createdAt);
|
|
336
501
|
if (completedMs === undefined) return false;
|
|
337
502
|
const ageMs = now - completedMs;
|
|
338
503
|
|
|
339
504
|
const retention = getCleanupRetentionMs(job);
|
|
340
505
|
|
|
341
|
-
if (
|
|
506
|
+
if (job.status === "complete" || job.status === "cancelled") {
|
|
342
507
|
return ageMs >= retention.complete;
|
|
343
508
|
}
|
|
344
509
|
|
|
@@ -350,10 +515,54 @@ function shouldPruneTerminalJob(job: OracleJob, now = Date.now()): boolean {
|
|
|
350
515
|
}
|
|
351
516
|
|
|
352
517
|
export async function removeTerminalOracleJob(job: OracleJob): Promise<{ removed: boolean; cleanupReport: OracleCleanupReport }> {
|
|
353
|
-
if (
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
518
|
+
if (!isTerminalOracleJob(job)) return { removed: false, cleanupReport: { attempted: [], warnings: [] } };
|
|
519
|
+
|
|
520
|
+
return withJobLock(job.id, { processPid: process.pid, action: "removeTerminalOracleJob" }, async () => {
|
|
521
|
+
const current = readJob(job.id);
|
|
522
|
+
if (!current) return { removed: true, cleanupReport: { attempted: [], warnings: [] } };
|
|
523
|
+
if (!isTerminalOracleJob(current)) return { removed: false, cleanupReport: { attempted: [], warnings: [] } };
|
|
524
|
+
if (notificationClaimIsLive(current)) {
|
|
525
|
+
return {
|
|
526
|
+
removed: false,
|
|
527
|
+
cleanupReport: {
|
|
528
|
+
attempted: [],
|
|
529
|
+
warnings: [`Refusing to remove terminal oracle job ${current.id} while a notification delivery is in flight.`],
|
|
530
|
+
},
|
|
531
|
+
};
|
|
532
|
+
}
|
|
533
|
+
if (wakeupRetentionGraceIsActive(current)) {
|
|
534
|
+
return {
|
|
535
|
+
removed: false,
|
|
536
|
+
cleanupReport: {
|
|
537
|
+
attempted: [],
|
|
538
|
+
warnings: [`Refusing to remove terminal oracle job ${current.id} because its wake-up delivery is still within the post-send retention grace window.`],
|
|
539
|
+
},
|
|
540
|
+
};
|
|
541
|
+
}
|
|
542
|
+
if (current.workerPid && isWorkerProcessAlive(current.workerPid, current.workerStartedAt)) {
|
|
543
|
+
return {
|
|
544
|
+
removed: false,
|
|
545
|
+
cleanupReport: {
|
|
546
|
+
attempted: [],
|
|
547
|
+
warnings: [`Refusing to remove terminal oracle job ${current.id} while worker PID ${current.workerPid} is still live.`],
|
|
548
|
+
},
|
|
549
|
+
};
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
const cleanupReport = await cleanupJobResources(current);
|
|
553
|
+
if (cleanupReport.warnings.length > 0) {
|
|
554
|
+
await writeJobUnlocked({
|
|
555
|
+
...current,
|
|
556
|
+
cleanupPending: false,
|
|
557
|
+
cleanupWarnings: [...(current.cleanupWarnings || []), ...cleanupReport.warnings],
|
|
558
|
+
lastCleanupAt: new Date().toISOString(),
|
|
559
|
+
error: [current.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
|
|
560
|
+
});
|
|
561
|
+
return { removed: false, cleanupReport };
|
|
562
|
+
}
|
|
563
|
+
await rm(getJobDir(current.id), { recursive: true, force: true });
|
|
564
|
+
return { removed: true, cleanupReport };
|
|
565
|
+
});
|
|
357
566
|
}
|
|
358
567
|
|
|
359
568
|
export async function pruneTerminalOracleJobs(now = Date.now()): Promise<string[]> {
|
|
@@ -374,43 +583,108 @@ export async function pruneTerminalOracleJobs(now = Date.now()): Promise<string[
|
|
|
374
583
|
export async function reconcileStaleOracleJobs(): Promise<OracleJob[]> {
|
|
375
584
|
const repaired: OracleJob[] = [];
|
|
376
585
|
const now = Date.now();
|
|
586
|
+
const recoveredAt = new Date(now).toISOString();
|
|
377
587
|
|
|
378
588
|
for (const jobDir of listOracleJobDirs()) {
|
|
379
589
|
const job = readJob(jobDir);
|
|
380
590
|
if (!job) continue;
|
|
591
|
+
|
|
592
|
+
if (isTerminalOracleJob(job) && (job.cleanupPending || job.cleanupWarnings?.length)) {
|
|
593
|
+
let cleanupTarget: OracleJob | undefined;
|
|
594
|
+
let blockedWarning: string | undefined;
|
|
595
|
+
|
|
596
|
+
await withJobLock(job.id, { processPid: process.pid, action: "reconcileTerminalCleanupJob" }, async () => {
|
|
597
|
+
const current = readJob(job.id);
|
|
598
|
+
if (!current || !isTerminalOracleJob(current) || (!current.cleanupPending && !current.cleanupWarnings?.length)) return;
|
|
599
|
+
|
|
600
|
+
if (current.workerPid && isWorkerProcessAlive(current.workerPid, current.workerStartedAt)) {
|
|
601
|
+
const staleCleanupReason = getTerminalCleanupStaleReason(current, now);
|
|
602
|
+
if (!staleCleanupReason) return;
|
|
603
|
+
const terminated = await terminateWorkerPid(current.workerPid, current.workerStartedAt);
|
|
604
|
+
if (!terminated) {
|
|
605
|
+
blockedWarning = `Oracle terminal cleanup is blocked because worker PID ${current.workerPid} could not be terminated safely after ${staleCleanupReason}.`;
|
|
606
|
+
return;
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
cleanupTarget = current;
|
|
611
|
+
});
|
|
612
|
+
|
|
613
|
+
if (blockedWarning) {
|
|
614
|
+
const blocked = await appendCleanupWarnings(job.id, [blockedWarning], recoveredAt);
|
|
615
|
+
if (blocked) repaired.push(blocked);
|
|
616
|
+
continue;
|
|
617
|
+
}
|
|
618
|
+
if (!cleanupTarget) continue;
|
|
619
|
+
|
|
620
|
+
const cleanupReport = await cleanupJobResources(cleanupTarget);
|
|
621
|
+
if (cleanupReport.warnings.length > 0) {
|
|
622
|
+
const withWarnings = await appendCleanupWarnings(job.id, cleanupReport.warnings, recoveredAt);
|
|
623
|
+
if (withWarnings) repaired.push(withWarnings);
|
|
624
|
+
} else {
|
|
625
|
+
const recoveredJob = await clearCleanupPending(job.id, recoveredAt);
|
|
626
|
+
if (recoveredJob) repaired.push(recoveredJob);
|
|
627
|
+
}
|
|
628
|
+
continue;
|
|
629
|
+
}
|
|
630
|
+
|
|
381
631
|
const staleReason = getStaleOracleJobReason(job, now);
|
|
382
632
|
if (!staleReason) continue;
|
|
383
633
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
634
|
+
let terminated = false;
|
|
635
|
+
let transitioned = false;
|
|
636
|
+
let repairedJob: OracleJob | undefined;
|
|
637
|
+
|
|
638
|
+
await withJobLock(job.id, { processPid: process.pid, action: "reconcileStaleOracleJob" }, async () => {
|
|
639
|
+
const current = readJob(job.id);
|
|
640
|
+
if (!current) return;
|
|
641
|
+
const currentStaleReason = getStaleOracleJobReason(current, now);
|
|
642
|
+
if (!currentStaleReason) return;
|
|
643
|
+
|
|
644
|
+
terminated = await terminateWorkerPid(current.workerPid, current.workerStartedAt);
|
|
645
|
+
transitioned = true;
|
|
646
|
+
const suffix = current.workerPid
|
|
647
|
+
? terminated
|
|
648
|
+
? ` Terminated stale worker PID ${current.workerPid}.`
|
|
649
|
+
: ` Failed to terminate stale worker PID ${current.workerPid}.`
|
|
650
|
+
: "";
|
|
651
|
+
repairedJob = {
|
|
652
|
+
...current,
|
|
653
|
+
...withJobPhase("failed", {
|
|
654
|
+
status: "failed",
|
|
655
|
+
completedAt: recoveredAt,
|
|
656
|
+
heartbeatAt: recoveredAt,
|
|
657
|
+
notifyClaimedAt: undefined,
|
|
658
|
+
notifyClaimedBy: undefined,
|
|
659
|
+
cleanupPending: terminated,
|
|
660
|
+
error: current.error
|
|
661
|
+
? `${current.error}\nRecovered stale job: ${currentStaleReason}.${suffix}`.trim()
|
|
662
|
+
: `Recovered stale job: ${currentStaleReason}.${suffix}`.trim(),
|
|
663
|
+
}, recoveredAt),
|
|
664
|
+
};
|
|
665
|
+
await writeJobUnlocked(repairedJob);
|
|
666
|
+
});
|
|
667
|
+
|
|
668
|
+
if (!transitioned || !repairedJob || !isTerminalOracleJob(repairedJob)) continue;
|
|
669
|
+
|
|
670
|
+
if (!terminated) {
|
|
671
|
+
const cleanupWarnings = [
|
|
672
|
+
`Oracle runtime cleanup is blocked because worker PID ${job.workerPid ?? "unknown"} could not be terminated safely.`,
|
|
673
|
+
];
|
|
674
|
+
const blocked = await appendCleanupWarnings(repairedJob.id, cleanupWarnings, recoveredAt);
|
|
675
|
+
repaired.push(blocked ?? repairedJob);
|
|
676
|
+
continue;
|
|
677
|
+
}
|
|
390
678
|
|
|
391
|
-
const repairedJob = await updateJob(job.id, (current) => ({
|
|
392
|
-
...current,
|
|
393
|
-
...withJobPhase("failed", {
|
|
394
|
-
status: "failed",
|
|
395
|
-
completedAt: new Date(now).toISOString(),
|
|
396
|
-
heartbeatAt: new Date(now).toISOString(),
|
|
397
|
-
notifyClaimedAt: undefined,
|
|
398
|
-
notifyClaimedBy: undefined,
|
|
399
|
-
error: current.error
|
|
400
|
-
? `${current.error}\nRecovered stale job: ${staleReason}.${suffix}`.trim()
|
|
401
|
-
: `Recovered stale job: ${staleReason}.${suffix}`.trim(),
|
|
402
|
-
}, new Date(now).toISOString()),
|
|
403
|
-
}));
|
|
404
679
|
const cleanupReport = await cleanupJobResources(repairedJob);
|
|
405
680
|
if (cleanupReport.warnings.length > 0) {
|
|
406
|
-
await
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
lastCleanupAt: new Date(now).toISOString(),
|
|
410
|
-
error: [current.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
|
|
411
|
-
}));
|
|
681
|
+
const withWarnings = await appendCleanupWarnings(repairedJob.id, cleanupReport.warnings, recoveredAt);
|
|
682
|
+
repaired.push(withWarnings ?? repairedJob);
|
|
683
|
+
continue;
|
|
412
684
|
}
|
|
413
|
-
|
|
685
|
+
|
|
686
|
+
const finalized = await clearCleanupPending(repairedJob.id, recoveredAt);
|
|
687
|
+
repaired.push(finalized ?? repairedJob);
|
|
414
688
|
}
|
|
415
689
|
|
|
416
690
|
return repaired;
|
|
@@ -427,6 +701,10 @@ export async function tryClaimNotification(jobId: string, claimedBy: string, now
|
|
|
427
701
|
if (!current) return undefined;
|
|
428
702
|
if (!isTerminalOracleJobStatus(current.status)) return undefined;
|
|
429
703
|
if (current.notifiedAt) return undefined;
|
|
704
|
+
if (!hasPersistedOriginSession(current)) return undefined;
|
|
705
|
+
const nowMs = parseTimestamp(now) ?? Date.now();
|
|
706
|
+
if (shouldPruneTerminalJob(current, nowMs)) return undefined;
|
|
707
|
+
if (!shouldRequestWakeup(current, nowMs)) return undefined;
|
|
430
708
|
|
|
431
709
|
const claimedAtMs = parseTimestamp(current.notifyClaimedAt);
|
|
432
710
|
const claimIsLive =
|
|
@@ -446,13 +724,50 @@ export async function tryClaimNotification(jobId: string, claimedBy: string, now
|
|
|
446
724
|
});
|
|
447
725
|
}
|
|
448
726
|
|
|
449
|
-
export async function
|
|
727
|
+
export async function recordNotificationTarget(
|
|
728
|
+
jobId: string,
|
|
729
|
+
claimedBy: string,
|
|
730
|
+
options: { notificationSessionKey: string; notificationSessionFile?: string },
|
|
731
|
+
): Promise<OracleJob> {
|
|
732
|
+
return withJobLock(jobId, { processPid: process.pid, action: "recordNotificationTarget", claimedBy }, async () => {
|
|
733
|
+
const current = readJob(jobId);
|
|
734
|
+
if (!current) throw new Error(`Oracle job not found: ${jobId}`);
|
|
735
|
+
if (current.notifiedAt) return current;
|
|
736
|
+
if (!notificationClaimIsOwnedBy(current, claimedBy)) {
|
|
737
|
+
throw new Error(`Oracle notification claim is not owned by ${claimedBy}: ${jobId}`);
|
|
738
|
+
}
|
|
739
|
+
const next: OracleJob = {
|
|
740
|
+
...current,
|
|
741
|
+
notificationSessionKey: options.notificationSessionKey,
|
|
742
|
+
notificationSessionFile: options.notificationSessionFile,
|
|
743
|
+
};
|
|
744
|
+
await writeJobUnlocked(next);
|
|
745
|
+
return next;
|
|
746
|
+
});
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
export async function markJobNotified(
|
|
750
|
+
jobId: string,
|
|
751
|
+
claimedBy: string,
|
|
752
|
+
options?: { at?: string; notificationEntryId?: string; notificationSessionKey?: string; notificationSessionFile?: string },
|
|
753
|
+
): Promise<OracleJob> {
|
|
754
|
+
const at = options?.at ?? new Date().toISOString();
|
|
450
755
|
return withJobLock(jobId, { processPid: process.pid, action: "markJobNotified", claimedBy }, async () => {
|
|
451
756
|
const current = readJob(jobId);
|
|
452
757
|
if (!current) throw new Error(`Oracle job not found: ${jobId}`);
|
|
758
|
+
if (current.notifiedAt) return current;
|
|
759
|
+
if (!notificationClaimIsOwnedBy(current, claimedBy)) {
|
|
760
|
+
throw new Error(`Oracle notification claim is not owned by ${claimedBy}: ${jobId}`);
|
|
761
|
+
}
|
|
453
762
|
const next: OracleJob = {
|
|
454
763
|
...current,
|
|
455
|
-
notifiedAt:
|
|
764
|
+
notifiedAt: at,
|
|
765
|
+
notificationEntryId: options?.notificationEntryId ?? current.notificationEntryId,
|
|
766
|
+
notificationSessionKey: options?.notificationSessionKey ?? current.notificationSessionKey,
|
|
767
|
+
notificationSessionFile: options?.notificationSessionFile ?? current.notificationSessionFile,
|
|
768
|
+
wakeupAttemptCount: 0,
|
|
769
|
+
wakeupLastRequestedAt: undefined,
|
|
770
|
+
wakeupSettledAt: undefined,
|
|
456
771
|
notifyClaimedAt: undefined,
|
|
457
772
|
notifyClaimedBy: undefined,
|
|
458
773
|
};
|
|
@@ -476,33 +791,151 @@ export async function releaseNotificationClaim(jobId: string, claimedBy: string)
|
|
|
476
791
|
});
|
|
477
792
|
}
|
|
478
793
|
|
|
794
|
+
export async function noteWakeupRequested(jobId: string, at = new Date().toISOString()): Promise<OracleJob | undefined> {
|
|
795
|
+
try {
|
|
796
|
+
return await updateJob(jobId, (job) => ({
|
|
797
|
+
...job,
|
|
798
|
+
wakeupAttemptCount: (job.wakeupAttemptCount ?? 0) + 1,
|
|
799
|
+
wakeupLastRequestedAt: at,
|
|
800
|
+
}));
|
|
801
|
+
} catch {
|
|
802
|
+
return readJob(jobId);
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
function getWakeupSessionKey(sessionFile: string | undefined, cwd: string | undefined): string | undefined {
|
|
807
|
+
if (!sessionFile || !cwd) return undefined;
|
|
808
|
+
const projectId = getProjectId(cwd);
|
|
809
|
+
return `${projectId}::${getSessionId(sessionFile, projectId)}`;
|
|
810
|
+
}
|
|
811
|
+
|
|
812
|
+
export async function markWakeupSettled(
|
|
813
|
+
jobId: string,
|
|
814
|
+
options: {
|
|
815
|
+
source: OracleWakeupSettlementSource;
|
|
816
|
+
sessionFile?: string;
|
|
817
|
+
cwd?: string;
|
|
818
|
+
at?: string;
|
|
819
|
+
allowBeforeFirstAttempt?: boolean;
|
|
820
|
+
},
|
|
821
|
+
): Promise<OracleJob | undefined> {
|
|
822
|
+
const at = options.at ?? new Date().toISOString();
|
|
823
|
+
const sessionKey = getWakeupSessionKey(options.sessionFile, options.cwd);
|
|
824
|
+
|
|
825
|
+
try {
|
|
826
|
+
return await updateJob(jobId, (job) => {
|
|
827
|
+
const beforeFirstAttempt = !job.wakeupLastRequestedAt && (job.wakeupAttemptCount ?? 0) === 0;
|
|
828
|
+
if (job.wakeupSettledAt) {
|
|
829
|
+
return {
|
|
830
|
+
...job,
|
|
831
|
+
wakeupSettledSource: job.wakeupSettledSource ?? options.source,
|
|
832
|
+
wakeupSettledSessionFile: job.wakeupSettledSessionFile ?? options.sessionFile,
|
|
833
|
+
wakeupSettledSessionKey: job.wakeupSettledSessionKey ?? sessionKey,
|
|
834
|
+
wakeupSettledBeforeFirstAttempt: job.wakeupSettledBeforeFirstAttempt ?? beforeFirstAttempt,
|
|
835
|
+
};
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
if (beforeFirstAttempt && !options.allowBeforeFirstAttempt) {
|
|
839
|
+
return {
|
|
840
|
+
...job,
|
|
841
|
+
wakeupObservedAt: job.wakeupObservedAt ?? at,
|
|
842
|
+
wakeupObservedSource: job.wakeupObservedSource ?? options.source,
|
|
843
|
+
wakeupObservedSessionFile: job.wakeupObservedSessionFile ?? options.sessionFile,
|
|
844
|
+
wakeupObservedSessionKey: job.wakeupObservedSessionKey ?? sessionKey,
|
|
845
|
+
};
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
return {
|
|
849
|
+
...job,
|
|
850
|
+
wakeupSettledAt: at,
|
|
851
|
+
wakeupSettledSource: options.source,
|
|
852
|
+
wakeupSettledSessionFile: options.sessionFile,
|
|
853
|
+
wakeupSettledSessionKey: sessionKey,
|
|
854
|
+
wakeupSettledBeforeFirstAttempt: beforeFirstAttempt,
|
|
855
|
+
};
|
|
856
|
+
});
|
|
857
|
+
} catch {
|
|
858
|
+
return readJob(jobId);
|
|
859
|
+
}
|
|
860
|
+
}
|
|
861
|
+
|
|
479
862
|
export async function cancelOracleJob(id: string, reason = "Cancelled by user"): Promise<OracleJob> {
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
863
|
+
return withLock("admission", "global", { processPid: process.pid, action: "cancelOracleJob", jobId: id }, async () => {
|
|
864
|
+
const current = readJob(id);
|
|
865
|
+
if (!current) throw new Error(`Oracle job not found: ${id}`);
|
|
866
|
+
if (!isOpenOracleJob(current)) return current;
|
|
867
|
+
|
|
868
|
+
const now = new Date().toISOString();
|
|
869
|
+
if (current.status === "queued") {
|
|
870
|
+
const cancelled = await updateJob(id, (job) => ({
|
|
871
|
+
...job,
|
|
872
|
+
...withJobPhase("cancelled", {
|
|
873
|
+
status: "cancelled",
|
|
874
|
+
completedAt: now,
|
|
875
|
+
heartbeatAt: now,
|
|
876
|
+
notifyClaimedAt: undefined,
|
|
877
|
+
notifyClaimedBy: undefined,
|
|
878
|
+
error: reason,
|
|
879
|
+
}, now),
|
|
880
|
+
}));
|
|
881
|
+
|
|
882
|
+
const cleanupReport = await cleanupJobResources(cancelled);
|
|
883
|
+
if (cleanupReport.warnings.length === 0) return cancelled;
|
|
884
|
+
|
|
885
|
+
return updateJob(id, (job) => ({
|
|
886
|
+
...job,
|
|
887
|
+
cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupReport.warnings],
|
|
888
|
+
lastCleanupAt: now,
|
|
889
|
+
error: [job.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
|
|
890
|
+
}));
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
const terminated = await terminateWorkerPid(current.workerPid, current.workerStartedAt);
|
|
894
|
+
let transitioned = false;
|
|
895
|
+
const cancelled = await updateJob(id, (job) => {
|
|
896
|
+
if (isTerminalOracleJob(job)) return job;
|
|
897
|
+
transitioned = true;
|
|
898
|
+
return {
|
|
899
|
+
...job,
|
|
900
|
+
...withJobPhase(terminated ? "cancelled" : "failed", {
|
|
901
|
+
status: terminated ? "cancelled" : "failed",
|
|
902
|
+
completedAt: now,
|
|
903
|
+
heartbeatAt: now,
|
|
904
|
+
notifyClaimedAt: undefined,
|
|
905
|
+
notifyClaimedBy: undefined,
|
|
906
|
+
cleanupPending: terminated,
|
|
907
|
+
error: terminated ? reason : `${reason}; worker PID ${job.workerPid ?? "unknown"} did not exit`,
|
|
908
|
+
}, now),
|
|
909
|
+
};
|
|
910
|
+
});
|
|
911
|
+
if (!transitioned) return cancelled;
|
|
912
|
+
|
|
913
|
+
if (!terminated) {
|
|
914
|
+
const cleanupWarnings = [
|
|
915
|
+
`Oracle runtime cleanup is blocked because worker PID ${current.workerPid ?? "unknown"} could not be terminated safely.`,
|
|
916
|
+
];
|
|
917
|
+
return updateJob(id, (job) => ({
|
|
918
|
+
...job,
|
|
919
|
+
cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupWarnings],
|
|
920
|
+
lastCleanupAt: now,
|
|
921
|
+
error: [job.error, ...cleanupWarnings].filter(Boolean).join("\n"),
|
|
922
|
+
}));
|
|
923
|
+
}
|
|
499
924
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
925
|
+
const cleanupReport = await cleanupJobResources(cancelled);
|
|
926
|
+
if (cleanupReport.warnings.length === 0) {
|
|
927
|
+
const finalized = await clearCleanupPending(id, now);
|
|
928
|
+
return finalized ?? cancelled;
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
return updateJob(id, (job) => ({
|
|
932
|
+
...job,
|
|
933
|
+
cleanupPending: false,
|
|
934
|
+
cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupReport.warnings],
|
|
935
|
+
lastCleanupAt: now,
|
|
936
|
+
error: [job.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
|
|
937
|
+
}));
|
|
938
|
+
});
|
|
506
939
|
}
|
|
507
940
|
|
|
508
941
|
export async function createJob(
|
|
@@ -512,6 +945,7 @@ export async function createJob(
|
|
|
512
945
|
originSessionFile: string | undefined,
|
|
513
946
|
config: OracleConfig,
|
|
514
947
|
runtime: OracleRuntimeAllocation,
|
|
948
|
+
options?: { initialState?: "queued" | "submitted"; createdAt?: string },
|
|
515
949
|
): Promise<OracleJob> {
|
|
516
950
|
const jobDir = getJobDir(id);
|
|
517
951
|
const logsDir = join(jobDir, "logs");
|
|
@@ -522,7 +956,8 @@ export async function createJob(
|
|
|
522
956
|
const reasoningPath = join(jobDir, "reasoning.md");
|
|
523
957
|
const artifactsManifestPath = join(jobDir, "artifacts.json");
|
|
524
958
|
const projectId = getProjectId(cwd);
|
|
525
|
-
const
|
|
959
|
+
const sessionFile = requirePersistedSessionFile(originSessionFile, "create oracle jobs");
|
|
960
|
+
const sessionId = getSessionId(sessionFile, projectId);
|
|
526
961
|
const conversationId = parseConversationId(input.chatUrl);
|
|
527
962
|
|
|
528
963
|
await mkdir(jobDir, { recursive: true, mode: 0o700 });
|
|
@@ -534,22 +969,24 @@ export async function createJob(
|
|
|
534
969
|
await writeFile(promptPath, input.prompt, { encoding: "utf8", mode: 0o600 });
|
|
535
970
|
await chmod(promptPath, 0o600).catch(() => undefined);
|
|
536
971
|
|
|
537
|
-
const
|
|
972
|
+
const createdAt = options?.createdAt ?? new Date().toISOString();
|
|
973
|
+
const initialState = options?.initialState ?? "submitted";
|
|
538
974
|
const normalizedEffort = input.modelFamily === "instant" ? undefined : (input.effort ?? config.defaults.effort);
|
|
539
975
|
const normalizedAutoSwitchToThinking = input.modelFamily === "instant"
|
|
540
976
|
? (input.autoSwitchToThinking ?? config.defaults.autoSwitchToThinking)
|
|
541
977
|
: false;
|
|
542
978
|
const job: OracleJob = {
|
|
543
979
|
id,
|
|
544
|
-
status:
|
|
545
|
-
phase:
|
|
546
|
-
phaseAt:
|
|
547
|
-
createdAt
|
|
548
|
-
|
|
980
|
+
status: initialState,
|
|
981
|
+
phase: initialState,
|
|
982
|
+
phaseAt: createdAt,
|
|
983
|
+
createdAt,
|
|
984
|
+
queuedAt: initialState === "queued" ? createdAt : undefined,
|
|
985
|
+
submittedAt: initialState === "submitted" ? createdAt : undefined,
|
|
549
986
|
cwd,
|
|
550
987
|
projectId,
|
|
551
988
|
sessionId,
|
|
552
|
-
originSessionFile,
|
|
989
|
+
originSessionFile: sessionFile,
|
|
553
990
|
requestSource: input.requestSource,
|
|
554
991
|
chatModelFamily: input.modelFamily,
|
|
555
992
|
effort: normalizedEffort,
|