pi-oracle 0.1.12 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,11 +5,12 @@ import { chmod, mkdir, readFile, rename, rm, writeFile } from "node:fs/promises"
5
5
  import { join, resolve } from "node:path";
6
6
  import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
7
7
  import type { OracleConfig, OracleEffort, OracleModelFamily } from "./config.js";
8
- import { withJobLock } from "./locks.js";
9
- import { cleanupRuntimeArtifacts, getProjectId, getSessionId, parseConversationId, type OracleCleanupReport } from "./runtime.js";
8
+ import { withJobLock, withLock } from "./locks.js";
9
+ import { cleanupRuntimeArtifacts, getProjectId, getSessionId, parseConversationId, requirePersistedSessionFile, type OracleCleanupReport } from "./runtime.js";
10
10
 
11
- export type OracleJobStatus = "preparing" | "submitted" | "waiting" | "complete" | "failed" | "cancelled";
11
+ export type OracleJobStatus = "queued" | "preparing" | "submitted" | "waiting" | "complete" | "failed" | "cancelled";
12
12
  export type OracleJobPhase =
13
+ | "queued"
13
14
  | "submitted"
14
15
  | "cloning_runtime"
15
16
  | "launching_browser"
@@ -24,10 +25,17 @@ export type OracleJobPhase =
24
25
  | "failed"
25
26
  | "cancelled";
26
27
 
28
+ export type OracleWakeupSettlementSource = "oracle_read" | "oracle_status";
29
+
27
30
  export const ACTIVE_ORACLE_JOB_STATUSES: OracleJobStatus[] = ["preparing", "submitted", "waiting"];
31
+ export const OPEN_ORACLE_JOB_STATUSES: OracleJobStatus[] = ["queued", ...ACTIVE_ORACLE_JOB_STATUSES];
32
+ export const TERMINAL_ORACLE_JOB_STATUSES: OracleJobStatus[] = ["complete", "failed", "cancelled"];
28
33
  export const ORACLE_MISSING_WORKER_GRACE_MS = 30_000;
29
34
  export const ORACLE_STALE_HEARTBEAT_MS = 3 * 60 * 1000;
30
35
  export const ORACLE_NOTIFICATION_CLAIM_TTL_MS = 60_000;
36
+ export const ORACLE_WAKEUP_MAX_ATTEMPTS = 3;
37
+ export const ORACLE_WAKEUP_RETRY_DELAYS_MS = [0, 15_000, 60_000] as const;
38
+ export const ORACLE_WAKEUP_POST_SEND_RETENTION_MS = 2 * 60 * 1000;
31
39
  const ORACLE_COMPLETE_JOB_RETENTION_MS = 14 * 24 * 60 * 60 * 1000;
32
40
  const ORACLE_FAILED_JOB_RETENTION_MS = 30 * 24 * 60 * 60 * 1000;
33
41
  export const DEFAULT_ORACLE_JOBS_DIR = "/tmp";
@@ -38,6 +46,36 @@ export function isActiveOracleJob(job: Pick<OracleJob, "status">): boolean {
38
46
  return ACTIVE_ORACLE_JOB_STATUSES.includes(job.status);
39
47
  }
40
48
 
49
+ export function isOpenOracleJob(job: Pick<OracleJob, "status">): boolean {
50
+ return OPEN_ORACLE_JOB_STATUSES.includes(job.status);
51
+ }
52
+
53
+ export function isTerminalOracleJob(job: Pick<OracleJob, "status">): boolean {
54
+ return TERMINAL_ORACLE_JOB_STATUSES.includes(job.status);
55
+ }
56
+
57
+ export function shouldAdvanceQueueAfterCancellation(job: Pick<OracleJob, "status" | "cleanupWarnings" | "cleanupPending">): boolean {
58
+ return job.status === "cancelled" && !job.cleanupPending && !job.cleanupWarnings?.length;
59
+ }
60
+
61
+ export function hasRetainedPreSubmitArchive(job: Pick<OracleJob, "submittedAt" | "archiveDeletedAfterUpload" | "archivePath">): boolean {
62
+ return !job.submittedAt && !job.archiveDeletedAfterUpload && typeof job.archivePath === "string" && job.archivePath.length > 0;
63
+ }
64
+
65
+ export function hasDurableWorkerHandoff(
66
+ job: Pick<OracleJob, "status" | "phase" | "workerPid" | "workerStartedAt" | "heartbeatAt">,
67
+ ): boolean {
68
+ if (job.status === "queued") return false;
69
+ if (job.workerPid) return true;
70
+ return false;
71
+ }
72
+
73
+ export function hasPersistedOriginSession(
74
+ job: Pick<OracleJob, "originSessionFile" | "sessionId">,
75
+ ): job is Pick<OracleJob, "originSessionFile" | "sessionId"> & { originSessionFile: string } {
76
+ return typeof job.originSessionFile === "string" && job.originSessionFile.length > 0 && job.sessionId === job.originSessionFile;
77
+ }
78
+
41
79
  function readProcessStartedAt(pid: number | undefined): string | undefined {
42
80
  if (!pid || pid <= 0) return undefined;
43
81
  try {
@@ -86,6 +124,7 @@ export interface OracleJob {
86
124
  phase: OracleJobPhase;
87
125
  phaseAt: string;
88
126
  createdAt: string;
127
+ queuedAt?: string;
89
128
  submittedAt?: string;
90
129
  completedAt?: string;
91
130
  heartbeatAt?: string;
@@ -108,6 +147,20 @@ export interface OracleJob {
108
147
  archiveSha256?: string;
109
148
  archiveDeletedAfterUpload: boolean;
110
149
  notifiedAt?: string;
150
+ notificationEntryId?: string;
151
+ notificationSessionKey?: string;
152
+ notificationSessionFile?: string;
153
+ wakeupAttemptCount?: number;
154
+ wakeupLastRequestedAt?: string;
155
+ wakeupSettledAt?: string;
156
+ wakeupSettledSource?: OracleWakeupSettlementSource;
157
+ wakeupSettledSessionFile?: string;
158
+ wakeupSettledSessionKey?: string;
159
+ wakeupSettledBeforeFirstAttempt?: boolean;
160
+ wakeupObservedAt?: string;
161
+ wakeupObservedSource?: OracleWakeupSettlementSource;
162
+ wakeupObservedSessionFile?: string;
163
+ wakeupObservedSessionKey?: string;
111
164
  notifyClaimedAt?: string;
112
165
  notifyClaimedBy?: string;
113
166
  artifactFailureCount?: number;
@@ -126,6 +179,7 @@ export interface OracleJob {
126
179
  config: OracleConfig;
127
180
  cleanupWarnings?: string[];
128
181
  lastCleanupAt?: string;
182
+ cleanupPending?: boolean;
129
183
  }
130
184
 
131
185
  export interface OracleSubmitInput {
@@ -214,6 +268,34 @@ export async function updateJob(id: string, mutate: (job: OracleJob) => OracleJo
214
268
  });
215
269
  }
216
270
 
271
+ export async function appendCleanupWarnings(jobId: string, warnings: string[], at = new Date().toISOString()): Promise<OracleJob | undefined> {
272
+ if (warnings.length === 0) return readJob(jobId);
273
+ try {
274
+ return await updateJob(jobId, (job) => ({
275
+ ...job,
276
+ cleanupPending: false,
277
+ cleanupWarnings: Array.from(new Set([...(job.cleanupWarnings || []), ...warnings])),
278
+ lastCleanupAt: at,
279
+ error: [job.error, ...warnings].filter(Boolean).join("\n"),
280
+ }));
281
+ } catch {
282
+ return readJob(jobId);
283
+ }
284
+ }
285
+
286
+ export async function clearCleanupPending(jobId: string, at = new Date().toISOString()): Promise<OracleJob | undefined> {
287
+ try {
288
+ return await updateJob(jobId, (job) => ({
289
+ ...job,
290
+ cleanupPending: false,
291
+ cleanupWarnings: undefined,
292
+ lastCleanupAt: at,
293
+ }));
294
+ } catch {
295
+ return readJob(jobId);
296
+ }
297
+ }
298
+
217
299
  function sleep(ms: number): Promise<void> {
218
300
  return new Promise((resolve) => setTimeout(resolve, ms));
219
301
  }
@@ -224,6 +306,39 @@ function parseTimestamp(value: string | undefined): number | undefined {
224
306
  return Number.isFinite(parsed) ? parsed : undefined;
225
307
  }
226
308
 
309
+ function notificationClaimIsOwnedBy(job: Pick<OracleJob, "notifyClaimedAt" | "notifyClaimedBy">, claimedBy: string, now = Date.now()): boolean {
310
+ if (job.notifyClaimedBy !== claimedBy) return false;
311
+ const claimedAtMs = parseTimestamp(job.notifyClaimedAt);
312
+ if (claimedAtMs === undefined) return false;
313
+ return now - claimedAtMs < ORACLE_NOTIFICATION_CLAIM_TTL_MS;
314
+ }
315
+
316
+ function notificationClaimIsLive(job: Pick<OracleJob, "notifyClaimedAt" | "notifyClaimedBy">, now = Date.now()): boolean {
317
+ if (!job.notifyClaimedBy) return false;
318
+ const claimedAtMs = parseTimestamp(job.notifyClaimedAt);
319
+ if (claimedAtMs === undefined) return false;
320
+ return now - claimedAtMs < ORACLE_NOTIFICATION_CLAIM_TTL_MS;
321
+ }
322
+
323
+ function wakeupRetentionGraceIsActive(job: Pick<OracleJob, "wakeupLastRequestedAt">, now = Date.now()): boolean {
324
+ const lastRequestedAtMs = parseTimestamp(job.wakeupLastRequestedAt);
325
+ if (lastRequestedAtMs === undefined) return false;
326
+ return now - lastRequestedAtMs < ORACLE_WAKEUP_POST_SEND_RETENTION_MS;
327
+ }
328
+
329
+ export function getWakeupRetryDelayMs(attemptCount: number): number {
330
+ return ORACLE_WAKEUP_RETRY_DELAYS_MS[Math.min(attemptCount, ORACLE_WAKEUP_RETRY_DELAYS_MS.length - 1)] ?? ORACLE_WAKEUP_RETRY_DELAYS_MS[ORACLE_WAKEUP_RETRY_DELAYS_MS.length - 1];
331
+ }
332
+
333
+ export function shouldRequestWakeup(job: Pick<OracleJob, "wakeupAttemptCount" | "wakeupLastRequestedAt" | "wakeupSettledAt">, now = Date.now()): boolean {
334
+ if (job.wakeupSettledAt) return false;
335
+ const attempts = job.wakeupAttemptCount ?? 0;
336
+ if (attempts >= ORACLE_WAKEUP_MAX_ATTEMPTS) return false;
337
+ const lastRequestedAtMs = parseTimestamp(job.wakeupLastRequestedAt);
338
+ if (lastRequestedAtMs === undefined) return true;
339
+ return now - lastRequestedAtMs >= getWakeupRetryDelayMs(attempts);
340
+ }
341
+
227
342
  export function withJobPhase<T extends Pick<OracleJob, "phase" | "phaseAt">>(
228
343
  phase: OracleJobPhase,
229
344
  patch?: Omit<Partial<OracleJob>, "phase" | "phaseAt">,
@@ -237,7 +352,7 @@ export function withJobPhase<T extends Pick<OracleJob, "phase" | "phaseAt">>(
237
352
  }
238
353
 
239
354
  function isTerminalOracleJobStatus(status: OracleJobStatus): boolean {
240
- return status === "complete" || status === "failed" || status === "cancelled";
355
+ return TERMINAL_ORACLE_JOB_STATUSES.includes(status);
241
356
  }
242
357
 
243
358
  export async function terminateWorkerPid(
@@ -312,15 +427,62 @@ export function getStaleOracleJobReason(job: OracleJob, now = Date.now()): strin
312
427
  return undefined;
313
428
  }
314
429
 
430
+ function getTerminalCleanupStaleReason(job: Pick<OracleJob, "status" | "cleanupPending" | "cleanupWarnings" | "lastCleanupAt" | "heartbeatAt" | "completedAt" | "phaseAt" | "createdAt" | "workerPid" | "workerStartedAt">, now = Date.now()): string | undefined {
431
+ if (!isTerminalOracleJob(job)) return undefined;
432
+ if (!job.cleanupPending && !job.cleanupWarnings?.length) return undefined;
433
+
434
+ const baselineMs =
435
+ parseTimestamp(job.lastCleanupAt) ??
436
+ parseTimestamp(job.heartbeatAt) ??
437
+ parseTimestamp(job.completedAt) ??
438
+ parseTimestamp(job.phaseAt) ??
439
+ parseTimestamp(job.createdAt);
440
+ if (baselineMs === undefined) return "Oracle terminal cleanup has no valid timestamps";
441
+ if (!job.workerPid) return undefined;
442
+
443
+ const currentStartedAt = readProcessStartedAt(job.workerPid);
444
+ if (!currentStartedAt) {
445
+ return `Oracle terminal cleanup worker PID ${job.workerPid} is no longer running`;
446
+ }
447
+
448
+ if (job.workerStartedAt && currentStartedAt !== job.workerStartedAt) {
449
+ return `Oracle terminal cleanup worker PID ${job.workerPid} no longer matches the recorded process identity`;
450
+ }
451
+
452
+ if (now - baselineMs > ORACLE_STALE_HEARTBEAT_MS) {
453
+ return `Oracle terminal cleanup is stale (${Math.round((now - baselineMs) / 1000)}s since last update)`;
454
+ }
455
+
456
+ return undefined;
457
+ }
458
+
315
459
  export async function cleanupJobResources(
316
- job: Pick<OracleJob, "runtimeId" | "runtimeProfileDir" | "runtimeSessionName" | "conversationId">,
460
+ job: Pick<OracleJob, "submittedAt" | "runtimeId" | "runtimeProfileDir" | "runtimeSessionName" | "conversationId" | "archivePath" | "archiveDeletedAfterUpload">,
317
461
  ): Promise<OracleCleanupReport> {
318
- return cleanupRuntimeArtifacts({
462
+ const report: OracleCleanupReport = { attempted: [], warnings: [] };
463
+
464
+ if (hasRetainedPreSubmitArchive(job)) {
465
+ report.attempted.push("queuedArchive");
466
+ await rm(job.archivePath, { force: true }).catch((error: Error) => {
467
+ report.warnings.push(`Failed to remove queued archive ${job.archivePath}: ${error.message}`);
468
+ });
469
+ }
470
+
471
+ if (!job.submittedAt) {
472
+ return report;
473
+ }
474
+
475
+ const runtimeReport = await cleanupRuntimeArtifacts({
319
476
  runtimeId: job.runtimeId,
320
477
  runtimeProfileDir: job.runtimeProfileDir,
321
478
  runtimeSessionName: job.runtimeSessionName,
322
479
  conversationId: job.conversationId,
323
480
  });
481
+
482
+ return {
483
+ attempted: [...report.attempted, ...runtimeReport.attempted],
484
+ warnings: [...report.warnings, ...runtimeReport.warnings],
485
+ };
324
486
  }
325
487
 
326
488
  function getCleanupRetentionMs(job: OracleJob): { complete: number; failed: number } {
@@ -330,15 +492,18 @@ function getCleanupRetentionMs(job: OracleJob): { complete: number; failed: numb
330
492
  };
331
493
  }
332
494
 
333
- function shouldPruneTerminalJob(job: OracleJob, now = Date.now()): boolean {
495
+ export function shouldPruneTerminalJob(job: OracleJob, now = Date.now()): boolean {
334
496
  if (!isTerminalOracleJobStatus(job.status)) return false;
497
+ if (job.cleanupPending || job.cleanupWarnings?.length) return false;
498
+ if (notificationClaimIsLive(job, now)) return false;
499
+ if (wakeupRetentionGraceIsActive(job, now)) return false;
335
500
  const completedMs = parseTimestamp(job.completedAt) ?? parseTimestamp(job.createdAt);
336
501
  if (completedMs === undefined) return false;
337
502
  const ageMs = now - completedMs;
338
503
 
339
504
  const retention = getCleanupRetentionMs(job);
340
505
 
341
- if ((job.status === "complete" || job.status === "cancelled") && job.notifiedAt) {
506
+ if (job.status === "complete" || job.status === "cancelled") {
342
507
  return ageMs >= retention.complete;
343
508
  }
344
509
 
@@ -350,10 +515,54 @@ function shouldPruneTerminalJob(job: OracleJob, now = Date.now()): boolean {
350
515
  }
351
516
 
352
517
  export async function removeTerminalOracleJob(job: OracleJob): Promise<{ removed: boolean; cleanupReport: OracleCleanupReport }> {
353
- if (isActiveOracleJob(job)) return { removed: false, cleanupReport: { attempted: [], warnings: [] } };
354
- const cleanupReport = await cleanupJobResources(job);
355
- await rm(getJobDir(job.id), { recursive: true, force: true });
356
- return { removed: true, cleanupReport };
518
+ if (!isTerminalOracleJob(job)) return { removed: false, cleanupReport: { attempted: [], warnings: [] } };
519
+
520
+ return withJobLock(job.id, { processPid: process.pid, action: "removeTerminalOracleJob" }, async () => {
521
+ const current = readJob(job.id);
522
+ if (!current) return { removed: true, cleanupReport: { attempted: [], warnings: [] } };
523
+ if (!isTerminalOracleJob(current)) return { removed: false, cleanupReport: { attempted: [], warnings: [] } };
524
+ if (notificationClaimIsLive(current)) {
525
+ return {
526
+ removed: false,
527
+ cleanupReport: {
528
+ attempted: [],
529
+ warnings: [`Refusing to remove terminal oracle job ${current.id} while a notification delivery is in flight.`],
530
+ },
531
+ };
532
+ }
533
+ if (wakeupRetentionGraceIsActive(current)) {
534
+ return {
535
+ removed: false,
536
+ cleanupReport: {
537
+ attempted: [],
538
+ warnings: [`Refusing to remove terminal oracle job ${current.id} because its wake-up delivery is still within the post-send retention grace window.`],
539
+ },
540
+ };
541
+ }
542
+ if (current.workerPid && isWorkerProcessAlive(current.workerPid, current.workerStartedAt)) {
543
+ return {
544
+ removed: false,
545
+ cleanupReport: {
546
+ attempted: [],
547
+ warnings: [`Refusing to remove terminal oracle job ${current.id} while worker PID ${current.workerPid} is still live.`],
548
+ },
549
+ };
550
+ }
551
+
552
+ const cleanupReport = await cleanupJobResources(current);
553
+ if (cleanupReport.warnings.length > 0) {
554
+ await writeJobUnlocked({
555
+ ...current,
556
+ cleanupPending: false,
557
+ cleanupWarnings: [...(current.cleanupWarnings || []), ...cleanupReport.warnings],
558
+ lastCleanupAt: new Date().toISOString(),
559
+ error: [current.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
560
+ });
561
+ return { removed: false, cleanupReport };
562
+ }
563
+ await rm(getJobDir(current.id), { recursive: true, force: true });
564
+ return { removed: true, cleanupReport };
565
+ });
357
566
  }
358
567
 
359
568
  export async function pruneTerminalOracleJobs(now = Date.now()): Promise<string[]> {
@@ -374,43 +583,108 @@ export async function pruneTerminalOracleJobs(now = Date.now()): Promise<string[
374
583
  export async function reconcileStaleOracleJobs(): Promise<OracleJob[]> {
375
584
  const repaired: OracleJob[] = [];
376
585
  const now = Date.now();
586
+ const recoveredAt = new Date(now).toISOString();
377
587
 
378
588
  for (const jobDir of listOracleJobDirs()) {
379
589
  const job = readJob(jobDir);
380
590
  if (!job) continue;
591
+
592
+ if (isTerminalOracleJob(job) && (job.cleanupPending || job.cleanupWarnings?.length)) {
593
+ let cleanupTarget: OracleJob | undefined;
594
+ let blockedWarning: string | undefined;
595
+
596
+ await withJobLock(job.id, { processPid: process.pid, action: "reconcileTerminalCleanupJob" }, async () => {
597
+ const current = readJob(job.id);
598
+ if (!current || !isTerminalOracleJob(current) || (!current.cleanupPending && !current.cleanupWarnings?.length)) return;
599
+
600
+ if (current.workerPid && isWorkerProcessAlive(current.workerPid, current.workerStartedAt)) {
601
+ const staleCleanupReason = getTerminalCleanupStaleReason(current, now);
602
+ if (!staleCleanupReason) return;
603
+ const terminated = await terminateWorkerPid(current.workerPid, current.workerStartedAt);
604
+ if (!terminated) {
605
+ blockedWarning = `Oracle terminal cleanup is blocked because worker PID ${current.workerPid} could not be terminated safely after ${staleCleanupReason}.`;
606
+ return;
607
+ }
608
+ }
609
+
610
+ cleanupTarget = current;
611
+ });
612
+
613
+ if (blockedWarning) {
614
+ const blocked = await appendCleanupWarnings(job.id, [blockedWarning], recoveredAt);
615
+ if (blocked) repaired.push(blocked);
616
+ continue;
617
+ }
618
+ if (!cleanupTarget) continue;
619
+
620
+ const cleanupReport = await cleanupJobResources(cleanupTarget);
621
+ if (cleanupReport.warnings.length > 0) {
622
+ const withWarnings = await appendCleanupWarnings(job.id, cleanupReport.warnings, recoveredAt);
623
+ if (withWarnings) repaired.push(withWarnings);
624
+ } else {
625
+ const recoveredJob = await clearCleanupPending(job.id, recoveredAt);
626
+ if (recoveredJob) repaired.push(recoveredJob);
627
+ }
628
+ continue;
629
+ }
630
+
381
631
  const staleReason = getStaleOracleJobReason(job, now);
382
632
  if (!staleReason) continue;
383
633
 
384
- const terminated = await terminateWorkerPid(job.workerPid, job.workerStartedAt);
385
- const suffix = job.workerPid
386
- ? terminated
387
- ? ` Terminated stale worker PID ${job.workerPid}.`
388
- : ` Failed to terminate stale worker PID ${job.workerPid}.`
389
- : "";
634
+ let terminated = false;
635
+ let transitioned = false;
636
+ let repairedJob: OracleJob | undefined;
637
+
638
+ await withJobLock(job.id, { processPid: process.pid, action: "reconcileStaleOracleJob" }, async () => {
639
+ const current = readJob(job.id);
640
+ if (!current) return;
641
+ const currentStaleReason = getStaleOracleJobReason(current, now);
642
+ if (!currentStaleReason) return;
643
+
644
+ terminated = await terminateWorkerPid(current.workerPid, current.workerStartedAt);
645
+ transitioned = true;
646
+ const suffix = current.workerPid
647
+ ? terminated
648
+ ? ` Terminated stale worker PID ${current.workerPid}.`
649
+ : ` Failed to terminate stale worker PID ${current.workerPid}.`
650
+ : "";
651
+ repairedJob = {
652
+ ...current,
653
+ ...withJobPhase("failed", {
654
+ status: "failed",
655
+ completedAt: recoveredAt,
656
+ heartbeatAt: recoveredAt,
657
+ notifyClaimedAt: undefined,
658
+ notifyClaimedBy: undefined,
659
+ cleanupPending: terminated,
660
+ error: current.error
661
+ ? `${current.error}\nRecovered stale job: ${currentStaleReason}.${suffix}`.trim()
662
+ : `Recovered stale job: ${currentStaleReason}.${suffix}`.trim(),
663
+ }, recoveredAt),
664
+ };
665
+ await writeJobUnlocked(repairedJob);
666
+ });
667
+
668
+ if (!transitioned || !repairedJob || !isTerminalOracleJob(repairedJob)) continue;
669
+
670
+ if (!terminated) {
671
+ const cleanupWarnings = [
672
+ `Oracle runtime cleanup is blocked because worker PID ${job.workerPid ?? "unknown"} could not be terminated safely.`,
673
+ ];
674
+ const blocked = await appendCleanupWarnings(repairedJob.id, cleanupWarnings, recoveredAt);
675
+ repaired.push(blocked ?? repairedJob);
676
+ continue;
677
+ }
390
678
 
391
- const repairedJob = await updateJob(job.id, (current) => ({
392
- ...current,
393
- ...withJobPhase("failed", {
394
- status: "failed",
395
- completedAt: new Date(now).toISOString(),
396
- heartbeatAt: new Date(now).toISOString(),
397
- notifyClaimedAt: undefined,
398
- notifyClaimedBy: undefined,
399
- error: current.error
400
- ? `${current.error}\nRecovered stale job: ${staleReason}.${suffix}`.trim()
401
- : `Recovered stale job: ${staleReason}.${suffix}`.trim(),
402
- }, new Date(now).toISOString()),
403
- }));
404
679
  const cleanupReport = await cleanupJobResources(repairedJob);
405
680
  if (cleanupReport.warnings.length > 0) {
406
- await updateJob(repairedJob.id, (current) => ({
407
- ...current,
408
- cleanupWarnings: [...(current.cleanupWarnings || []), ...cleanupReport.warnings],
409
- lastCleanupAt: new Date(now).toISOString(),
410
- error: [current.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
411
- }));
681
+ const withWarnings = await appendCleanupWarnings(repairedJob.id, cleanupReport.warnings, recoveredAt);
682
+ repaired.push(withWarnings ?? repairedJob);
683
+ continue;
412
684
  }
413
- repaired.push(repairedJob);
685
+
686
+ const finalized = await clearCleanupPending(repairedJob.id, recoveredAt);
687
+ repaired.push(finalized ?? repairedJob);
414
688
  }
415
689
 
416
690
  return repaired;
@@ -427,6 +701,10 @@ export async function tryClaimNotification(jobId: string, claimedBy: string, now
427
701
  if (!current) return undefined;
428
702
  if (!isTerminalOracleJobStatus(current.status)) return undefined;
429
703
  if (current.notifiedAt) return undefined;
704
+ if (!hasPersistedOriginSession(current)) return undefined;
705
+ const nowMs = parseTimestamp(now) ?? Date.now();
706
+ if (shouldPruneTerminalJob(current, nowMs)) return undefined;
707
+ if (!shouldRequestWakeup(current, nowMs)) return undefined;
430
708
 
431
709
  const claimedAtMs = parseTimestamp(current.notifyClaimedAt);
432
710
  const claimIsLive =
@@ -446,13 +724,50 @@ export async function tryClaimNotification(jobId: string, claimedBy: string, now
446
724
  });
447
725
  }
448
726
 
449
- export async function markJobNotified(jobId: string, claimedBy: string, at = new Date().toISOString()): Promise<OracleJob> {
727
+ export async function recordNotificationTarget(
728
+ jobId: string,
729
+ claimedBy: string,
730
+ options: { notificationSessionKey: string; notificationSessionFile?: string },
731
+ ): Promise<OracleJob> {
732
+ return withJobLock(jobId, { processPid: process.pid, action: "recordNotificationTarget", claimedBy }, async () => {
733
+ const current = readJob(jobId);
734
+ if (!current) throw new Error(`Oracle job not found: ${jobId}`);
735
+ if (current.notifiedAt) return current;
736
+ if (!notificationClaimIsOwnedBy(current, claimedBy)) {
737
+ throw new Error(`Oracle notification claim is not owned by ${claimedBy}: ${jobId}`);
738
+ }
739
+ const next: OracleJob = {
740
+ ...current,
741
+ notificationSessionKey: options.notificationSessionKey,
742
+ notificationSessionFile: options.notificationSessionFile,
743
+ };
744
+ await writeJobUnlocked(next);
745
+ return next;
746
+ });
747
+ }
748
+
749
+ export async function markJobNotified(
750
+ jobId: string,
751
+ claimedBy: string,
752
+ options?: { at?: string; notificationEntryId?: string; notificationSessionKey?: string; notificationSessionFile?: string },
753
+ ): Promise<OracleJob> {
754
+ const at = options?.at ?? new Date().toISOString();
450
755
  return withJobLock(jobId, { processPid: process.pid, action: "markJobNotified", claimedBy }, async () => {
451
756
  const current = readJob(jobId);
452
757
  if (!current) throw new Error(`Oracle job not found: ${jobId}`);
758
+ if (current.notifiedAt) return current;
759
+ if (!notificationClaimIsOwnedBy(current, claimedBy)) {
760
+ throw new Error(`Oracle notification claim is not owned by ${claimedBy}: ${jobId}`);
761
+ }
453
762
  const next: OracleJob = {
454
763
  ...current,
455
- notifiedAt: current.notifiedAt || at,
764
+ notifiedAt: at,
765
+ notificationEntryId: options?.notificationEntryId ?? current.notificationEntryId,
766
+ notificationSessionKey: options?.notificationSessionKey ?? current.notificationSessionKey,
767
+ notificationSessionFile: options?.notificationSessionFile ?? current.notificationSessionFile,
768
+ wakeupAttemptCount: 0,
769
+ wakeupLastRequestedAt: undefined,
770
+ wakeupSettledAt: undefined,
456
771
  notifyClaimedAt: undefined,
457
772
  notifyClaimedBy: undefined,
458
773
  };
@@ -476,33 +791,151 @@ export async function releaseNotificationClaim(jobId: string, claimedBy: string)
476
791
  });
477
792
  }
478
793
 
794
+ export async function noteWakeupRequested(jobId: string, at = new Date().toISOString()): Promise<OracleJob | undefined> {
795
+ try {
796
+ return await updateJob(jobId, (job) => ({
797
+ ...job,
798
+ wakeupAttemptCount: (job.wakeupAttemptCount ?? 0) + 1,
799
+ wakeupLastRequestedAt: at,
800
+ }));
801
+ } catch {
802
+ return readJob(jobId);
803
+ }
804
+ }
805
+
806
+ function getWakeupSessionKey(sessionFile: string | undefined, cwd: string | undefined): string | undefined {
807
+ if (!sessionFile || !cwd) return undefined;
808
+ const projectId = getProjectId(cwd);
809
+ return `${projectId}::${getSessionId(sessionFile, projectId)}`;
810
+ }
811
+
812
+ export async function markWakeupSettled(
813
+ jobId: string,
814
+ options: {
815
+ source: OracleWakeupSettlementSource;
816
+ sessionFile?: string;
817
+ cwd?: string;
818
+ at?: string;
819
+ allowBeforeFirstAttempt?: boolean;
820
+ },
821
+ ): Promise<OracleJob | undefined> {
822
+ const at = options.at ?? new Date().toISOString();
823
+ const sessionKey = getWakeupSessionKey(options.sessionFile, options.cwd);
824
+
825
+ try {
826
+ return await updateJob(jobId, (job) => {
827
+ const beforeFirstAttempt = !job.wakeupLastRequestedAt && (job.wakeupAttemptCount ?? 0) === 0;
828
+ if (job.wakeupSettledAt) {
829
+ return {
830
+ ...job,
831
+ wakeupSettledSource: job.wakeupSettledSource ?? options.source,
832
+ wakeupSettledSessionFile: job.wakeupSettledSessionFile ?? options.sessionFile,
833
+ wakeupSettledSessionKey: job.wakeupSettledSessionKey ?? sessionKey,
834
+ wakeupSettledBeforeFirstAttempt: job.wakeupSettledBeforeFirstAttempt ?? beforeFirstAttempt,
835
+ };
836
+ }
837
+
838
+ if (beforeFirstAttempt && !options.allowBeforeFirstAttempt) {
839
+ return {
840
+ ...job,
841
+ wakeupObservedAt: job.wakeupObservedAt ?? at,
842
+ wakeupObservedSource: job.wakeupObservedSource ?? options.source,
843
+ wakeupObservedSessionFile: job.wakeupObservedSessionFile ?? options.sessionFile,
844
+ wakeupObservedSessionKey: job.wakeupObservedSessionKey ?? sessionKey,
845
+ };
846
+ }
847
+
848
+ return {
849
+ ...job,
850
+ wakeupSettledAt: at,
851
+ wakeupSettledSource: options.source,
852
+ wakeupSettledSessionFile: options.sessionFile,
853
+ wakeupSettledSessionKey: sessionKey,
854
+ wakeupSettledBeforeFirstAttempt: beforeFirstAttempt,
855
+ };
856
+ });
857
+ } catch {
858
+ return readJob(jobId);
859
+ }
860
+ }
861
+
479
862
  export async function cancelOracleJob(id: string, reason = "Cancelled by user"): Promise<OracleJob> {
480
- const current = readJob(id);
481
- if (!current) throw new Error(`Oracle job not found: ${id}`);
482
- if (!isActiveOracleJob(current)) return current;
483
-
484
- const terminated = await terminateWorkerPid(current.workerPid, current.workerStartedAt);
485
- const now = new Date().toISOString();
486
- const cancelled = await updateJob(id, (job) => ({
487
- ...job,
488
- ...withJobPhase(terminated ? "cancelled" : "failed", {
489
- status: terminated ? "cancelled" : "failed",
490
- completedAt: now,
491
- heartbeatAt: now,
492
- notifyClaimedAt: undefined,
493
- notifyClaimedBy: undefined,
494
- error: terminated ? reason : `${reason}; worker PID ${job.workerPid ?? "unknown"} did not exit`,
495
- }, now),
496
- }));
497
- const cleanupReport = await cleanupJobResources(cancelled);
498
- if (cleanupReport.warnings.length === 0) return cancelled;
863
+ return withLock("admission", "global", { processPid: process.pid, action: "cancelOracleJob", jobId: id }, async () => {
864
+ const current = readJob(id);
865
+ if (!current) throw new Error(`Oracle job not found: ${id}`);
866
+ if (!isOpenOracleJob(current)) return current;
867
+
868
+ const now = new Date().toISOString();
869
+ if (current.status === "queued") {
870
+ const cancelled = await updateJob(id, (job) => ({
871
+ ...job,
872
+ ...withJobPhase("cancelled", {
873
+ status: "cancelled",
874
+ completedAt: now,
875
+ heartbeatAt: now,
876
+ notifyClaimedAt: undefined,
877
+ notifyClaimedBy: undefined,
878
+ error: reason,
879
+ }, now),
880
+ }));
881
+
882
+ const cleanupReport = await cleanupJobResources(cancelled);
883
+ if (cleanupReport.warnings.length === 0) return cancelled;
884
+
885
+ return updateJob(id, (job) => ({
886
+ ...job,
887
+ cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupReport.warnings],
888
+ lastCleanupAt: now,
889
+ error: [job.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
890
+ }));
891
+ }
892
+
893
+ const terminated = await terminateWorkerPid(current.workerPid, current.workerStartedAt);
894
+ let transitioned = false;
895
+ const cancelled = await updateJob(id, (job) => {
896
+ if (isTerminalOracleJob(job)) return job;
897
+ transitioned = true;
898
+ return {
899
+ ...job,
900
+ ...withJobPhase(terminated ? "cancelled" : "failed", {
901
+ status: terminated ? "cancelled" : "failed",
902
+ completedAt: now,
903
+ heartbeatAt: now,
904
+ notifyClaimedAt: undefined,
905
+ notifyClaimedBy: undefined,
906
+ cleanupPending: terminated,
907
+ error: terminated ? reason : `${reason}; worker PID ${job.workerPid ?? "unknown"} did not exit`,
908
+ }, now),
909
+ };
910
+ });
911
+ if (!transitioned) return cancelled;
912
+
913
+ if (!terminated) {
914
+ const cleanupWarnings = [
915
+ `Oracle runtime cleanup is blocked because worker PID ${current.workerPid ?? "unknown"} could not be terminated safely.`,
916
+ ];
917
+ return updateJob(id, (job) => ({
918
+ ...job,
919
+ cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupWarnings],
920
+ lastCleanupAt: now,
921
+ error: [job.error, ...cleanupWarnings].filter(Boolean).join("\n"),
922
+ }));
923
+ }
499
924
 
500
- return updateJob(id, (job) => ({
501
- ...job,
502
- cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupReport.warnings],
503
- lastCleanupAt: now,
504
- error: [job.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
505
- }));
925
+ const cleanupReport = await cleanupJobResources(cancelled);
926
+ if (cleanupReport.warnings.length === 0) {
927
+ const finalized = await clearCleanupPending(id, now);
928
+ return finalized ?? cancelled;
929
+ }
930
+
931
+ return updateJob(id, (job) => ({
932
+ ...job,
933
+ cleanupPending: false,
934
+ cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupReport.warnings],
935
+ lastCleanupAt: now,
936
+ error: [job.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
937
+ }));
938
+ });
506
939
  }
507
940
 
508
941
  export async function createJob(
@@ -512,6 +945,7 @@ export async function createJob(
512
945
  originSessionFile: string | undefined,
513
946
  config: OracleConfig,
514
947
  runtime: OracleRuntimeAllocation,
948
+ options?: { initialState?: "queued" | "submitted"; createdAt?: string },
515
949
  ): Promise<OracleJob> {
516
950
  const jobDir = getJobDir(id);
517
951
  const logsDir = join(jobDir, "logs");
@@ -522,7 +956,8 @@ export async function createJob(
522
956
  const reasoningPath = join(jobDir, "reasoning.md");
523
957
  const artifactsManifestPath = join(jobDir, "artifacts.json");
524
958
  const projectId = getProjectId(cwd);
525
- const sessionId = getSessionId(originSessionFile, projectId);
959
+ const sessionFile = requirePersistedSessionFile(originSessionFile, "create oracle jobs");
960
+ const sessionId = getSessionId(sessionFile, projectId);
526
961
  const conversationId = parseConversationId(input.chatUrl);
527
962
 
528
963
  await mkdir(jobDir, { recursive: true, mode: 0o700 });
@@ -534,22 +969,24 @@ export async function createJob(
534
969
  await writeFile(promptPath, input.prompt, { encoding: "utf8", mode: 0o600 });
535
970
  await chmod(promptPath, 0o600).catch(() => undefined);
536
971
 
537
- const now = new Date().toISOString();
972
+ const createdAt = options?.createdAt ?? new Date().toISOString();
973
+ const initialState = options?.initialState ?? "submitted";
538
974
  const normalizedEffort = input.modelFamily === "instant" ? undefined : (input.effort ?? config.defaults.effort);
539
975
  const normalizedAutoSwitchToThinking = input.modelFamily === "instant"
540
976
  ? (input.autoSwitchToThinking ?? config.defaults.autoSwitchToThinking)
541
977
  : false;
542
978
  const job: OracleJob = {
543
979
  id,
544
- status: "submitted",
545
- phase: "submitted",
546
- phaseAt: now,
547
- createdAt: now,
548
- submittedAt: now,
980
+ status: initialState,
981
+ phase: initialState,
982
+ phaseAt: createdAt,
983
+ createdAt,
984
+ queuedAt: initialState === "queued" ? createdAt : undefined,
985
+ submittedAt: initialState === "submitted" ? createdAt : undefined,
549
986
  cwd,
550
987
  projectId,
551
988
  sessionId,
552
- originSessionFile,
989
+ originSessionFile: sessionFile,
553
990
  requestSource: input.requestSource,
554
991
  chatModelFamily: input.modelFamily,
555
992
  effort: normalizedEffort,