pi-oracle 0.1.12 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,11 +5,12 @@ import { chmod, mkdir, readFile, rename, rm, writeFile } from "node:fs/promises"
5
5
  import { join, resolve } from "node:path";
6
6
  import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
7
7
  import type { OracleConfig, OracleEffort, OracleModelFamily } from "./config.js";
8
- import { withJobLock } from "./locks.js";
9
- import { cleanupRuntimeArtifacts, getProjectId, getSessionId, parseConversationId, type OracleCleanupReport } from "./runtime.js";
8
+ import { withJobLock, withLock } from "./locks.js";
9
+ import { cleanupRuntimeArtifacts, getProjectId, getSessionId, parseConversationId, requirePersistedSessionFile, type OracleCleanupReport } from "./runtime.js";
10
10
 
11
- export type OracleJobStatus = "preparing" | "submitted" | "waiting" | "complete" | "failed" | "cancelled";
11
+ export type OracleJobStatus = "queued" | "preparing" | "submitted" | "waiting" | "complete" | "failed" | "cancelled";
12
12
  export type OracleJobPhase =
13
+ | "queued"
13
14
  | "submitted"
14
15
  | "cloning_runtime"
15
16
  | "launching_browser"
@@ -25,9 +26,14 @@ export type OracleJobPhase =
25
26
  | "cancelled";
26
27
 
27
28
  export const ACTIVE_ORACLE_JOB_STATUSES: OracleJobStatus[] = ["preparing", "submitted", "waiting"];
29
+ export const OPEN_ORACLE_JOB_STATUSES: OracleJobStatus[] = ["queued", ...ACTIVE_ORACLE_JOB_STATUSES];
30
+ export const TERMINAL_ORACLE_JOB_STATUSES: OracleJobStatus[] = ["complete", "failed", "cancelled"];
28
31
  export const ORACLE_MISSING_WORKER_GRACE_MS = 30_000;
29
32
  export const ORACLE_STALE_HEARTBEAT_MS = 3 * 60 * 1000;
30
33
  export const ORACLE_NOTIFICATION_CLAIM_TTL_MS = 60_000;
34
+ export const ORACLE_WAKEUP_MAX_ATTEMPTS = 3;
35
+ export const ORACLE_WAKEUP_RETRY_DELAYS_MS = [0, 15_000, 60_000] as const;
36
+ export const ORACLE_WAKEUP_POST_SEND_RETENTION_MS = 2 * 60 * 1000;
31
37
  const ORACLE_COMPLETE_JOB_RETENTION_MS = 14 * 24 * 60 * 60 * 1000;
32
38
  const ORACLE_FAILED_JOB_RETENTION_MS = 30 * 24 * 60 * 60 * 1000;
33
39
  export const DEFAULT_ORACLE_JOBS_DIR = "/tmp";
@@ -38,6 +44,32 @@ export function isActiveOracleJob(job: Pick<OracleJob, "status">): boolean {
38
44
  return ACTIVE_ORACLE_JOB_STATUSES.includes(job.status);
39
45
  }
40
46
 
47
+ export function isOpenOracleJob(job: Pick<OracleJob, "status">): boolean {
48
+ return OPEN_ORACLE_JOB_STATUSES.includes(job.status);
49
+ }
50
+
51
+ export function isTerminalOracleJob(job: Pick<OracleJob, "status">): boolean {
52
+ return TERMINAL_ORACLE_JOB_STATUSES.includes(job.status);
53
+ }
54
+
55
+ export function shouldAdvanceQueueAfterCancellation(job: Pick<OracleJob, "status" | "cleanupWarnings" | "cleanupPending">): boolean {
56
+ return job.status === "cancelled" && !job.cleanupPending && !job.cleanupWarnings?.length;
57
+ }
58
+
59
+ export function hasDurableWorkerHandoff(
60
+ job: Pick<OracleJob, "status" | "phase" | "workerPid" | "workerStartedAt" | "heartbeatAt">,
61
+ ): boolean {
62
+ if (job.status === "queued") return false;
63
+ if (job.workerPid) return true;
64
+ return false;
65
+ }
66
+
67
+ export function hasPersistedOriginSession(
68
+ job: Pick<OracleJob, "originSessionFile" | "sessionId">,
69
+ ): job is Pick<OracleJob, "originSessionFile" | "sessionId"> & { originSessionFile: string } {
70
+ return typeof job.originSessionFile === "string" && job.originSessionFile.length > 0 && job.sessionId === job.originSessionFile;
71
+ }
72
+
41
73
  function readProcessStartedAt(pid: number | undefined): string | undefined {
42
74
  if (!pid || pid <= 0) return undefined;
43
75
  try {
@@ -86,6 +118,7 @@ export interface OracleJob {
86
118
  phase: OracleJobPhase;
87
119
  phaseAt: string;
88
120
  createdAt: string;
121
+ queuedAt?: string;
89
122
  submittedAt?: string;
90
123
  completedAt?: string;
91
124
  heartbeatAt?: string;
@@ -108,6 +141,12 @@ export interface OracleJob {
108
141
  archiveSha256?: string;
109
142
  archiveDeletedAfterUpload: boolean;
110
143
  notifiedAt?: string;
144
+ notificationEntryId?: string;
145
+ notificationSessionKey?: string;
146
+ notificationSessionFile?: string;
147
+ wakeupAttemptCount?: number;
148
+ wakeupLastRequestedAt?: string;
149
+ wakeupSettledAt?: string;
111
150
  notifyClaimedAt?: string;
112
151
  notifyClaimedBy?: string;
113
152
  artifactFailureCount?: number;
@@ -126,6 +165,7 @@ export interface OracleJob {
126
165
  config: OracleConfig;
127
166
  cleanupWarnings?: string[];
128
167
  lastCleanupAt?: string;
168
+ cleanupPending?: boolean;
129
169
  }
130
170
 
131
171
  export interface OracleSubmitInput {
@@ -214,6 +254,34 @@ export async function updateJob(id: string, mutate: (job: OracleJob) => OracleJo
214
254
  });
215
255
  }
216
256
 
257
+ export async function appendCleanupWarnings(jobId: string, warnings: string[], at = new Date().toISOString()): Promise<OracleJob | undefined> {
258
+ if (warnings.length === 0) return readJob(jobId);
259
+ try {
260
+ return await updateJob(jobId, (job) => ({
261
+ ...job,
262
+ cleanupPending: false,
263
+ cleanupWarnings: Array.from(new Set([...(job.cleanupWarnings || []), ...warnings])),
264
+ lastCleanupAt: at,
265
+ error: [job.error, ...warnings].filter(Boolean).join("\n"),
266
+ }));
267
+ } catch {
268
+ return readJob(jobId);
269
+ }
270
+ }
271
+
272
+ export async function clearCleanupPending(jobId: string, at = new Date().toISOString()): Promise<OracleJob | undefined> {
273
+ try {
274
+ return await updateJob(jobId, (job) => ({
275
+ ...job,
276
+ cleanupPending: false,
277
+ cleanupWarnings: undefined,
278
+ lastCleanupAt: at,
279
+ }));
280
+ } catch {
281
+ return readJob(jobId);
282
+ }
283
+ }
284
+
217
285
  function sleep(ms: number): Promise<void> {
218
286
  return new Promise((resolve) => setTimeout(resolve, ms));
219
287
  }
@@ -224,6 +292,39 @@ function parseTimestamp(value: string | undefined): number | undefined {
224
292
  return Number.isFinite(parsed) ? parsed : undefined;
225
293
  }
226
294
 
295
+ function notificationClaimIsOwnedBy(job: Pick<OracleJob, "notifyClaimedAt" | "notifyClaimedBy">, claimedBy: string, now = Date.now()): boolean {
296
+ if (job.notifyClaimedBy !== claimedBy) return false;
297
+ const claimedAtMs = parseTimestamp(job.notifyClaimedAt);
298
+ if (claimedAtMs === undefined) return false;
299
+ return now - claimedAtMs < ORACLE_NOTIFICATION_CLAIM_TTL_MS;
300
+ }
301
+
302
+ function notificationClaimIsLive(job: Pick<OracleJob, "notifyClaimedAt" | "notifyClaimedBy">, now = Date.now()): boolean {
303
+ if (!job.notifyClaimedBy) return false;
304
+ const claimedAtMs = parseTimestamp(job.notifyClaimedAt);
305
+ if (claimedAtMs === undefined) return false;
306
+ return now - claimedAtMs < ORACLE_NOTIFICATION_CLAIM_TTL_MS;
307
+ }
308
+
309
+ function wakeupRetentionGraceIsActive(job: Pick<OracleJob, "wakeupLastRequestedAt">, now = Date.now()): boolean {
310
+ const lastRequestedAtMs = parseTimestamp(job.wakeupLastRequestedAt);
311
+ if (lastRequestedAtMs === undefined) return false;
312
+ return now - lastRequestedAtMs < ORACLE_WAKEUP_POST_SEND_RETENTION_MS;
313
+ }
314
+
315
+ export function getWakeupRetryDelayMs(attemptCount: number): number {
316
+ return ORACLE_WAKEUP_RETRY_DELAYS_MS[Math.min(attemptCount, ORACLE_WAKEUP_RETRY_DELAYS_MS.length - 1)] ?? ORACLE_WAKEUP_RETRY_DELAYS_MS[ORACLE_WAKEUP_RETRY_DELAYS_MS.length - 1];
317
+ }
318
+
319
+ export function shouldRequestWakeup(job: Pick<OracleJob, "wakeupAttemptCount" | "wakeupLastRequestedAt" | "wakeupSettledAt">, now = Date.now()): boolean {
320
+ if (job.wakeupSettledAt) return false;
321
+ const attempts = job.wakeupAttemptCount ?? 0;
322
+ if (attempts >= ORACLE_WAKEUP_MAX_ATTEMPTS) return false;
323
+ const lastRequestedAtMs = parseTimestamp(job.wakeupLastRequestedAt);
324
+ if (lastRequestedAtMs === undefined) return true;
325
+ return now - lastRequestedAtMs >= getWakeupRetryDelayMs(attempts);
326
+ }
327
+
227
328
  export function withJobPhase<T extends Pick<OracleJob, "phase" | "phaseAt">>(
228
329
  phase: OracleJobPhase,
229
330
  patch?: Omit<Partial<OracleJob>, "phase" | "phaseAt">,
@@ -237,7 +338,7 @@ export function withJobPhase<T extends Pick<OracleJob, "phase" | "phaseAt">>(
237
338
  }
238
339
 
239
340
  function isTerminalOracleJobStatus(status: OracleJobStatus): boolean {
240
- return status === "complete" || status === "failed" || status === "cancelled";
341
+ return TERMINAL_ORACLE_JOB_STATUSES.includes(status);
241
342
  }
242
343
 
243
344
  export async function terminateWorkerPid(
@@ -312,9 +413,41 @@ export function getStaleOracleJobReason(job: OracleJob, now = Date.now()): strin
312
413
  return undefined;
313
414
  }
314
415
 
416
+ function getTerminalCleanupStaleReason(job: Pick<OracleJob, "status" | "cleanupPending" | "cleanupWarnings" | "lastCleanupAt" | "heartbeatAt" | "completedAt" | "phaseAt" | "createdAt" | "workerPid" | "workerStartedAt">, now = Date.now()): string | undefined {
417
+ if (!isTerminalOracleJob(job)) return undefined;
418
+ if (!job.cleanupPending && !job.cleanupWarnings?.length) return undefined;
419
+
420
+ const baselineMs =
421
+ parseTimestamp(job.lastCleanupAt) ??
422
+ parseTimestamp(job.heartbeatAt) ??
423
+ parseTimestamp(job.completedAt) ??
424
+ parseTimestamp(job.phaseAt) ??
425
+ parseTimestamp(job.createdAt);
426
+ if (baselineMs === undefined) return "Oracle terminal cleanup has no valid timestamps";
427
+ if (!job.workerPid) return undefined;
428
+
429
+ const currentStartedAt = readProcessStartedAt(job.workerPid);
430
+ if (!currentStartedAt) {
431
+ return `Oracle terminal cleanup worker PID ${job.workerPid} is no longer running`;
432
+ }
433
+
434
+ if (job.workerStartedAt && currentStartedAt !== job.workerStartedAt) {
435
+ return `Oracle terminal cleanup worker PID ${job.workerPid} no longer matches the recorded process identity`;
436
+ }
437
+
438
+ if (now - baselineMs > ORACLE_STALE_HEARTBEAT_MS) {
439
+ return `Oracle terminal cleanup is stale (${Math.round((now - baselineMs) / 1000)}s since last update)`;
440
+ }
441
+
442
+ return undefined;
443
+ }
444
+
315
445
  export async function cleanupJobResources(
316
- job: Pick<OracleJob, "runtimeId" | "runtimeProfileDir" | "runtimeSessionName" | "conversationId">,
446
+ job: Pick<OracleJob, "submittedAt" | "runtimeId" | "runtimeProfileDir" | "runtimeSessionName" | "conversationId">,
317
447
  ): Promise<OracleCleanupReport> {
448
+ if (!job.submittedAt) {
449
+ return { attempted: [], warnings: [] };
450
+ }
318
451
  return cleanupRuntimeArtifacts({
319
452
  runtimeId: job.runtimeId,
320
453
  runtimeProfileDir: job.runtimeProfileDir,
@@ -330,15 +463,18 @@ function getCleanupRetentionMs(job: OracleJob): { complete: number; failed: numb
330
463
  };
331
464
  }
332
465
 
333
- function shouldPruneTerminalJob(job: OracleJob, now = Date.now()): boolean {
466
+ export function shouldPruneTerminalJob(job: OracleJob, now = Date.now()): boolean {
334
467
  if (!isTerminalOracleJobStatus(job.status)) return false;
468
+ if (job.cleanupPending || job.cleanupWarnings?.length) return false;
469
+ if (notificationClaimIsLive(job, now)) return false;
470
+ if (wakeupRetentionGraceIsActive(job, now)) return false;
335
471
  const completedMs = parseTimestamp(job.completedAt) ?? parseTimestamp(job.createdAt);
336
472
  if (completedMs === undefined) return false;
337
473
  const ageMs = now - completedMs;
338
474
 
339
475
  const retention = getCleanupRetentionMs(job);
340
476
 
341
- if ((job.status === "complete" || job.status === "cancelled") && job.notifiedAt) {
477
+ if (job.status === "complete" || job.status === "cancelled") {
342
478
  return ageMs >= retention.complete;
343
479
  }
344
480
 
@@ -350,10 +486,54 @@ function shouldPruneTerminalJob(job: OracleJob, now = Date.now()): boolean {
350
486
  }
351
487
 
352
488
  export async function removeTerminalOracleJob(job: OracleJob): Promise<{ removed: boolean; cleanupReport: OracleCleanupReport }> {
353
- if (isActiveOracleJob(job)) return { removed: false, cleanupReport: { attempted: [], warnings: [] } };
354
- const cleanupReport = await cleanupJobResources(job);
355
- await rm(getJobDir(job.id), { recursive: true, force: true });
356
- return { removed: true, cleanupReport };
489
+ if (!isTerminalOracleJob(job)) return { removed: false, cleanupReport: { attempted: [], warnings: [] } };
490
+
491
+ return withJobLock(job.id, { processPid: process.pid, action: "removeTerminalOracleJob" }, async () => {
492
+ const current = readJob(job.id);
493
+ if (!current) return { removed: true, cleanupReport: { attempted: [], warnings: [] } };
494
+ if (!isTerminalOracleJob(current)) return { removed: false, cleanupReport: { attempted: [], warnings: [] } };
495
+ if (notificationClaimIsLive(current)) {
496
+ return {
497
+ removed: false,
498
+ cleanupReport: {
499
+ attempted: [],
500
+ warnings: [`Refusing to remove terminal oracle job ${current.id} while a notification delivery is in flight.`],
501
+ },
502
+ };
503
+ }
504
+ if (wakeupRetentionGraceIsActive(current)) {
505
+ return {
506
+ removed: false,
507
+ cleanupReport: {
508
+ attempted: [],
509
+ warnings: [`Refusing to remove terminal oracle job ${current.id} because its wake-up delivery is still within the post-send retention grace window.`],
510
+ },
511
+ };
512
+ }
513
+ if (current.workerPid && isWorkerProcessAlive(current.workerPid, current.workerStartedAt)) {
514
+ return {
515
+ removed: false,
516
+ cleanupReport: {
517
+ attempted: [],
518
+ warnings: [`Refusing to remove terminal oracle job ${current.id} while worker PID ${current.workerPid} is still live.`],
519
+ },
520
+ };
521
+ }
522
+
523
+ const cleanupReport = await cleanupJobResources(current);
524
+ if (cleanupReport.warnings.length > 0) {
525
+ await writeJobUnlocked({
526
+ ...current,
527
+ cleanupPending: false,
528
+ cleanupWarnings: [...(current.cleanupWarnings || []), ...cleanupReport.warnings],
529
+ lastCleanupAt: new Date().toISOString(),
530
+ error: [current.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
531
+ });
532
+ return { removed: false, cleanupReport };
533
+ }
534
+ await rm(getJobDir(current.id), { recursive: true, force: true });
535
+ return { removed: true, cleanupReport };
536
+ });
357
537
  }
358
538
 
359
539
  export async function pruneTerminalOracleJobs(now = Date.now()): Promise<string[]> {
@@ -374,43 +554,108 @@ export async function pruneTerminalOracleJobs(now = Date.now()): Promise<string[
374
554
  export async function reconcileStaleOracleJobs(): Promise<OracleJob[]> {
375
555
  const repaired: OracleJob[] = [];
376
556
  const now = Date.now();
557
+ const recoveredAt = new Date(now).toISOString();
377
558
 
378
559
  for (const jobDir of listOracleJobDirs()) {
379
560
  const job = readJob(jobDir);
380
561
  if (!job) continue;
562
+
563
+ if (isTerminalOracleJob(job) && (job.cleanupPending || job.cleanupWarnings?.length)) {
564
+ let cleanupTarget: OracleJob | undefined;
565
+ let blockedWarning: string | undefined;
566
+
567
+ await withJobLock(job.id, { processPid: process.pid, action: "reconcileTerminalCleanupJob" }, async () => {
568
+ const current = readJob(job.id);
569
+ if (!current || !isTerminalOracleJob(current) || (!current.cleanupPending && !current.cleanupWarnings?.length)) return;
570
+
571
+ if (current.workerPid && isWorkerProcessAlive(current.workerPid, current.workerStartedAt)) {
572
+ const staleCleanupReason = getTerminalCleanupStaleReason(current, now);
573
+ if (!staleCleanupReason) return;
574
+ const terminated = await terminateWorkerPid(current.workerPid, current.workerStartedAt);
575
+ if (!terminated) {
576
+ blockedWarning = `Oracle terminal cleanup is blocked because worker PID ${current.workerPid} could not be terminated safely after ${staleCleanupReason}.`;
577
+ return;
578
+ }
579
+ }
580
+
581
+ cleanupTarget = current;
582
+ });
583
+
584
+ if (blockedWarning) {
585
+ const blocked = await appendCleanupWarnings(job.id, [blockedWarning], recoveredAt);
586
+ if (blocked) repaired.push(blocked);
587
+ continue;
588
+ }
589
+ if (!cleanupTarget) continue;
590
+
591
+ const cleanupReport = await cleanupJobResources(cleanupTarget);
592
+ if (cleanupReport.warnings.length > 0) {
593
+ const withWarnings = await appendCleanupWarnings(job.id, cleanupReport.warnings, recoveredAt);
594
+ if (withWarnings) repaired.push(withWarnings);
595
+ } else {
596
+ const recoveredJob = await clearCleanupPending(job.id, recoveredAt);
597
+ if (recoveredJob) repaired.push(recoveredJob);
598
+ }
599
+ continue;
600
+ }
601
+
381
602
  const staleReason = getStaleOracleJobReason(job, now);
382
603
  if (!staleReason) continue;
383
604
 
384
- const terminated = await terminateWorkerPid(job.workerPid, job.workerStartedAt);
385
- const suffix = job.workerPid
386
- ? terminated
387
- ? ` Terminated stale worker PID ${job.workerPid}.`
388
- : ` Failed to terminate stale worker PID ${job.workerPid}.`
389
- : "";
605
+ let terminated = false;
606
+ let transitioned = false;
607
+ let repairedJob: OracleJob | undefined;
608
+
609
+ await withJobLock(job.id, { processPid: process.pid, action: "reconcileStaleOracleJob" }, async () => {
610
+ const current = readJob(job.id);
611
+ if (!current) return;
612
+ const currentStaleReason = getStaleOracleJobReason(current, now);
613
+ if (!currentStaleReason) return;
614
+
615
+ terminated = await terminateWorkerPid(current.workerPid, current.workerStartedAt);
616
+ transitioned = true;
617
+ const suffix = current.workerPid
618
+ ? terminated
619
+ ? ` Terminated stale worker PID ${current.workerPid}.`
620
+ : ` Failed to terminate stale worker PID ${current.workerPid}.`
621
+ : "";
622
+ repairedJob = {
623
+ ...current,
624
+ ...withJobPhase("failed", {
625
+ status: "failed",
626
+ completedAt: recoveredAt,
627
+ heartbeatAt: recoveredAt,
628
+ notifyClaimedAt: undefined,
629
+ notifyClaimedBy: undefined,
630
+ cleanupPending: terminated,
631
+ error: current.error
632
+ ? `${current.error}\nRecovered stale job: ${currentStaleReason}.${suffix}`.trim()
633
+ : `Recovered stale job: ${currentStaleReason}.${suffix}`.trim(),
634
+ }, recoveredAt),
635
+ };
636
+ await writeJobUnlocked(repairedJob);
637
+ });
638
+
639
+ if (!transitioned || !repairedJob || !isTerminalOracleJob(repairedJob)) continue;
640
+
641
+ if (!terminated) {
642
+ const cleanupWarnings = [
643
+ `Oracle runtime cleanup is blocked because worker PID ${job.workerPid ?? "unknown"} could not be terminated safely.`,
644
+ ];
645
+ const blocked = await appendCleanupWarnings(repairedJob.id, cleanupWarnings, recoveredAt);
646
+ repaired.push(blocked ?? repairedJob);
647
+ continue;
648
+ }
390
649
 
391
- const repairedJob = await updateJob(job.id, (current) => ({
392
- ...current,
393
- ...withJobPhase("failed", {
394
- status: "failed",
395
- completedAt: new Date(now).toISOString(),
396
- heartbeatAt: new Date(now).toISOString(),
397
- notifyClaimedAt: undefined,
398
- notifyClaimedBy: undefined,
399
- error: current.error
400
- ? `${current.error}\nRecovered stale job: ${staleReason}.${suffix}`.trim()
401
- : `Recovered stale job: ${staleReason}.${suffix}`.trim(),
402
- }, new Date(now).toISOString()),
403
- }));
404
650
  const cleanupReport = await cleanupJobResources(repairedJob);
405
651
  if (cleanupReport.warnings.length > 0) {
406
- await updateJob(repairedJob.id, (current) => ({
407
- ...current,
408
- cleanupWarnings: [...(current.cleanupWarnings || []), ...cleanupReport.warnings],
409
- lastCleanupAt: new Date(now).toISOString(),
410
- error: [current.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
411
- }));
652
+ const withWarnings = await appendCleanupWarnings(repairedJob.id, cleanupReport.warnings, recoveredAt);
653
+ repaired.push(withWarnings ?? repairedJob);
654
+ continue;
412
655
  }
413
- repaired.push(repairedJob);
656
+
657
+ const finalized = await clearCleanupPending(repairedJob.id, recoveredAt);
658
+ repaired.push(finalized ?? repairedJob);
414
659
  }
415
660
 
416
661
  return repaired;
@@ -427,6 +672,10 @@ export async function tryClaimNotification(jobId: string, claimedBy: string, now
427
672
  if (!current) return undefined;
428
673
  if (!isTerminalOracleJobStatus(current.status)) return undefined;
429
674
  if (current.notifiedAt) return undefined;
675
+ if (!hasPersistedOriginSession(current)) return undefined;
676
+ const nowMs = parseTimestamp(now) ?? Date.now();
677
+ if (shouldPruneTerminalJob(current, nowMs)) return undefined;
678
+ if (!shouldRequestWakeup(current, nowMs)) return undefined;
430
679
 
431
680
  const claimedAtMs = parseTimestamp(current.notifyClaimedAt);
432
681
  const claimIsLive =
@@ -446,13 +695,50 @@ export async function tryClaimNotification(jobId: string, claimedBy: string, now
446
695
  });
447
696
  }
448
697
 
449
- export async function markJobNotified(jobId: string, claimedBy: string, at = new Date().toISOString()): Promise<OracleJob> {
698
+ export async function recordNotificationTarget(
699
+ jobId: string,
700
+ claimedBy: string,
701
+ options: { notificationSessionKey: string; notificationSessionFile?: string },
702
+ ): Promise<OracleJob> {
703
+ return withJobLock(jobId, { processPid: process.pid, action: "recordNotificationTarget", claimedBy }, async () => {
704
+ const current = readJob(jobId);
705
+ if (!current) throw new Error(`Oracle job not found: ${jobId}`);
706
+ if (current.notifiedAt) return current;
707
+ if (!notificationClaimIsOwnedBy(current, claimedBy)) {
708
+ throw new Error(`Oracle notification claim is not owned by ${claimedBy}: ${jobId}`);
709
+ }
710
+ const next: OracleJob = {
711
+ ...current,
712
+ notificationSessionKey: options.notificationSessionKey,
713
+ notificationSessionFile: options.notificationSessionFile,
714
+ };
715
+ await writeJobUnlocked(next);
716
+ return next;
717
+ });
718
+ }
719
+
720
+ export async function markJobNotified(
721
+ jobId: string,
722
+ claimedBy: string,
723
+ options?: { at?: string; notificationEntryId?: string; notificationSessionKey?: string; notificationSessionFile?: string },
724
+ ): Promise<OracleJob> {
725
+ const at = options?.at ?? new Date().toISOString();
450
726
  return withJobLock(jobId, { processPid: process.pid, action: "markJobNotified", claimedBy }, async () => {
451
727
  const current = readJob(jobId);
452
728
  if (!current) throw new Error(`Oracle job not found: ${jobId}`);
729
+ if (current.notifiedAt) return current;
730
+ if (!notificationClaimIsOwnedBy(current, claimedBy)) {
731
+ throw new Error(`Oracle notification claim is not owned by ${claimedBy}: ${jobId}`);
732
+ }
453
733
  const next: OracleJob = {
454
734
  ...current,
455
- notifiedAt: current.notifiedAt || at,
735
+ notifiedAt: at,
736
+ notificationEntryId: options?.notificationEntryId ?? current.notificationEntryId,
737
+ notificationSessionKey: options?.notificationSessionKey ?? current.notificationSessionKey,
738
+ notificationSessionFile: options?.notificationSessionFile ?? current.notificationSessionFile,
739
+ wakeupAttemptCount: 0,
740
+ wakeupLastRequestedAt: undefined,
741
+ wakeupSettledAt: undefined,
456
742
  notifyClaimedAt: undefined,
457
743
  notifyClaimedBy: undefined,
458
744
  };
@@ -476,33 +762,109 @@ export async function releaseNotificationClaim(jobId: string, claimedBy: string)
476
762
  });
477
763
  }
478
764
 
765
+ export async function noteWakeupRequested(jobId: string, at = new Date().toISOString()): Promise<OracleJob | undefined> {
766
+ try {
767
+ return await updateJob(jobId, (job) => ({
768
+ ...job,
769
+ wakeupAttemptCount: (job.wakeupAttemptCount ?? 0) + 1,
770
+ wakeupLastRequestedAt: at,
771
+ }));
772
+ } catch {
773
+ return readJob(jobId);
774
+ }
775
+ }
776
+
777
+ export async function markWakeupSettled(jobId: string, at = new Date().toISOString()): Promise<OracleJob | undefined> {
778
+ try {
779
+ return await updateJob(jobId, (job) => ({
780
+ ...job,
781
+ wakeupSettledAt: job.wakeupSettledAt ?? at,
782
+ }));
783
+ } catch {
784
+ return readJob(jobId);
785
+ }
786
+ }
787
+
479
788
  export async function cancelOracleJob(id: string, reason = "Cancelled by user"): Promise<OracleJob> {
480
- const current = readJob(id);
481
- if (!current) throw new Error(`Oracle job not found: ${id}`);
482
- if (!isActiveOracleJob(current)) return current;
483
-
484
- const terminated = await terminateWorkerPid(current.workerPid, current.workerStartedAt);
485
- const now = new Date().toISOString();
486
- const cancelled = await updateJob(id, (job) => ({
487
- ...job,
488
- ...withJobPhase(terminated ? "cancelled" : "failed", {
489
- status: terminated ? "cancelled" : "failed",
490
- completedAt: now,
491
- heartbeatAt: now,
492
- notifyClaimedAt: undefined,
493
- notifyClaimedBy: undefined,
494
- error: terminated ? reason : `${reason}; worker PID ${job.workerPid ?? "unknown"} did not exit`,
495
- }, now),
496
- }));
497
- const cleanupReport = await cleanupJobResources(cancelled);
498
- if (cleanupReport.warnings.length === 0) return cancelled;
789
+ return withLock("admission", "global", { processPid: process.pid, action: "cancelOracleJob", jobId: id }, async () => {
790
+ const current = readJob(id);
791
+ if (!current) throw new Error(`Oracle job not found: ${id}`);
792
+ if (!isOpenOracleJob(current)) return current;
793
+
794
+ const now = new Date().toISOString();
795
+ if (current.status === "queued") {
796
+ const cancelled = await updateJob(id, (job) => ({
797
+ ...job,
798
+ ...withJobPhase("cancelled", {
799
+ status: "cancelled",
800
+ completedAt: now,
801
+ heartbeatAt: now,
802
+ notifyClaimedAt: undefined,
803
+ notifyClaimedBy: undefined,
804
+ error: reason,
805
+ }, now),
806
+ }));
807
+
808
+ const cleanupWarnings: string[] = [];
809
+ await rm(cancelled.archivePath, { force: true }).catch((error: Error) => {
810
+ cleanupWarnings.push(`Failed to remove queued archive ${cancelled.archivePath}: ${error.message}`);
811
+ });
812
+ if (cleanupWarnings.length === 0) return cancelled;
813
+
814
+ return updateJob(id, (job) => ({
815
+ ...job,
816
+ cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupWarnings],
817
+ lastCleanupAt: now,
818
+ error: [job.error, ...cleanupWarnings].filter(Boolean).join("\n"),
819
+ }));
820
+ }
499
821
 
500
- return updateJob(id, (job) => ({
501
- ...job,
502
- cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupReport.warnings],
503
- lastCleanupAt: now,
504
- error: [job.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
505
- }));
822
+ const terminated = await terminateWorkerPid(current.workerPid, current.workerStartedAt);
823
+ let transitioned = false;
824
+ const cancelled = await updateJob(id, (job) => {
825
+ if (isTerminalOracleJob(job)) return job;
826
+ transitioned = true;
827
+ return {
828
+ ...job,
829
+ ...withJobPhase(terminated ? "cancelled" : "failed", {
830
+ status: terminated ? "cancelled" : "failed",
831
+ completedAt: now,
832
+ heartbeatAt: now,
833
+ notifyClaimedAt: undefined,
834
+ notifyClaimedBy: undefined,
835
+ cleanupPending: terminated,
836
+ error: terminated ? reason : `${reason}; worker PID ${job.workerPid ?? "unknown"} did not exit`,
837
+ }, now),
838
+ };
839
+ });
840
+ if (!transitioned) return cancelled;
841
+
842
+ if (!terminated) {
843
+ const cleanupWarnings = [
844
+ `Oracle runtime cleanup is blocked because worker PID ${current.workerPid ?? "unknown"} could not be terminated safely.`,
845
+ ];
846
+ return updateJob(id, (job) => ({
847
+ ...job,
848
+ cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupWarnings],
849
+ lastCleanupAt: now,
850
+ error: [job.error, ...cleanupWarnings].filter(Boolean).join("\n"),
851
+ }));
852
+ }
853
+
854
+ const cleanupReport = await cleanupJobResources(cancelled);
855
+ if (cleanupReport.warnings.length === 0) {
856
+ const finalized = await clearCleanupPending(id, now);
857
+ return finalized ?? cancelled;
858
+ }
859
+
860
+ return updateJob(id, (job) => ({
861
+ ...job,
862
+ cleanupPending: false,
863
+ cleanupWarnings: [...(job.cleanupWarnings || []), ...cleanupReport.warnings],
864
+ lastCleanupAt: now,
865
+ error: [job.error, ...cleanupReport.warnings].filter(Boolean).join("\n"),
866
+ }));
867
+ });
506
868
  }
507
869
 
508
870
  export async function createJob(
@@ -512,6 +874,7 @@ export async function createJob(
512
874
  originSessionFile: string | undefined,
513
875
  config: OracleConfig,
514
876
  runtime: OracleRuntimeAllocation,
877
+ options?: { initialState?: "queued" | "submitted"; createdAt?: string },
515
878
  ): Promise<OracleJob> {
516
879
  const jobDir = getJobDir(id);
517
880
  const logsDir = join(jobDir, "logs");
@@ -522,7 +885,8 @@ export async function createJob(
522
885
  const reasoningPath = join(jobDir, "reasoning.md");
523
886
  const artifactsManifestPath = join(jobDir, "artifacts.json");
524
887
  const projectId = getProjectId(cwd);
525
- const sessionId = getSessionId(originSessionFile, projectId);
888
+ const sessionFile = requirePersistedSessionFile(originSessionFile, "create oracle jobs");
889
+ const sessionId = getSessionId(sessionFile, projectId);
526
890
  const conversationId = parseConversationId(input.chatUrl);
527
891
 
528
892
  await mkdir(jobDir, { recursive: true, mode: 0o700 });
@@ -534,22 +898,24 @@ export async function createJob(
534
898
  await writeFile(promptPath, input.prompt, { encoding: "utf8", mode: 0o600 });
535
899
  await chmod(promptPath, 0o600).catch(() => undefined);
536
900
 
537
- const now = new Date().toISOString();
901
+ const createdAt = options?.createdAt ?? new Date().toISOString();
902
+ const initialState = options?.initialState ?? "submitted";
538
903
  const normalizedEffort = input.modelFamily === "instant" ? undefined : (input.effort ?? config.defaults.effort);
539
904
  const normalizedAutoSwitchToThinking = input.modelFamily === "instant"
540
905
  ? (input.autoSwitchToThinking ?? config.defaults.autoSwitchToThinking)
541
906
  : false;
542
907
  const job: OracleJob = {
543
908
  id,
544
- status: "submitted",
545
- phase: "submitted",
546
- phaseAt: now,
547
- createdAt: now,
548
- submittedAt: now,
909
+ status: initialState,
910
+ phase: initialState,
911
+ phaseAt: createdAt,
912
+ createdAt,
913
+ queuedAt: initialState === "queued" ? createdAt : undefined,
914
+ submittedAt: initialState === "submitted" ? createdAt : undefined,
549
915
  cwd,
550
916
  projectId,
551
917
  sessionId,
552
- originSessionFile,
918
+ originSessionFile: sessionFile,
553
919
  requestSource: input.requestSource,
554
920
  chatModelFamily: input.modelFamily,
555
921
  effort: normalizedEffort,