patchrelay 0.76.0 → 0.78.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,10 +2,11 @@ import { CLEARED_FAILURE_PROVENANCE } from "./failure-provenance.js";
2
2
  import { buildStageReport, countEventMethods } from "./run-reporting.js";
3
3
  import { buildRunCompletedActivity, buildRunFailureActivity } from "./linear-session-reporting.js";
4
4
  import { handleNoPrCompletionCheck } from "./no-pr-completion-check.js";
5
- import { resolveCompletedRunState } from "./run-completion-policy.js";
5
+ import { resolvePostRunFactoryState } from "./run-completion-policy.js";
6
6
  import { computeChangeIdentityFromWorktree } from "./change-identity.js";
7
7
  import { inspectGitWorktreeStatus, isRepairRunType } from "./git-worktree-status.js";
8
8
  import { buildRunOutcomeSummary } from "./run-outcome-summary.js";
9
+ import { settleRun } from "./run-settlement.js";
9
10
  const WRITER = "run-finalizer";
10
11
  function parseEventJson(eventJson) {
11
12
  if (!eventJson)
@@ -376,7 +377,9 @@ export class RunFinalizer {
376
377
  }
377
378
  const verifiedRepairError = await this.completionPolicy.verifyReactiveRunAdvancedBranch(run, freshIssue);
378
379
  if (verifiedRepairError) {
379
- const holdState = params.resolveRecoverableRunState(freshIssue) ?? "failed";
380
+ // The run failed verification — it did not do its work, so resolve
381
+ // the hold state from GitHub truth like any other recovery path.
382
+ const holdState = resolvePostRunFactoryState(freshIssue, run, { outcome: "recovered" }) ?? "failed";
380
383
  this.failRunAndClear(run, verifiedRepairError, holdState);
381
384
  this.syncFailureOutcome({
382
385
  run,
@@ -446,23 +449,22 @@ export class RunFinalizer {
446
449
  // any git error returns undefined and we leave the cache as-is.
447
450
  this.maybeUpdateLastPublishedIdentity(run, refreshedIssue);
448
451
  const postRunFollowUp = await this.completionPolicy.resolvePostRunFollowUp(run, refreshedIssue);
449
- const postRunState = postRunFollowUp?.factoryState ?? resolveCompletedRunState(refreshedIssue, run);
452
+ const postRunState = postRunFollowUp?.factoryState ?? resolvePostRunFactoryState(refreshedIssue, run);
450
453
  const outcomeSummary = this.buildOutcomeSummary({
451
454
  run,
452
455
  issue: refreshedIssue,
453
456
  postRunState,
454
457
  latestAssistantSummary: report.assistantMessages.at(-1),
455
458
  });
456
- // `refreshedIssue` was read before several async policy checks; a
457
- // version conflict here means a webhook landed mid-finalize. Re-resolve
458
- // the post-run state from the fresh row so we never regress it (e.g.
459
- // the PR merged while we were verifying the publish).
459
+ // `refreshedIssue` was read before several async policy checks; a webhook
460
+ // may have landed mid-finalize. settleRun re-reads the row inside its
461
+ // transaction and resolves the post-run state from that fresh truth, so
462
+ // we never regress it (e.g. the PR merged while we were verifying the
463
+ // publish). settleRun also owns the slot clear (plan §B1): it refuses to
464
+ // touch a slot that no longer points at this run.
460
465
  const buildCompletionUpdate = (record) => {
461
- const state = postRunFollowUp?.factoryState ?? resolveCompletedRunState(record, run);
466
+ const state = postRunFollowUp?.factoryState ?? resolvePostRunFactoryState(record, run);
462
467
  return {
463
- projectId: run.projectId,
464
- linearIssueId: run.linearIssueId,
465
- activeRunId: null,
466
468
  ...(state ? { factoryState: state } : {}),
467
469
  pendingRunType: null,
468
470
  pendingRunContextJson: null,
@@ -472,18 +474,17 @@ export class RunFinalizer {
472
474
  };
473
475
  };
474
476
  const completed = this.withHeldLease(run.projectId, run.linearIssueId, (lease) => {
475
- this.db.runs.finishRun(run.id, this.buildCompletedRunUpdate({
476
- threadId,
477
- ...(params.completedTurnId ? { completedTurnId: params.completedTurnId } : {}),
478
- report,
479
- outcomeSummary,
480
- }));
481
- this.db.issueSessions.commitIssueState({
482
- writer: WRITER,
477
+ settleRun({
478
+ db: this.db,
479
+ run,
480
+ finish: this.buildCompletedRunUpdate({
481
+ threadId,
482
+ ...(params.completedTurnId ? { completedTurnId: params.completedTurnId } : {}),
483
+ report,
484
+ outcomeSummary,
485
+ }),
483
486
  lease,
484
- expectedVersion: refreshedIssue.version,
485
- update: buildCompletionUpdate(refreshedIssue),
486
- onConflict: (current) => buildCompletionUpdate(current),
487
+ buildIssueUpdate: buildCompletionUpdate,
487
488
  });
488
489
  if (postRunFollowUp) {
489
490
  return this.appendWakeEventWithLease(lease, issue, postRunFollowUp.pendingRunType, postRunFollowUp.context, "post_run");
@@ -219,6 +219,11 @@ export class RunLauncher {
219
219
  update: { projectId: params.project.id, linearIssueId: params.issue.linearIssueId, threadId },
220
220
  });
221
221
  }
222
+ // Plan §B5: persist the thread id on the run row BEFORE startTurn is
223
+ // awaited, so a turn/completed notification arriving while the turn is
224
+ // starting can already resolve the run by thread id. The orchestrator
225
+ // re-records it (with the turn id) after the launch returns.
226
+ this.recordRunThread(params, threadId, parentThreadId);
222
227
  this.db.runs.updateLaunchPhase(params.run.id, "thread_started");
223
228
  try {
224
229
  const turn = await this.codex.startTurn({ threadId, cwd: params.worktreePath, input: params.prompt });
@@ -237,6 +242,9 @@ export class RunLauncher {
237
242
  lease: { projectId: params.project.id, linearIssueId: params.issue.linearIssueId, leaseId: params.leaseId },
238
243
  update: { projectId: params.project.id, linearIssueId: params.issue.linearIssueId, threadId },
239
244
  });
245
+ // Plan §B5: re-point the run row at the fresh thread before the
246
+ // retried startTurn, for the same notification race.
247
+ this.recordRunThread(params, threadId, parentThreadId);
240
248
  const turn = await this.codex.startTurn({ threadId, cwd: params.worktreePath, input: params.prompt });
241
249
  turnId = turn.turnId;
242
250
  this.db.runs.updateLaunchPhase(params.run.id, "turn_started");
@@ -285,6 +293,19 @@ export class RunLauncher {
285
293
  throw error;
286
294
  }
287
295
  }
296
+ // Persist the Codex thread id on the run row under the launch lease.
297
+ // Losing the lease here aborts the launch the same way assertLaunchLease
298
+ // does — the run row must not be touched by a worker that no longer owns
299
+ // the session.
300
+ recordRunThread(params, threadId, parentThreadId) {
301
+ const recorded = this.db.issueSessions.updateRunThreadWithLease({ projectId: params.project.id, linearIssueId: params.issue.linearIssueId, leaseId: params.leaseId }, params.run.id, { threadId, ...(parentThreadId ? { parentThreadId } : {}) });
302
+ if (recorded)
303
+ return;
304
+ const error = new Error("Lost issue-session lease while recording the Codex thread id");
305
+ error.name = "IssueSessionLeaseLostError";
306
+ this.logger.warn({ runId: params.run.id, issueId: params.issue.linearIssueId }, "Aborting run launch after losing issue-session lease while recording the Codex thread id");
307
+ throw error;
308
+ }
288
309
  async setInitialImplementationGoal(threadId, issue) {
289
310
  const goalSetter = this.codex.setThreadGoal;
290
311
  if (typeof goalSetter !== "function") {
@@ -1,6 +1,5 @@
1
1
  import { buildRunFailureActivity } from "./linear-session-reporting.js";
2
2
  import { extractTurnId, resolveRunCompletionStatus } from "./run-reporting.js";
3
- import { resolveRecoverablePostRunState } from "./interrupted-run-recovery.js";
4
3
  import { resolveFailureFactoryState } from "./reactive-pr-state.js";
5
4
  const WRITER = "run-notification-handler";
6
5
  const DEFAULT_PUBLISH_COMMAND_TIMEOUT_MS = 10 * 60 * 1000;
@@ -146,7 +145,6 @@ export class RunNotificationHandler {
146
145
  thread,
147
146
  threadId,
148
147
  ...(completedTurnId ? { completedTurnId } : {}),
149
- resolveRecoverableRunState: resolveRecoverablePostRunState,
150
148
  });
151
149
  this.activeThreadId = undefined;
152
150
  }
@@ -9,16 +9,16 @@ import { IdleIssueReconciler } from "./idle-reconciliation.js";
9
9
  import { LinearSessionSync } from "./linear-session-sync.js";
10
10
  import { recoverLinearAgentActivityContext } from "./linear-agent-activity-recovery.js";
11
11
  import { IssueSessionLeaseService } from "./issue-session-lease-service.js";
12
- import { InterruptedRunRecovery } from "./interrupted-run-recovery.js";
13
12
  import { RunCompletionPolicy } from "./run-completion-policy.js";
13
+ import { RunFailurePolicy } from "./run-failure-policy.js";
14
14
  import { RunFinalizer } from "./run-finalizer.js";
15
15
  import { RunLauncher } from "./run-launcher.js";
16
16
  import { RunNotificationHandler } from "./run-notification-handler.js";
17
17
  import { RunReconciler } from "./run-reconciler.js";
18
- import { RunRecoveryService } from "./run-recovery-service.js";
19
18
  import { RunWakePlanner } from "./run-wake-planner.js";
20
19
  import { WakeDispatcher } from "./wake-dispatcher.js";
21
- import { getRemainingZombieRecoveryDelayMs } from "./zombie-recovery.js";
20
+ import { settleRun } from "./run-settlement.js";
21
+ import { getRemainingZombieRecoveryDelayMs } from "./run-budgets.js";
22
22
  import { classifyIssue } from "./issue-class.js";
23
23
  import { buildIssueTriageHash, IssueTriageService } from "./issue-triage.js";
24
24
  import { loadConfig } from "./config.js";
@@ -27,10 +27,6 @@ import { emitTelemetry, noopTelemetry } from "./telemetry.js";
27
27
  import { LinearIssueProjectionService } from "./linear-issue-projection.js";
28
28
  import { RunAdmissionController } from "./run-admission-controller.js";
29
29
  const WRITER = "run-orchestrator";
30
- // A terminal run must hold the active slot for at least this long before
31
- // the orchestrator force-clears it, so we never race the normal
32
- // notification-driven finalize that runs within seconds of completion.
33
- const DANGLING_ACTIVE_RUN_MIN_AGE_MS = 2 * 60_000;
34
30
  function lowerCaseFirst(value) {
35
31
  return value ? `${value.slice(0, 1).toLowerCase()}${value.slice(1)}` : value;
36
32
  }
@@ -62,9 +58,8 @@ export class RunOrchestrator {
62
58
  leaseService;
63
59
  runFinalizer;
64
60
  runLauncher;
65
- runRecovery;
61
+ runFailurePolicy;
66
62
  runWakePlanner;
67
- interruptedRunRecovery;
68
63
  runCompletionPolicy;
69
64
  completionCheck;
70
65
  issueTriage;
@@ -86,7 +81,6 @@ export class RunOrchestrator {
86
81
  recoveryPorts = {
87
82
  failRunAndClear: (run, message, nextState) => this.failRunAndClear(run, message, nextState),
88
83
  restoreIdleWorktree: (issue) => this.restoreIdleWorktree(issue),
89
- recoverOrEscalate: (issue, runType, reason) => this.recoverOrEscalate(issue, runType, reason),
90
84
  };
91
85
  activeSessionLeases;
92
86
  botIdentity;
@@ -138,9 +132,8 @@ export class RunOrchestrator {
138
132
  this.runFinalizer = new RunFinalizer(db, logger, this.linearSync, this.wakeDispatcher, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, (lease, issue, runType, context, dedupeScope) => this.appendWakeEventWithLease(lease, issue, runType, context, dedupeScope), this.recoveryPorts.failRunAndClear, this.runCompletionPolicy, this.completionCheck, feed);
139
133
  this.runLauncher = new RunLauncher(config, db, codex, logger, this.worktreeManager);
140
134
  this.runNotificationHandler = new RunNotificationHandler(config, db, logger, this.linearSync, this.runFinalizer, this.threadPorts.readThreadWithRetry, this.leasePorts.withHeldLease, this.leasePorts.heartbeatLease, this.leasePorts.releaseLease, feed, { interruptTurn: (options) => codex.interruptTurn(options) });
141
- this.runRecovery = new RunRecoveryService(db, logger, this.linearSync, this.leasePorts.withHeldLease, this.leasePorts.getHeldLease, (lease, issue, runType, context, dedupeScope) => this.appendWakeEventWithLease(lease, issue, runType, context, dedupeScope), this.leasePorts.releaseLease, (projectId, issueId) => this.enqueueIssue(projectId, issueId), feed);
142
- this.interruptedRunRecovery = new InterruptedRunRecovery(db, logger, this.linearSync, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, this.recoveryPorts.failRunAndClear, this.recoveryPorts.restoreIdleWorktree, this.runCompletionPolicy, (projectId, issueId) => this.enqueueIssue(projectId, issueId), feed);
143
- this.runReconciler = new RunReconciler(db, logger, linearProvider, this.linearSync, this.interruptedRunRecovery, this.runFinalizer, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, this.threadPorts.readThreadWithRetry, this.recoveryPorts.recoverOrEscalate, (projectId) => this.config.projects.find((project) => project.id === projectId)?.github?.repoFullName, feed);
135
+ this.runFailurePolicy = new RunFailurePolicy(db, logger, this.linearSync, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, (lease, issue, runType, context, dedupeScope) => this.appendWakeEventWithLease(lease, issue, runType, context, dedupeScope), this.wakeDispatcher, this.recoveryPorts.restoreIdleWorktree, this.runCompletionPolicy, (projectId) => this.config.projects.find((project) => project.id === projectId), feed);
136
+ this.runReconciler = new RunReconciler(db, logger, linearProvider, this.linearSync, this.runFailurePolicy, this.runFinalizer, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, this.threadPorts.readThreadWithRetry, (projectId) => this.config.projects.find((project) => project.id === projectId)?.github?.repoFullName, feed, telemetry);
144
137
  this.runWakePlanner = new RunWakePlanner(db);
145
138
  this.linearIssueProjection = new LinearIssueProjectionService(db, linearProvider, logger);
146
139
  this.runAdmission = new RunAdmissionController(db, this.linearIssueProjection);
@@ -589,10 +582,10 @@ export class RunOrchestrator {
589
582
  for (const run of this.db.runs.listRunningRuns()) {
590
583
  await this.reconcileRun(run);
591
584
  }
592
- // Free any issue whose active slot is pinned to an already-terminal
585
+ // Settle any issue whose active slot is pinned to an already-terminal
593
586
  // run (post-run finalize interrupted by restart). Must run before the
594
587
  // idle reconciler so the freed issue is routed in this same pass.
595
- this.finalizeDanglingActiveRuns();
588
+ this.settleDanglingActiveRuns();
596
589
  // Preemptively detect stuck merge-queue PRs (conflicts visible on
597
590
  // GitHub) and dispatch queue_repair before the Steward evicts.
598
591
  await this.queueHealthMonitor.reconcile();
@@ -605,70 +598,35 @@ export class RunOrchestrator {
605
598
  advanceIdleIssue(issue, newState, options) {
606
599
  this.idleReconciler.advanceIdleIssue(issue, newState, options);
607
600
  }
608
- /**
609
- * After a zombie/stale run is cleared, decide whether to re-enqueue
610
- * or escalate. Checks: PR already merged → done; budget exhausted →
611
- * escalate; backoff delay not elapsed → skip.
612
- */
613
- recoverOrEscalate(issue, runType, reason) {
614
- this.runRecovery.recoverOrEscalate({
615
- issue,
616
- runType,
617
- reason,
618
- isRequestedChangesRunType,
619
- });
620
- }
621
- // Clear a dangling active slot: an issue still pointing at an
601
+ // Settle a dangling active slot: an issue still pointing at an
622
602
  // already-terminal run via `activeRunId`. The post-run finalize was
623
603
  // interrupted (almost always a restart between marking the run
624
604
  // terminal and clearing the slot), so the run can never drive the
625
605
  // session forward, yet every idle/recovery pass skips the issue
626
- // because `activeRunId` is set. We re-read under the issue-session
627
- // lease and null the slot; the idle reconciler then routes the issue
628
- // from GitHub truth (e.g. a missed changes_requested review_fix).
629
- finalizeDanglingActiveRuns() {
606
+ // because `activeRunId` is set. settleRun is idempotent and its slot
607
+ // clear is a predicate-guarded versioned commit, so no age gate is
608
+ // needed it cannot destructively race the notification finalizer.
609
+ // The idle reconciler then routes the issue from GitHub truth (e.g. a
610
+ // missed changes_requested → review_fix).
611
+ settleDanglingActiveRuns() {
630
612
  for (const issue of this.db.issues.listIssuesWithTerminalActiveRun()) {
631
613
  if (issue.activeRunId === undefined)
632
614
  continue;
633
615
  const run = this.db.runs.getRunById(issue.activeRunId);
634
- // The query already filters to terminal runs; this guards against a
635
- // race where the run advanced back to active between query and read.
636
- if (!run || run.status === "running" || run.status === "queued")
637
- continue;
638
- // Hold off until the run has been terminal long enough that the
639
- // normal notification-driven finalize has demonstrably not run —
640
- // avoids racing a live completion that is milliseconds from clearing
641
- // the slot itself.
642
- const endedAtMs = run.endedAt ? Date.parse(run.endedAt) : Number.NaN;
643
- if (Number.isFinite(endedAtMs) && Date.now() - endedAtMs < DANGLING_ACTIVE_RUN_MIN_AGE_MS)
616
+ if (!run)
644
617
  continue;
645
618
  const lease = this.claimLeaseForReconciliation(run.projectId, run.linearIssueId);
646
- // "skip" → a live lease owns the session (a real run is in flight);
647
- // leave it alone. "owned" an outer local scope holds it, so we
648
- // must not release it here.
619
+ // "skip" → a live lease owns the session (a worker is mid-finalize or
620
+ // mid-launch); settleRun could not corrupt its writes, but deferring
621
+ // lets the owner land its richer post-run state first. "owned" → an
622
+ // outer local scope holds it, so we must not release it here.
649
623
  if (lease === "skip")
650
624
  continue;
651
625
  try {
652
- const cleared = this.withHeldIssueSessionLease(run.projectId, run.linearIssueId, (held) => {
653
- const fresh = this.db.issues.getIssue(run.projectId, run.linearIssueId);
654
- if (!fresh || fresh.activeRunId !== run.id)
655
- return false;
656
- const danglingClear = {
657
- projectId: run.projectId,
658
- linearIssueId: run.linearIssueId,
659
- activeRunId: null,
660
- };
661
- const commit = this.db.issueSessions.commitIssueState({
662
- writer: WRITER,
663
- lease: held,
664
- expectedVersion: fresh.version,
665
- update: danglingClear,
666
- // Never clear a slot a concurrent writer re-pointed elsewhere.
667
- onConflict: (current) => (current.activeRunId === run.id ? danglingClear : undefined),
668
- });
669
- return commit.outcome === "applied";
670
- });
671
- if (cleared) {
626
+ // No `finish` outcome: the run is already terminal, and settleRun
627
+ // leaves a run that raced back to non-terminal status untouched.
628
+ const settled = this.withHeldIssueSessionLease(run.projectId, run.linearIssueId, (held) => settleRun({ db: this.db, run, lease: held }));
629
+ if (settled?.slotCleared) {
672
630
  this.logger.warn({ issueKey: issue.issueKey, runId: run.id, runType: run.runType, runStatus: run.status }, "Cleared dangling active-run slot left by a terminal run; idle reconcile will resume the issue");
673
631
  this.feed?.publish({
674
632
  level: "warn",
@@ -701,14 +659,14 @@ export class RunOrchestrator {
701
659
  }
702
660
  // ─── Internal helpers ─────────────────────────────────────────────
703
661
  escalate(issue, runType, reason) {
704
- this.runRecovery.escalate({
662
+ this.runFailurePolicy.escalate({
705
663
  issue,
706
664
  runType,
707
665
  reason,
708
666
  });
709
667
  }
710
668
  failRunAndClear(run, message, nextState = "failed") {
711
- this.runRecovery.failRunAndClear({
669
+ this.runFailurePolicy.failRunAndClear({
712
670
  run,
713
671
  message,
714
672
  nextState,
@@ -3,10 +3,10 @@ import { TERMINAL_STATES } from "./factory-state.js";
3
3
  import { resolveAuthoritativeLinearStopState } from "./linear-workflow.js";
4
4
  import { buildRunFailureActivity } from "./linear-session-reporting.js";
5
5
  import { getThreadTurns } from "./codex-thread-utils.js";
6
- import { resolveRecoverablePostRunState } from "./interrupted-run-recovery.js";
7
6
  import { resolveEffectiveActiveRun } from "./effective-active-run.js";
8
7
  import { isThreadMaterializingError } from "./codex-thread-errors.js";
9
8
  import { fetchPullRequestSnapshot } from "./reconcile-pr-fetch.js";
9
+ import { emitTelemetry, noopTelemetry } from "./telemetry.js";
10
10
  const THREAD_MATERIALIZATION_GRACE_MS = 10 * 60_000;
11
11
  const WRITER = "run-reconciler";
12
12
  function isWithinThreadMaterializationGrace(run, nowMs = Date.now()) {
@@ -20,27 +20,27 @@ export class RunReconciler {
20
20
  logger;
21
21
  linearProvider;
22
22
  linearSync;
23
- interruptedRunRecovery;
23
+ failurePolicy;
24
24
  runFinalizer;
25
25
  withHeldLease;
26
26
  releaseLease;
27
27
  readThreadWithRetry;
28
- recoverOrEscalate;
29
28
  resolveRepoFullName;
30
29
  feed;
31
- constructor(db, logger, linearProvider, linearSync, interruptedRunRecovery, runFinalizer, withHeldLease, releaseLease, readThreadWithRetry, recoverOrEscalate, resolveRepoFullName = () => undefined, feed) {
30
+ telemetry;
31
+ constructor(db, logger, linearProvider, linearSync, failurePolicy, runFinalizer, withHeldLease, releaseLease, readThreadWithRetry, resolveRepoFullName = () => undefined, feed, telemetry = noopTelemetry) {
32
32
  this.db = db;
33
33
  this.logger = logger;
34
34
  this.linearProvider = linearProvider;
35
35
  this.linearSync = linearSync;
36
- this.interruptedRunRecovery = interruptedRunRecovery;
36
+ this.failurePolicy = failurePolicy;
37
37
  this.runFinalizer = runFinalizer;
38
38
  this.withHeldLease = withHeldLease;
39
39
  this.releaseLease = releaseLease;
40
40
  this.readThreadWithRetry = readThreadWithRetry;
41
- this.recoverOrEscalate = recoverOrEscalate;
42
41
  this.resolveRepoFullName = resolveRepoFullName;
43
42
  this.feed = feed;
43
+ this.telemetry = telemetry;
44
44
  }
45
45
  async reconcile(params) {
46
46
  const { run, issue, recoveryLease } = params;
@@ -67,6 +67,19 @@ export class RunReconciler {
67
67
  if (commit?.outcome === "applied") {
68
68
  effectiveIssue = commit.issue;
69
69
  this.logger.info({ issueKey: effectiveIssue.issueKey, runId: run.id, runType: run.runType }, "Reattached detached active run during reconciliation");
70
+ // Plan §B5: with settleRun idempotent and the launcher persisting the
71
+ // thread id before startTurn, this reattachment should never fire.
72
+ // Telemetry observes it for one release before the block is deleted.
73
+ emitTelemetry(this.telemetry, {
74
+ type: "health.invariant",
75
+ invariant: "detached_active_run",
76
+ status: "repaired",
77
+ projectId: run.projectId,
78
+ linearIssueId: run.linearIssueId,
79
+ ...(effectiveIssue.issueKey ? { issueKey: effectiveIssue.issueKey } : {}),
80
+ runId: run.id,
81
+ detail: `Reattached detached active ${run.runType} run during reconciliation`,
82
+ });
70
83
  }
71
84
  else if (commit?.outcome === "conflict_skipped" && commit.issue) {
72
85
  effectiveIssue = commit.issue;
@@ -111,19 +124,14 @@ export class RunReconciler {
111
124
  return;
112
125
  }
113
126
  this.logger.warn({ issueKey: effectiveIssue.issueKey, runId: run.id, runType: run.runType }, "Zombie run detected (no thread)");
114
- const zombieClear = { projectId: run.projectId, linearIssueId: run.linearIssueId, activeRunId: null };
115
- this.withHeldLease(run.projectId, run.linearIssueId, () => {
116
- const commit = this.db.issueSessions.commitIssueState({
117
- writer: WRITER,
118
- expectedVersion: effectiveIssue.version,
119
- update: zombieClear,
120
- onConflict: (current) => (current.activeRunId === run.id ? zombieClear : undefined),
121
- });
122
- if (commit.outcome !== "applied")
123
- return;
124
- this.db.runs.finishRun(run.id, { status: "failed", failureReason: "Zombie: never started (no thread after restart)" });
127
+ // Detection only the failure policy settles the run and decides
128
+ // retry vs escalate (plan §B4).
129
+ this.failurePolicy.settleStrandedRunAndRecover({
130
+ run,
131
+ issue: effectiveIssue,
132
+ reason: "zombie",
133
+ failureReason: "Zombie: never started (no thread after restart)",
125
134
  });
126
- this.recoverOrEscalate(effectiveIssue, run.runType, "zombie");
127
135
  const recoveredIssue = this.db.issues.getIssue(run.projectId, run.linearIssueId) ?? effectiveIssue;
128
136
  void this.linearSync.emitActivity(recoveredIssue, buildRunFailureActivity(run.runType, "The Codex turn never started before PatchRelay restarted."));
129
137
  void this.linearSync.syncSession(recoveredIssue, { activeRunType: run.runType });
@@ -144,19 +152,14 @@ export class RunReconciler {
144
152
  return;
145
153
  }
146
154
  this.logger.warn({ issueKey: effectiveIssue.issueKey, runId: run.id, runType: run.runType, threadId: run.threadId }, "Stale thread during reconciliation");
147
- const staleClear = { projectId: run.projectId, linearIssueId: run.linearIssueId, activeRunId: null };
148
- this.withHeldLease(run.projectId, run.linearIssueId, () => {
149
- const commit = this.db.issueSessions.commitIssueState({
150
- writer: WRITER,
151
- expectedVersion: effectiveIssue.version,
152
- update: staleClear,
153
- onConflict: (current) => (current.activeRunId === run.id ? staleClear : undefined),
154
- });
155
- if (commit.outcome !== "applied")
156
- return;
157
- this.db.runs.finishRun(run.id, { status: "failed", failureReason: "Stale thread after restart" });
155
+ // Detection only the failure policy settles the run and decides
156
+ // retry vs escalate (plan §B4).
157
+ this.failurePolicy.settleStrandedRunAndRecover({
158
+ run,
159
+ issue: effectiveIssue,
160
+ reason: "stale_thread",
161
+ failureReason: "Stale thread after restart",
158
162
  });
159
- this.recoverOrEscalate(effectiveIssue, run.runType, "stale_thread");
160
163
  const recoveredIssue = this.db.issues.getIssue(run.projectId, run.linearIssueId) ?? effectiveIssue;
161
164
  void this.linearSync.emitActivity(recoveredIssue, buildRunFailureActivity(run.runType, "PatchRelay lost the active Codex thread after restart and needs to recover."));
162
165
  void this.linearSync.syncSession(recoveredIssue, { activeRunType: run.runType });
@@ -207,7 +210,7 @@ export class RunReconciler {
207
210
  }
208
211
  const latestTurn = getThreadTurns(thread).at(-1);
209
212
  if (latestTurn?.status === "interrupted") {
210
- await this.interruptedRunRecovery.handle(run, effectiveIssue);
213
+ await this.failurePolicy.handleInterruptedRun(run, effectiveIssue);
211
214
  return;
212
215
  }
213
216
  if (latestTurn?.status === "completed") {
@@ -218,7 +221,6 @@ export class RunReconciler {
218
221
  thread,
219
222
  threadId: run.threadId,
220
223
  ...(latestTurn.id ? { completedTurnId: latestTurn.id } : {}),
221
- resolveRecoverableRunState: resolveRecoverablePostRunState,
222
224
  });
223
225
  return;
224
226
  }
@@ -0,0 +1,57 @@
1
+ const WRITER = "run-settlement";
2
+ const TERMINAL_RUN_STATUSES = new Set(["completed", "failed", "released", "superseded"]);
3
+ export function isTerminalRunStatus(status) {
4
+ return TERMINAL_RUN_STATUSES.has(status);
5
+ }
6
+ // Phase B1 (core simplification plan): the fast, transactional, idempotent
7
+ // half of run finalization. One transaction marks the run terminal and
8
+ // clears the issue's active slot — the two writes whose separation caused
9
+ // the dangling-active-run freeze (PR #566): a restart landing between them
10
+ // left `activeRunId` pointing at a terminal run forever, hiding the issue
11
+ // from every idle/recovery pass. Safe to call from both the notification
12
+ // finalizer and reconciliation at any time:
13
+ // - already-terminal run → finishRun skipped;
14
+ // - slot already cleared or re-pointed at another run → issue untouched;
15
+ // - non-terminal run with no `finish` outcome → full no-op.
16
+ export function settleRun(params) {
17
+ const { db, run } = params;
18
+ return db.transaction(() => {
19
+ const freshRun = db.runs.getRunById(run.id);
20
+ if (!freshRun) {
21
+ return { runFinished: false, slotCleared: false, issue: db.issues.getIssue(run.projectId, run.linearIssueId) };
22
+ }
23
+ let runFinished = false;
24
+ if (!isTerminalRunStatus(freshRun.status)) {
25
+ if (!params.finish) {
26
+ return { runFinished: false, slotCleared: false, issue: db.issues.getIssue(run.projectId, run.linearIssueId) };
27
+ }
28
+ db.runs.finishRun(run.id, params.finish);
29
+ runFinished = true;
30
+ }
31
+ const current = db.issues.getIssue(run.projectId, run.linearIssueId);
32
+ if (!current || current.activeRunId !== run.id) {
33
+ return { runFinished, slotCleared: false, issue: current };
34
+ }
35
+ const buildUpdate = (record) => ({
36
+ projectId: run.projectId,
37
+ linearIssueId: run.linearIssueId,
38
+ ...params.buildIssueUpdate?.(record),
39
+ // After the caller-provided fields so nothing can override the clear.
40
+ activeRunId: null,
41
+ });
42
+ const commit = db.issueSessions.commitIssueState({
43
+ writer: WRITER,
44
+ ...(params.lease ? { lease: params.lease } : {}),
45
+ expectedVersion: current.version,
46
+ update: buildUpdate(current),
47
+ // The read above happened inside this same transaction, so a version
48
+ // conflict cannot normally occur; the predicate keeps the invariant
49
+ // explicit: never clear a slot that was re-pointed at another run.
50
+ onConflict: (fresh) => (fresh.activeRunId === run.id ? buildUpdate(fresh) : undefined),
51
+ });
52
+ if (commit.outcome !== "applied") {
53
+ return { runFinished, slotCleared: false, issue: commit.outcome === "conflict_skipped" ? commit.issue : current };
54
+ }
55
+ return { runFinished, slotCleared: true, issue: commit.issue };
56
+ });
57
+ }
package/dist/service.js CHANGED
@@ -14,6 +14,7 @@ import { ServiceStartupRecovery } from "./service-startup-recovery.js";
14
14
  import { WakeDispatcher } from "./wake-dispatcher.js";
15
15
  import { WebhookHandler } from "./webhook-handler.js";
16
16
  import { acceptIncomingWebhook } from "./service-webhooks.js";
17
+ import { ABANDONED_PENDING_WEBHOOK_AGE_MS } from "./db/webhook-event-store.js";
17
18
  import { runWebhookEventRetention } from "./event-retention.js";
18
19
  import { parseStringArray, TrackedIssueListQuery } from "./tracked-issue-list-query.js";
19
20
  import { AgentInputService } from "./agent-input-service.js";
@@ -103,6 +104,7 @@ export class PatchRelayService {
103
104
  }
104
105
  async start() {
105
106
  this.db.issueSessions.releaseExpiredIssueSessionLeases();
107
+ this.sweepAbandonedWebhookEvents();
106
108
  const repairedInstallations = this.db.linearInstallations.repairProjectInstallations(this.config.projects.map((project) => project.id));
107
109
  for (const repair of repairedInstallations) {
108
110
  this.logger.info({ projectId: repair.projectId, installationId: repair.installationId, reason: repair.reason }, "Repaired Linear project installation link");
@@ -287,6 +289,26 @@ export class PatchRelayService {
287
289
  getReadiness() {
288
290
  return this.runtime.getReadiness();
289
291
  }
292
+ // Core simplification plan §C2: webhook_events is a dedupe + forensics log,
293
+ // not a replay queue. A row stuck at 'pending' means a crash or restart
294
+ // interrupted processing; the event will never be replayed (recovery is
295
+ // re-derivation from GitHub/Linear via reconciliation), so mark it
296
+ // 'abandoned' — making it archiveable — and surface the count to the
297
+ // operator, because every abandoned row is a crash worth seeing.
298
+ sweepAbandonedWebhookEvents() {
299
+ const cutoffIso = new Date(Date.now() - ABANDONED_PENDING_WEBHOOK_AGE_MS).toISOString();
300
+ const abandoned = this.db.webhookEvents.markAbandonedPendingEventsBefore(cutoffIso);
301
+ if (abandoned === 0)
302
+ return;
303
+ this.logger.warn({ abandoned, cutoffIso }, "Marked stale pending webhook events as abandoned at startup");
304
+ this.feed.publish({
305
+ level: "warn",
306
+ kind: "webhook",
307
+ status: "abandoned_events",
308
+ summary: `Startup: marked ${abandoned} stale pending webhook event(s) as abandoned`,
309
+ detail: "Processing was interrupted (crash/restart). State recovers via reconciliation; the rows stay archiveable for forensics.",
310
+ });
311
+ }
290
312
  scheduleEventRetention(delayMs = 24 * 60 * 60 * 1000) {
291
313
  if (this.eventRetentionTimer !== undefined) {
292
314
  clearTimeout(this.eventRetentionTimer);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "patchrelay",
3
- "version": "0.76.0",
3
+ "version": "0.78.0",
4
4
  "license": "MIT",
5
5
  "type": "module",
6
6
  "repository": {