patchrelay 0.76.0 → 0.78.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/build-info.json +3 -3
- package/dist/db/webhook-event-store.js +22 -0
- package/dist/failure-provenance.js +40 -0
- package/dist/github-webhook-late-publication-guard.js +49 -15
- package/dist/github-webhook-policy.js +36 -25
- package/dist/github-webhook-sequence-backstop.js +45 -2
- package/dist/github-webhook-state-projector.js +5 -12
- package/dist/idle-reconciliation.js +63 -38
- package/dist/pr-facts-derivation.js +81 -0
- package/dist/run-budgets.js +40 -6
- package/dist/run-completion-policy.js +50 -9
- package/dist/run-failure-policy.js +463 -0
- package/dist/run-finalizer.js +23 -22
- package/dist/run-launcher.js +21 -0
- package/dist/run-notification-handler.js +0 -2
- package/dist/run-orchestrator.js +26 -68
- package/dist/run-reconciler.js +34 -32
- package/dist/run-settlement.js +57 -0
- package/dist/service.js +22 -0
- package/package.json +1 -1
- package/dist/interrupted-run-recovery.js +0 -240
- package/dist/run-recovery-service.js +0 -239
- package/dist/zombie-recovery.js +0 -13
package/dist/run-finalizer.js
CHANGED
|
@@ -2,10 +2,11 @@ import { CLEARED_FAILURE_PROVENANCE } from "./failure-provenance.js";
|
|
|
2
2
|
import { buildStageReport, countEventMethods } from "./run-reporting.js";
|
|
3
3
|
import { buildRunCompletedActivity, buildRunFailureActivity } from "./linear-session-reporting.js";
|
|
4
4
|
import { handleNoPrCompletionCheck } from "./no-pr-completion-check.js";
|
|
5
|
-
import {
|
|
5
|
+
import { resolvePostRunFactoryState } from "./run-completion-policy.js";
|
|
6
6
|
import { computeChangeIdentityFromWorktree } from "./change-identity.js";
|
|
7
7
|
import { inspectGitWorktreeStatus, isRepairRunType } from "./git-worktree-status.js";
|
|
8
8
|
import { buildRunOutcomeSummary } from "./run-outcome-summary.js";
|
|
9
|
+
import { settleRun } from "./run-settlement.js";
|
|
9
10
|
const WRITER = "run-finalizer";
|
|
10
11
|
function parseEventJson(eventJson) {
|
|
11
12
|
if (!eventJson)
|
|
@@ -376,7 +377,9 @@ export class RunFinalizer {
|
|
|
376
377
|
}
|
|
377
378
|
const verifiedRepairError = await this.completionPolicy.verifyReactiveRunAdvancedBranch(run, freshIssue);
|
|
378
379
|
if (verifiedRepairError) {
|
|
379
|
-
|
|
380
|
+
// The run failed verification — it did not do its work, so resolve
|
|
381
|
+
// the hold state from GitHub truth like any other recovery path.
|
|
382
|
+
const holdState = resolvePostRunFactoryState(freshIssue, run, { outcome: "recovered" }) ?? "failed";
|
|
380
383
|
this.failRunAndClear(run, verifiedRepairError, holdState);
|
|
381
384
|
this.syncFailureOutcome({
|
|
382
385
|
run,
|
|
@@ -446,23 +449,22 @@ export class RunFinalizer {
|
|
|
446
449
|
// any git error returns undefined and we leave the cache as-is.
|
|
447
450
|
this.maybeUpdateLastPublishedIdentity(run, refreshedIssue);
|
|
448
451
|
const postRunFollowUp = await this.completionPolicy.resolvePostRunFollowUp(run, refreshedIssue);
|
|
449
|
-
const postRunState = postRunFollowUp?.factoryState ??
|
|
452
|
+
const postRunState = postRunFollowUp?.factoryState ?? resolvePostRunFactoryState(refreshedIssue, run);
|
|
450
453
|
const outcomeSummary = this.buildOutcomeSummary({
|
|
451
454
|
run,
|
|
452
455
|
issue: refreshedIssue,
|
|
453
456
|
postRunState,
|
|
454
457
|
latestAssistantSummary: report.assistantMessages.at(-1),
|
|
455
458
|
});
|
|
456
|
-
// `refreshedIssue` was read before several async policy checks; a
|
|
457
|
-
//
|
|
458
|
-
// the post-run state from
|
|
459
|
-
// the PR merged while we were verifying the
|
|
459
|
+
// `refreshedIssue` was read before several async policy checks; a webhook
|
|
460
|
+
// may have landed mid-finalize. settleRun re-reads the row inside its
|
|
461
|
+
// transaction and resolves the post-run state from that fresh truth, so
|
|
462
|
+
// we never regress it (e.g. the PR merged while we were verifying the
|
|
463
|
+
// publish). settleRun also owns the slot clear (plan §B1): it refuses to
|
|
464
|
+
// touch a slot that no longer points at this run.
|
|
460
465
|
const buildCompletionUpdate = (record) => {
|
|
461
|
-
const state = postRunFollowUp?.factoryState ??
|
|
466
|
+
const state = postRunFollowUp?.factoryState ?? resolvePostRunFactoryState(record, run);
|
|
462
467
|
return {
|
|
463
|
-
projectId: run.projectId,
|
|
464
|
-
linearIssueId: run.linearIssueId,
|
|
465
|
-
activeRunId: null,
|
|
466
468
|
...(state ? { factoryState: state } : {}),
|
|
467
469
|
pendingRunType: null,
|
|
468
470
|
pendingRunContextJson: null,
|
|
@@ -472,18 +474,17 @@ export class RunFinalizer {
|
|
|
472
474
|
};
|
|
473
475
|
};
|
|
474
476
|
const completed = this.withHeldLease(run.projectId, run.linearIssueId, (lease) => {
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
477
|
+
settleRun({
|
|
478
|
+
db: this.db,
|
|
479
|
+
run,
|
|
480
|
+
finish: this.buildCompletedRunUpdate({
|
|
481
|
+
threadId,
|
|
482
|
+
...(params.completedTurnId ? { completedTurnId: params.completedTurnId } : {}),
|
|
483
|
+
report,
|
|
484
|
+
outcomeSummary,
|
|
485
|
+
}),
|
|
483
486
|
lease,
|
|
484
|
-
|
|
485
|
-
update: buildCompletionUpdate(refreshedIssue),
|
|
486
|
-
onConflict: (current) => buildCompletionUpdate(current),
|
|
487
|
+
buildIssueUpdate: buildCompletionUpdate,
|
|
487
488
|
});
|
|
488
489
|
if (postRunFollowUp) {
|
|
489
490
|
return this.appendWakeEventWithLease(lease, issue, postRunFollowUp.pendingRunType, postRunFollowUp.context, "post_run");
|
package/dist/run-launcher.js
CHANGED
|
@@ -219,6 +219,11 @@ export class RunLauncher {
|
|
|
219
219
|
update: { projectId: params.project.id, linearIssueId: params.issue.linearIssueId, threadId },
|
|
220
220
|
});
|
|
221
221
|
}
|
|
222
|
+
// Plan §B5: persist the thread id on the run row BEFORE startTurn is
|
|
223
|
+
// awaited, so a turn/completed notification arriving while the turn is
|
|
224
|
+
// starting can already resolve the run by thread id. The orchestrator
|
|
225
|
+
// re-records it (with the turn id) after the launch returns.
|
|
226
|
+
this.recordRunThread(params, threadId, parentThreadId);
|
|
222
227
|
this.db.runs.updateLaunchPhase(params.run.id, "thread_started");
|
|
223
228
|
try {
|
|
224
229
|
const turn = await this.codex.startTurn({ threadId, cwd: params.worktreePath, input: params.prompt });
|
|
@@ -237,6 +242,9 @@ export class RunLauncher {
|
|
|
237
242
|
lease: { projectId: params.project.id, linearIssueId: params.issue.linearIssueId, leaseId: params.leaseId },
|
|
238
243
|
update: { projectId: params.project.id, linearIssueId: params.issue.linearIssueId, threadId },
|
|
239
244
|
});
|
|
245
|
+
// Plan §B5: re-point the run row at the fresh thread before the
|
|
246
|
+
// retried startTurn, for the same notification race.
|
|
247
|
+
this.recordRunThread(params, threadId, parentThreadId);
|
|
240
248
|
const turn = await this.codex.startTurn({ threadId, cwd: params.worktreePath, input: params.prompt });
|
|
241
249
|
turnId = turn.turnId;
|
|
242
250
|
this.db.runs.updateLaunchPhase(params.run.id, "turn_started");
|
|
@@ -285,6 +293,19 @@ export class RunLauncher {
|
|
|
285
293
|
throw error;
|
|
286
294
|
}
|
|
287
295
|
}
|
|
296
|
+
// Persist the Codex thread id on the run row under the launch lease.
|
|
297
|
+
// Losing the lease here aborts the launch the same way assertLaunchLease
|
|
298
|
+
// does — the run row must not be touched by a worker that no longer owns
|
|
299
|
+
// the session.
|
|
300
|
+
recordRunThread(params, threadId, parentThreadId) {
|
|
301
|
+
const recorded = this.db.issueSessions.updateRunThreadWithLease({ projectId: params.project.id, linearIssueId: params.issue.linearIssueId, leaseId: params.leaseId }, params.run.id, { threadId, ...(parentThreadId ? { parentThreadId } : {}) });
|
|
302
|
+
if (recorded)
|
|
303
|
+
return;
|
|
304
|
+
const error = new Error("Lost issue-session lease while recording the Codex thread id");
|
|
305
|
+
error.name = "IssueSessionLeaseLostError";
|
|
306
|
+
this.logger.warn({ runId: params.run.id, issueId: params.issue.linearIssueId }, "Aborting run launch after losing issue-session lease while recording the Codex thread id");
|
|
307
|
+
throw error;
|
|
308
|
+
}
|
|
288
309
|
async setInitialImplementationGoal(threadId, issue) {
|
|
289
310
|
const goalSetter = this.codex.setThreadGoal;
|
|
290
311
|
if (typeof goalSetter !== "function") {
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { buildRunFailureActivity } from "./linear-session-reporting.js";
|
|
2
2
|
import { extractTurnId, resolveRunCompletionStatus } from "./run-reporting.js";
|
|
3
|
-
import { resolveRecoverablePostRunState } from "./interrupted-run-recovery.js";
|
|
4
3
|
import { resolveFailureFactoryState } from "./reactive-pr-state.js";
|
|
5
4
|
const WRITER = "run-notification-handler";
|
|
6
5
|
const DEFAULT_PUBLISH_COMMAND_TIMEOUT_MS = 10 * 60 * 1000;
|
|
@@ -146,7 +145,6 @@ export class RunNotificationHandler {
|
|
|
146
145
|
thread,
|
|
147
146
|
threadId,
|
|
148
147
|
...(completedTurnId ? { completedTurnId } : {}),
|
|
149
|
-
resolveRecoverableRunState: resolveRecoverablePostRunState,
|
|
150
148
|
});
|
|
151
149
|
this.activeThreadId = undefined;
|
|
152
150
|
}
|
package/dist/run-orchestrator.js
CHANGED
|
@@ -9,16 +9,16 @@ import { IdleIssueReconciler } from "./idle-reconciliation.js";
|
|
|
9
9
|
import { LinearSessionSync } from "./linear-session-sync.js";
|
|
10
10
|
import { recoverLinearAgentActivityContext } from "./linear-agent-activity-recovery.js";
|
|
11
11
|
import { IssueSessionLeaseService } from "./issue-session-lease-service.js";
|
|
12
|
-
import { InterruptedRunRecovery } from "./interrupted-run-recovery.js";
|
|
13
12
|
import { RunCompletionPolicy } from "./run-completion-policy.js";
|
|
13
|
+
import { RunFailurePolicy } from "./run-failure-policy.js";
|
|
14
14
|
import { RunFinalizer } from "./run-finalizer.js";
|
|
15
15
|
import { RunLauncher } from "./run-launcher.js";
|
|
16
16
|
import { RunNotificationHandler } from "./run-notification-handler.js";
|
|
17
17
|
import { RunReconciler } from "./run-reconciler.js";
|
|
18
|
-
import { RunRecoveryService } from "./run-recovery-service.js";
|
|
19
18
|
import { RunWakePlanner } from "./run-wake-planner.js";
|
|
20
19
|
import { WakeDispatcher } from "./wake-dispatcher.js";
|
|
21
|
-
import {
|
|
20
|
+
import { settleRun } from "./run-settlement.js";
|
|
21
|
+
import { getRemainingZombieRecoveryDelayMs } from "./run-budgets.js";
|
|
22
22
|
import { classifyIssue } from "./issue-class.js";
|
|
23
23
|
import { buildIssueTriageHash, IssueTriageService } from "./issue-triage.js";
|
|
24
24
|
import { loadConfig } from "./config.js";
|
|
@@ -27,10 +27,6 @@ import { emitTelemetry, noopTelemetry } from "./telemetry.js";
|
|
|
27
27
|
import { LinearIssueProjectionService } from "./linear-issue-projection.js";
|
|
28
28
|
import { RunAdmissionController } from "./run-admission-controller.js";
|
|
29
29
|
const WRITER = "run-orchestrator";
|
|
30
|
-
// A terminal run must hold the active slot for at least this long before
|
|
31
|
-
// the orchestrator force-clears it, so we never race the normal
|
|
32
|
-
// notification-driven finalize that runs within seconds of completion.
|
|
33
|
-
const DANGLING_ACTIVE_RUN_MIN_AGE_MS = 2 * 60_000;
|
|
34
30
|
function lowerCaseFirst(value) {
|
|
35
31
|
return value ? `${value.slice(0, 1).toLowerCase()}${value.slice(1)}` : value;
|
|
36
32
|
}
|
|
@@ -62,9 +58,8 @@ export class RunOrchestrator {
|
|
|
62
58
|
leaseService;
|
|
63
59
|
runFinalizer;
|
|
64
60
|
runLauncher;
|
|
65
|
-
|
|
61
|
+
runFailurePolicy;
|
|
66
62
|
runWakePlanner;
|
|
67
|
-
interruptedRunRecovery;
|
|
68
63
|
runCompletionPolicy;
|
|
69
64
|
completionCheck;
|
|
70
65
|
issueTriage;
|
|
@@ -86,7 +81,6 @@ export class RunOrchestrator {
|
|
|
86
81
|
recoveryPorts = {
|
|
87
82
|
failRunAndClear: (run, message, nextState) => this.failRunAndClear(run, message, nextState),
|
|
88
83
|
restoreIdleWorktree: (issue) => this.restoreIdleWorktree(issue),
|
|
89
|
-
recoverOrEscalate: (issue, runType, reason) => this.recoverOrEscalate(issue, runType, reason),
|
|
90
84
|
};
|
|
91
85
|
activeSessionLeases;
|
|
92
86
|
botIdentity;
|
|
@@ -138,9 +132,8 @@ export class RunOrchestrator {
|
|
|
138
132
|
this.runFinalizer = new RunFinalizer(db, logger, this.linearSync, this.wakeDispatcher, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, (lease, issue, runType, context, dedupeScope) => this.appendWakeEventWithLease(lease, issue, runType, context, dedupeScope), this.recoveryPorts.failRunAndClear, this.runCompletionPolicy, this.completionCheck, feed);
|
|
139
133
|
this.runLauncher = new RunLauncher(config, db, codex, logger, this.worktreeManager);
|
|
140
134
|
this.runNotificationHandler = new RunNotificationHandler(config, db, logger, this.linearSync, this.runFinalizer, this.threadPorts.readThreadWithRetry, this.leasePorts.withHeldLease, this.leasePorts.heartbeatLease, this.leasePorts.releaseLease, feed, { interruptTurn: (options) => codex.interruptTurn(options) });
|
|
141
|
-
this.
|
|
142
|
-
this.
|
|
143
|
-
this.runReconciler = new RunReconciler(db, logger, linearProvider, this.linearSync, this.interruptedRunRecovery, this.runFinalizer, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, this.threadPorts.readThreadWithRetry, this.recoveryPorts.recoverOrEscalate, (projectId) => this.config.projects.find((project) => project.id === projectId)?.github?.repoFullName, feed);
|
|
135
|
+
this.runFailurePolicy = new RunFailurePolicy(db, logger, this.linearSync, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, (lease, issue, runType, context, dedupeScope) => this.appendWakeEventWithLease(lease, issue, runType, context, dedupeScope), this.wakeDispatcher, this.recoveryPorts.restoreIdleWorktree, this.runCompletionPolicy, (projectId) => this.config.projects.find((project) => project.id === projectId), feed);
|
|
136
|
+
this.runReconciler = new RunReconciler(db, logger, linearProvider, this.linearSync, this.runFailurePolicy, this.runFinalizer, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, this.threadPorts.readThreadWithRetry, (projectId) => this.config.projects.find((project) => project.id === projectId)?.github?.repoFullName, feed, telemetry);
|
|
144
137
|
this.runWakePlanner = new RunWakePlanner(db);
|
|
145
138
|
this.linearIssueProjection = new LinearIssueProjectionService(db, linearProvider, logger);
|
|
146
139
|
this.runAdmission = new RunAdmissionController(db, this.linearIssueProjection);
|
|
@@ -589,10 +582,10 @@ export class RunOrchestrator {
|
|
|
589
582
|
for (const run of this.db.runs.listRunningRuns()) {
|
|
590
583
|
await this.reconcileRun(run);
|
|
591
584
|
}
|
|
592
|
-
//
|
|
585
|
+
// Settle any issue whose active slot is pinned to an already-terminal
|
|
593
586
|
// run (post-run finalize interrupted by restart). Must run before the
|
|
594
587
|
// idle reconciler so the freed issue is routed in this same pass.
|
|
595
|
-
this.
|
|
588
|
+
this.settleDanglingActiveRuns();
|
|
596
589
|
// Preemptively detect stuck merge-queue PRs (conflicts visible on
|
|
597
590
|
// GitHub) and dispatch queue_repair before the Steward evicts.
|
|
598
591
|
await this.queueHealthMonitor.reconcile();
|
|
@@ -605,70 +598,35 @@ export class RunOrchestrator {
|
|
|
605
598
|
advanceIdleIssue(issue, newState, options) {
|
|
606
599
|
this.idleReconciler.advanceIdleIssue(issue, newState, options);
|
|
607
600
|
}
|
|
608
|
-
|
|
609
|
-
* After a zombie/stale run is cleared, decide whether to re-enqueue
|
|
610
|
-
* or escalate. Checks: PR already merged → done; budget exhausted →
|
|
611
|
-
* escalate; backoff delay not elapsed → skip.
|
|
612
|
-
*/
|
|
613
|
-
recoverOrEscalate(issue, runType, reason) {
|
|
614
|
-
this.runRecovery.recoverOrEscalate({
|
|
615
|
-
issue,
|
|
616
|
-
runType,
|
|
617
|
-
reason,
|
|
618
|
-
isRequestedChangesRunType,
|
|
619
|
-
});
|
|
620
|
-
}
|
|
621
|
-
// Clear a dangling active slot: an issue still pointing at an
|
|
601
|
+
// Settle a dangling active slot: an issue still pointing at an
|
|
622
602
|
// already-terminal run via `activeRunId`. The post-run finalize was
|
|
623
603
|
// interrupted (almost always a restart between marking the run
|
|
624
604
|
// terminal and clearing the slot), so the run can never drive the
|
|
625
605
|
// session forward, yet every idle/recovery pass skips the issue
|
|
626
|
-
// because `activeRunId` is set.
|
|
627
|
-
//
|
|
628
|
-
//
|
|
629
|
-
|
|
606
|
+
// because `activeRunId` is set. settleRun is idempotent and its slot
|
|
607
|
+
// clear is a predicate-guarded versioned commit, so no age gate is
|
|
608
|
+
// needed — it cannot destructively race the notification finalizer.
|
|
609
|
+
// The idle reconciler then routes the issue from GitHub truth (e.g. a
|
|
610
|
+
// missed changes_requested → review_fix).
|
|
611
|
+
settleDanglingActiveRuns() {
|
|
630
612
|
for (const issue of this.db.issues.listIssuesWithTerminalActiveRun()) {
|
|
631
613
|
if (issue.activeRunId === undefined)
|
|
632
614
|
continue;
|
|
633
615
|
const run = this.db.runs.getRunById(issue.activeRunId);
|
|
634
|
-
|
|
635
|
-
// race where the run advanced back to active between query and read.
|
|
636
|
-
if (!run || run.status === "running" || run.status === "queued")
|
|
637
|
-
continue;
|
|
638
|
-
// Hold off until the run has been terminal long enough that the
|
|
639
|
-
// normal notification-driven finalize has demonstrably not run —
|
|
640
|
-
// avoids racing a live completion that is milliseconds from clearing
|
|
641
|
-
// the slot itself.
|
|
642
|
-
const endedAtMs = run.endedAt ? Date.parse(run.endedAt) : Number.NaN;
|
|
643
|
-
if (Number.isFinite(endedAtMs) && Date.now() - endedAtMs < DANGLING_ACTIVE_RUN_MIN_AGE_MS)
|
|
616
|
+
if (!run)
|
|
644
617
|
continue;
|
|
645
618
|
const lease = this.claimLeaseForReconciliation(run.projectId, run.linearIssueId);
|
|
646
|
-
// "skip" → a live lease owns the session (a
|
|
647
|
-
//
|
|
648
|
-
//
|
|
619
|
+
// "skip" → a live lease owns the session (a worker is mid-finalize or
|
|
620
|
+
// mid-launch); settleRun could not corrupt its writes, but deferring
|
|
621
|
+
// lets the owner land its richer post-run state first. "owned" → an
|
|
622
|
+
// outer local scope holds it, so we must not release it here.
|
|
649
623
|
if (lease === "skip")
|
|
650
624
|
continue;
|
|
651
625
|
try {
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
const danglingClear = {
|
|
657
|
-
projectId: run.projectId,
|
|
658
|
-
linearIssueId: run.linearIssueId,
|
|
659
|
-
activeRunId: null,
|
|
660
|
-
};
|
|
661
|
-
const commit = this.db.issueSessions.commitIssueState({
|
|
662
|
-
writer: WRITER,
|
|
663
|
-
lease: held,
|
|
664
|
-
expectedVersion: fresh.version,
|
|
665
|
-
update: danglingClear,
|
|
666
|
-
// Never clear a slot a concurrent writer re-pointed elsewhere.
|
|
667
|
-
onConflict: (current) => (current.activeRunId === run.id ? danglingClear : undefined),
|
|
668
|
-
});
|
|
669
|
-
return commit.outcome === "applied";
|
|
670
|
-
});
|
|
671
|
-
if (cleared) {
|
|
626
|
+
// No `finish` outcome: the run is already terminal, and settleRun
|
|
627
|
+
// leaves a run that raced back to non-terminal status untouched.
|
|
628
|
+
const settled = this.withHeldIssueSessionLease(run.projectId, run.linearIssueId, (held) => settleRun({ db: this.db, run, lease: held }));
|
|
629
|
+
if (settled?.slotCleared) {
|
|
672
630
|
this.logger.warn({ issueKey: issue.issueKey, runId: run.id, runType: run.runType, runStatus: run.status }, "Cleared dangling active-run slot left by a terminal run; idle reconcile will resume the issue");
|
|
673
631
|
this.feed?.publish({
|
|
674
632
|
level: "warn",
|
|
@@ -701,14 +659,14 @@ export class RunOrchestrator {
|
|
|
701
659
|
}
|
|
702
660
|
// ─── Internal helpers ─────────────────────────────────────────────
|
|
703
661
|
escalate(issue, runType, reason) {
|
|
704
|
-
this.
|
|
662
|
+
this.runFailurePolicy.escalate({
|
|
705
663
|
issue,
|
|
706
664
|
runType,
|
|
707
665
|
reason,
|
|
708
666
|
});
|
|
709
667
|
}
|
|
710
668
|
failRunAndClear(run, message, nextState = "failed") {
|
|
711
|
-
this.
|
|
669
|
+
this.runFailurePolicy.failRunAndClear({
|
|
712
670
|
run,
|
|
713
671
|
message,
|
|
714
672
|
nextState,
|
package/dist/run-reconciler.js
CHANGED
|
@@ -3,10 +3,10 @@ import { TERMINAL_STATES } from "./factory-state.js";
|
|
|
3
3
|
import { resolveAuthoritativeLinearStopState } from "./linear-workflow.js";
|
|
4
4
|
import { buildRunFailureActivity } from "./linear-session-reporting.js";
|
|
5
5
|
import { getThreadTurns } from "./codex-thread-utils.js";
|
|
6
|
-
import { resolveRecoverablePostRunState } from "./interrupted-run-recovery.js";
|
|
7
6
|
import { resolveEffectiveActiveRun } from "./effective-active-run.js";
|
|
8
7
|
import { isThreadMaterializingError } from "./codex-thread-errors.js";
|
|
9
8
|
import { fetchPullRequestSnapshot } from "./reconcile-pr-fetch.js";
|
|
9
|
+
import { emitTelemetry, noopTelemetry } from "./telemetry.js";
|
|
10
10
|
const THREAD_MATERIALIZATION_GRACE_MS = 10 * 60_000;
|
|
11
11
|
const WRITER = "run-reconciler";
|
|
12
12
|
function isWithinThreadMaterializationGrace(run, nowMs = Date.now()) {
|
|
@@ -20,27 +20,27 @@ export class RunReconciler {
|
|
|
20
20
|
logger;
|
|
21
21
|
linearProvider;
|
|
22
22
|
linearSync;
|
|
23
|
-
|
|
23
|
+
failurePolicy;
|
|
24
24
|
runFinalizer;
|
|
25
25
|
withHeldLease;
|
|
26
26
|
releaseLease;
|
|
27
27
|
readThreadWithRetry;
|
|
28
|
-
recoverOrEscalate;
|
|
29
28
|
resolveRepoFullName;
|
|
30
29
|
feed;
|
|
31
|
-
|
|
30
|
+
telemetry;
|
|
31
|
+
constructor(db, logger, linearProvider, linearSync, failurePolicy, runFinalizer, withHeldLease, releaseLease, readThreadWithRetry, resolveRepoFullName = () => undefined, feed, telemetry = noopTelemetry) {
|
|
32
32
|
this.db = db;
|
|
33
33
|
this.logger = logger;
|
|
34
34
|
this.linearProvider = linearProvider;
|
|
35
35
|
this.linearSync = linearSync;
|
|
36
|
-
this.
|
|
36
|
+
this.failurePolicy = failurePolicy;
|
|
37
37
|
this.runFinalizer = runFinalizer;
|
|
38
38
|
this.withHeldLease = withHeldLease;
|
|
39
39
|
this.releaseLease = releaseLease;
|
|
40
40
|
this.readThreadWithRetry = readThreadWithRetry;
|
|
41
|
-
this.recoverOrEscalate = recoverOrEscalate;
|
|
42
41
|
this.resolveRepoFullName = resolveRepoFullName;
|
|
43
42
|
this.feed = feed;
|
|
43
|
+
this.telemetry = telemetry;
|
|
44
44
|
}
|
|
45
45
|
async reconcile(params) {
|
|
46
46
|
const { run, issue, recoveryLease } = params;
|
|
@@ -67,6 +67,19 @@ export class RunReconciler {
|
|
|
67
67
|
if (commit?.outcome === "applied") {
|
|
68
68
|
effectiveIssue = commit.issue;
|
|
69
69
|
this.logger.info({ issueKey: effectiveIssue.issueKey, runId: run.id, runType: run.runType }, "Reattached detached active run during reconciliation");
|
|
70
|
+
// Plan §B5: with settleRun idempotent and the launcher persisting the
|
|
71
|
+
// thread id before startTurn, this reattachment should never fire.
|
|
72
|
+
// Telemetry observes it for one release before the block is deleted.
|
|
73
|
+
emitTelemetry(this.telemetry, {
|
|
74
|
+
type: "health.invariant",
|
|
75
|
+
invariant: "detached_active_run",
|
|
76
|
+
status: "repaired",
|
|
77
|
+
projectId: run.projectId,
|
|
78
|
+
linearIssueId: run.linearIssueId,
|
|
79
|
+
...(effectiveIssue.issueKey ? { issueKey: effectiveIssue.issueKey } : {}),
|
|
80
|
+
runId: run.id,
|
|
81
|
+
detail: `Reattached detached active ${run.runType} run during reconciliation`,
|
|
82
|
+
});
|
|
70
83
|
}
|
|
71
84
|
else if (commit?.outcome === "conflict_skipped" && commit.issue) {
|
|
72
85
|
effectiveIssue = commit.issue;
|
|
@@ -111,19 +124,14 @@ export class RunReconciler {
|
|
|
111
124
|
return;
|
|
112
125
|
}
|
|
113
126
|
this.logger.warn({ issueKey: effectiveIssue.issueKey, runId: run.id, runType: run.runType }, "Zombie run detected (no thread)");
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
});
|
|
122
|
-
if (commit.outcome !== "applied")
|
|
123
|
-
return;
|
|
124
|
-
this.db.runs.finishRun(run.id, { status: "failed", failureReason: "Zombie: never started (no thread after restart)" });
|
|
127
|
+
// Detection only — the failure policy settles the run and decides
|
|
128
|
+
// retry vs escalate (plan §B4).
|
|
129
|
+
this.failurePolicy.settleStrandedRunAndRecover({
|
|
130
|
+
run,
|
|
131
|
+
issue: effectiveIssue,
|
|
132
|
+
reason: "zombie",
|
|
133
|
+
failureReason: "Zombie: never started (no thread after restart)",
|
|
125
134
|
});
|
|
126
|
-
this.recoverOrEscalate(effectiveIssue, run.runType, "zombie");
|
|
127
135
|
const recoveredIssue = this.db.issues.getIssue(run.projectId, run.linearIssueId) ?? effectiveIssue;
|
|
128
136
|
void this.linearSync.emitActivity(recoveredIssue, buildRunFailureActivity(run.runType, "The Codex turn never started before PatchRelay restarted."));
|
|
129
137
|
void this.linearSync.syncSession(recoveredIssue, { activeRunType: run.runType });
|
|
@@ -144,19 +152,14 @@ export class RunReconciler {
|
|
|
144
152
|
return;
|
|
145
153
|
}
|
|
146
154
|
this.logger.warn({ issueKey: effectiveIssue.issueKey, runId: run.id, runType: run.runType, threadId: run.threadId }, "Stale thread during reconciliation");
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
});
|
|
155
|
-
if (commit.outcome !== "applied")
|
|
156
|
-
return;
|
|
157
|
-
this.db.runs.finishRun(run.id, { status: "failed", failureReason: "Stale thread after restart" });
|
|
155
|
+
// Detection only — the failure policy settles the run and decides
|
|
156
|
+
// retry vs escalate (plan §B4).
|
|
157
|
+
this.failurePolicy.settleStrandedRunAndRecover({
|
|
158
|
+
run,
|
|
159
|
+
issue: effectiveIssue,
|
|
160
|
+
reason: "stale_thread",
|
|
161
|
+
failureReason: "Stale thread after restart",
|
|
158
162
|
});
|
|
159
|
-
this.recoverOrEscalate(effectiveIssue, run.runType, "stale_thread");
|
|
160
163
|
const recoveredIssue = this.db.issues.getIssue(run.projectId, run.linearIssueId) ?? effectiveIssue;
|
|
161
164
|
void this.linearSync.emitActivity(recoveredIssue, buildRunFailureActivity(run.runType, "PatchRelay lost the active Codex thread after restart and needs to recover."));
|
|
162
165
|
void this.linearSync.syncSession(recoveredIssue, { activeRunType: run.runType });
|
|
@@ -207,7 +210,7 @@ export class RunReconciler {
|
|
|
207
210
|
}
|
|
208
211
|
const latestTurn = getThreadTurns(thread).at(-1);
|
|
209
212
|
if (latestTurn?.status === "interrupted") {
|
|
210
|
-
await this.
|
|
213
|
+
await this.failurePolicy.handleInterruptedRun(run, effectiveIssue);
|
|
211
214
|
return;
|
|
212
215
|
}
|
|
213
216
|
if (latestTurn?.status === "completed") {
|
|
@@ -218,7 +221,6 @@ export class RunReconciler {
|
|
|
218
221
|
thread,
|
|
219
222
|
threadId: run.threadId,
|
|
220
223
|
...(latestTurn.id ? { completedTurnId: latestTurn.id } : {}),
|
|
221
|
-
resolveRecoverableRunState: resolveRecoverablePostRunState,
|
|
222
224
|
});
|
|
223
225
|
return;
|
|
224
226
|
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
const WRITER = "run-settlement";
|
|
2
|
+
const TERMINAL_RUN_STATUSES = new Set(["completed", "failed", "released", "superseded"]);
|
|
3
|
+
export function isTerminalRunStatus(status) {
|
|
4
|
+
return TERMINAL_RUN_STATUSES.has(status);
|
|
5
|
+
}
|
|
6
|
+
// Phase B1 (core simplification plan): the fast, transactional, idempotent
|
|
7
|
+
// half of run finalization. One transaction marks the run terminal and
|
|
8
|
+
// clears the issue's active slot — the two writes whose separation caused
|
|
9
|
+
// the dangling-active-run freeze (PR #566): a restart landing between them
|
|
10
|
+
// left `activeRunId` pointing at a terminal run forever, hiding the issue
|
|
11
|
+
// from every idle/recovery pass. Safe to call from both the notification
|
|
12
|
+
// finalizer and reconciliation at any time:
|
|
13
|
+
// - already-terminal run → finishRun skipped;
|
|
14
|
+
// - slot already cleared or re-pointed at another run → issue untouched;
|
|
15
|
+
// - non-terminal run with no `finish` outcome → full no-op.
|
|
16
|
+
export function settleRun(params) {
|
|
17
|
+
const { db, run } = params;
|
|
18
|
+
return db.transaction(() => {
|
|
19
|
+
const freshRun = db.runs.getRunById(run.id);
|
|
20
|
+
if (!freshRun) {
|
|
21
|
+
return { runFinished: false, slotCleared: false, issue: db.issues.getIssue(run.projectId, run.linearIssueId) };
|
|
22
|
+
}
|
|
23
|
+
let runFinished = false;
|
|
24
|
+
if (!isTerminalRunStatus(freshRun.status)) {
|
|
25
|
+
if (!params.finish) {
|
|
26
|
+
return { runFinished: false, slotCleared: false, issue: db.issues.getIssue(run.projectId, run.linearIssueId) };
|
|
27
|
+
}
|
|
28
|
+
db.runs.finishRun(run.id, params.finish);
|
|
29
|
+
runFinished = true;
|
|
30
|
+
}
|
|
31
|
+
const current = db.issues.getIssue(run.projectId, run.linearIssueId);
|
|
32
|
+
if (!current || current.activeRunId !== run.id) {
|
|
33
|
+
return { runFinished, slotCleared: false, issue: current };
|
|
34
|
+
}
|
|
35
|
+
const buildUpdate = (record) => ({
|
|
36
|
+
projectId: run.projectId,
|
|
37
|
+
linearIssueId: run.linearIssueId,
|
|
38
|
+
...params.buildIssueUpdate?.(record),
|
|
39
|
+
// After the caller-provided fields so nothing can override the clear.
|
|
40
|
+
activeRunId: null,
|
|
41
|
+
});
|
|
42
|
+
const commit = db.issueSessions.commitIssueState({
|
|
43
|
+
writer: WRITER,
|
|
44
|
+
...(params.lease ? { lease: params.lease } : {}),
|
|
45
|
+
expectedVersion: current.version,
|
|
46
|
+
update: buildUpdate(current),
|
|
47
|
+
// The read above happened inside this same transaction, so a version
|
|
48
|
+
// conflict cannot normally occur; the predicate keeps the invariant
|
|
49
|
+
// explicit: never clear a slot that was re-pointed at another run.
|
|
50
|
+
onConflict: (fresh) => (fresh.activeRunId === run.id ? buildUpdate(fresh) : undefined),
|
|
51
|
+
});
|
|
52
|
+
if (commit.outcome !== "applied") {
|
|
53
|
+
return { runFinished, slotCleared: false, issue: commit.outcome === "conflict_skipped" ? commit.issue : current };
|
|
54
|
+
}
|
|
55
|
+
return { runFinished, slotCleared: true, issue: commit.issue };
|
|
56
|
+
});
|
|
57
|
+
}
|
package/dist/service.js
CHANGED
|
@@ -14,6 +14,7 @@ import { ServiceStartupRecovery } from "./service-startup-recovery.js";
|
|
|
14
14
|
import { WakeDispatcher } from "./wake-dispatcher.js";
|
|
15
15
|
import { WebhookHandler } from "./webhook-handler.js";
|
|
16
16
|
import { acceptIncomingWebhook } from "./service-webhooks.js";
|
|
17
|
+
import { ABANDONED_PENDING_WEBHOOK_AGE_MS } from "./db/webhook-event-store.js";
|
|
17
18
|
import { runWebhookEventRetention } from "./event-retention.js";
|
|
18
19
|
import { parseStringArray, TrackedIssueListQuery } from "./tracked-issue-list-query.js";
|
|
19
20
|
import { AgentInputService } from "./agent-input-service.js";
|
|
@@ -103,6 +104,7 @@ export class PatchRelayService {
|
|
|
103
104
|
}
|
|
104
105
|
async start() {
|
|
105
106
|
this.db.issueSessions.releaseExpiredIssueSessionLeases();
|
|
107
|
+
this.sweepAbandonedWebhookEvents();
|
|
106
108
|
const repairedInstallations = this.db.linearInstallations.repairProjectInstallations(this.config.projects.map((project) => project.id));
|
|
107
109
|
for (const repair of repairedInstallations) {
|
|
108
110
|
this.logger.info({ projectId: repair.projectId, installationId: repair.installationId, reason: repair.reason }, "Repaired Linear project installation link");
|
|
@@ -287,6 +289,26 @@ export class PatchRelayService {
|
|
|
287
289
|
getReadiness() {
|
|
288
290
|
return this.runtime.getReadiness();
|
|
289
291
|
}
|
|
292
|
+
// Core simplification plan §C2: webhook_events is a dedupe + forensics log,
|
|
293
|
+
// not a replay queue. A row stuck at 'pending' means a crash or restart
|
|
294
|
+
// interrupted processing; the event will never be replayed (recovery is
|
|
295
|
+
// re-derivation from GitHub/Linear via reconciliation), so mark it
|
|
296
|
+
// 'abandoned' — making it archiveable — and surface the count to the
|
|
297
|
+
// operator, because every abandoned row is a crash worth seeing.
|
|
298
|
+
sweepAbandonedWebhookEvents() {
|
|
299
|
+
const cutoffIso = new Date(Date.now() - ABANDONED_PENDING_WEBHOOK_AGE_MS).toISOString();
|
|
300
|
+
const abandoned = this.db.webhookEvents.markAbandonedPendingEventsBefore(cutoffIso);
|
|
301
|
+
if (abandoned === 0)
|
|
302
|
+
return;
|
|
303
|
+
this.logger.warn({ abandoned, cutoffIso }, "Marked stale pending webhook events as abandoned at startup");
|
|
304
|
+
this.feed.publish({
|
|
305
|
+
level: "warn",
|
|
306
|
+
kind: "webhook",
|
|
307
|
+
status: "abandoned_events",
|
|
308
|
+
summary: `Startup: marked ${abandoned} stale pending webhook event(s) as abandoned`,
|
|
309
|
+
detail: "Processing was interrupted (crash/restart). State recovers via reconciliation; the rows stay archiveable for forensics.",
|
|
310
|
+
});
|
|
311
|
+
}
|
|
290
312
|
scheduleEventRetention(delayMs = 24 * 60 * 60 * 1000) {
|
|
291
313
|
if (this.eventRetentionTimer !== undefined) {
|
|
292
314
|
clearTimeout(this.eventRetentionTimer);
|