patchrelay 0.76.0 → 0.78.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/build-info.json +3 -3
- package/dist/db/webhook-event-store.js +22 -0
- package/dist/failure-provenance.js +40 -0
- package/dist/github-webhook-late-publication-guard.js +49 -15
- package/dist/github-webhook-policy.js +36 -25
- package/dist/github-webhook-sequence-backstop.js +45 -2
- package/dist/github-webhook-state-projector.js +5 -12
- package/dist/idle-reconciliation.js +63 -38
- package/dist/pr-facts-derivation.js +81 -0
- package/dist/run-budgets.js +40 -6
- package/dist/run-completion-policy.js +50 -9
- package/dist/run-failure-policy.js +463 -0
- package/dist/run-finalizer.js +23 -22
- package/dist/run-launcher.js +21 -0
- package/dist/run-notification-handler.js +0 -2
- package/dist/run-orchestrator.js +26 -68
- package/dist/run-reconciler.js +34 -32
- package/dist/run-settlement.js +57 -0
- package/dist/service.js +22 -0
- package/package.json +1 -1
- package/dist/interrupted-run-recovery.js +0 -240
- package/dist/run-recovery-service.js +0 -239
- package/dist/zombie-recovery.js +0 -13
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import { resolveFactoryStateFromGitHub } from "./factory-state.js";
|
|
2
|
+
import { isFailingCheckStatus, isReviewDecisionApproved, isReviewDecisionReviewRequired, } from "./idle-reconciliation-helpers.js";
|
|
3
|
+
/**
|
|
4
|
+
* Pure factory-state derivation shared by the webhook projector and the idle
|
|
5
|
+
* reconciler. Returns the state the issue should move to, or `undefined`
|
|
6
|
+
* when the observation is a no-op for the current state.
|
|
7
|
+
*/
|
|
8
|
+
export function deriveFactoryStateFromPrFacts(observed, current) {
|
|
9
|
+
if (observed.triggerEvent !== undefined) {
|
|
10
|
+
return deriveFromTriggerEvent(observed.triggerEvent, observed, current);
|
|
11
|
+
}
|
|
12
|
+
return deriveFromPolledLevel(observed, current);
|
|
13
|
+
}
|
|
14
|
+
// ── Delta observations (webhook trigger events) ─────────────────────
|
|
15
|
+
// The transition-rule table in factory-state.ts is the spec; this wrapper
|
|
16
|
+
// adds the awaiting_input/delegated lifting that the webhook path applies
|
|
17
|
+
// before consulting the table.
|
|
18
|
+
function deriveFromTriggerEvent(triggerEvent, observed, current) {
|
|
19
|
+
if (triggerEvent === "pr_closed") {
|
|
20
|
+
// The terminal-PR handler owns the closed-PR decision on the webhook path.
|
|
21
|
+
return undefined;
|
|
22
|
+
}
|
|
23
|
+
const effectiveCurrentState = (current.factoryState === "awaiting_input" || current.factoryState === "delegated")
|
|
24
|
+
&& (observed.prState === "open" || observed.prNumber !== undefined)
|
|
25
|
+
? "pr_open"
|
|
26
|
+
: current.factoryState;
|
|
27
|
+
const resolved = resolveFactoryStateFromGitHub(triggerEvent, effectiveCurrentState, {
|
|
28
|
+
prReviewState: current.prReviewState,
|
|
29
|
+
activeRunId: current.activeRunId,
|
|
30
|
+
failureSource: observed.failureSource,
|
|
31
|
+
...(current.activeRunType ? { activeRunType: current.activeRunType } : {}),
|
|
32
|
+
...(current.activeRunSourceHeadSha ? { activeRunSourceHeadSha: current.activeRunSourceHeadSha } : {}),
|
|
33
|
+
...(observed.approvalHeadSha ? { approvalHeadSha: observed.approvalHeadSha } : {}),
|
|
34
|
+
});
|
|
35
|
+
if (resolved !== undefined) {
|
|
36
|
+
return resolved;
|
|
37
|
+
}
|
|
38
|
+
if (effectiveCurrentState !== current.factoryState) {
|
|
39
|
+
return effectiveCurrentState;
|
|
40
|
+
}
|
|
41
|
+
return undefined;
|
|
42
|
+
}
|
|
43
|
+
// ── Level observations (polled snapshot) ────────────────────────────
|
|
44
|
+
function deriveFromPolledLevel(observed, current) {
|
|
45
|
+
if (observed.prState === "closed") {
|
|
46
|
+
if (observed.closedPrDisposition === "done")
|
|
47
|
+
return "done";
|
|
48
|
+
if (observed.closedPrDisposition === "terminal")
|
|
49
|
+
return undefined;
|
|
50
|
+
return "delegated";
|
|
51
|
+
}
|
|
52
|
+
if (observed.prState === "merged") {
|
|
53
|
+
// Mirrors the pr_merged transition rule: with an active run the
|
|
54
|
+
// finalizer owns the completion; deploy tracking may map "done" to
|
|
55
|
+
// "deploying" at the call site.
|
|
56
|
+
return current.activeRunId === undefined ? "done" : undefined;
|
|
57
|
+
}
|
|
58
|
+
if (current.factoryState === "escalated" || current.factoryState === "failed") {
|
|
59
|
+
// Terminal recovery: newer GitHub truth reopens a stuck terminal issue.
|
|
60
|
+
// No fall-through to the generic approved rule — an escalated issue with
|
|
61
|
+
// a red gate stays escalated (the failure provenance keeps the repair
|
|
62
|
+
// routable; auto-reopening would swallow it).
|
|
63
|
+
if (isReviewDecisionApproved(observed.reviewDecision) && !isFailingCheckStatus(observed.gateCheckStatus)) {
|
|
64
|
+
return "awaiting_queue";
|
|
65
|
+
}
|
|
66
|
+
if (observed.gateCheckStatus === "pending") {
|
|
67
|
+
return "pr_open";
|
|
68
|
+
}
|
|
69
|
+
if (observed.headAdvanced && !isFailingCheckStatus(observed.gateCheckStatus)) {
|
|
70
|
+
return "pr_open";
|
|
71
|
+
}
|
|
72
|
+
if (isReviewDecisionReviewRequired(observed.reviewDecision) && !isFailingCheckStatus(observed.gateCheckStatus)) {
|
|
73
|
+
return "pr_open";
|
|
74
|
+
}
|
|
75
|
+
return undefined;
|
|
76
|
+
}
|
|
77
|
+
if (isReviewDecisionApproved(observed.reviewDecision)) {
|
|
78
|
+
return "awaiting_queue";
|
|
79
|
+
}
|
|
80
|
+
return undefined;
|
|
81
|
+
}
|
package/dist/run-budgets.js
CHANGED
|
@@ -1,12 +1,46 @@
|
|
|
1
|
-
export const
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
export const DEFAULT_RUN_BUDGETS = {
|
|
2
|
+
ciRepair: 10,
|
|
3
|
+
queueRepair: 10,
|
|
4
|
+
reviewFix: 10,
|
|
5
|
+
zombieRecovery: 5,
|
|
6
|
+
};
|
|
7
|
+
export function resolveRunBudgets(project) {
|
|
8
|
+
return {
|
|
9
|
+
ciRepair: project?.repairBudgets?.ciRepair ?? DEFAULT_RUN_BUDGETS.ciRepair,
|
|
10
|
+
queueRepair: project?.repairBudgets?.queueRepair ?? DEFAULT_RUN_BUDGETS.queueRepair,
|
|
11
|
+
reviewFix: project?.repairBudgets?.reviewFix ?? DEFAULT_RUN_BUDGETS.reviewFix,
|
|
12
|
+
// No per-project override exists for zombie recovery yet; add one to
|
|
13
|
+
// ProjectConfig.repairBudgets if a project ever needs it.
|
|
14
|
+
zombieRecovery: DEFAULT_RUN_BUDGETS.zombieRecovery,
|
|
15
|
+
};
|
|
16
|
+
}
|
|
4
17
|
export function getCiRepairBudget(project) {
|
|
5
|
-
return project
|
|
18
|
+
return resolveRunBudgets(project).ciRepair;
|
|
6
19
|
}
|
|
7
20
|
export function getQueueRepairBudget(project) {
|
|
8
|
-
return project
|
|
21
|
+
return resolveRunBudgets(project).queueRepair;
|
|
9
22
|
}
|
|
10
23
|
export function getReviewFixBudget(project) {
|
|
11
|
-
return project
|
|
24
|
+
return resolveRunBudgets(project).reviewFix;
|
|
25
|
+
}
|
|
26
|
+
export function getZombieRecoveryBudget(project) {
|
|
27
|
+
return resolveRunBudgets(project).zombieRecovery;
|
|
28
|
+
}
|
|
29
|
+
// ─── Zombie-recovery backoff schedule (formerly zombie-recovery.ts) ──
|
|
30
|
+
//
|
|
31
|
+
// Exponential backoff between retries of a run that died without doing
|
|
32
|
+
// its work. Owned here with the budgets so the whole retry discipline
|
|
33
|
+
// (how many attempts, how far apart) reads in one place.
|
|
34
|
+
const ZOMBIE_RECOVERY_BASE_DELAY_MS = 15_000;
|
|
35
|
+
export function getZombieRecoveryDelayMs(recoveryAttempts) {
|
|
36
|
+
return ZOMBIE_RECOVERY_BASE_DELAY_MS * Math.pow(2, recoveryAttempts);
|
|
37
|
+
}
|
|
38
|
+
export function getRemainingZombieRecoveryDelayMs(lastRecoveryAt, recoveryAttempts, now = Date.now()) {
|
|
39
|
+
if (!lastRecoveryAt)
|
|
40
|
+
return 0;
|
|
41
|
+
const recoveredAtMs = Date.parse(lastRecoveryAt);
|
|
42
|
+
if (!Number.isFinite(recoveredAtMs))
|
|
43
|
+
return 0;
|
|
44
|
+
const delay = getZombieRecoveryDelayMs(recoveryAttempts);
|
|
45
|
+
return Math.max(0, recoveredAtMs + delay - now);
|
|
12
46
|
}
|
|
@@ -1,18 +1,59 @@
|
|
|
1
1
|
import { ACTIVE_RUN_STATES } from "./factory-state.js";
|
|
2
2
|
import { ImplementationOutcomePolicy } from "./implementation-outcome-policy.js";
|
|
3
3
|
import { ReactiveRunPolicy } from "./reactive-run-policy.js";
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
import { deriveIssueSessionReactiveIntent } from "./issue-session.js";
|
|
5
|
+
// Plan §B3: the one post-run factory-state resolver. Unifies the former
|
|
6
|
+
// `resolveCompletedRunState` (run-completion-policy) and
|
|
7
|
+
// `resolveRecoverablePostRunState` (interrupted-run-recovery).
|
|
8
|
+
//
|
|
9
|
+
// Shared rule (both old functions agreed):
|
|
10
|
+
// - no PR on the issue → undefined (nothing to resolve from PR truth);
|
|
11
|
+
// - approved open/closed PR → awaiting_queue; otherwise pr_open;
|
|
12
|
+
// - merged PR (while the issue is in an active-run state) → done.
|
|
13
|
+
//
|
|
14
|
+
// The two old functions genuinely disagreed in two places, and the
|
|
15
|
+
// disagreement is semantic, so it survives as the `outcome` option rather
|
|
16
|
+
// than being averaged away:
|
|
17
|
+
// - outcome "completed" (the run did its work, default): gate every write
|
|
18
|
+
// on ACTIVE_RUN_STATES so a state advanced concurrently by webhooks
|
|
19
|
+
// (e.g. deploying, awaiting_queue) is never clobbered, and never
|
|
20
|
+
// re-derive a reactive repair state — the stale GitHub verdict
|
|
21
|
+
// (changes_requested / red CI) refers to the head the run just
|
|
22
|
+
// replaced, and routing it again would loop the fix forever.
|
|
23
|
+
// - outcome "recovered" (the run died without doing its work): GitHub
|
|
24
|
+
// truth is authoritative regardless of the local factory state —
|
|
25
|
+
// merged → done unconditionally, and an open PR re-derives the
|
|
26
|
+
// reactive intent (repairing_ci / repairing_queue / changes_requested)
|
|
27
|
+
// so the original problem is routed again.
|
|
28
|
+
export function resolvePostRunFactoryState(issue, _run, options) {
|
|
29
|
+
if (!issue.prNumber)
|
|
30
|
+
return undefined;
|
|
31
|
+
if (options?.outcome === "recovered") {
|
|
6
32
|
if (issue.prState === "merged")
|
|
7
33
|
return "done";
|
|
8
|
-
if (issue.
|
|
9
|
-
|
|
10
|
-
|
|
34
|
+
if (issue.prState === "open") {
|
|
35
|
+
const reactiveIntent = deriveIssueSessionReactiveIntent({
|
|
36
|
+
prNumber: issue.prNumber,
|
|
37
|
+
prState: issue.prState,
|
|
38
|
+
prReviewState: issue.prReviewState,
|
|
39
|
+
prCheckStatus: issue.prCheckStatus,
|
|
40
|
+
latestFailureSource: issue.lastGitHubFailureSource,
|
|
41
|
+
});
|
|
42
|
+
if (reactiveIntent)
|
|
43
|
+
return reactiveIntent.compatibilityFactoryState;
|
|
44
|
+
if (issue.prReviewState === "approved")
|
|
45
|
+
return "awaiting_queue";
|
|
46
|
+
return "pr_open";
|
|
47
|
+
}
|
|
48
|
+
// Closed (or unknown) PR: fall through to the factory-state-gated rule.
|
|
11
49
|
}
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
50
|
+
if (!ACTIVE_RUN_STATES.has(issue.factoryState))
|
|
51
|
+
return undefined;
|
|
52
|
+
if (issue.prState === "merged")
|
|
53
|
+
return "done";
|
|
54
|
+
if (issue.prReviewState === "approved")
|
|
55
|
+
return "awaiting_queue";
|
|
56
|
+
return "pr_open";
|
|
16
57
|
}
|
|
17
58
|
export class RunCompletionPolicy {
|
|
18
59
|
reactive;
|
|
@@ -0,0 +1,463 @@
|
|
|
1
|
+
import { buildRunFailureActivity } from "./linear-session-reporting.js";
|
|
2
|
+
import { getRemainingZombieRecoveryDelayMs, getZombieRecoveryBudget } from "./run-budgets.js";
|
|
3
|
+
import { resolvePostRunFactoryState } from "./run-completion-policy.js";
|
|
4
|
+
import { isRequestedChangesRunType } from "./reactive-pr-state.js";
|
|
5
|
+
import { settleRun } from "./run-settlement.js";
|
|
6
|
+
const WRITER = "run-failure-policy";
|
|
7
|
+
// Roll back the attempt counter consumed by the interrupted run and clear the
|
|
8
|
+
// attempted-failure provenance for repair runs, as a single issue update so
|
|
9
|
+
// the whole repair commits (and conflict-recomputes) atomically.
|
|
10
|
+
function buildInterruptedAttemptRepairUpdate(runType, issue) {
|
|
11
|
+
const counter = runType === "ci_repair" && issue.ciRepairAttempts > 0
|
|
12
|
+
? { ciRepairAttempts: issue.ciRepairAttempts - 1 }
|
|
13
|
+
: runType === "queue_repair" && issue.queueRepairAttempts > 0
|
|
14
|
+
? { queueRepairAttempts: issue.queueRepairAttempts - 1 }
|
|
15
|
+
: isRequestedChangesRunType(runType) && issue.reviewFixAttempts > 0
|
|
16
|
+
? { reviewFixAttempts: issue.reviewFixAttempts - 1 }
|
|
17
|
+
: undefined;
|
|
18
|
+
const provenance = runType === "ci_repair" || runType === "queue_repair"
|
|
19
|
+
? {
|
|
20
|
+
lastAttemptedFailureHeadSha: null,
|
|
21
|
+
lastAttemptedFailureSignature: null,
|
|
22
|
+
lastAttemptedFailureAt: null,
|
|
23
|
+
}
|
|
24
|
+
: undefined;
|
|
25
|
+
if (!counter && !provenance)
|
|
26
|
+
return undefined;
|
|
27
|
+
return {
|
|
28
|
+
projectId: issue.projectId,
|
|
29
|
+
linearIssueId: issue.linearIssueId,
|
|
30
|
+
...counter,
|
|
31
|
+
...provenance,
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
function resolveRetryRunType(runType, context) {
|
|
35
|
+
if (runType === "branch_upkeep") {
|
|
36
|
+
return "branch_upkeep";
|
|
37
|
+
}
|
|
38
|
+
return context?.reviewFixMode === "branch_upkeep" || context?.branchUpkeepRequired === true
|
|
39
|
+
? "branch_upkeep"
|
|
40
|
+
: "review_fix";
|
|
41
|
+
}
|
|
42
|
+
// Plan §B4: the one run-failure policy. Merges the former
|
|
43
|
+
// RunRecoveryService (zombie retry/escalate + backoff) and
|
|
44
|
+
// InterruptedRunRecovery (interrupted-turn handling, counter decrements,
|
|
45
|
+
// re-enqueue) into a single module that answers: given a stranded or
|
|
46
|
+
// failed run + its issue — retry (with which backoff/budget), re-enqueue
|
|
47
|
+
// (which runType/context), or escalate?
|
|
48
|
+
//
|
|
49
|
+
// Ownership: run-reconciler and service-startup-recovery only DETECT
|
|
50
|
+
// stranded states and hand them here; this policy DECIDES; execution of
|
|
51
|
+
// the run/slot writes goes through settleRun, and dispatch of follow-up
|
|
52
|
+
// work goes through the WakeDispatcher.
|
|
53
|
+
export class RunFailurePolicy {
|
|
54
|
+
db;
|
|
55
|
+
logger;
|
|
56
|
+
linearSync;
|
|
57
|
+
withHeldLease;
|
|
58
|
+
releaseLease;
|
|
59
|
+
appendWakeEventWithLease;
|
|
60
|
+
wakeDispatcher;
|
|
61
|
+
restoreIdleWorktree;
|
|
62
|
+
completionPolicy;
|
|
63
|
+
resolveProject;
|
|
64
|
+
feed;
|
|
65
|
+
constructor(db, logger, linearSync, withHeldLease, releaseLease, appendWakeEventWithLease, wakeDispatcher, restoreIdleWorktree, completionPolicy, resolveProject, feed) {
|
|
66
|
+
this.db = db;
|
|
67
|
+
this.logger = logger;
|
|
68
|
+
this.linearSync = linearSync;
|
|
69
|
+
this.withHeldLease = withHeldLease;
|
|
70
|
+
this.releaseLease = releaseLease;
|
|
71
|
+
this.appendWakeEventWithLease = appendWakeEventWithLease;
|
|
72
|
+
this.wakeDispatcher = wakeDispatcher;
|
|
73
|
+
this.restoreIdleWorktree = restoreIdleWorktree;
|
|
74
|
+
this.completionPolicy = completionPolicy;
|
|
75
|
+
this.resolveProject = resolveProject;
|
|
76
|
+
this.feed = feed;
|
|
77
|
+
}
|
|
78
|
+
// ─── Stranded runs (zombie / stale thread) ───────────────────────
|
|
79
|
+
/**
|
|
80
|
+
* Detector entry point: the reconciler found a run that can never make
|
|
81
|
+
* progress (no Codex thread after a restart, or the thread is gone).
|
|
82
|
+
* Settle the run (mark failed, release the slot) and decide retry vs
|
|
83
|
+
* escalate via the zombie budget/backoff.
|
|
84
|
+
*/
|
|
85
|
+
settleStrandedRunAndRecover(params) {
|
|
86
|
+
const { run, issue } = params;
|
|
87
|
+
this.withHeldLease(run.projectId, run.linearIssueId, (lease) => settleRun({
|
|
88
|
+
db: this.db,
|
|
89
|
+
run,
|
|
90
|
+
finish: { status: "failed", failureReason: params.failureReason },
|
|
91
|
+
lease,
|
|
92
|
+
}));
|
|
93
|
+
this.recoverOrEscalate({ issue, runType: run.runType, reason: params.reason });
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Decide what happens after a run died without doing its work: PR
|
|
97
|
+
* already merged → done; zombie budget exhausted → escalate; backoff
|
|
98
|
+
* not elapsed → keep the wake but defer; otherwise consume one budget
|
|
99
|
+
* unit, append a recovery wake, and dispatch.
|
|
100
|
+
*/
|
|
101
|
+
recoverOrEscalate(params) {
|
|
102
|
+
const { issue, runType, reason } = params;
|
|
103
|
+
const fresh = this.db.issues.getIssue(issue.projectId, issue.linearIssueId);
|
|
104
|
+
if (!fresh)
|
|
105
|
+
return;
|
|
106
|
+
if (isRequestedChangesRunType(runType)) {
|
|
107
|
+
const updated = this.withHeldLease(fresh.projectId, fresh.linearIssueId, (lease) => {
|
|
108
|
+
this.db.issueSessions.clearPendingIssueSessionEventsWithLease(lease);
|
|
109
|
+
this.db.issueSessions.commitIssueState({
|
|
110
|
+
writer: WRITER,
|
|
111
|
+
lease,
|
|
112
|
+
update: {
|
|
113
|
+
projectId: fresh.projectId,
|
|
114
|
+
linearIssueId: fresh.linearIssueId,
|
|
115
|
+
pendingRunType: null,
|
|
116
|
+
pendingRunContextJson: null,
|
|
117
|
+
factoryState: "escalated",
|
|
118
|
+
},
|
|
119
|
+
});
|
|
120
|
+
return true;
|
|
121
|
+
});
|
|
122
|
+
if (!updated) {
|
|
123
|
+
this.logger.warn({ issueKey: fresh.issueKey, reason }, "Skipping review-fix recovery escalation after losing issue-session lease");
|
|
124
|
+
this.releaseLease(fresh.projectId, fresh.linearIssueId);
|
|
125
|
+
return;
|
|
126
|
+
}
|
|
127
|
+
this.logger.warn({ issueKey: fresh.issueKey, reason }, "Requested-changes run failed before a new head was published - escalating");
|
|
128
|
+
this.feed?.publish({
|
|
129
|
+
level: "error",
|
|
130
|
+
kind: "workflow",
|
|
131
|
+
issueKey: fresh.issueKey,
|
|
132
|
+
projectId: fresh.projectId,
|
|
133
|
+
stage: runType,
|
|
134
|
+
status: "escalated",
|
|
135
|
+
summary: `Requested-changes run failed before publishing a new head (${reason})`,
|
|
136
|
+
});
|
|
137
|
+
this.releaseLease(fresh.projectId, fresh.linearIssueId);
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
140
|
+
if (fresh.prState === "merged") {
|
|
141
|
+
const updated = this.withHeldLease(fresh.projectId, fresh.linearIssueId, (lease) => {
|
|
142
|
+
this.db.issueSessions.commitIssueState({
|
|
143
|
+
writer: WRITER,
|
|
144
|
+
lease,
|
|
145
|
+
update: {
|
|
146
|
+
projectId: fresh.projectId,
|
|
147
|
+
linearIssueId: fresh.linearIssueId,
|
|
148
|
+
factoryState: "done",
|
|
149
|
+
zombieRecoveryAttempts: 0,
|
|
150
|
+
lastZombieRecoveryAt: null,
|
|
151
|
+
},
|
|
152
|
+
});
|
|
153
|
+
return true;
|
|
154
|
+
});
|
|
155
|
+
if (!updated) {
|
|
156
|
+
this.logger.warn({ issueKey: fresh.issueKey, reason }, "Skipping merged recovery completion after losing issue-session lease");
|
|
157
|
+
this.releaseLease(fresh.projectId, fresh.linearIssueId);
|
|
158
|
+
return;
|
|
159
|
+
}
|
|
160
|
+
this.logger.info({ issueKey: fresh.issueKey, reason }, "Recovery: PR already merged - transitioning to done");
|
|
161
|
+
this.releaseLease(fresh.projectId, fresh.linearIssueId);
|
|
162
|
+
return;
|
|
163
|
+
}
|
|
164
|
+
const zombieRecoveryBudget = getZombieRecoveryBudget(this.resolveProject(fresh.projectId));
|
|
165
|
+
const attempts = fresh.zombieRecoveryAttempts + 1;
|
|
166
|
+
if (attempts > zombieRecoveryBudget) {
|
|
167
|
+
const updated = this.withHeldLease(fresh.projectId, fresh.linearIssueId, (lease) => {
|
|
168
|
+
this.db.issueSessions.commitIssueState({
|
|
169
|
+
writer: WRITER,
|
|
170
|
+
lease,
|
|
171
|
+
update: {
|
|
172
|
+
projectId: fresh.projectId,
|
|
173
|
+
linearIssueId: fresh.linearIssueId,
|
|
174
|
+
factoryState: "escalated",
|
|
175
|
+
},
|
|
176
|
+
});
|
|
177
|
+
return true;
|
|
178
|
+
});
|
|
179
|
+
if (!updated) {
|
|
180
|
+
this.logger.warn({ issueKey: fresh.issueKey, attempts, reason }, "Skipping recovery escalation after losing issue-session lease");
|
|
181
|
+
this.releaseLease(fresh.projectId, fresh.linearIssueId);
|
|
182
|
+
return;
|
|
183
|
+
}
|
|
184
|
+
this.logger.warn({ issueKey: fresh.issueKey, attempts, reason }, "Recovery: budget exhausted - escalating");
|
|
185
|
+
this.feed?.publish({
|
|
186
|
+
level: "error",
|
|
187
|
+
kind: "workflow",
|
|
188
|
+
issueKey: fresh.issueKey,
|
|
189
|
+
projectId: fresh.projectId,
|
|
190
|
+
stage: "escalated",
|
|
191
|
+
status: "budget_exhausted",
|
|
192
|
+
summary: `${reason} recovery failed after ${zombieRecoveryBudget} attempts`,
|
|
193
|
+
});
|
|
194
|
+
this.releaseLease(fresh.projectId, fresh.linearIssueId);
|
|
195
|
+
return;
|
|
196
|
+
}
|
|
197
|
+
if (fresh.lastZombieRecoveryAt) {
|
|
198
|
+
const remainingDelayMs = getRemainingZombieRecoveryDelayMs(fresh.lastZombieRecoveryAt, fresh.zombieRecoveryAttempts);
|
|
199
|
+
if (remainingDelayMs > 0) {
|
|
200
|
+
this.withHeldLease(fresh.projectId, fresh.linearIssueId, (lease) => {
|
|
201
|
+
this.appendWakeEventWithLease(lease, fresh, runType, undefined, `recovery:${attempts}`);
|
|
202
|
+
});
|
|
203
|
+
this.logger.debug({ issueKey: fresh.issueKey, attempts: fresh.zombieRecoveryAttempts, remainingDelayMs }, "Recovery: backoff not elapsed, deferring retry");
|
|
204
|
+
return;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
const requeued = this.withHeldLease(fresh.projectId, fresh.linearIssueId, (lease) => {
|
|
208
|
+
// `attempts` is read-modify-write against the fresh row read above; on
|
|
209
|
+
// conflict recompute the counter from the current row.
|
|
210
|
+
const buildRequeueUpdate = (record) => ({
|
|
211
|
+
projectId: fresh.projectId,
|
|
212
|
+
linearIssueId: fresh.linearIssueId,
|
|
213
|
+
pendingRunType: null,
|
|
214
|
+
pendingRunContextJson: null,
|
|
215
|
+
zombieRecoveryAttempts: record.zombieRecoveryAttempts + 1,
|
|
216
|
+
lastZombieRecoveryAt: new Date().toISOString(),
|
|
217
|
+
});
|
|
218
|
+
this.db.issueSessions.commitIssueState({
|
|
219
|
+
writer: WRITER,
|
|
220
|
+
lease,
|
|
221
|
+
expectedVersion: fresh.version,
|
|
222
|
+
update: buildRequeueUpdate(fresh),
|
|
223
|
+
onConflict: (current) => buildRequeueUpdate(current),
|
|
224
|
+
});
|
|
225
|
+
return this.appendWakeEventWithLease(lease, fresh, runType, undefined, `recovery:${attempts}`);
|
|
226
|
+
});
|
|
227
|
+
if (!requeued) {
|
|
228
|
+
this.logger.warn({ issueKey: fresh.issueKey, attempts, reason }, "Skipping recovery re-enqueue after losing issue-session lease");
|
|
229
|
+
this.releaseLease(fresh.projectId, fresh.linearIssueId);
|
|
230
|
+
return;
|
|
231
|
+
}
|
|
232
|
+
this.wakeDispatcher.dispatchIfWakePending(fresh.projectId, fresh.linearIssueId);
|
|
233
|
+
this.logger.info({ issueKey: fresh.issueKey, attempts, reason }, "Recovery: re-enqueued with backoff");
|
|
234
|
+
}
|
|
235
|
+
// ─── Terminal decisions ──────────────────────────────────────────
|
|
236
|
+
escalate(params) {
|
|
237
|
+
const { issue, runType, reason } = params;
|
|
238
|
+
this.logger.warn({ issueKey: issue.issueKey, runType, reason }, "Escalating to human");
|
|
239
|
+
const escalated = this.withHeldLease(issue.projectId, issue.linearIssueId, (lease) => {
|
|
240
|
+
// Escalation is an operator-facing decision: the issue write and the
|
|
241
|
+
// run release ride in the held-lease transaction. When a run still
|
|
242
|
+
// holds the slot, settleRun owns the paired run-release + slot-clear;
|
|
243
|
+
// it refuses to clear a slot that was re-pointed at another run.
|
|
244
|
+
const escalateFields = {
|
|
245
|
+
pendingRunType: null,
|
|
246
|
+
pendingRunContextJson: null,
|
|
247
|
+
factoryState: "escalated",
|
|
248
|
+
};
|
|
249
|
+
if (issue.activeRunId !== undefined) {
|
|
250
|
+
const settled = settleRun({
|
|
251
|
+
db: this.db,
|
|
252
|
+
run: { id: issue.activeRunId, projectId: issue.projectId, linearIssueId: issue.linearIssueId },
|
|
253
|
+
finish: { status: "released" },
|
|
254
|
+
lease,
|
|
255
|
+
buildIssueUpdate: () => escalateFields,
|
|
256
|
+
});
|
|
257
|
+
if (!settled.slotCleared)
|
|
258
|
+
return false;
|
|
259
|
+
}
|
|
260
|
+
else {
|
|
261
|
+
const commit = this.db.issueSessions.commitIssueState({
|
|
262
|
+
writer: WRITER,
|
|
263
|
+
lease,
|
|
264
|
+
update: {
|
|
265
|
+
projectId: issue.projectId,
|
|
266
|
+
linearIssueId: issue.linearIssueId,
|
|
267
|
+
...escalateFields,
|
|
268
|
+
},
|
|
269
|
+
});
|
|
270
|
+
if (commit.outcome !== "applied")
|
|
271
|
+
return false;
|
|
272
|
+
}
|
|
273
|
+
this.db.issueSessions.clearPendingIssueSessionEventsWithLease(lease);
|
|
274
|
+
return true;
|
|
275
|
+
});
|
|
276
|
+
if (!escalated) {
|
|
277
|
+
this.logger.warn({ issueKey: issue.issueKey, runType }, "Skipping escalation write after losing issue-session lease");
|
|
278
|
+
this.releaseLease(issue.projectId, issue.linearIssueId);
|
|
279
|
+
return;
|
|
280
|
+
}
|
|
281
|
+
this.feed?.publish({
|
|
282
|
+
level: "error",
|
|
283
|
+
kind: "workflow",
|
|
284
|
+
issueKey: issue.issueKey,
|
|
285
|
+
projectId: issue.projectId,
|
|
286
|
+
stage: runType,
|
|
287
|
+
status: "escalated",
|
|
288
|
+
summary: `Escalated: ${reason}`,
|
|
289
|
+
});
|
|
290
|
+
const escalatedIssue = this.db.issues.getIssue(issue.projectId, issue.linearIssueId) ?? issue;
|
|
291
|
+
void this.linearSync.emitActivity(escalatedIssue, {
|
|
292
|
+
type: "error",
|
|
293
|
+
body: `PatchRelay needs human help to continue.\n\n${reason}`,
|
|
294
|
+
});
|
|
295
|
+
void this.linearSync.syncSession(escalatedIssue);
|
|
296
|
+
this.releaseLease(issue.projectId, issue.linearIssueId);
|
|
297
|
+
}
|
|
298
|
+
failRunAndClear(params) {
|
|
299
|
+
const { run, message, nextState } = params;
|
|
300
|
+
const updated = this.withHeldLease(run.projectId, run.linearIssueId, (lease) => {
|
|
301
|
+
settleRun({
|
|
302
|
+
db: this.db,
|
|
303
|
+
run,
|
|
304
|
+
finish: { status: "failed", failureReason: message },
|
|
305
|
+
lease,
|
|
306
|
+
buildIssueUpdate: () => ({ factoryState: nextState }),
|
|
307
|
+
});
|
|
308
|
+
if (nextState === "failed" || nextState === "escalated" || nextState === "awaiting_input" || nextState === "done") {
|
|
309
|
+
this.db.issueSessions.clearPendingIssueSessionEventsWithLease(lease);
|
|
310
|
+
}
|
|
311
|
+
return true;
|
|
312
|
+
});
|
|
313
|
+
if (!updated) {
|
|
314
|
+
this.logger.warn({ runId: run.id, issueId: run.linearIssueId }, "Skipping failure cleanup after losing issue-session lease");
|
|
315
|
+
}
|
|
316
|
+
this.releaseLease(run.projectId, run.linearIssueId);
|
|
317
|
+
}
|
|
318
|
+
// ─── Interrupted turns (formerly InterruptedRunRecovery) ─────────
|
|
319
|
+
async handleInterruptedRun(run, issue) {
|
|
320
|
+
this.logger.warn({ issueKey: issue.issueKey, runType: run.runType, threadId: run.threadId }, "Run has interrupted turn - marking as failed");
|
|
321
|
+
const repairedCounters = this.withHeldLease(issue.projectId, issue.linearIssueId, (lease) => {
|
|
322
|
+
// The decrement is read-modify-write against an issue row read before
|
|
323
|
+
// the awaits that led here; on conflict, recompute from the fresh row.
|
|
324
|
+
const update = buildInterruptedAttemptRepairUpdate(run.runType, issue);
|
|
325
|
+
if (update) {
|
|
326
|
+
this.db.issueSessions.commitIssueState({
|
|
327
|
+
writer: WRITER,
|
|
328
|
+
lease,
|
|
329
|
+
expectedVersion: issue.version,
|
|
330
|
+
update,
|
|
331
|
+
onConflict: (current) => buildInterruptedAttemptRepairUpdate(run.runType, current),
|
|
332
|
+
});
|
|
333
|
+
}
|
|
334
|
+
return true;
|
|
335
|
+
});
|
|
336
|
+
if (!repairedCounters) {
|
|
337
|
+
this.logger.warn({ runId: run.id, issueId: run.linearIssueId }, "Skipping interrupted-run recovery after losing issue-session lease");
|
|
338
|
+
this.releaseLease(run.projectId, run.linearIssueId);
|
|
339
|
+
return;
|
|
340
|
+
}
|
|
341
|
+
if (isRequestedChangesRunType(run.runType)) {
|
|
342
|
+
await this.handleInterruptedRequestedChangesRun(run, issue);
|
|
343
|
+
return;
|
|
344
|
+
}
|
|
345
|
+
if (run.runType === "implementation" && !issue.prNumber) {
|
|
346
|
+
await this.handleInterruptedImplementationRun(run, issue);
|
|
347
|
+
return;
|
|
348
|
+
}
|
|
349
|
+
const recoveredState = resolvePostRunFactoryState(this.db.issues.getIssue(run.projectId, run.linearIssueId) ?? issue, run, { outcome: "recovered" });
|
|
350
|
+
this.failRunAndClear({ run, message: "Codex turn was interrupted", nextState: recoveredState ?? "failed" });
|
|
351
|
+
await this.restoreIdleWorktree(issue);
|
|
352
|
+
const failedIssue = this.db.issues.getIssue(run.projectId, run.linearIssueId) ?? issue;
|
|
353
|
+
if (recoveredState) {
|
|
354
|
+
this.feed?.publish({
|
|
355
|
+
level: "info",
|
|
356
|
+
kind: "stage",
|
|
357
|
+
issueKey: issue.issueKey,
|
|
358
|
+
projectId: run.projectId,
|
|
359
|
+
stage: recoveredState,
|
|
360
|
+
status: "reconciled",
|
|
361
|
+
summary: `Interrupted ${run.runType} recovered -> ${recoveredState}`,
|
|
362
|
+
});
|
|
363
|
+
}
|
|
364
|
+
else {
|
|
365
|
+
void this.linearSync.emitActivity(failedIssue, buildRunFailureActivity(run.runType, "The Codex turn was interrupted."));
|
|
366
|
+
}
|
|
367
|
+
void this.linearSync.syncSession(failedIssue, { activeRunType: run.runType });
|
|
368
|
+
this.releaseLease(run.projectId, run.linearIssueId);
|
|
369
|
+
}
|
|
370
|
+
async handleInterruptedImplementationRun(run, issue) {
|
|
371
|
+
const interruptedMessage = "Implementation run was interrupted before PatchRelay could publish a PR";
|
|
372
|
+
this.failRunAndClear({ run, message: "Codex turn was interrupted", nextState: "delegated" });
|
|
373
|
+
await this.restoreIdleWorktree(issue);
|
|
374
|
+
const refreshedIssue = this.db.issues.getIssue(run.projectId, run.linearIssueId) ?? issue;
|
|
375
|
+
this.db.issueSessions.appendIssueSessionEventRespectingActiveLease(run.projectId, run.linearIssueId, {
|
|
376
|
+
projectId: run.projectId,
|
|
377
|
+
linearIssueId: run.linearIssueId,
|
|
378
|
+
eventType: "delegated",
|
|
379
|
+
dedupeKey: `interrupted_implementation:implementation:${run.linearIssueId}`,
|
|
380
|
+
});
|
|
381
|
+
if (!this.db.workflowWakes.peekIssueWake(run.projectId, run.linearIssueId)) {
|
|
382
|
+
const failedIssue = this.db.issues.getIssue(run.projectId, run.linearIssueId) ?? refreshedIssue;
|
|
383
|
+
this.feed?.publish({
|
|
384
|
+
level: "error",
|
|
385
|
+
kind: "workflow",
|
|
386
|
+
issueKey: issue.issueKey,
|
|
387
|
+
projectId: run.projectId,
|
|
388
|
+
stage: run.runType,
|
|
389
|
+
status: "escalated",
|
|
390
|
+
summary: interruptedMessage,
|
|
391
|
+
});
|
|
392
|
+
void this.linearSync.emitActivity(failedIssue, buildRunFailureActivity(run.runType, interruptedMessage));
|
|
393
|
+
void this.linearSync.syncSession(failedIssue, { activeRunType: run.runType });
|
|
394
|
+
this.releaseLease(run.projectId, run.linearIssueId);
|
|
395
|
+
return;
|
|
396
|
+
}
|
|
397
|
+
this.feed?.publish({
|
|
398
|
+
level: "warn",
|
|
399
|
+
kind: "workflow",
|
|
400
|
+
issueKey: issue.issueKey,
|
|
401
|
+
projectId: run.projectId,
|
|
402
|
+
stage: run.runType,
|
|
403
|
+
status: "retry_queued",
|
|
404
|
+
summary: "Implementation run was interrupted; PatchRelay will retry automatically",
|
|
405
|
+
});
|
|
406
|
+
const recoveredIssue = this.db.issues.getIssue(run.projectId, run.linearIssueId) ?? refreshedIssue;
|
|
407
|
+
void this.linearSync.syncSession(recoveredIssue, { activeRunType: run.runType });
|
|
408
|
+
this.wakeDispatcher.dispatchIfWakePending(run.projectId, run.linearIssueId);
|
|
409
|
+
this.releaseLease(run.projectId, run.linearIssueId);
|
|
410
|
+
}
|
|
411
|
+
async handleInterruptedRequestedChangesRun(run, issue) {
|
|
412
|
+
const freshIssue = this.db.issues.getIssue(run.projectId, run.linearIssueId) ?? issue;
|
|
413
|
+
const refreshedIssue = await this.completionPolicy.refreshIssueAfterReactivePublish(run, freshIssue);
|
|
414
|
+
const retryContext = await this.completionPolicy.resolveRequestedChangesWakeContext(refreshedIssue, run.runType, run.runType === "branch_upkeep"
|
|
415
|
+
? {
|
|
416
|
+
branchUpkeepRequired: true,
|
|
417
|
+
reviewFixMode: "branch_upkeep",
|
|
418
|
+
wakeReason: "branch_upkeep",
|
|
419
|
+
}
|
|
420
|
+
: undefined);
|
|
421
|
+
const retryRunType = resolveRetryRunType(run.runType, retryContext);
|
|
422
|
+
const recoveredState = resolvePostRunFactoryState(refreshedIssue, run, { outcome: "recovered" }) ?? "failed";
|
|
423
|
+
const interruptedMessage = "Requested-changes run was interrupted before PatchRelay could verify that a new PR head was published";
|
|
424
|
+
this.failRunAndClear({ run, message: interruptedMessage, nextState: recoveredState });
|
|
425
|
+
await this.restoreIdleWorktree(issue);
|
|
426
|
+
const recoveredIssue = this.db.issues.getIssue(run.projectId, run.linearIssueId) ?? refreshedIssue;
|
|
427
|
+
if (recoveredState === "changes_requested") {
|
|
428
|
+
this.db.issueSessions.commitIssueState({
|
|
429
|
+
writer: WRITER,
|
|
430
|
+
update: {
|
|
431
|
+
projectId: run.projectId,
|
|
432
|
+
linearIssueId: run.linearIssueId,
|
|
433
|
+
pendingRunType: retryRunType,
|
|
434
|
+
pendingRunContextJson: retryContext ? JSON.stringify(retryContext) : null,
|
|
435
|
+
},
|
|
436
|
+
});
|
|
437
|
+
this.feed?.publish({
|
|
438
|
+
level: "warn",
|
|
439
|
+
kind: "workflow",
|
|
440
|
+
issueKey: issue.issueKey,
|
|
441
|
+
projectId: run.projectId,
|
|
442
|
+
stage: run.runType,
|
|
443
|
+
status: "retry_queued",
|
|
444
|
+
summary: "Requested-changes run was interrupted; PatchRelay will retry from fresh GitHub truth",
|
|
445
|
+
});
|
|
446
|
+
this.wakeDispatcher.dispatchIfWakePending(run.projectId, run.linearIssueId);
|
|
447
|
+
}
|
|
448
|
+
else {
|
|
449
|
+
this.feed?.publish({
|
|
450
|
+
level: "error",
|
|
451
|
+
kind: "workflow",
|
|
452
|
+
issueKey: issue.issueKey,
|
|
453
|
+
projectId: run.projectId,
|
|
454
|
+
stage: run.runType,
|
|
455
|
+
status: "escalated",
|
|
456
|
+
summary: interruptedMessage,
|
|
457
|
+
});
|
|
458
|
+
}
|
|
459
|
+
void this.linearSync.emitActivity(recoveredIssue, buildRunFailureActivity(run.runType, interruptedMessage));
|
|
460
|
+
void this.linearSync.syncSession(recoveredIssue, { activeRunType: run.runType });
|
|
461
|
+
this.releaseLease(run.projectId, run.linearIssueId);
|
|
462
|
+
}
|
|
463
|
+
}
|