patchrelay 0.80.1 → 0.81.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/build-info.json +3 -3
- package/dist/codex-capacity.js +42 -0
- package/dist/db/issue-store.js +3 -0
- package/dist/db/issue-upsert-columns.js +1 -0
- package/dist/db/migrations.js +3 -0
- package/dist/idle-reconciliation.js +7 -0
- package/dist/run-budgets.js +24 -0
- package/dist/run-failure-policy.js +78 -8
- package/dist/run-notification-handler.js +21 -2
- package/dist/run-orchestrator.js +21 -5
- package/dist/run-reconciler.js +21 -0
- package/dist/run-reporting.js +20 -0
- package/dist/telemetry.js +13 -0
- package/package.json +1 -1
package/dist/build-info.json
CHANGED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
// Classifies Codex turn failures that are capacity outages (account usage
|
|
2
|
+
// limit, rate limit, quota) rather than evidence about the work itself.
|
|
3
|
+
// A capacity failure must not consume repair budgets or escalate an issue —
|
|
4
|
+
// the RunFailurePolicy defers the same wake behind a backoff instead
|
|
5
|
+
// (see deferCapacityLimitedRun).
|
|
6
|
+
// Known capacity phrasings. The real production string is
|
|
7
|
+
// "You've hit your usage limit. Upgrade to Pro (...) or try again at 3:23 AM.";
|
|
8
|
+
// rate-limit and quota phrasings are matched defensively for the API-key path.
|
|
9
|
+
const CAPACITY_PATTERNS = [
|
|
10
|
+
/usage limit/i,
|
|
11
|
+
/rate limit/i,
|
|
12
|
+
/quota/i,
|
|
13
|
+
];
|
|
14
|
+
export function classifyCodexFailure(errorMessage, now = new Date()) {
|
|
15
|
+
const detail = errorMessage?.trim();
|
|
16
|
+
if (!detail)
|
|
17
|
+
return { kind: "other" };
|
|
18
|
+
if (!CAPACITY_PATTERNS.some((pattern) => pattern.test(detail))) {
|
|
19
|
+
return { kind: "other" };
|
|
20
|
+
}
|
|
21
|
+
const retryAtIso = parseRetryAt(detail, now);
|
|
22
|
+
return { kind: "capacity", detail, ...(retryAtIso ? { retryAtIso } : {}) };
|
|
23
|
+
}
|
|
24
|
+
// Parses "try again at 3:23 AM" into the NEXT such wall-clock time in the
|
|
25
|
+
// host's local timezone (today if still ahead, otherwise tomorrow).
|
|
26
|
+
function parseRetryAt(message, now) {
|
|
27
|
+
const match = /try again at (\d{1,2})(?::(\d{2}))?\s*([ap])\.?m\.?/i.exec(message);
|
|
28
|
+
if (!match)
|
|
29
|
+
return undefined;
|
|
30
|
+
const rawHour = Number(match[1]);
|
|
31
|
+
const minute = match[2] === undefined ? 0 : Number(match[2]);
|
|
32
|
+
if (rawHour < 1 || rawHour > 12 || minute > 59)
|
|
33
|
+
return undefined;
|
|
34
|
+
const isPm = match[3]?.toLowerCase() === "p";
|
|
35
|
+
const hour = (rawHour % 12) + (isPm ? 12 : 0);
|
|
36
|
+
const candidate = new Date(now);
|
|
37
|
+
candidate.setHours(hour, minute, 0, 0);
|
|
38
|
+
if (candidate.getTime() <= now.getTime()) {
|
|
39
|
+
candidate.setDate(candidate.getDate() + 1);
|
|
40
|
+
}
|
|
41
|
+
return candidate.toISOString();
|
|
42
|
+
}
|
package/dist/db/issue-store.js
CHANGED
|
@@ -548,6 +548,9 @@ export function mapIssueRow(row) {
|
|
|
548
548
|
reviewFixAttempts: Number(row.review_fix_attempts ?? 0),
|
|
549
549
|
zombieRecoveryAttempts: Number(row.zombie_recovery_attempts ?? 0),
|
|
550
550
|
...(row.last_zombie_recovery_at !== null && row.last_zombie_recovery_at !== undefined ? { lastZombieRecoveryAt: String(row.last_zombie_recovery_at) } : {}),
|
|
551
|
+
...(row.capacity_backoff_until !== null && row.capacity_backoff_until !== undefined
|
|
552
|
+
? { capacityBackoffUntil: String(row.capacity_backoff_until) }
|
|
553
|
+
: {}),
|
|
551
554
|
...(row.orchestration_settle_until !== null && row.orchestration_settle_until !== undefined
|
|
552
555
|
? { orchestrationSettleUntil: String(row.orchestration_settle_until) }
|
|
553
556
|
: {}),
|
|
@@ -70,6 +70,7 @@ export const ISSUE_COLUMN_DEFS = {
|
|
|
70
70
|
reviewFixAttempts: { column: "review_fix_attempts", insertDefault: 0 },
|
|
71
71
|
zombieRecoveryAttempts: { column: "zombie_recovery_attempts", insertDefault: 0 },
|
|
72
72
|
lastZombieRecoveryAt: { column: "last_zombie_recovery_at" },
|
|
73
|
+
capacityBackoffUntil: { column: "capacity_backoff_until" },
|
|
73
74
|
orchestrationSettleUntil: { column: "orchestration_settle_until" },
|
|
74
75
|
deployStartedAt: { column: "deploy_started_at" },
|
|
75
76
|
};
|
package/dist/db/migrations.js
CHANGED
|
@@ -369,6 +369,9 @@ export function runPatchRelayMigrations(connection) {
|
|
|
369
369
|
// Optimistic-concurrency counter for issue-state writes (core
|
|
370
370
|
// simplification plan, phase A). Bumped on every UPDATE by upsertIssue.
|
|
371
371
|
addColumnIfMissing(connection, "issues", "version", "INTEGER NOT NULL DEFAULT 0");
|
|
372
|
+
// Codex capacity backoff: launches are deferred until this timestamp
|
|
373
|
+
// after a usage-limit / rate-limit / quota failure.
|
|
374
|
+
addColumnIfMissing(connection, "issues", "capacity_backoff_until", "TEXT");
|
|
372
375
|
}
|
|
373
376
|
function addColumnIfMissing(connection, table, column, definition) {
|
|
374
377
|
const cols = connection.prepare(`PRAGMA table_info(${table})`).all();
|
|
@@ -579,6 +579,13 @@ export class IdleIssueReconciler {
|
|
|
579
579
|
projectId: issue.projectId,
|
|
580
580
|
linearIssueId: issue.linearIssueId,
|
|
581
581
|
...buildPrStateUpdates(pr, gateCheckStatus, gateCheckNames[0] ?? "verify"),
|
|
582
|
+
// A newly observed head is the poll-side equivalent of a lost
|
|
583
|
+
// pr_synchronize: the webhook path resets the repair budgets for
|
|
584
|
+
// the fresh head, so re-derivation must too — otherwise the new
|
|
585
|
+
// head inherits the old head's consumed budget and escalates
|
|
586
|
+
// earlier. Provenance clearing stays governed by
|
|
587
|
+
// mayClearFailureProvenance at the advance sites below.
|
|
588
|
+
...(headAdvanced ? { ciRepairAttempts: 0, queueRepairAttempts: 0 } : {}),
|
|
582
589
|
},
|
|
583
590
|
});
|
|
584
591
|
// Continue the pass with the refreshed row so later version-checked
|
package/dist/run-budgets.js
CHANGED
|
@@ -44,3 +44,27 @@ export function getRemainingZombieRecoveryDelayMs(lastRecoveryAt, recoveryAttemp
|
|
|
44
44
|
const delay = getZombieRecoveryDelayMs(recoveryAttempts);
|
|
45
45
|
return Math.max(0, recoveredAtMs + delay - now);
|
|
46
46
|
}
|
|
47
|
+
// ─── Codex capacity backoff ──────────────────────────────────────────
|
|
48
|
+
//
|
|
49
|
+
// A run that failed on a Codex capacity outage (usage limit / rate limit /
|
|
50
|
+
// quota) is re-enqueued, not escalated, and never consumes a repair budget.
|
|
51
|
+
// The retry waits until the provider-announced retry time when one was
|
|
52
|
+
// parsed from the error (plus a small jitter so a fleet of issues does not
|
|
53
|
+
// stampede the moment the limit resets), else this fixed backoff.
|
|
54
|
+
export const CAPACITY_RETRY_BACKOFF_MS = 10 * 60_000;
|
|
55
|
+
const CAPACITY_RETRY_JITTER_MS = 60_000;
|
|
56
|
+
export function resolveCapacityBackoffUntil(retryAtIso, now = Date.now(), jitterMs = Math.floor(Math.random() * CAPACITY_RETRY_JITTER_MS)) {
|
|
57
|
+
const retryAtMs = retryAtIso !== undefined ? Date.parse(retryAtIso) : Number.NaN;
|
|
58
|
+
const untilMs = Number.isFinite(retryAtMs) && retryAtMs > now
|
|
59
|
+
? retryAtMs + jitterMs
|
|
60
|
+
: now + CAPACITY_RETRY_BACKOFF_MS;
|
|
61
|
+
return new Date(untilMs).toISOString();
|
|
62
|
+
}
|
|
63
|
+
export function getRemainingCapacityBackoffMs(capacityBackoffUntil, now = Date.now()) {
|
|
64
|
+
if (!capacityBackoffUntil)
|
|
65
|
+
return 0;
|
|
66
|
+
const untilMs = Date.parse(capacityBackoffUntil);
|
|
67
|
+
if (!Number.isFinite(untilMs))
|
|
68
|
+
return 0;
|
|
69
|
+
return Math.max(0, untilMs - now);
|
|
70
|
+
}
|
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
import { buildRunFailureActivity } from "./linear-session-reporting.js";
|
|
2
|
-
import { getRemainingZombieRecoveryDelayMs, getZombieRecoveryBudget } from "./run-budgets.js";
|
|
2
|
+
import { getRemainingZombieRecoveryDelayMs, getZombieRecoveryBudget, resolveCapacityBackoffUntil } from "./run-budgets.js";
|
|
3
|
+
import { emitTelemetry, noopTelemetry } from "./telemetry.js";
|
|
3
4
|
import { resolvePostRunFactoryState } from "./run-completion-policy.js";
|
|
4
5
|
import { isRequestedChangesRunType } from "./reactive-pr-state.js";
|
|
5
6
|
import { serializeRunContext } from "./run-context.js";
|
|
6
7
|
import { settleRun } from "./run-settlement.js";
|
|
7
8
|
const WRITER = "run-failure-policy";
|
|
8
|
-
// Roll back the attempt counter consumed
|
|
9
|
-
// attempted-failure provenance for repair runs,
|
|
10
|
-
//
|
|
11
|
-
|
|
9
|
+
// Roll back the attempt counter consumed at launch and clear the
|
|
10
|
+
// attempted-failure provenance for repair runs, so a run that died without
|
|
11
|
+
// evidence about the work (interrupted turn, capacity outage) neither burns
|
|
12
|
+
// a budget unit nor blocks the same failure from re-deriving a wake.
|
|
13
|
+
function buildAttemptRefundFields(runType, issue) {
|
|
12
14
|
const counter = runType === "ci_repair" && issue.ciRepairAttempts > 0
|
|
13
15
|
? { ciRepairAttempts: issue.ciRepairAttempts - 1 }
|
|
14
16
|
: runType === "queue_repair" && issue.queueRepairAttempts > 0
|
|
@@ -25,11 +27,18 @@ function buildInterruptedAttemptRepairUpdate(runType, issue) {
|
|
|
25
27
|
: undefined;
|
|
26
28
|
if (!counter && !provenance)
|
|
27
29
|
return undefined;
|
|
30
|
+
return { ...counter, ...provenance };
|
|
31
|
+
}
|
|
32
|
+
// The interrupted-run variant: same refund, committed as a single issue
|
|
33
|
+
// update so the whole repair commits (and conflict-recomputes) atomically.
|
|
34
|
+
function buildInterruptedAttemptRepairUpdate(runType, issue) {
|
|
35
|
+
const fields = buildAttemptRefundFields(runType, issue);
|
|
36
|
+
if (!fields)
|
|
37
|
+
return undefined;
|
|
28
38
|
return {
|
|
29
39
|
projectId: issue.projectId,
|
|
30
40
|
linearIssueId: issue.linearIssueId,
|
|
31
|
-
...
|
|
32
|
-
...provenance,
|
|
41
|
+
...fields,
|
|
33
42
|
};
|
|
34
43
|
}
|
|
35
44
|
function resolveRetryRunType(runType, context) {
|
|
@@ -63,7 +72,8 @@ export class RunFailurePolicy {
|
|
|
63
72
|
completionPolicy;
|
|
64
73
|
resolveProject;
|
|
65
74
|
feed;
|
|
66
|
-
|
|
75
|
+
telemetry;
|
|
76
|
+
constructor(db, logger, linearSync, withHeldLease, releaseLease, appendWakeEventWithLease, wakeDispatcher, restoreIdleWorktree, completionPolicy, resolveProject, feed, telemetry = noopTelemetry) {
|
|
67
77
|
this.db = db;
|
|
68
78
|
this.logger = logger;
|
|
69
79
|
this.linearSync = linearSync;
|
|
@@ -75,6 +85,7 @@ export class RunFailurePolicy {
|
|
|
75
85
|
this.completionPolicy = completionPolicy;
|
|
76
86
|
this.resolveProject = resolveProject;
|
|
77
87
|
this.feed = feed;
|
|
88
|
+
this.telemetry = telemetry;
|
|
78
89
|
}
|
|
79
90
|
// ─── Stranded runs (zombie / stale thread) ───────────────────────
|
|
80
91
|
/**
|
|
@@ -233,6 +244,65 @@ export class RunFailurePolicy {
|
|
|
233
244
|
this.wakeDispatcher.dispatchIfWakePending(fresh.projectId, fresh.linearIssueId);
|
|
234
245
|
this.logger.info({ issueKey: fresh.issueKey, attempts, reason }, "Recovery: re-enqueued with backoff");
|
|
235
246
|
}
|
|
247
|
+
// ─── Capacity outages ────────────────────────────────────────────
|
|
248
|
+
/**
|
|
249
|
+
* A Codex capacity failure (usage limit / rate limit / quota) is not
|
|
250
|
+
* evidence that the work is impossible: settle the run as failed with the
|
|
251
|
+
* real error text, refund the attempt counter consumed at launch, return
|
|
252
|
+
* the issue to the state that routes the same work, and re-enqueue the
|
|
253
|
+
* same wake behind a capacity backoff — never a budget burn, never an
|
|
254
|
+
* escalation.
|
|
255
|
+
*/
|
|
256
|
+
deferCapacityLimitedRun(params) {
|
|
257
|
+
const { run, capacity } = params;
|
|
258
|
+
const capacityBackoffUntil = resolveCapacityBackoffUntil(capacity.retryAtIso);
|
|
259
|
+
const deferred = this.withHeldLease(run.projectId, run.linearIssueId, (lease) => {
|
|
260
|
+
const settled = settleRun({
|
|
261
|
+
db: this.db,
|
|
262
|
+
run,
|
|
263
|
+
finish: {
|
|
264
|
+
status: "failed",
|
|
265
|
+
...(params.threadId ? { threadId: params.threadId } : {}),
|
|
266
|
+
...(params.turnId ? { turnId: params.turnId } : {}),
|
|
267
|
+
failureReason: params.failureReason,
|
|
268
|
+
},
|
|
269
|
+
lease,
|
|
270
|
+
buildIssueUpdate: (record) => ({
|
|
271
|
+
...buildAttemptRefundFields(run.runType, record),
|
|
272
|
+
pendingRunType: null,
|
|
273
|
+
pendingRunContextJson: null,
|
|
274
|
+
// The hold state that routes this work again, resolved from fresh
|
|
275
|
+
// GitHub truth like the interrupted-run recovery path. Never a
|
|
276
|
+
// terminal state: an unresolvable hold keeps the current one.
|
|
277
|
+
factoryState: resolvePostRunFactoryState(record, run, { outcome: "recovered" })
|
|
278
|
+
?? (run.runType === "implementation" ? "delegated" : record.factoryState),
|
|
279
|
+
capacityBackoffUntil,
|
|
280
|
+
}),
|
|
281
|
+
});
|
|
282
|
+
const wakeIssue = settled.issue ?? params.issue;
|
|
283
|
+
return this.appendWakeEventWithLease(lease, wakeIssue, run.runType, undefined, `capacity:${run.id}`);
|
|
284
|
+
});
|
|
285
|
+
this.linearSync.clearProgress(run.id);
|
|
286
|
+
if (!deferred) {
|
|
287
|
+
this.logger.warn({ runId: run.id, issueId: run.linearIssueId }, "Skipping capacity deferral after losing issue-session lease");
|
|
288
|
+
this.releaseLease(run.projectId, run.linearIssueId);
|
|
289
|
+
return;
|
|
290
|
+
}
|
|
291
|
+
const issue = this.db.issues.getIssue(run.projectId, run.linearIssueId) ?? params.issue;
|
|
292
|
+
this.logger.warn({ issueKey: issue.issueKey, runType: run.runType, detail: capacity.detail, capacityBackoffUntil }, "Codex capacity limit - deferring retry without consuming budget");
|
|
293
|
+
emitTelemetry(this.telemetry, {
|
|
294
|
+
type: "run.capacity_deferred",
|
|
295
|
+
projectId: run.projectId,
|
|
296
|
+
linearIssueId: run.linearIssueId,
|
|
297
|
+
...(issue.issueKey ? { issueKey: issue.issueKey } : {}),
|
|
298
|
+
runId: run.id,
|
|
299
|
+
runType: run.runType,
|
|
300
|
+
detail: capacity.detail,
|
|
301
|
+
...(capacity.retryAtIso ? { retryAtIso: capacity.retryAtIso } : {}),
|
|
302
|
+
});
|
|
303
|
+
void this.linearSync.syncSession(issue, { activeRunType: run.runType });
|
|
304
|
+
this.releaseLease(run.projectId, run.linearIssueId);
|
|
305
|
+
}
|
|
236
306
|
// ─── Terminal decisions ──────────────────────────────────────────
|
|
237
307
|
escalate(params) {
|
|
238
308
|
const { issue, runType, reason } = params;
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { buildRunFailureActivity } from "./linear-session-reporting.js";
|
|
2
|
-
import { extractTurnId, resolveRunCompletionStatus } from "./run-reporting.js";
|
|
2
|
+
import { buildFailedTurnFailureReason, extractTurnErrorMessage, extractTurnId, resolveRunCompletionStatus } from "./run-reporting.js";
|
|
3
|
+
import { classifyCodexFailure } from "./codex-capacity.js";
|
|
3
4
|
import { resolveFailureFactoryState } from "./reactive-pr-state.js";
|
|
4
5
|
const WRITER = "run-notification-handler";
|
|
5
6
|
const DEFAULT_PUBLISH_COMMAND_TIMEOUT_MS = 10 * 60 * 1000;
|
|
@@ -76,7 +77,25 @@ export class RunNotificationHandler {
|
|
|
76
77
|
const completedTurnId = extractTurnId(notification.params);
|
|
77
78
|
const status = resolveRunCompletionStatus(notification.params);
|
|
78
79
|
if (status === "failed") {
|
|
79
|
-
const
|
|
80
|
+
const turnErrorMessage = extractTurnErrorMessage(notification.params);
|
|
81
|
+
const failureReason = buildFailedTurnFailureReason(turnErrorMessage);
|
|
82
|
+
// A capacity outage (usage limit / rate limit / quota) is not evidence
|
|
83
|
+
// about the work: defer the same wake behind a backoff instead of
|
|
84
|
+
// consuming an attempt budget or escalating. Only an actual error
|
|
85
|
+
// message classifies — interrupted turns stay on their own path.
|
|
86
|
+
const capacity = classifyCodexFailure(turnErrorMessage);
|
|
87
|
+
if (capacity.kind === "capacity" && this.options.deferCapacityLimitedRun) {
|
|
88
|
+
this.options.deferCapacityLimitedRun({
|
|
89
|
+
run,
|
|
90
|
+
issue,
|
|
91
|
+
failureReason,
|
|
92
|
+
capacity,
|
|
93
|
+
threadId,
|
|
94
|
+
...(completedTurnId ? { turnId: completedTurnId } : {}),
|
|
95
|
+
});
|
|
96
|
+
this.activeThreadId = undefined;
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
80
99
|
const recovered = await this.runFinalizer.recoverFailedImplementationRun({
|
|
81
100
|
run,
|
|
82
101
|
issue,
|
package/dist/run-orchestrator.js
CHANGED
|
@@ -18,7 +18,7 @@ import { RunReconciler } from "./run-reconciler.js";
|
|
|
18
18
|
import { RunWakePlanner } from "./run-wake-planner.js";
|
|
19
19
|
import { WakeDispatcher } from "./wake-dispatcher.js";
|
|
20
20
|
import { settleRun } from "./run-settlement.js";
|
|
21
|
-
import { getRemainingZombieRecoveryDelayMs } from "./run-budgets.js";
|
|
21
|
+
import { getRemainingCapacityBackoffMs, getRemainingZombieRecoveryDelayMs } from "./run-budgets.js";
|
|
22
22
|
import { classifyIssue } from "./issue-class.js";
|
|
23
23
|
import { buildIssueTriageHash, IssueTriageService } from "./issue-triage.js";
|
|
24
24
|
import { loadConfig } from "./config.js";
|
|
@@ -129,8 +129,12 @@ export class RunOrchestrator {
|
|
|
129
129
|
this.issueTriage = new IssueTriageService(codex, logger);
|
|
130
130
|
this.runFinalizer = new RunFinalizer(db, logger, this.linearSync, this.wakeDispatcher, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, (lease, issue, runType, context, dedupeScope) => this.appendWakeEventWithLease(lease, issue, runType, context, dedupeScope), this.recoveryPorts.failRunAndClear, this.runCompletionPolicy, this.completionCheck, feed);
|
|
131
131
|
this.runLauncher = new RunLauncher(config, db, codex, logger, this.worktreeManager);
|
|
132
|
-
this.runNotificationHandler = new RunNotificationHandler(config, db, logger, this.linearSync, this.runFinalizer, this.threadPorts.readThreadWithRetry, this.leasePorts.withHeldLease, this.leasePorts.heartbeatLease, this.leasePorts.releaseLease, feed, {
|
|
133
|
-
|
|
132
|
+
this.runNotificationHandler = new RunNotificationHandler(config, db, logger, this.linearSync, this.runFinalizer, this.threadPorts.readThreadWithRetry, this.leasePorts.withHeldLease, this.leasePorts.heartbeatLease, this.leasePorts.releaseLease, feed, {
|
|
133
|
+
interruptTurn: (options) => codex.interruptTurn(options),
|
|
134
|
+
// Lazy: the failure policy is constructed just below.
|
|
135
|
+
deferCapacityLimitedRun: (params) => this.runFailurePolicy.deferCapacityLimitedRun(params),
|
|
136
|
+
});
|
|
137
|
+
this.runFailurePolicy = new RunFailurePolicy(db, logger, this.linearSync, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, (lease, issue, runType, context, dedupeScope) => this.appendWakeEventWithLease(lease, issue, runType, context, dedupeScope), this.wakeDispatcher, this.recoveryPorts.restoreIdleWorktree, this.runCompletionPolicy, (projectId) => this.config.projects.find((project) => project.id === projectId), feed, telemetry);
|
|
134
138
|
this.runReconciler = new RunReconciler(db, logger, linearProvider, this.linearSync, this.runFailurePolicy, this.runFinalizer, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, this.threadPorts.readThreadWithRetry, (projectId) => this.config.projects.find((project) => project.id === projectId)?.github?.repoFullName, feed, telemetry);
|
|
135
139
|
this.runWakePlanner = new RunWakePlanner(db, logger);
|
|
136
140
|
this.linearIssueProjection = new LinearIssueProjectionService(db, linearProvider, logger);
|
|
@@ -359,6 +363,16 @@ export class RunOrchestrator {
|
|
|
359
363
|
this.releaseIssueSessionLease(item.projectId, item.issueId);
|
|
360
364
|
return;
|
|
361
365
|
}
|
|
366
|
+
// Codex capacity outage backoff: a usage-limit/rate-limit failure left a
|
|
367
|
+
// pending wake behind; the wake stays queued (the idle reconciler keeps
|
|
368
|
+
// re-poking it) and the launch waits until the backoff elapses.
|
|
369
|
+
const remainingCapacityDelayMs = getRemainingCapacityBackoffMs(issue.capacityBackoffUntil);
|
|
370
|
+
if (remainingCapacityDelayMs > 0) {
|
|
371
|
+
this.emitRunSkipped(item, "capacity_backoff", issue, { runType, remainingDelayMs: remainingCapacityDelayMs });
|
|
372
|
+
this.logger.debug({ issueKey: issue.issueKey, runType, remainingCapacityDelayMs }, "Deferring run launch until Codex capacity backoff elapses");
|
|
373
|
+
this.releaseIssueSessionLease(item.projectId, item.issueId);
|
|
374
|
+
return;
|
|
375
|
+
}
|
|
362
376
|
const baseContext = isRequestedChangesRunType(runType)
|
|
363
377
|
? await this.runCompletionPolicy.resolveRequestedChangesWakeContext(issue, runType, context)
|
|
364
378
|
: context;
|
|
@@ -488,8 +502,9 @@ export class RunOrchestrator {
|
|
|
488
502
|
this.releaseIssueSessionLease(run.projectId, run.linearIssueId);
|
|
489
503
|
return;
|
|
490
504
|
}
|
|
491
|
-
// Reset zombie recovery counter — this run
|
|
492
|
-
|
|
505
|
+
// Reset zombie recovery counter and capacity backoff — this run
|
|
506
|
+
// started successfully
|
|
507
|
+
if (issue.zombieRecoveryAttempts > 0 || issue.capacityBackoffUntil !== undefined) {
|
|
493
508
|
this.db.issueSessions.commitIssueState({
|
|
494
509
|
writer: WRITER,
|
|
495
510
|
lease: { projectId: item.projectId, linearIssueId: item.issueId, leaseId },
|
|
@@ -498,6 +513,7 @@ export class RunOrchestrator {
|
|
|
498
513
|
linearIssueId: item.issueId,
|
|
499
514
|
zombieRecoveryAttempts: 0,
|
|
500
515
|
lastZombieRecoveryAt: null,
|
|
516
|
+
capacityBackoffUntil: null,
|
|
501
517
|
},
|
|
502
518
|
});
|
|
503
519
|
}
|
package/dist/run-reconciler.js
CHANGED
|
@@ -3,6 +3,8 @@ import { TERMINAL_STATES } from "./factory-state.js";
|
|
|
3
3
|
import { resolveAuthoritativeLinearStopState } from "./linear-workflow.js";
|
|
4
4
|
import { buildRunFailureActivity } from "./linear-session-reporting.js";
|
|
5
5
|
import { getThreadTurns } from "./codex-thread-utils.js";
|
|
6
|
+
import { classifyCodexFailure } from "./codex-capacity.js";
|
|
7
|
+
import { buildFailedTurnFailureReason } from "./run-reporting.js";
|
|
6
8
|
import { resolveEffectiveActiveRun } from "./effective-active-run.js";
|
|
7
9
|
import { isThreadMaterializingError } from "./codex-thread-errors.js";
|
|
8
10
|
import { fetchPullRequestSnapshot } from "./reconcile-pr-fetch.js";
|
|
@@ -213,6 +215,25 @@ export class RunReconciler {
|
|
|
213
215
|
await this.failurePolicy.handleInterruptedRun(run, effectiveIssue);
|
|
214
216
|
return;
|
|
215
217
|
}
|
|
218
|
+
// A failed turn found during reconciliation (the live notification was
|
|
219
|
+
// lost, e.g. across a restart) whose error is a Codex capacity outage:
|
|
220
|
+
// defer the same wake instead of leaving the run dangling or burning a
|
|
221
|
+
// budget. Non-capacity failed turns keep the existing behavior — the
|
|
222
|
+
// notification path owns live settlement.
|
|
223
|
+
if (latestTurn?.status === "failed") {
|
|
224
|
+
const capacity = classifyCodexFailure(latestTurn.error?.message ?? undefined);
|
|
225
|
+
if (capacity.kind === "capacity") {
|
|
226
|
+
this.failurePolicy.deferCapacityLimitedRun({
|
|
227
|
+
run,
|
|
228
|
+
issue: effectiveIssue,
|
|
229
|
+
failureReason: buildFailedTurnFailureReason(latestTurn.error?.message ?? undefined),
|
|
230
|
+
capacity,
|
|
231
|
+
threadId: run.threadId,
|
|
232
|
+
...(latestTurn.id ? { turnId: latestTurn.id } : {}),
|
|
233
|
+
});
|
|
234
|
+
return;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
216
237
|
if (latestTurn?.status === "completed") {
|
|
217
238
|
await this.runFinalizer.finalizeCompletedRun({
|
|
218
239
|
source: "reconciliation",
|
package/dist/run-reporting.js
CHANGED
|
@@ -164,6 +164,26 @@ export function extractTurnId(params) {
|
|
|
164
164
|
const id = turn.id;
|
|
165
165
|
return typeof id === "string" ? id : undefined;
|
|
166
166
|
}
|
|
167
|
+
export const FAILED_TURN_FAILURE_REASON = "Codex reported the turn completed in a failed state";
|
|
168
|
+
// Keeps the generic prefix (existing queries and tests key on it) and appends
|
|
169
|
+
// the real Codex error so a capacity outage is distinguishable from a genuine
|
|
170
|
+
// failure in the persisted run record.
|
|
171
|
+
export function buildFailedTurnFailureReason(errorMessage) {
|
|
172
|
+
const trimmed = errorMessage?.trim();
|
|
173
|
+
return trimmed ? `${FAILED_TURN_FAILURE_REASON}: ${trimmed}` : FAILED_TURN_FAILURE_REASON;
|
|
174
|
+
}
|
|
175
|
+
export function extractTurnErrorMessage(params) {
|
|
176
|
+
const turn = params.turn;
|
|
177
|
+
if (!turn || typeof turn !== "object") {
|
|
178
|
+
return undefined;
|
|
179
|
+
}
|
|
180
|
+
const error = turn.error;
|
|
181
|
+
if (!error || typeof error !== "object") {
|
|
182
|
+
return undefined;
|
|
183
|
+
}
|
|
184
|
+
const message = error.message;
|
|
185
|
+
return typeof message === "string" && message.trim() ? message : undefined;
|
|
186
|
+
}
|
|
167
187
|
export function buildPendingMaterializationThread(stageRun, error) {
|
|
168
188
|
return {
|
|
169
189
|
id: stageRun.threadId ?? "pending-thread",
|
package/dist/telemetry.js
CHANGED
|
@@ -78,6 +78,19 @@ export class OperatorFeedTelemetrySink {
|
|
|
78
78
|
};
|
|
79
79
|
}
|
|
80
80
|
return undefined;
|
|
81
|
+
case "run.capacity_deferred":
|
|
82
|
+
return {
|
|
83
|
+
level: "warn",
|
|
84
|
+
kind: "workflow",
|
|
85
|
+
...(event.issueKey ? { issueKey: event.issueKey } : {}),
|
|
86
|
+
...(event.projectId ? { projectId: event.projectId } : {}),
|
|
87
|
+
...(event.runType ? { stage: event.runType } : {}),
|
|
88
|
+
status: "capacity_deferred",
|
|
89
|
+
summary: event.retryAtIso
|
|
90
|
+
? `Codex capacity limit; retrying ${event.runType ?? "run"} at ${event.retryAtIso}`
|
|
91
|
+
: `Codex capacity limit; retrying ${event.runType ?? "run"} after backoff`,
|
|
92
|
+
detail: event.detail,
|
|
93
|
+
};
|
|
81
94
|
case "state.write_conflict":
|
|
82
95
|
return {
|
|
83
96
|
level: "warn",
|