patchrelay 0.80.2 → 0.82.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "service": "patchrelay",
3
- "version": "0.80.2",
4
- "commit": "439e484fd0e3",
5
- "builtAt": "2026-06-10T22:17:32.259Z"
3
+ "version": "0.82.0",
4
+ "commit": "91dee8b62165",
5
+ "builtAt": "2026-06-11T01:02:50.881Z"
6
6
  }
@@ -52,9 +52,16 @@ export async function collectClusterHealth(config, db, runCommand) {
52
52
  },
53
53
  summarize: (payload) => {
54
54
  const parsed = payload;
55
- return parsed.health?.reachable === true && parsed.health?.ok === true
56
- ? "Healthy"
57
- : `Unhealthy (${parsed.health?.reachable === false ? "service not reachable" : "service health unavailable"})`;
55
+ if (parsed.health?.reachable === true && parsed.health?.ok === true) {
56
+ return "Healthy";
57
+ }
58
+ // The review-quill health body reports an active Codex capacity
59
+ // pause as runtime.codexLimitedUntil, which `review-quill service
60
+ // status --json` passes through as health.codexLimitedUntil.
61
+ if (parsed.health?.reachable === true && typeof parsed.health?.codexLimitedUntil === "string") {
62
+ return `review-quill degraded: Codex usage limit until ${parsed.health.codexLimitedUntil}`;
63
+ }
64
+ return `Unhealthy (${parsed.health?.reachable === false ? "service not reachable" : "service health unavailable"})`;
58
65
  },
59
66
  })
60
67
  : undefined;
@@ -0,0 +1,42 @@
1
+ // Classifies Codex turn failures that are capacity outages (account usage
2
+ // limit, rate limit, quota) rather than evidence about the work itself.
3
+ // A capacity failure must not consume repair budgets or escalate an issue —
4
+ // the RunFailurePolicy defers the same wake behind a backoff instead
5
+ // (see deferCapacityLimitedRun).
6
+ // Known capacity phrasings. The real production string is
7
+ // "You've hit your usage limit. Upgrade to Pro (...) or try again at 3:23 AM.";
8
+ // rate-limit and quota phrasings are matched defensively for the API-key path.
9
+ const CAPACITY_PATTERNS = [
10
+ /usage limit/i,
11
+ /rate limit/i,
12
+ /quota/i,
13
+ ];
14
+ export function classifyCodexFailure(errorMessage, now = new Date()) {
15
+ const detail = errorMessage?.trim();
16
+ if (!detail)
17
+ return { kind: "other" };
18
+ if (!CAPACITY_PATTERNS.some((pattern) => pattern.test(detail))) {
19
+ return { kind: "other" };
20
+ }
21
+ const retryAtIso = parseRetryAt(detail, now);
22
+ return { kind: "capacity", detail, ...(retryAtIso ? { retryAtIso } : {}) };
23
+ }
24
+ // Parses "try again at 3:23 AM" into the NEXT such wall-clock time in the
25
+ // host's local timezone (today if still ahead, otherwise tomorrow).
26
+ function parseRetryAt(message, now) {
27
+ const match = /try again at (\d{1,2})(?::(\d{2}))?\s*([ap])\.?m\.?/i.exec(message);
28
+ if (!match)
29
+ return undefined;
30
+ const rawHour = Number(match[1]);
31
+ const minute = match[2] === undefined ? 0 : Number(match[2]);
32
+ if (rawHour < 1 || rawHour > 12 || minute > 59)
33
+ return undefined;
34
+ const isPm = match[3]?.toLowerCase() === "p";
35
+ const hour = (rawHour % 12) + (isPm ? 12 : 0);
36
+ const candidate = new Date(now);
37
+ candidate.setHours(hour, minute, 0, 0);
38
+ if (candidate.getTime() <= now.getTime()) {
39
+ candidate.setDate(candidate.getDate() + 1);
40
+ }
41
+ return candidate.toISOString();
42
+ }
@@ -548,6 +548,9 @@ export function mapIssueRow(row) {
548
548
  reviewFixAttempts: Number(row.review_fix_attempts ?? 0),
549
549
  zombieRecoveryAttempts: Number(row.zombie_recovery_attempts ?? 0),
550
550
  ...(row.last_zombie_recovery_at !== null && row.last_zombie_recovery_at !== undefined ? { lastZombieRecoveryAt: String(row.last_zombie_recovery_at) } : {}),
551
+ ...(row.capacity_backoff_until !== null && row.capacity_backoff_until !== undefined
552
+ ? { capacityBackoffUntil: String(row.capacity_backoff_until) }
553
+ : {}),
551
554
  ...(row.orchestration_settle_until !== null && row.orchestration_settle_until !== undefined
552
555
  ? { orchestrationSettleUntil: String(row.orchestration_settle_until) }
553
556
  : {}),
@@ -70,6 +70,7 @@ export const ISSUE_COLUMN_DEFS = {
70
70
  reviewFixAttempts: { column: "review_fix_attempts", insertDefault: 0 },
71
71
  zombieRecoveryAttempts: { column: "zombie_recovery_attempts", insertDefault: 0 },
72
72
  lastZombieRecoveryAt: { column: "last_zombie_recovery_at" },
73
+ capacityBackoffUntil: { column: "capacity_backoff_until" },
73
74
  orchestrationSettleUntil: { column: "orchestration_settle_until" },
74
75
  deployStartedAt: { column: "deploy_started_at" },
75
76
  };
@@ -369,6 +369,9 @@ export function runPatchRelayMigrations(connection) {
369
369
  // Optimistic-concurrency counter for issue-state writes (core
370
370
  // simplification plan, phase A). Bumped on every UPDATE by upsertIssue.
371
371
  addColumnIfMissing(connection, "issues", "version", "INTEGER NOT NULL DEFAULT 0");
372
+ // Codex capacity backoff: launches are deferred until this timestamp
373
+ // after a usage-limit / rate-limit / quota failure.
374
+ addColumnIfMissing(connection, "issues", "capacity_backoff_until", "TEXT");
372
375
  }
373
376
  function addColumnIfMissing(connection, table, column, definition) {
374
377
  const cols = connection.prepare(`PRAGMA table_info(${table})`).all();
@@ -44,3 +44,27 @@ export function getRemainingZombieRecoveryDelayMs(lastRecoveryAt, recoveryAttemp
44
44
  const delay = getZombieRecoveryDelayMs(recoveryAttempts);
45
45
  return Math.max(0, recoveredAtMs + delay - now);
46
46
  }
47
+ // ─── Codex capacity backoff ──────────────────────────────────────────
48
+ //
49
+ // A run that failed on a Codex capacity outage (usage limit / rate limit /
50
+ // quota) is re-enqueued, not escalated, and never consumes a repair budget.
51
+ // The retry waits until the provider-announced retry time when one was
52
+ // parsed from the error (plus a small jitter so a fleet of issues does not
53
+ // stampede the moment the limit resets), else this fixed backoff.
54
+ export const CAPACITY_RETRY_BACKOFF_MS = 10 * 60_000;
55
+ const CAPACITY_RETRY_JITTER_MS = 60_000;
56
+ export function resolveCapacityBackoffUntil(retryAtIso, now = Date.now(), jitterMs = Math.floor(Math.random() * CAPACITY_RETRY_JITTER_MS)) {
57
+ const retryAtMs = retryAtIso !== undefined ? Date.parse(retryAtIso) : Number.NaN;
58
+ const untilMs = Number.isFinite(retryAtMs) && retryAtMs > now
59
+ ? retryAtMs + jitterMs
60
+ : now + CAPACITY_RETRY_BACKOFF_MS;
61
+ return new Date(untilMs).toISOString();
62
+ }
63
+ export function getRemainingCapacityBackoffMs(capacityBackoffUntil, now = Date.now()) {
64
+ if (!capacityBackoffUntil)
65
+ return 0;
66
+ const untilMs = Date.parse(capacityBackoffUntil);
67
+ if (!Number.isFinite(untilMs))
68
+ return 0;
69
+ return Math.max(0, untilMs - now);
70
+ }
@@ -1,14 +1,16 @@
1
1
  import { buildRunFailureActivity } from "./linear-session-reporting.js";
2
- import { getRemainingZombieRecoveryDelayMs, getZombieRecoveryBudget } from "./run-budgets.js";
2
+ import { getRemainingZombieRecoveryDelayMs, getZombieRecoveryBudget, resolveCapacityBackoffUntil } from "./run-budgets.js";
3
+ import { emitTelemetry, noopTelemetry } from "./telemetry.js";
3
4
  import { resolvePostRunFactoryState } from "./run-completion-policy.js";
4
5
  import { isRequestedChangesRunType } from "./reactive-pr-state.js";
5
6
  import { serializeRunContext } from "./run-context.js";
6
7
  import { settleRun } from "./run-settlement.js";
7
8
  const WRITER = "run-failure-policy";
8
- // Roll back the attempt counter consumed by the interrupted run and clear the
9
- // attempted-failure provenance for repair runs, as a single issue update so
10
- // the whole repair commits (and conflict-recomputes) atomically.
11
- function buildInterruptedAttemptRepairUpdate(runType, issue) {
9
+ // Roll back the attempt counter consumed at launch and clear the
10
+ // attempted-failure provenance for repair runs, so a run that died without
11
+ // evidence about the work (interrupted turn, capacity outage) neither burns
12
+ // a budget unit nor blocks the same failure from re-deriving a wake.
13
+ function buildAttemptRefundFields(runType, issue) {
12
14
  const counter = runType === "ci_repair" && issue.ciRepairAttempts > 0
13
15
  ? { ciRepairAttempts: issue.ciRepairAttempts - 1 }
14
16
  : runType === "queue_repair" && issue.queueRepairAttempts > 0
@@ -25,11 +27,18 @@ function buildInterruptedAttemptRepairUpdate(runType, issue) {
25
27
  : undefined;
26
28
  if (!counter && !provenance)
27
29
  return undefined;
30
+ return { ...counter, ...provenance };
31
+ }
32
+ // The interrupted-run variant: same refund, committed as a single issue
33
+ // update so the whole repair commits (and conflict-recomputes) atomically.
34
+ function buildInterruptedAttemptRepairUpdate(runType, issue) {
35
+ const fields = buildAttemptRefundFields(runType, issue);
36
+ if (!fields)
37
+ return undefined;
28
38
  return {
29
39
  projectId: issue.projectId,
30
40
  linearIssueId: issue.linearIssueId,
31
- ...counter,
32
- ...provenance,
41
+ ...fields,
33
42
  };
34
43
  }
35
44
  function resolveRetryRunType(runType, context) {
@@ -63,7 +72,8 @@ export class RunFailurePolicy {
63
72
  completionPolicy;
64
73
  resolveProject;
65
74
  feed;
66
- constructor(db, logger, linearSync, withHeldLease, releaseLease, appendWakeEventWithLease, wakeDispatcher, restoreIdleWorktree, completionPolicy, resolveProject, feed) {
75
+ telemetry;
76
+ constructor(db, logger, linearSync, withHeldLease, releaseLease, appendWakeEventWithLease, wakeDispatcher, restoreIdleWorktree, completionPolicy, resolveProject, feed, telemetry = noopTelemetry) {
67
77
  this.db = db;
68
78
  this.logger = logger;
69
79
  this.linearSync = linearSync;
@@ -75,6 +85,7 @@ export class RunFailurePolicy {
75
85
  this.completionPolicy = completionPolicy;
76
86
  this.resolveProject = resolveProject;
77
87
  this.feed = feed;
88
+ this.telemetry = telemetry;
78
89
  }
79
90
  // ─── Stranded runs (zombie / stale thread) ───────────────────────
80
91
  /**
@@ -233,6 +244,65 @@ export class RunFailurePolicy {
233
244
  this.wakeDispatcher.dispatchIfWakePending(fresh.projectId, fresh.linearIssueId);
234
245
  this.logger.info({ issueKey: fresh.issueKey, attempts, reason }, "Recovery: re-enqueued with backoff");
235
246
  }
247
+ // ─── Capacity outages ────────────────────────────────────────────
248
+ /**
249
+ * A Codex capacity failure (usage limit / rate limit / quota) is not
250
+ * evidence that the work is impossible: settle the run as failed with the
251
+ * real error text, refund the attempt counter consumed at launch, return
252
+ * the issue to the state that routes the same work, and re-enqueue the
253
+ * same wake behind a capacity backoff — never a budget burn, never an
254
+ * escalation.
255
+ */
256
+ deferCapacityLimitedRun(params) {
257
+ const { run, capacity } = params;
258
+ const capacityBackoffUntil = resolveCapacityBackoffUntil(capacity.retryAtIso);
259
+ const deferred = this.withHeldLease(run.projectId, run.linearIssueId, (lease) => {
260
+ const settled = settleRun({
261
+ db: this.db,
262
+ run,
263
+ finish: {
264
+ status: "failed",
265
+ ...(params.threadId ? { threadId: params.threadId } : {}),
266
+ ...(params.turnId ? { turnId: params.turnId } : {}),
267
+ failureReason: params.failureReason,
268
+ },
269
+ lease,
270
+ buildIssueUpdate: (record) => ({
271
+ ...buildAttemptRefundFields(run.runType, record),
272
+ pendingRunType: null,
273
+ pendingRunContextJson: null,
274
+ // The hold state that routes this work again, resolved from fresh
275
+ // GitHub truth like the interrupted-run recovery path. Never a
276
+ // terminal state: an unresolvable hold keeps the current one.
277
+ factoryState: resolvePostRunFactoryState(record, run, { outcome: "recovered" })
278
+ ?? (run.runType === "implementation" ? "delegated" : record.factoryState),
279
+ capacityBackoffUntil,
280
+ }),
281
+ });
282
+ const wakeIssue = settled.issue ?? params.issue;
283
+ return this.appendWakeEventWithLease(lease, wakeIssue, run.runType, undefined, `capacity:${run.id}`);
284
+ });
285
+ this.linearSync.clearProgress(run.id);
286
+ if (!deferred) {
287
+ this.logger.warn({ runId: run.id, issueId: run.linearIssueId }, "Skipping capacity deferral after losing issue-session lease");
288
+ this.releaseLease(run.projectId, run.linearIssueId);
289
+ return;
290
+ }
291
+ const issue = this.db.issues.getIssue(run.projectId, run.linearIssueId) ?? params.issue;
292
+ this.logger.warn({ issueKey: issue.issueKey, runType: run.runType, detail: capacity.detail, capacityBackoffUntil }, "Codex capacity limit - deferring retry without consuming budget");
293
+ emitTelemetry(this.telemetry, {
294
+ type: "run.capacity_deferred",
295
+ projectId: run.projectId,
296
+ linearIssueId: run.linearIssueId,
297
+ ...(issue.issueKey ? { issueKey: issue.issueKey } : {}),
298
+ runId: run.id,
299
+ runType: run.runType,
300
+ detail: capacity.detail,
301
+ ...(capacity.retryAtIso ? { retryAtIso: capacity.retryAtIso } : {}),
302
+ });
303
+ void this.linearSync.syncSession(issue, { activeRunType: run.runType });
304
+ this.releaseLease(run.projectId, run.linearIssueId);
305
+ }
236
306
  // ─── Terminal decisions ──────────────────────────────────────────
237
307
  escalate(params) {
238
308
  const { issue, runType, reason } = params;
@@ -1,5 +1,6 @@
1
1
  import { buildRunFailureActivity } from "./linear-session-reporting.js";
2
- import { extractTurnId, resolveRunCompletionStatus } from "./run-reporting.js";
2
+ import { buildFailedTurnFailureReason, extractTurnErrorMessage, extractTurnId, resolveRunCompletionStatus } from "./run-reporting.js";
3
+ import { classifyCodexFailure } from "./codex-capacity.js";
3
4
  import { resolveFailureFactoryState } from "./reactive-pr-state.js";
4
5
  const WRITER = "run-notification-handler";
5
6
  const DEFAULT_PUBLISH_COMMAND_TIMEOUT_MS = 10 * 60 * 1000;
@@ -76,7 +77,25 @@ export class RunNotificationHandler {
76
77
  const completedTurnId = extractTurnId(notification.params);
77
78
  const status = resolveRunCompletionStatus(notification.params);
78
79
  if (status === "failed") {
79
- const failureReason = "Codex reported the turn completed in a failed state";
80
+ const turnErrorMessage = extractTurnErrorMessage(notification.params);
81
+ const failureReason = buildFailedTurnFailureReason(turnErrorMessage);
82
+ // A capacity outage (usage limit / rate limit / quota) is not evidence
83
+ // about the work: defer the same wake behind a backoff instead of
84
+ // consuming an attempt budget or escalating. Only an actual error
85
+ // message classifies — interrupted turns stay on their own path.
86
+ const capacity = classifyCodexFailure(turnErrorMessage);
87
+ if (capacity.kind === "capacity" && this.options.deferCapacityLimitedRun) {
88
+ this.options.deferCapacityLimitedRun({
89
+ run,
90
+ issue,
91
+ failureReason,
92
+ capacity,
93
+ threadId,
94
+ ...(completedTurnId ? { turnId: completedTurnId } : {}),
95
+ });
96
+ this.activeThreadId = undefined;
97
+ return;
98
+ }
80
99
  const recovered = await this.runFinalizer.recoverFailedImplementationRun({
81
100
  run,
82
101
  issue,
@@ -18,7 +18,7 @@ import { RunReconciler } from "./run-reconciler.js";
18
18
  import { RunWakePlanner } from "./run-wake-planner.js";
19
19
  import { WakeDispatcher } from "./wake-dispatcher.js";
20
20
  import { settleRun } from "./run-settlement.js";
21
- import { getRemainingZombieRecoveryDelayMs } from "./run-budgets.js";
21
+ import { getRemainingCapacityBackoffMs, getRemainingZombieRecoveryDelayMs } from "./run-budgets.js";
22
22
  import { classifyIssue } from "./issue-class.js";
23
23
  import { buildIssueTriageHash, IssueTriageService } from "./issue-triage.js";
24
24
  import { loadConfig } from "./config.js";
@@ -129,8 +129,12 @@ export class RunOrchestrator {
129
129
  this.issueTriage = new IssueTriageService(codex, logger);
130
130
  this.runFinalizer = new RunFinalizer(db, logger, this.linearSync, this.wakeDispatcher, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, (lease, issue, runType, context, dedupeScope) => this.appendWakeEventWithLease(lease, issue, runType, context, dedupeScope), this.recoveryPorts.failRunAndClear, this.runCompletionPolicy, this.completionCheck, feed);
131
131
  this.runLauncher = new RunLauncher(config, db, codex, logger, this.worktreeManager);
132
- this.runNotificationHandler = new RunNotificationHandler(config, db, logger, this.linearSync, this.runFinalizer, this.threadPorts.readThreadWithRetry, this.leasePorts.withHeldLease, this.leasePorts.heartbeatLease, this.leasePorts.releaseLease, feed, { interruptTurn: (options) => codex.interruptTurn(options) });
133
- this.runFailurePolicy = new RunFailurePolicy(db, logger, this.linearSync, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, (lease, issue, runType, context, dedupeScope) => this.appendWakeEventWithLease(lease, issue, runType, context, dedupeScope), this.wakeDispatcher, this.recoveryPorts.restoreIdleWorktree, this.runCompletionPolicy, (projectId) => this.config.projects.find((project) => project.id === projectId), feed);
132
+ this.runNotificationHandler = new RunNotificationHandler(config, db, logger, this.linearSync, this.runFinalizer, this.threadPorts.readThreadWithRetry, this.leasePorts.withHeldLease, this.leasePorts.heartbeatLease, this.leasePorts.releaseLease, feed, {
133
+ interruptTurn: (options) => codex.interruptTurn(options),
134
+ // Lazy: the failure policy is constructed just below.
135
+ deferCapacityLimitedRun: (params) => this.runFailurePolicy.deferCapacityLimitedRun(params),
136
+ });
137
+ this.runFailurePolicy = new RunFailurePolicy(db, logger, this.linearSync, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, (lease, issue, runType, context, dedupeScope) => this.appendWakeEventWithLease(lease, issue, runType, context, dedupeScope), this.wakeDispatcher, this.recoveryPorts.restoreIdleWorktree, this.runCompletionPolicy, (projectId) => this.config.projects.find((project) => project.id === projectId), feed, telemetry);
134
138
  this.runReconciler = new RunReconciler(db, logger, linearProvider, this.linearSync, this.runFailurePolicy, this.runFinalizer, this.leasePorts.withHeldLease, this.leasePorts.releaseLease, this.threadPorts.readThreadWithRetry, (projectId) => this.config.projects.find((project) => project.id === projectId)?.github?.repoFullName, feed, telemetry);
135
139
  this.runWakePlanner = new RunWakePlanner(db, logger);
136
140
  this.linearIssueProjection = new LinearIssueProjectionService(db, linearProvider, logger);
@@ -359,6 +363,16 @@ export class RunOrchestrator {
359
363
  this.releaseIssueSessionLease(item.projectId, item.issueId);
360
364
  return;
361
365
  }
366
+ // Codex capacity outage backoff: a usage-limit/rate-limit failure left a
367
+ // pending wake behind; the wake stays queued (the idle reconciler keeps
368
+ // re-poking it) and the launch waits until the backoff elapses.
369
+ const remainingCapacityDelayMs = getRemainingCapacityBackoffMs(issue.capacityBackoffUntil);
370
+ if (remainingCapacityDelayMs > 0) {
371
+ this.emitRunSkipped(item, "capacity_backoff", issue, { runType, remainingDelayMs: remainingCapacityDelayMs });
372
+ this.logger.debug({ issueKey: issue.issueKey, runType, remainingCapacityDelayMs }, "Deferring run launch until Codex capacity backoff elapses");
373
+ this.releaseIssueSessionLease(item.projectId, item.issueId);
374
+ return;
375
+ }
362
376
  const baseContext = isRequestedChangesRunType(runType)
363
377
  ? await this.runCompletionPolicy.resolveRequestedChangesWakeContext(issue, runType, context)
364
378
  : context;
@@ -488,8 +502,9 @@ export class RunOrchestrator {
488
502
  this.releaseIssueSessionLease(run.projectId, run.linearIssueId);
489
503
  return;
490
504
  }
491
- // Reset zombie recovery counter — this run started successfully
492
- if (issue.zombieRecoveryAttempts > 0) {
505
+ // Reset zombie recovery counter and capacity backoff — this run
506
+ // started successfully
507
+ if (issue.zombieRecoveryAttempts > 0 || issue.capacityBackoffUntil !== undefined) {
493
508
  this.db.issueSessions.commitIssueState({
494
509
  writer: WRITER,
495
510
  lease: { projectId: item.projectId, linearIssueId: item.issueId, leaseId },
@@ -498,6 +513,7 @@ export class RunOrchestrator {
498
513
  linearIssueId: item.issueId,
499
514
  zombieRecoveryAttempts: 0,
500
515
  lastZombieRecoveryAt: null,
516
+ capacityBackoffUntil: null,
501
517
  },
502
518
  });
503
519
  }
@@ -3,6 +3,8 @@ import { TERMINAL_STATES } from "./factory-state.js";
3
3
  import { resolveAuthoritativeLinearStopState } from "./linear-workflow.js";
4
4
  import { buildRunFailureActivity } from "./linear-session-reporting.js";
5
5
  import { getThreadTurns } from "./codex-thread-utils.js";
6
+ import { classifyCodexFailure } from "./codex-capacity.js";
7
+ import { buildFailedTurnFailureReason } from "./run-reporting.js";
6
8
  import { resolveEffectiveActiveRun } from "./effective-active-run.js";
7
9
  import { isThreadMaterializingError } from "./codex-thread-errors.js";
8
10
  import { fetchPullRequestSnapshot } from "./reconcile-pr-fetch.js";
@@ -213,6 +215,25 @@ export class RunReconciler {
213
215
  await this.failurePolicy.handleInterruptedRun(run, effectiveIssue);
214
216
  return;
215
217
  }
218
+ // A failed turn found during reconciliation (the live notification was
219
+ // lost, e.g. across a restart) whose error is a Codex capacity outage:
220
+ // defer the same wake instead of leaving the run dangling or burning a
221
+ // budget. Non-capacity failed turns keep the existing behavior — the
222
+ // notification path owns live settlement.
223
+ if (latestTurn?.status === "failed") {
224
+ const capacity = classifyCodexFailure(latestTurn.error?.message ?? undefined);
225
+ if (capacity.kind === "capacity") {
226
+ this.failurePolicy.deferCapacityLimitedRun({
227
+ run,
228
+ issue: effectiveIssue,
229
+ failureReason: buildFailedTurnFailureReason(latestTurn.error?.message ?? undefined),
230
+ capacity,
231
+ threadId: run.threadId,
232
+ ...(latestTurn.id ? { turnId: latestTurn.id } : {}),
233
+ });
234
+ return;
235
+ }
236
+ }
216
237
  if (latestTurn?.status === "completed") {
217
238
  await this.runFinalizer.finalizeCompletedRun({
218
239
  source: "reconciliation",
@@ -164,6 +164,26 @@ export function extractTurnId(params) {
164
164
  const id = turn.id;
165
165
  return typeof id === "string" ? id : undefined;
166
166
  }
167
+ export const FAILED_TURN_FAILURE_REASON = "Codex reported the turn completed in a failed state";
168
+ // Keeps the generic prefix (existing queries and tests key on it) and appends
169
+ // the real Codex error so a capacity outage is distinguishable from a genuine
170
+ // failure in the persisted run record.
171
+ export function buildFailedTurnFailureReason(errorMessage) {
172
+ const trimmed = errorMessage?.trim();
173
+ return trimmed ? `${FAILED_TURN_FAILURE_REASON}: ${trimmed}` : FAILED_TURN_FAILURE_REASON;
174
+ }
175
+ export function extractTurnErrorMessage(params) {
176
+ const turn = params.turn;
177
+ if (!turn || typeof turn !== "object") {
178
+ return undefined;
179
+ }
180
+ const error = turn.error;
181
+ if (!error || typeof error !== "object") {
182
+ return undefined;
183
+ }
184
+ const message = error.message;
185
+ return typeof message === "string" && message.trim() ? message : undefined;
186
+ }
167
187
  export function buildPendingMaterializationThread(stageRun, error) {
168
188
  return {
169
189
  id: stageRun.threadId ?? "pending-thread",
package/dist/telemetry.js CHANGED
@@ -78,6 +78,19 @@ export class OperatorFeedTelemetrySink {
78
78
  };
79
79
  }
80
80
  return undefined;
81
+ case "run.capacity_deferred":
82
+ return {
83
+ level: "warn",
84
+ kind: "workflow",
85
+ ...(event.issueKey ? { issueKey: event.issueKey } : {}),
86
+ ...(event.projectId ? { projectId: event.projectId } : {}),
87
+ ...(event.runType ? { stage: event.runType } : {}),
88
+ status: "capacity_deferred",
89
+ summary: event.retryAtIso
90
+ ? `Codex capacity limit; retrying ${event.runType ?? "run"} at ${event.retryAtIso}`
91
+ : `Codex capacity limit; retrying ${event.runType ?? "run"} after backoff`,
92
+ detail: event.detail,
93
+ };
81
94
  case "state.write_conflict":
82
95
  return {
83
96
  level: "warn",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "patchrelay",
3
- "version": "0.80.2",
3
+ "version": "0.82.0",
4
4
  "license": "MIT",
5
5
  "type": "module",
6
6
  "repository": {