@bridge_gpt/mcp-server 0.2.9 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  /**
2
2
  * Deterministic observed→desired reconciliation for the Epic Supervisor
3
- * (BAPI-408).
3
+ * (BAPI-408, BAPI-436).
4
4
  *
5
5
  * Executes five steps in order:
6
6
  * 1. Fold terminal signals into Postgres via CAS
@@ -9,10 +9,15 @@
9
9
  * 4. Action approved merges via C6 delegation
10
10
  * 5. Schedule post-action wait hooks
11
11
  *
12
+ * Dispatch is purely merge-gated (BAPI-436): dependents are dispatched only
13
+ * after their predecessor reaches "done", which requires a merge.succeeded
14
+ * signal. gate.met and run.stopped fold to the intermediate "ready_for_review"
15
+ * state — dependents do NOT dispatch on these signals.
16
+ *
12
17
  * All durable mutations go through injected seams so the logic is testable
13
18
  * without real network, ledger, or terminal access.
14
19
  */
15
- import { computeReadySet } from "./epic-state.js";
20
+ import { computeReadySet, decideRemediation } from "./epic-state.js";
16
21
  import { extractMergeActionIdentityFromGateEvent } from "./merge-ledger.js";
17
22
  // ---------------------------------------------------------------------------
18
23
  // reconcileEpic
@@ -21,7 +26,7 @@ import { extractMergeActionIdentityFromGateEvent } from "./merge-ledger.js";
21
26
  * Execute the deterministic observed→desired reconciliation pass. All I/O is
22
27
  * behind injected seams; the ready-set is computed by pure code (no LLM).
23
28
  */
24
- export async function reconcileEpic(access, observed, plan, deps) {
29
+ export async function reconcileEpic(access, observed, plan, deps, supervisorConfig) {
25
30
  const result = {
26
31
  signals_folded: 0,
27
32
  dispatched: 0,
@@ -43,6 +48,22 @@ export async function reconcileEpic(access, observed, plan, deps) {
43
48
  if (casResult.ok) {
44
49
  result.signals_folded += 1;
45
50
  deps.log(`[epic-reconcile] folded ${signal.signal_type} for ${signal.ticket_key} → ${signal.next_status}`);
51
+ // BAPI-442: fire teardown + Jira transition strictly after merge.succeeded
52
+ // CAS → done. Both are fail-open: errors are logged and never abort the pass.
53
+ if (signal.signal_type === "merge.succeeded") {
54
+ if (supervisorConfig?.teardown_enabled && deps.teardownSeam) {
55
+ await deps.teardownSeam(observed.epic_key, signal.ticket_key).catch((e) => {
56
+ const safeMsg = e instanceof Error ? e.constructor.name : "teardown error";
57
+ deps.log(`[epic-reconcile] teardown error for ${signal.ticket_key}: ${safeMsg}`);
58
+ });
59
+ }
60
+ if (deps.jiraTransitionSeam) {
61
+ await deps.jiraTransitionSeam(observed.epic_key, signal.ticket_key).catch((e) => {
62
+ const safeMsg = e instanceof Error ? e.constructor.name : "jira error";
63
+ deps.log(`[epic-reconcile] jira-transition error for ${signal.ticket_key}: ${safeMsg}`);
64
+ });
65
+ }
66
+ }
46
67
  }
47
68
  else {
48
69
  // CAS conflict: another tick already advanced this ticket — non-fatal
@@ -61,8 +82,24 @@ export async function reconcileEpic(access, observed, plan, deps) {
61
82
  }
62
83
  // Step 2: Compute the ready-set (pure — never calls LLM)
63
84
  const readySet = computeReadySet(plan, observed.ticket_statuses);
64
- // Step 3: Dispatch each ready ticket idempotently
85
+ // Step 3: Dispatch each ready ticket idempotently.
86
+ //
87
+ // NOTE (BAPI-442): an earlier draft performed a synchronous two-phase
88
+ // planned → ready_for_review → ready spec re-review here — it spawned
89
+ // /review-ticket and then dispatched implementation in the SAME tick. That did
90
+ // not actually gate implementation on the review (the verdict was never
91
+ // consulted and the review run_id was discarded), and it overloaded the
92
+ // BAPI-436 "ready_for_review" (awaiting-merge) state, leaving a liveness gap
93
+ // if the conductor crashed between the CAS and the dispatch. That path has
94
+ // been removed. `auto_rereview_enabled` is reserved until real review-gating —
95
+ // a distinct `reviewing` status, review-run correlation, multi-tick re-entry,
96
+ // and a spec-review verdict signal — is built (BAPI-445). Until then a ready
97
+ // ticket dispatches implementation directly regardless of the flag.
65
98
  for (const ticketKey of readySet) {
99
+ if (supervisorConfig?.auto_rereview_enabled) {
100
+ deps.log(`[epic-reconcile] auto_rereview_enabled is set but review-gating is not yet ` +
101
+ `implemented (BAPI-445); dispatching ${ticketKey} directly`);
102
+ }
66
103
  let claimResult;
67
104
  try {
68
105
  claimResult = await deps.claimDispatchKey(observed.epic_key, ticketKey, observed.plan_version);
@@ -118,6 +155,86 @@ export async function reconcileEpic(access, observed, plan, deps) {
118
155
  result.warnings.push(`correlate-failed for ${ticketKey}: ${safeMsg}`);
119
156
  }
120
157
  }
158
+ // Step 3.5: Remediation pass (BAPI-441) — re-act on blocked tickets under
159
+ // budget. Keyed off the folded "blocked" status + per-ticket counters, NOT a
160
+ // computeReadySet change (the ready-set still returns only planned tickets).
161
+ // Skipped entirely unless the remediation seams + supervisorConfig are wired.
162
+ const remediationWired = supervisorConfig !== undefined &&
163
+ deps.readWorkerLiveness !== undefined &&
164
+ deps.remediateCas !== undefined &&
165
+ deps.sendNudge !== undefined &&
166
+ deps.resumeDispatch !== undefined;
167
+ if (remediationWired) {
168
+ const cfg = supervisorConfig;
169
+ const readWorkerLiveness = deps.readWorkerLiveness;
170
+ const sendNudge = deps.sendNudge;
171
+ const resumeDispatch = deps.resumeDispatch;
172
+ const remediateCas = deps.remediateCas;
173
+ for (const [ticketKey, status] of observed.ticket_statuses) {
174
+ if (status !== "blocked")
175
+ continue;
176
+ // Per-ticket try/catch so a single ticket's failure never aborts the pass.
177
+ try {
178
+ const counters = observed.ticket_remediation_counters?.get(ticketKey) ?? {
179
+ attempts: 0,
180
+ no_progress: 0,
181
+ };
182
+ const liveness = await readWorkerLiveness(observed.epic_key, ticketKey);
183
+ const decision = decideRemediation(counters.attempts, counters.no_progress, liveness.alive, cfg);
184
+ if (decision === "escalate") {
185
+ await escalate(observed.epic_key, `remediation-budget-exhausted:${ticketKey}`);
186
+ deps.log(`[epic-reconcile] remediation escalate ${ticketKey} ` +
187
+ `(attempts=${counters.attempts} no_progress=${counters.no_progress})`);
188
+ continue;
189
+ }
190
+ // The attempt being recorded is the next one (1-based).
191
+ const attempt = counters.attempts + 1;
192
+ const attemptKind = decision;
193
+ // The folding reason frames the nudge (message type + digest). Default to
194
+ // the review path when the ledger no longer carries the blocking event.
195
+ const reason = observed.ticket_blocked_reasons?.get(ticketKey) ?? "review.changes_requested";
196
+ // A nudge needs a worker to address it to. The liveness scan already
197
+ // resolved the worker id from the same heartbeat that proved the worker
198
+ // alive; if it is missing we cannot relay, so skip BEFORE recording an
199
+ // attempt — otherwise the CAS would burn a budget unit with nothing sent.
200
+ if (decision === "nudge" && !liveness.workerId) {
201
+ result.warnings.push(`remediation nudge skipped for ${ticketKey}: alive worker has no worker_id`);
202
+ continue;
203
+ }
204
+ // Record the attempt durably FIRST. The remediate endpoint builds the
205
+ // (backend-redacted) review digest for a nudge and returns it, and is
206
+ // idempotent (a 409 replay returns conflict, not throw). An unexpected
207
+ // remediate failure is absorbed as a per-ticket warning so the rest of
208
+ // the pass still runs (crash-replay safe).
209
+ let casOutcome;
210
+ try {
211
+ casOutcome = await remediateCas(observed.epic_key, ticketKey, attemptKind, reason);
212
+ }
213
+ catch (err) {
214
+ const safeMsg = err instanceof Error ? err.constructor.name : "remediate error";
215
+ result.warnings.push(`remediate-cas-failed for ${ticketKey} (${attemptKind}): ${safeMsg}`);
216
+ continue;
217
+ }
218
+ if (casOutcome.conflict) {
219
+ // Idempotency replay: the attempt was already recorded (and acted on)
220
+ // on a prior tick. Do not re-act — the crash-replay self-heals here.
221
+ result.warnings.push(`remediation replay swallowed for ${ticketKey} (${attemptKind})`);
222
+ continue;
223
+ }
224
+ if (decision === "nudge") {
225
+ await sendNudge(observed.epic_key, ticketKey, attempt, casOutcome.reviewDigest, casOutcome.truncated, reason, liveness.workerId);
226
+ }
227
+ else {
228
+ await resumeDispatch(observed.epic_key, ticketKey, attempt);
229
+ }
230
+ deps.log(`[epic-reconcile] remediation ${decision} ${ticketKey} attempt=${attempt}`);
231
+ }
232
+ catch (err) {
233
+ const safeMsg = err instanceof Error ? err.constructor.name : "remediation error";
234
+ result.warnings.push(`remediation-error for ${ticketKey}: ${safeMsg}`);
235
+ }
236
+ }
237
+ }
121
238
  // Step 4: Action approved merges via C6 delegation
122
239
  for (const event of observed.pending_merge_events) {
123
240
  const identity = extractMergeActionIdentityFromGateEvent(event);
@@ -17,16 +17,20 @@
17
17
  * All durable mutations go through the injectable seams that call the sibling
18
18
  * Epic Run TS client (already available in bridge-api-client.ts as of BAPI-407).
19
19
  */
20
- import { resolveConductorBridgeApiAccess, claimEpicSupervisionLease, fetchEpicRunState, advanceEpicTicketStatus, createEpicTicketStatus, recordEpicDispatch, transitionEpicDispatch, fetchParseStatus, triggerRepositoryParse, getEpicPlan, buildEpicDispatchKey, } from "./bridge-api-client.js";
20
+ import { spawnSync } from "child_process";
21
+ import { resolveConductorBridgeApiAccess, claimEpicSupervisionLease, fetchEpicRunState, advanceEpicTicketStatus, createEpicTicketStatus, recordEpicDispatch, transitionEpicDispatch, fetchParseStatus, triggerRepositoryParse, getEpicPlan, buildEpicDispatchKey, fetchEffectiveSupervisorConfig, fetchEffectiveSupervisorSetup, fetchPrReviewStatus, remediateEpicTicket, deletePullRequestBranch, transitionJiraStatus, } from "./bridge-api-client.js";
21
22
  import { processGateMetMerge } from "./supervisor-merge.js";
22
- import { rebuildObservedState, } from "./epic-state.js";
23
+ import { rebuildObservedState, extractWorkerLiveness, } from "./epic-state.js";
23
24
  import { reconcileEpic } from "./epic-reconcile.js";
25
+ import { buildSupervisorRemediationWorkerMessage } from "./supervisor-message-relay.js";
26
+ import { sendWorkerMessage } from "./store.js";
24
27
  import { hashPlan } from "./plan.js";
25
- import { pollConductorEvents } from "./store.js";
28
+ import { pollConductorEvents, POLL_LIMIT_MAX } from "./store.js";
26
29
  import { dispatchSupervisorNotification } from "./supervisor-notification.js";
27
30
  import { makeSupervisorIdempotencyKey } from "./supervisor-ledger.js";
28
31
  import { createDefaultStartTicketsDeps, orchestrateStartTickets } from "../start-tickets.js";
29
32
  import { orchestrateReviewTickets } from "../review-tickets.js";
33
+ import { createStartTicketsConductorContext, provisionConductorHooksForRows, emitStartTicketsRunStarted, } from "../start-tickets-conductor.js";
30
34
  // ---------------------------------------------------------------------------
31
35
  // Constants
32
36
  // ---------------------------------------------------------------------------
@@ -46,7 +50,7 @@ function defaultLeaseOwner() {
46
50
  async function defaultEscalateOnce(epicKey, reason) {
47
51
  process.stderr.write(`[epic-tick] ESCALATION epic=${epicKey} reason=${reason}\n`);
48
52
  }
49
- async function defaultDispatchSeam(_epicKey, ticketKey) {
53
+ async function defaultDispatchSeam(_epicKey, ticketKey, _attempt = 0) {
50
54
  throw new Error(`dispatch seam not wired for ticket ${ticketKey}`);
51
55
  }
52
56
  async function defaultPostActionWaitSeam(_epicKey, _ticketKey) {
@@ -73,7 +77,8 @@ export async function runEpicTick(options, deps = {}) {
73
77
  const dispatchSeam = deps.dispatchSeam ?? defaultDispatchSeam;
74
78
  const processMergeFn = deps.processMerge ?? processGateMetMerge;
75
79
  const postActionWaitSeam = deps.postActionWaitSeam ?? defaultPostActionWaitSeam;
76
- const fetchLocalEvents = deps.fetchLocalEvents ?? ((_key) => []);
80
+ const fetchLocalEvents = deps.fetchLocalEvents ??
81
+ ((_key, _runIds) => []);
77
82
  const resolveBridgeAccess = deps.resolveBridgeAccess ?? resolveConductorBridgeApiAccess;
78
83
  const claimLeaseFn = deps.claimLease ?? claimEpicSupervisionLease;
79
84
  const fetchEpicStateFn = deps.fetchEpicState ?? fetchEpicRunState;
@@ -190,7 +195,15 @@ export async function runEpicTick(options, deps = {}) {
190
195
  worker_count: 0,
191
196
  };
192
197
  }
193
- const localEvents = fetchLocalEvents(epic_key);
198
+ // Scope the local-ledger read to this epic's dispatched run_ids. The shared
199
+ // ~/.config/bridge/events.db ledger accumulates events for every epic/worker
200
+ // on the machine; rebuildObservedState only folds signals whose run_id maps
201
+ // to one of these dispatches, so scoping the read here avoids loading the
202
+ // entire (up to 50K-row) ledger on every tick.
203
+ const dispatchedRunIds = epicRunState.dispatches
204
+ .map((d) => d.run_id)
205
+ .filter((rid) => typeof rid === "string" && rid.length > 0);
206
+ const localEvents = fetchLocalEvents(epic_key, dispatchedRunIds);
194
207
  const observed = rebuildObservedState(epicRunState, localEvents, nowFn());
195
208
  workerCount = [...observed.ticket_statuses.values()].filter((s) => ACTIVE_WORKER_STATUSES.has(s)).length;
196
209
  // Step 3.5: Run post-action waits (parse-after-merge)
@@ -330,6 +343,82 @@ export async function runEpicTick(options, deps = {}) {
330
343
  }
331
344
  // Step 5: Reconcile observed→desired
332
345
  if (plan !== null) {
346
+ // BAPI-441: fetch the effective supervisor config (budget ceilings +
347
+ // liveness window) and setup (pr_bindings) once. Fail-open: if the config
348
+ // read fails, remediationConfig stays undefined and reconcile skips the
349
+ // remediation pass entirely (dispatch/merge steps unaffected).
350
+ let remediationConfig;
351
+ let livenessWindowSeconds = 120;
352
+ let prBindings = {};
353
+ try {
354
+ const cfg = await fetchEffectiveSupervisorConfig(access, epic_key);
355
+ remediationConfig = {
356
+ max_remediation_attempts: cfg.max_remediation_attempts,
357
+ max_remediation_no_progress_attempts: cfg.max_remediation_no_progress_attempts,
358
+ auto_rereview_enabled: cfg.auto_rereview_enabled ?? false,
359
+ teardown_enabled: cfg.teardown_enabled ?? false,
360
+ };
361
+ livenessWindowSeconds = cfg.worker_liveness_window_seconds;
362
+ }
363
+ catch (err) {
364
+ const safeMsg = err instanceof Error ? err.constructor.name : "config error";
365
+ errorLog(`[epic-tick] supervisor-config fetch failed (${safeMsg}); skipping remediation for epic=${epic_key}`);
366
+ }
367
+ if (remediationConfig) {
368
+ try {
369
+ const setup = await fetchEffectiveSupervisorSetup(access, epic_key);
370
+ if (setup.pr_bindings && typeof setup.pr_bindings === "object") {
371
+ prBindings = setup.pr_bindings;
372
+ }
373
+ }
374
+ catch (err) {
375
+ const safeMsg = err instanceof Error ? err.constructor.name : "setup error";
376
+ errorLog(`[epic-tick] supervisor-setup fetch failed (${safeMsg}); remediation PR resolution degraded for epic=${epic_key}`);
377
+ }
378
+ }
379
+ // ticket_key → dispatched run_id (the run whose heartbeat liveness reads).
380
+ // Seed from ticket_status.dispatch_run_id, then prefer the most-recent
381
+ // dispatch-ledger run_id per ticket so that after a remediation re-dispatch
382
+ // (a new attempt-scoped epic_dispatch row correlated with the fresh run_id)
383
+ // liveness tracks the NEW worker rather than the stale original.
384
+ const ticketRunIdMap = new Map();
385
+ for (const ts of epicRunState.ticket_statuses) {
386
+ if (ts.dispatch_run_id)
387
+ ticketRunIdMap.set(ts.ticket_key, ts.dispatch_run_id);
388
+ }
389
+ const latestDispatchByTicket = new Map();
390
+ for (const d of epicRunState.dispatches) {
391
+ if (!d.run_id)
392
+ continue;
393
+ const updatedAt = new Date(d.updated_at).getTime();
394
+ const prev = latestDispatchByTicket.get(d.ticket_key);
395
+ if (!prev || updatedAt >= prev.updatedAt) {
396
+ latestDispatchByTicket.set(d.ticket_key, { runId: d.run_id, updatedAt });
397
+ }
398
+ }
399
+ for (const [tk, info] of latestDispatchByTicket) {
400
+ ticketRunIdMap.set(tk, info.runId);
401
+ }
402
+ const resolvePrNumber = (ticketKey) => {
403
+ const raw = prBindings[ticketKey];
404
+ if (typeof raw === "number" && Number.isInteger(raw) && raw >= 1)
405
+ return raw;
406
+ if (raw && typeof raw === "object") {
407
+ const obj = raw;
408
+ const pr = obj.pr_number ?? obj.pr;
409
+ if (typeof pr === "number" && Number.isInteger(pr) && pr >= 1)
410
+ return pr;
411
+ }
412
+ return null;
413
+ };
414
+ const maxSeqForRun = (runId) => {
415
+ let maxSeq = 0;
416
+ for (const ev of localEvents) {
417
+ if (ev.run_id === runId && ev.seq > maxSeq)
418
+ maxSeq = ev.seq;
419
+ }
420
+ return maxSeq;
421
+ };
333
422
  const reconcileDeps = {
334
423
  casTicketStatus: async (ek, tk, rowVersion, nextStatus, planVersion) => advanceEpicTicketStatus(access, {
335
424
  epicKey: ek,
@@ -360,13 +449,146 @@ export async function runEpicTick(options, deps = {}) {
360
449
  runId,
361
450
  });
362
451
  },
363
- dispatchSeam: async (ek, tk) => dispatchSeam(ek, tk),
452
+ dispatchSeam: async (ek, tk, attempt = 0) => dispatchSeam(ek, tk, attempt),
364
453
  processMerge: async (acc, event) => processMergeFn(acc, event),
365
454
  postActionWaitSeam: async (ek, tk) => postActionWaitSeam(ek, tk),
366
455
  escalateOnce: async (ek, reason) => escalateOnce(ek, reason),
367
456
  log,
457
+ // BAPI-442: teardown and Jira-transition seams (fail-open, optional).
458
+ teardownSeam: async (_ek, tk) => {
459
+ // Resolve the PR number for the ticket.
460
+ const prNumber = resolvePrNumber(tk);
461
+ if (prNumber === null) {
462
+ errorLog(`[epic-tick] teardown: no PR binding for ${tk}; skipping`);
463
+ return;
464
+ }
465
+ // Fetch setup to get the expected head SHA if available; fall back to empty.
466
+ let expectedSha = "";
467
+ try {
468
+ const setup = await fetchEffectiveSupervisorSetup(access, epic_key);
469
+ const binding = (setup.pr_bindings ?? {})[tk];
470
+ if (binding && typeof binding === "object") {
471
+ const b = binding;
472
+ if (typeof b.head_sha === "string")
473
+ expectedSha = b.head_sha;
474
+ }
475
+ }
476
+ catch {
477
+ // Best-effort; proceed with empty SHA (endpoint still deletes by PR number)
478
+ }
479
+ try {
480
+ await deletePullRequestBranch(access, prNumber, expectedSha || "");
481
+ log(`[epic-tick] teardown: branch deleted for PR #${prNumber} (ticket=${tk})`);
482
+ }
483
+ catch (err) {
484
+ const safeMsg = err instanceof Error ? err.constructor.name : "error";
485
+ errorLog(`[epic-tick] teardown: branch-delete failed (${safeMsg}) for ${tk}`);
486
+ }
487
+ // Remove local worktree idempotently; errors are benign.
488
+ try {
489
+ spawnSync("git", ["worktree", "remove", "--force", tk], { stdio: "ignore" });
490
+ log(`[epic-tick] teardown: worktree removed for ${tk}`);
491
+ }
492
+ catch {
493
+ // Already removed or never created — idempotent skip.
494
+ }
495
+ },
496
+ jiraTransitionSeam: async (_ek, tk) => {
497
+ try {
498
+ const result = await transitionJiraStatus(access, tk, "auto");
499
+ if (result.status === "skipped") {
500
+ log(`[epic-tick] jira-transition: no matching transition for ${tk} (skipped)`);
501
+ }
502
+ else {
503
+ log(`[epic-tick] jira-transition: transitioned ${tk}`);
504
+ }
505
+ }
506
+ catch (err) {
507
+ const safeMsg = err instanceof Error ? err.constructor.name : "error";
508
+ errorLog(`[epic-tick] jira-transition failed (${safeMsg}) for ${tk}`);
509
+ }
510
+ },
511
+ // BAPI-441 remediation seams.
512
+ readWorkerLiveness: async (_ek, tk) => {
513
+ const runId = ticketRunIdMap.get(tk);
514
+ if (!runId)
515
+ return { alive: false, workerId: null };
516
+ return extractWorkerLiveness(localEvents, runId, nowFn(), livenessWindowSeconds);
517
+ },
518
+ remediateCas: async (ek, tk, attemptKind, reason) => {
519
+ const prNumber = resolvePrNumber(tk);
520
+ if (prNumber === null) {
521
+ throw new Error(`remediate: no PR binding for ${tk}`);
522
+ }
523
+ const reviewStatus = (await fetchPrReviewStatus(access, prNumber));
524
+ const headSha = reviewStatus?.detail?.head_sha ?? null;
525
+ if (!headSha) {
526
+ throw new Error(`remediate: no head_sha for PR ${prNumber}`);
527
+ }
528
+ const rowVersion = observed.ticket_row_versions.get(tk) ?? 0;
529
+ // Deterministic block-state idempotency key: stable for a given durable
530
+ // row_version so a same-tick retry replays (409, swallowed); advances
531
+ // with the next attempt.
532
+ const idempotencyKey = `remediate:${ek}:${tk}:${rowVersion}`;
533
+ const result = await remediateEpicTicket(access, {
534
+ pr_number: prNumber,
535
+ epic_run_id: ek,
536
+ ticket_key: tk,
537
+ expected_row_version: rowVersion,
538
+ head_sha: headSha,
539
+ idempotency_key: idempotencyKey,
540
+ attempt_kind: attemptKind,
541
+ reason,
542
+ });
543
+ if (result.conflict) {
544
+ return { conflict: true, reviewDigest: null, truncated: false };
545
+ }
546
+ return {
547
+ conflict: false,
548
+ reviewDigest: result.response.review_digest,
549
+ truncated: result.response.truncated,
550
+ };
551
+ },
552
+ sendNudge: async (_ek, tk, attempt, reviewDigest, truncated, reason, workerId) => {
553
+ const runId = ticketRunIdMap.get(tk);
554
+ if (!runId)
555
+ throw new Error(`nudge: no run_id for ${tk}`);
556
+ // workerId is resolved by readWorkerLiveness from the same heartbeat
557
+ // scan and null-checked by the reconcile pass before remediateCas, so
558
+ // the two seams stay consistent and no budget is burned on a missing id.
559
+ const input = buildSupervisorRemediationWorkerMessage({
560
+ runId,
561
+ workerId,
562
+ ticketKey: tk,
563
+ reason,
564
+ attempt,
565
+ reviewDigest: reviewDigest ?? "",
566
+ truncated,
567
+ causeSeq: maxSeqForRun(runId),
568
+ });
569
+ sendWorkerMessage(input);
570
+ },
571
+ resumeDispatch: async (ek, tk, attempt) => {
572
+ // Claim an attempt-scoped pending dispatch row FIRST so the spawn's
573
+ // run_spawned correlation (inside orchestrateStartTickets) has a row to
574
+ // transition and the re-dispatched run_id is durably recorded against
575
+ // the ticket. The claim is idempotent (lease-held/already-spawned are
576
+ // returned, not thrown).
577
+ await recordEpicDispatch(access, {
578
+ epicKey: ek,
579
+ ticketKey: tk,
580
+ planVersion: plan.plan_version,
581
+ leaseOwner: lease_owner,
582
+ ttlSeconds: DEFAULT_DISPATCH_KEY_TTL_SECONDS,
583
+ attempt,
584
+ });
585
+ // dispatchSeam returns the new run_id; orchestrate correlates it into
586
+ // the attempt-scoped epic_dispatch row, so the next tick's liveness map
587
+ // (built from the dispatch ledger) tracks the fresh worker.
588
+ await dispatchSeam(ek, tk, attempt);
589
+ },
368
590
  };
369
- const reconcileResult = await reconcileEpic(access, observed, plan, reconcileDeps);
591
+ const reconcileResult = await reconcileEpic(access, observed, plan, reconcileDeps, remediationConfig);
370
592
  log(`[epic-tick] reconcile done: epic=${epic_key} ` +
371
593
  `signals=${reconcileResult.signals_folded} ` +
372
594
  `dispatched=${reconcileResult.dispatched} ` +
@@ -494,7 +716,7 @@ export async function buildProductionEpicRuntimeDeps(epicKey) {
494
716
  const planHash = hashPlan(dag);
495
717
  return { plan_hash: planHash, plan_version: response.plan_version, tickets };
496
718
  };
497
- const dispatchSeam = async (ek, tk) => {
719
+ const dispatchSeam = async (ek, tk, attempt = 0) => {
498
720
  // Guard: fetchPlan must run before dispatchSeam so cachedPlanVersion and
499
721
  // automationMap are populated. A zero version means the factory seam was
500
722
  // wired but fetchPlan was never called — fail explicitly rather than silently
@@ -502,6 +724,12 @@ export async function buildProductionEpicRuntimeDeps(epicKey) {
502
724
  if (cachedPlanVersion === 0) {
503
725
  throw new Error(`dispatchSeam called before fetchPlan for epic ${ek} ticket ${tk}; cachedPlanVersion is 0`);
504
726
  }
727
+ // BAPI-441: a remediation re-dispatch (attempt > 0) reuses the existing
728
+ // branch/worktree (resume mode) and claims an attempt-scoped dispatch key so
729
+ // it is not deduped against the original epic dispatch.
730
+ const isResume = attempt > 0;
731
+ // The dispatch kind comes from the plan node's automation (start-tickets or
732
+ // review-tickets); default to start-tickets when unspecified.
505
733
  const kind = automationMap.get(tk) ?? "start-tickets";
506
734
  // Operator dry-run: when BAPI_CONDUCTOR_DISPATCH_DRY_RUN=1, dispatch resolves
507
735
  // the spawn command + model routing but opens NO terminal, creates NO worktree,
@@ -513,7 +741,7 @@ export async function buildProductionEpicRuntimeDeps(epicKey) {
513
741
  epic_key: ek,
514
742
  epic_run_id: ek,
515
743
  plan_version: cachedPlanVersion,
516
- dispatch_key: buildEpicDispatchKey(ek, tk, cachedPlanVersion),
744
+ dispatch_key: buildEpicDispatchKey(ek, tk, cachedPlanVersion, attempt),
517
745
  };
518
746
  const deps = createDefaultStartTicketsDeps();
519
747
  let runId;
@@ -533,6 +761,14 @@ export async function buildProductionEpicRuntimeDeps(epicKey) {
533
761
  runId = result.rows[0]?.runId;
534
762
  }
535
763
  else {
764
+ // BAPI-409 / IH-1: epic dispatch (dispatch_key set) requires the conductor
765
+ // stage to mint a run_id and provision per-worker env/supervisor context.
766
+ // `conductorEnabled: true` alone is necessary but not sufficient — the
767
+ // BAPI-409 guard in orchestrateStartTickets fails closed unless the
768
+ // createConductorContext seam (and its siblings) is injected via the third
769
+ // `overrides` argument, exactly as the packaged start-tickets CLI does. The
770
+ // orchestrator short-circuits on dryRun before using them, so passing them
771
+ // unconditionally is safe; dispatchDryRun preserves the operator dry-run seam.
536
772
  const result = await orchestrateStartTickets(deps, {
537
773
  keys: [tk],
538
774
  epic: identity,
@@ -543,11 +779,13 @@ export async function buildProductionEpicRuntimeDeps(epicKey) {
543
779
  refreshMain: false,
544
780
  branchOverrides: {},
545
781
  baseBranch: "main",
546
- // Epic dispatch always uses the Conductor system (epic-tick coordinates
547
- // workers through it), so the message-relay instruction must be present
548
- // on epic-dispatched worker prompts regardless of the user-facing
549
- // `--conductor` default.
550
782
  conductorEnabled: true,
783
+ // BAPI-441: re-dispatch reuses the existing branch/worktree.
784
+ resumeMode: isResume,
785
+ }, {
786
+ createConductorContext: createStartTicketsConductorContext,
787
+ provisionConductorHooksForRows,
788
+ emitStartTicketsRunStarted,
551
789
  });
552
790
  if (!result.ok) {
553
791
  throw new Error(`start-tickets dispatch failed: ${result.error}`);
@@ -564,11 +802,50 @@ export async function buildProductionEpicRuntimeDeps(epicKey) {
564
802
  }
565
803
  return runId;
566
804
  };
567
- const fetchLocalEvents = (_ek) => {
805
+ const fetchLocalEvents = (_ek, runIds) => {
568
806
  // Workers and the epic-tick process share the same local SQLite ledger
569
807
  // (~/.config/bridge/events.db). pollConductorEvents opens it read-only.
570
- const result = pollConductorEvents({ data_mode: "full" });
571
- return result.events;
808
+ //
809
+ // Scope the read to this epic's dispatched run_ids. The shared ledger holds
810
+ // events for every epic/worker on the machine (up to RETENTION_MAX_ROWS),
811
+ // but rebuildObservedState only folds signals whose run_id maps to one of
812
+ // these dispatches — so the run_ids filter pushes that scoping into SQL and
813
+ // avoids loading sibling-epic events on every tick. With no known run_ids
814
+ // (first tick before any dispatch) there is nothing to fold, so skip the
815
+ // read entirely.
816
+ //
817
+ // pollConductorEvents returns at most POLL_LIMIT_MAX events per call
818
+ // (default 100, capped at 1000) starting at `since_seq`. rebuildObservedState
819
+ // folds terminal signals (gate.met/run.stopped/merge.succeeded/ci.failed)
820
+ // ONLY from the events it is handed, so a single capped page silently hides
821
+ // recent terminal signals once the (scoped) result grows past one page —
822
+ // done-detection then breaks. Drain the COMPLETE history by paginating on the
823
+ // `next_seq` cursor until a short (or empty) page signals the tail.
824
+ if (runIds !== undefined && runIds.length === 0) {
825
+ return [];
826
+ }
827
+ const runIdsFilter = runIds && runIds.length > 0 ? { run_ids: [...runIds] } : undefined;
828
+ const events = [];
829
+ let sinceSeq = 1;
830
+ // Retention caps (retention_days/retention_max_rows) bound the ledger, but
831
+ // cap total iterations defensively against a non-advancing cursor.
832
+ const MAX_PAGES = 10_000;
833
+ for (let page = 0; page < MAX_PAGES; page += 1) {
834
+ const result = pollConductorEvents({
835
+ data_mode: "full",
836
+ since_seq: sinceSeq,
837
+ limit: POLL_LIMIT_MAX,
838
+ filter: runIdsFilter,
839
+ });
840
+ events.push(...result.events);
841
+ // Stop on a short/empty page (no more rows) or a cursor that fails to
842
+ // advance (guards against an infinite loop).
843
+ if (result.count < POLL_LIMIT_MAX || result.next_seq <= sinceSeq) {
844
+ break;
845
+ }
846
+ sinceSeq = result.next_seq;
847
+ }
848
+ return events;
572
849
  };
573
850
  const escalateOnce = async (ek, reason) => {
574
851
  const candidate = {
@@ -623,5 +900,9 @@ export async function buildProductionEpicRuntimeDeps(epicKey) {
623
900
  fetchLocalEvents,
624
901
  escalateOnce,
625
902
  postActionWaitSeam,
903
+ // BAPI-442 seams are wired at the reconcileDeps level inside runEpicTick
904
+ // (they need the per-tick `access` and `prBindings` closure). The factory
905
+ // returns the dispatchSeam with isReReview support; the other two seams are
906
+ // defined inline in the reconcileDeps object in runEpicTick.
626
907
  };
627
908
  }