@bridge_gpt/mcp-server 0.2.6 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,16 +17,20 @@
17
17
  * All durable mutations go through the injectable seams that call the sibling
18
18
  * Epic Run TS client (already available in bridge-api-client.ts as of BAPI-407).
19
19
  */
20
- import { resolveConductorBridgeApiAccess, claimEpicSupervisionLease, fetchEpicRunState, advanceEpicTicketStatus, createEpicTicketStatus, recordEpicDispatch, transitionEpicDispatch, fetchParseStatus, triggerRepositoryParse, getEpicPlan, buildEpicDispatchKey, } from "./bridge-api-client.js";
20
+ import { spawnSync } from "child_process";
21
+ import { resolveConductorBridgeApiAccess, claimEpicSupervisionLease, fetchEpicRunState, advanceEpicTicketStatus, createEpicTicketStatus, recordEpicDispatch, transitionEpicDispatch, fetchParseStatus, triggerRepositoryParse, getEpicPlan, buildEpicDispatchKey, fetchEffectiveSupervisorConfig, fetchEffectiveSupervisorSetup, fetchPrReviewStatus, remediateEpicTicket, deletePullRequestBranch, transitionJiraStatus, } from "./bridge-api-client.js";
21
22
  import { processGateMetMerge } from "./supervisor-merge.js";
22
- import { rebuildObservedState, } from "./epic-state.js";
23
+ import { rebuildObservedState, extractWorkerLiveness, } from "./epic-state.js";
23
24
  import { reconcileEpic } from "./epic-reconcile.js";
25
+ import { buildSupervisorRemediationWorkerMessage } from "./supervisor-message-relay.js";
26
+ import { sendWorkerMessage } from "./store.js";
24
27
  import { hashPlan } from "./plan.js";
25
- import { pollConductorEvents } from "./store.js";
28
+ import { pollConductorEvents, POLL_LIMIT_MAX } from "./store.js";
26
29
  import { dispatchSupervisorNotification } from "./supervisor-notification.js";
27
30
  import { makeSupervisorIdempotencyKey } from "./supervisor-ledger.js";
28
31
  import { createDefaultStartTicketsDeps, orchestrateStartTickets } from "../start-tickets.js";
29
32
  import { orchestrateReviewTickets } from "../review-tickets.js";
33
+ import { createStartTicketsConductorContext, provisionConductorHooksForRows, emitStartTicketsRunStarted, } from "../start-tickets-conductor.js";
30
34
  // ---------------------------------------------------------------------------
31
35
  // Constants
32
36
  // ---------------------------------------------------------------------------
@@ -46,7 +50,7 @@ function defaultLeaseOwner() {
46
50
  async function defaultEscalateOnce(epicKey, reason) {
47
51
  process.stderr.write(`[epic-tick] ESCALATION epic=${epicKey} reason=${reason}\n`);
48
52
  }
49
- async function defaultDispatchSeam(_epicKey, ticketKey) {
53
+ async function defaultDispatchSeam(_epicKey, ticketKey, _attempt = 0) {
50
54
  throw new Error(`dispatch seam not wired for ticket ${ticketKey}`);
51
55
  }
52
56
  async function defaultPostActionWaitSeam(_epicKey, _ticketKey) {
@@ -73,7 +77,8 @@ export async function runEpicTick(options, deps = {}) {
73
77
  const dispatchSeam = deps.dispatchSeam ?? defaultDispatchSeam;
74
78
  const processMergeFn = deps.processMerge ?? processGateMetMerge;
75
79
  const postActionWaitSeam = deps.postActionWaitSeam ?? defaultPostActionWaitSeam;
76
- const fetchLocalEvents = deps.fetchLocalEvents ?? ((_key) => []);
80
+ const fetchLocalEvents = deps.fetchLocalEvents ??
81
+ ((_key, _runIds) => []);
77
82
  const resolveBridgeAccess = deps.resolveBridgeAccess ?? resolveConductorBridgeApiAccess;
78
83
  const claimLeaseFn = deps.claimLease ?? claimEpicSupervisionLease;
79
84
  const fetchEpicStateFn = deps.fetchEpicState ?? fetchEpicRunState;
@@ -190,7 +195,15 @@ export async function runEpicTick(options, deps = {}) {
190
195
  worker_count: 0,
191
196
  };
192
197
  }
193
- const localEvents = fetchLocalEvents(epic_key);
198
+ // Scope the local-ledger read to this epic's dispatched run_ids. The shared
199
+ // ~/.config/bridge/events.db ledger accumulates events for every epic/worker
200
+ // on the machine; rebuildObservedState only folds signals whose run_id maps
201
+ // to one of these dispatches, so scoping the read here avoids loading the
202
+ // entire (up to 50K-row) ledger on every tick.
203
+ const dispatchedRunIds = epicRunState.dispatches
204
+ .map((d) => d.run_id)
205
+ .filter((rid) => typeof rid === "string" && rid.length > 0);
206
+ const localEvents = fetchLocalEvents(epic_key, dispatchedRunIds);
194
207
  const observed = rebuildObservedState(epicRunState, localEvents, nowFn());
195
208
  workerCount = [...observed.ticket_statuses.values()].filter((s) => ACTIVE_WORKER_STATUSES.has(s)).length;
196
209
  // Step 3.5: Run post-action waits (parse-after-merge)
@@ -330,6 +343,82 @@ export async function runEpicTick(options, deps = {}) {
330
343
  }
331
344
  // Step 5: Reconcile observed→desired
332
345
  if (plan !== null) {
346
+ // BAPI-441: fetch the effective supervisor config (budget ceilings +
347
+ // liveness window) and setup (pr_bindings) once. Fail-open: if the config
348
+ // read fails, remediationConfig stays undefined and reconcile skips the
349
+ // remediation pass entirely (dispatch/merge steps unaffected).
350
+ let remediationConfig;
351
+ let livenessWindowSeconds = 120;
352
+ let prBindings = {};
353
+ try {
354
+ const cfg = await fetchEffectiveSupervisorConfig(access, epic_key);
355
+ remediationConfig = {
356
+ max_remediation_attempts: cfg.max_remediation_attempts,
357
+ max_remediation_no_progress_attempts: cfg.max_remediation_no_progress_attempts,
358
+ auto_rereview_enabled: cfg.auto_rereview_enabled ?? false,
359
+ teardown_enabled: cfg.teardown_enabled ?? false,
360
+ };
361
+ livenessWindowSeconds = cfg.worker_liveness_window_seconds;
362
+ }
363
+ catch (err) {
364
+ const safeMsg = err instanceof Error ? err.constructor.name : "config error";
365
+ errorLog(`[epic-tick] supervisor-config fetch failed (${safeMsg}); skipping remediation for epic=${epic_key}`);
366
+ }
367
+ if (remediationConfig) {
368
+ try {
369
+ const setup = await fetchEffectiveSupervisorSetup(access, epic_key);
370
+ if (setup.pr_bindings && typeof setup.pr_bindings === "object") {
371
+ prBindings = setup.pr_bindings;
372
+ }
373
+ }
374
+ catch (err) {
375
+ const safeMsg = err instanceof Error ? err.constructor.name : "setup error";
376
+ errorLog(`[epic-tick] supervisor-setup fetch failed (${safeMsg}); remediation PR resolution degraded for epic=${epic_key}`);
377
+ }
378
+ }
379
+ // ticket_key → dispatched run_id (the run whose heartbeat liveness reads).
380
+ // Seed from ticket_status.dispatch_run_id, then prefer the most-recent
381
+ // dispatch-ledger run_id per ticket so that after a remediation re-dispatch
382
+ // (a new attempt-scoped epic_dispatch row correlated with the fresh run_id)
383
+ // liveness tracks the NEW worker rather than the stale original.
384
+ const ticketRunIdMap = new Map();
385
+ for (const ts of epicRunState.ticket_statuses) {
386
+ if (ts.dispatch_run_id)
387
+ ticketRunIdMap.set(ts.ticket_key, ts.dispatch_run_id);
388
+ }
389
+ const latestDispatchByTicket = new Map();
390
+ for (const d of epicRunState.dispatches) {
391
+ if (!d.run_id)
392
+ continue;
393
+ const updatedAt = new Date(d.updated_at).getTime();
394
+ const prev = latestDispatchByTicket.get(d.ticket_key);
395
+ if (!prev || updatedAt >= prev.updatedAt) {
396
+ latestDispatchByTicket.set(d.ticket_key, { runId: d.run_id, updatedAt });
397
+ }
398
+ }
399
+ for (const [tk, info] of latestDispatchByTicket) {
400
+ ticketRunIdMap.set(tk, info.runId);
401
+ }
402
+ const resolvePrNumber = (ticketKey) => {
403
+ const raw = prBindings[ticketKey];
404
+ if (typeof raw === "number" && Number.isInteger(raw) && raw >= 1)
405
+ return raw;
406
+ if (raw && typeof raw === "object") {
407
+ const obj = raw;
408
+ const pr = obj.pr_number ?? obj.pr;
409
+ if (typeof pr === "number" && Number.isInteger(pr) && pr >= 1)
410
+ return pr;
411
+ }
412
+ return null;
413
+ };
414
+ const maxSeqForRun = (runId) => {
415
+ let maxSeq = 0;
416
+ for (const ev of localEvents) {
417
+ if (ev.run_id === runId && ev.seq > maxSeq)
418
+ maxSeq = ev.seq;
419
+ }
420
+ return maxSeq;
421
+ };
333
422
  const reconcileDeps = {
334
423
  casTicketStatus: async (ek, tk, rowVersion, nextStatus, planVersion) => advanceEpicTicketStatus(access, {
335
424
  epicKey: ek,
@@ -360,13 +449,146 @@ export async function runEpicTick(options, deps = {}) {
360
449
  runId,
361
450
  });
362
451
  },
363
- dispatchSeam: async (ek, tk) => dispatchSeam(ek, tk),
452
+ dispatchSeam: async (ek, tk, attempt = 0) => dispatchSeam(ek, tk, attempt),
364
453
  processMerge: async (acc, event) => processMergeFn(acc, event),
365
454
  postActionWaitSeam: async (ek, tk) => postActionWaitSeam(ek, tk),
366
455
  escalateOnce: async (ek, reason) => escalateOnce(ek, reason),
367
456
  log,
457
+ // BAPI-442: teardown and Jira-transition seams (fail-open, optional).
458
+ teardownSeam: async (_ek, tk) => {
459
+ // Resolve the PR number for the ticket.
460
+ const prNumber = resolvePrNumber(tk);
461
+ if (prNumber === null) {
462
+ errorLog(`[epic-tick] teardown: no PR binding for ${tk}; skipping`);
463
+ return;
464
+ }
465
+ // Fetch setup to get the expected head SHA if available; fall back to empty.
466
+ let expectedSha = "";
467
+ try {
468
+ const setup = await fetchEffectiveSupervisorSetup(access, epic_key);
469
+ const binding = (setup.pr_bindings ?? {})[tk];
470
+ if (binding && typeof binding === "object") {
471
+ const b = binding;
472
+ if (typeof b.head_sha === "string")
473
+ expectedSha = b.head_sha;
474
+ }
475
+ }
476
+ catch {
477
+ // Best-effort; proceed with empty SHA (endpoint still deletes by PR number)
478
+ }
479
+ try {
480
+ await deletePullRequestBranch(access, prNumber, expectedSha || "");
481
+ log(`[epic-tick] teardown: branch deleted for PR #${prNumber} (ticket=${tk})`);
482
+ }
483
+ catch (err) {
484
+ const safeMsg = err instanceof Error ? err.constructor.name : "error";
485
+ errorLog(`[epic-tick] teardown: branch-delete failed (${safeMsg}) for ${tk}`);
486
+ }
487
+ // Remove local worktree idempotently; errors are benign.
488
+ try {
489
+ spawnSync("git", ["worktree", "remove", "--force", tk], { stdio: "ignore" });
490
+ log(`[epic-tick] teardown: worktree removed for ${tk}`);
491
+ }
492
+ catch {
493
+ // Already removed or never created — idempotent skip.
494
+ }
495
+ },
496
+ jiraTransitionSeam: async (_ek, tk) => {
497
+ try {
498
+ const result = await transitionJiraStatus(access, tk, "auto");
499
+ if (result.status === "skipped") {
500
+ log(`[epic-tick] jira-transition: no matching transition for ${tk} (skipped)`);
501
+ }
502
+ else {
503
+ log(`[epic-tick] jira-transition: transitioned ${tk}`);
504
+ }
505
+ }
506
+ catch (err) {
507
+ const safeMsg = err instanceof Error ? err.constructor.name : "error";
508
+ errorLog(`[epic-tick] jira-transition failed (${safeMsg}) for ${tk}`);
509
+ }
510
+ },
511
+ // BAPI-441 remediation seams.
512
+ readWorkerLiveness: async (_ek, tk) => {
513
+ const runId = ticketRunIdMap.get(tk);
514
+ if (!runId)
515
+ return { alive: false, workerId: null };
516
+ return extractWorkerLiveness(localEvents, runId, nowFn(), livenessWindowSeconds);
517
+ },
518
+ remediateCas: async (ek, tk, attemptKind, reason) => {
519
+ const prNumber = resolvePrNumber(tk);
520
+ if (prNumber === null) {
521
+ throw new Error(`remediate: no PR binding for ${tk}`);
522
+ }
523
+ const reviewStatus = (await fetchPrReviewStatus(access, prNumber));
524
+ const headSha = reviewStatus?.detail?.head_sha ?? null;
525
+ if (!headSha) {
526
+ throw new Error(`remediate: no head_sha for PR ${prNumber}`);
527
+ }
528
+ const rowVersion = observed.ticket_row_versions.get(tk) ?? 0;
529
+ // Deterministic block-state idempotency key: stable for a given durable
530
+ // row_version so a same-tick retry replays (409, swallowed); advances
531
+ // with the next attempt.
532
+ const idempotencyKey = `remediate:${ek}:${tk}:${rowVersion}`;
533
+ const result = await remediateEpicTicket(access, {
534
+ pr_number: prNumber,
535
+ epic_run_id: ek,
536
+ ticket_key: tk,
537
+ expected_row_version: rowVersion,
538
+ head_sha: headSha,
539
+ idempotency_key: idempotencyKey,
540
+ attempt_kind: attemptKind,
541
+ reason,
542
+ });
543
+ if (result.conflict) {
544
+ return { conflict: true, reviewDigest: null, truncated: false };
545
+ }
546
+ return {
547
+ conflict: false,
548
+ reviewDigest: result.response.review_digest,
549
+ truncated: result.response.truncated,
550
+ };
551
+ },
552
+ sendNudge: async (_ek, tk, attempt, reviewDigest, truncated, reason, workerId) => {
553
+ const runId = ticketRunIdMap.get(tk);
554
+ if (!runId)
555
+ throw new Error(`nudge: no run_id for ${tk}`);
556
+ // workerId is resolved by readWorkerLiveness from the same heartbeat
557
+ // scan and null-checked by the reconcile pass before remediateCas, so
558
+ // the two seams stay consistent and no budget is burned on a missing id.
559
+ const input = buildSupervisorRemediationWorkerMessage({
560
+ runId,
561
+ workerId,
562
+ ticketKey: tk,
563
+ reason,
564
+ attempt,
565
+ reviewDigest: reviewDigest ?? "",
566
+ truncated,
567
+ causeSeq: maxSeqForRun(runId),
568
+ });
569
+ sendWorkerMessage(input);
570
+ },
571
+ resumeDispatch: async (ek, tk, attempt) => {
572
+ // Claim an attempt-scoped pending dispatch row FIRST so the spawn's
573
+ // run_spawned correlation (inside orchestrateStartTickets) has a row to
574
+ // transition and the re-dispatched run_id is durably recorded against
575
+ // the ticket. The claim is idempotent (lease-held/already-spawned are
576
+ // returned, not thrown).
577
+ await recordEpicDispatch(access, {
578
+ epicKey: ek,
579
+ ticketKey: tk,
580
+ planVersion: plan.plan_version,
581
+ leaseOwner: lease_owner,
582
+ ttlSeconds: DEFAULT_DISPATCH_KEY_TTL_SECONDS,
583
+ attempt,
584
+ });
585
+ // dispatchSeam returns the new run_id; orchestrate correlates it into
586
+ // the attempt-scoped epic_dispatch row, so the next tick's liveness map
587
+ // (built from the dispatch ledger) tracks the fresh worker.
588
+ await dispatchSeam(ek, tk, attempt);
589
+ },
368
590
  };
369
- const reconcileResult = await reconcileEpic(access, observed, plan, reconcileDeps);
591
+ const reconcileResult = await reconcileEpic(access, observed, plan, reconcileDeps, remediationConfig);
370
592
  log(`[epic-tick] reconcile done: epic=${epic_key} ` +
371
593
  `signals=${reconcileResult.signals_folded} ` +
372
594
  `dispatched=${reconcileResult.dispatched} ` +
@@ -494,7 +716,7 @@ export async function buildProductionEpicRuntimeDeps(epicKey) {
494
716
  const planHash = hashPlan(dag);
495
717
  return { plan_hash: planHash, plan_version: response.plan_version, tickets };
496
718
  };
497
- const dispatchSeam = async (ek, tk) => {
719
+ const dispatchSeam = async (ek, tk, attempt = 0) => {
498
720
  // Guard: fetchPlan must run before dispatchSeam so cachedPlanVersion and
499
721
  // automationMap are populated. A zero version means the factory seam was
500
722
  // wired but fetchPlan was never called — fail explicitly rather than silently
@@ -502,6 +724,12 @@ export async function buildProductionEpicRuntimeDeps(epicKey) {
502
724
  if (cachedPlanVersion === 0) {
503
725
  throw new Error(`dispatchSeam called before fetchPlan for epic ${ek} ticket ${tk}; cachedPlanVersion is 0`);
504
726
  }
727
+ // BAPI-441: a remediation re-dispatch (attempt > 0) reuses the existing
728
+ // branch/worktree (resume mode) and claims an attempt-scoped dispatch key so
729
+ // it is not deduped against the original epic dispatch.
730
+ const isResume = attempt > 0;
731
+ // The dispatch kind comes from the plan node's automation (start-tickets or
732
+ // review-tickets); default to start-tickets when unspecified.
505
733
  const kind = automationMap.get(tk) ?? "start-tickets";
506
734
  // Operator dry-run: when BAPI_CONDUCTOR_DISPATCH_DRY_RUN=1, dispatch resolves
507
735
  // the spawn command + model routing but opens NO terminal, creates NO worktree,
@@ -513,7 +741,7 @@ export async function buildProductionEpicRuntimeDeps(epicKey) {
513
741
  epic_key: ek,
514
742
  epic_run_id: ek,
515
743
  plan_version: cachedPlanVersion,
516
- dispatch_key: buildEpicDispatchKey(ek, tk, cachedPlanVersion),
744
+ dispatch_key: buildEpicDispatchKey(ek, tk, cachedPlanVersion, attempt),
517
745
  };
518
746
  const deps = createDefaultStartTicketsDeps();
519
747
  let runId;
@@ -533,6 +761,14 @@ export async function buildProductionEpicRuntimeDeps(epicKey) {
533
761
  runId = result.rows[0]?.runId;
534
762
  }
535
763
  else {
764
+ // BAPI-409 / IH-1: epic dispatch (dispatch_key set) requires the conductor
765
+ // stage to mint a run_id and provision per-worker env/supervisor context.
766
+ // `conductorEnabled: true` alone is necessary but not sufficient — the
767
+ // BAPI-409 guard in orchestrateStartTickets fails closed unless the
768
+ // createConductorContext seam (and its siblings) is injected via the third
769
+ // `overrides` argument, exactly as the packaged start-tickets CLI does. The
770
+ // orchestrator short-circuits on dryRun before using them, so passing them
771
+ // unconditionally is safe; dispatchDryRun preserves the operator dry-run seam.
536
772
  const result = await orchestrateStartTickets(deps, {
537
773
  keys: [tk],
538
774
  epic: identity,
@@ -543,6 +779,13 @@ export async function buildProductionEpicRuntimeDeps(epicKey) {
543
779
  refreshMain: false,
544
780
  branchOverrides: {},
545
781
  baseBranch: "main",
782
+ conductorEnabled: true,
783
+ // BAPI-441: re-dispatch reuses the existing branch/worktree.
784
+ resumeMode: isResume,
785
+ }, {
786
+ createConductorContext: createStartTicketsConductorContext,
787
+ provisionConductorHooksForRows,
788
+ emitStartTicketsRunStarted,
546
789
  });
547
790
  if (!result.ok) {
548
791
  throw new Error(`start-tickets dispatch failed: ${result.error}`);
@@ -559,11 +802,50 @@ export async function buildProductionEpicRuntimeDeps(epicKey) {
559
802
  }
560
803
  return runId;
561
804
  };
562
- const fetchLocalEvents = (_ek) => {
805
+ const fetchLocalEvents = (_ek, runIds) => {
563
806
  // Workers and the epic-tick process share the same local SQLite ledger
564
807
  // (~/.config/bridge/events.db). pollConductorEvents opens it read-only.
565
- const result = pollConductorEvents({ data_mode: "full" });
566
- return result.events;
808
+ //
809
+ // Scope the read to this epic's dispatched run_ids. The shared ledger holds
810
+ // events for every epic/worker on the machine (up to RETENTION_MAX_ROWS),
811
+ // but rebuildObservedState only folds signals whose run_id maps to one of
812
+ // these dispatches — so the run_ids filter pushes that scoping into SQL and
813
+ // avoids loading sibling-epic events on every tick. With no known run_ids
814
+ // (first tick before any dispatch) there is nothing to fold, so skip the
815
+ // read entirely.
816
+ //
817
+ // pollConductorEvents returns at most POLL_LIMIT_MAX events per call
818
+ // (default 100, capped at 1000) starting at `since_seq`. rebuildObservedState
819
+ // folds terminal signals (gate.met/run.stopped/merge.succeeded/ci.failed)
820
+ // ONLY from the events it is handed, so a single capped page silently hides
821
+ // recent terminal signals once the (scoped) result grows past one page —
822
+ // done-detection then breaks. Drain the COMPLETE history by paginating on the
823
+ // `next_seq` cursor until a short (or empty) page signals the tail.
824
+ if (runIds !== undefined && runIds.length === 0) {
825
+ return [];
826
+ }
827
+ const runIdsFilter = runIds && runIds.length > 0 ? { run_ids: [...runIds] } : undefined;
828
+ const events = [];
829
+ let sinceSeq = 1;
830
+ // Retention caps (retention_days/retention_max_rows) bound the ledger, but
831
+ // cap total iterations defensively against a non-advancing cursor.
832
+ const MAX_PAGES = 10_000;
833
+ for (let page = 0; page < MAX_PAGES; page += 1) {
834
+ const result = pollConductorEvents({
835
+ data_mode: "full",
836
+ since_seq: sinceSeq,
837
+ limit: POLL_LIMIT_MAX,
838
+ filter: runIdsFilter,
839
+ });
840
+ events.push(...result.events);
841
+ // Stop on a short/empty page (no more rows) or a cursor that fails to
842
+ // advance (guards against an infinite loop).
843
+ if (result.count < POLL_LIMIT_MAX || result.next_seq <= sinceSeq) {
844
+ break;
845
+ }
846
+ sinceSeq = result.next_seq;
847
+ }
848
+ return events;
567
849
  };
568
850
  const escalateOnce = async (ek, reason) => {
569
851
  const candidate = {
@@ -618,5 +900,9 @@ export async function buildProductionEpicRuntimeDeps(epicKey) {
618
900
  fetchLocalEvents,
619
901
  escalateOnce,
620
902
  postActionWaitSeam,
903
+ // BAPI-442 seams are wired at the reconcileDeps level inside runEpicTick
904
+ // (they need the per-tick `access` and `prBindings` closure). The factory
905
+ // returns the dispatchSeam with isReReview support; the other two seams are
906
+ // defined inline in the reconcileDeps object in runEpicTick.
621
907
  };
622
908
  }
@@ -1,44 +1,63 @@
1
1
  /**
2
2
  * Pure, deterministic observed-state rebuild + ready-set computation for the
3
- * Epic Supervisor (BAPI-408).
3
+ * Epic Supervisor (BAPI-408, BAPI-436).
4
4
  *
5
5
  * This module has NO I/O, NO timers, and NO LLM calls. Every time-dependent
6
6
  * function takes an explicit `now` (epoch ms) so tests are wall-clock
7
7
  * independent. Truth precedence: raw local ledger events override non-terminal
8
8
  * Postgres states.
9
+ *
10
+ * Status mapping (BAPI-436 — merge-gated dependent dispatch):
11
+ * gate.met → ready_for_review (implementation done; awaiting merge)
12
+ * run.stopped → ready_for_review (worker session ended; awaiting merge)
13
+ * merge.succeeded → done (merged; dependents may now dispatch)
14
+ * ci.failed → blocked (unchanged)
9
15
  */
10
16
  // ---------------------------------------------------------------------------
11
17
  // Status sets (module-private)
12
18
  // ---------------------------------------------------------------------------
13
19
  const NOT_STARTED_STATUS = "planned";
14
20
  const DONE_STATUSES = new Set(["done"]);
21
+ // "blocked" is intentionally non-terminal: a merge.succeeded arriving in a
22
+ // subsequent tick (cross-tick) can still advance a blocked ticket to done.
23
+ // This is the expected path when an operator merges a PR despite failed CI.
24
+ // Contrast with the same-tick case: ci.failed wins over merge.succeeded in
25
+ // the same ledger batch (first-signal-wins; see the foldedTicketKeys guard).
15
26
  const NON_TERMINAL_STATUSES = new Set([
16
27
  "planned",
17
28
  "ready",
18
29
  "dispatched",
19
30
  "running",
20
31
  "blocked",
32
+ "ready_for_review",
21
33
  ]);
22
34
  const TERMINAL_SIGNAL_TYPES = new Set([
23
35
  "gate.met",
24
36
  "merge.succeeded",
25
37
  "ci.failed",
26
38
  "run.stopped",
39
+ "review.changes_requested",
27
40
  ]);
28
41
  function isNonTerminal(status) {
29
42
  return NON_TERMINAL_STATUSES.has(status);
30
43
  }
31
- function signalToNextStatus(signalType) {
44
+ export function signalToNextStatus(signalType) {
32
45
  if (signalType === "ci.failed")
33
46
  return "blocked";
34
- return "done";
47
+ if (signalType === "review.changes_requested")
48
+ return "blocked";
49
+ if (signalType === "merge.succeeded")
50
+ return "done";
51
+ return "ready_for_review"; // gate.met and run.stopped: awaiting merge
35
52
  }
36
53
  // ---------------------------------------------------------------------------
37
54
  // computeReadySet
38
55
  // ---------------------------------------------------------------------------
39
56
  /**
40
57
  * Pure deterministic ready-set computation. Returns ticket keys that:
41
- * 1. Have status "planned" (not yet started), AND
58
+ * 1. Have status "planned" (not yet started) or "ready" (crash-recovery —
59
+ * a ticket already advanced to ready on a prior tick that crashed before
60
+ * dispatch must not be silently dropped), AND
42
61
  * 2. Whose full `depends_on` list is satisfied (all deps have "done" status).
43
62
  *
44
63
  * Never calls an LLM or performs I/O. Goal 8 invariant.
@@ -47,7 +66,7 @@ export function computeReadySet(plan, ticketStatuses) {
47
66
  const ready = [];
48
67
  for (const ticket of plan.tickets) {
49
68
  const currentStatus = ticketStatuses.get(ticket.ticket_key) ?? "planned";
50
- if (currentStatus !== NOT_STARTED_STATUS)
69
+ if (currentStatus !== NOT_STARTED_STATUS && currentStatus !== "ready")
51
70
  continue;
52
71
  const allDepsResolved = ticket.depends_on.every((dep) => DONE_STATUSES.has(ticketStatuses.get(dep) ?? "planned"));
53
72
  if (allDepsResolved)
@@ -55,6 +74,46 @@ export function computeReadySet(plan, ticketStatuses) {
55
74
  }
56
75
  return ready;
57
76
  }
77
+ /**
78
+ * Pure, deterministic remediation decision (no I/O, no clock). Given the current
79
+ * budget counters, the ledger-derived worker liveness, and the configured
80
+ * ceilings, decide whether to NUDGE (worker still alive), RE-DISPATCH (worker
81
+ * gone, budget remaining), or ESCALATE (budget exhausted).
82
+ *
83
+ * Budget exhaustion is checked FIRST so an at-ceiling ticket always escalates
84
+ * regardless of liveness. A counter at-or-above its ceiling exhausts the budget.
85
+ */
86
+ export function decideRemediation(attempts, noProgressAttempts, alive, config) {
87
+ if (attempts >= config.max_remediation_attempts ||
88
+ noProgressAttempts >= config.max_remediation_no_progress_attempts) {
89
+ return "escalate";
90
+ }
91
+ return alive ? "nudge" : "redispatch";
92
+ }
93
+ /**
94
+ * Pure liveness extraction (no I/O, explicit `nowMs`). Finds the most recent
95
+ * `message.delivered`/`message.acked` heartbeat event for `runId` and reports
96
+ * the worker alive when that heartbeat's age is within
97
+ * `windowSeconds`. An empty ledger (no heartbeat for the run) defaults to
98
+ * `{ alive: false, workerId: null }` (fail-closed: never misjudge a worker
99
+ * alive without evidence).
100
+ */
101
+ export function extractWorkerLiveness(events, runId, nowMs, windowSeconds) {
102
+ let latest = null;
103
+ for (const ev of events) {
104
+ if (ev.run_id !== runId)
105
+ continue;
106
+ if (ev.type !== "message.delivered" && ev.type !== "message.acked")
107
+ continue;
108
+ if (!latest || new Date(ev.time).getTime() > new Date(latest.time).getTime()) {
109
+ latest = ev;
110
+ }
111
+ }
112
+ if (!latest)
113
+ return { alive: false, workerId: null };
114
+ const age = nowMs - new Date(latest.time).getTime();
115
+ return { alive: age <= windowSeconds * 1000, workerId: latest.worker_id ?? null };
116
+ }
58
117
  // ---------------------------------------------------------------------------
59
118
  // rebuildObservedState
60
119
  // ---------------------------------------------------------------------------
@@ -75,14 +134,23 @@ export function rebuildObservedState(postgresState, events, _now) {
75
134
  // Populate base maps from Postgres
76
135
  const ticketStatusMap = new Map();
77
136
  const ticketRowVersionMap = new Map();
137
+ const ticketRemediationMap = new Map();
78
138
  for (const ts of ticket_statuses) {
79
139
  ticketStatusMap.set(ts.ticket_key, ts.status);
80
140
  ticketRowVersionMap.set(ts.ticket_key, ts.row_version);
141
+ ticketRemediationMap.set(ts.ticket_key, {
142
+ attempts: ts.remediation_attempts ?? 0,
143
+ no_progress: ts.remediation_no_progress_attempts ?? 0,
144
+ });
81
145
  }
82
146
  const unfoldedSignals = [];
83
147
  const pendingMergeEvents = [];
84
148
  // Track which tickets already have a folded signal (one override per ticket)
85
149
  const foldedTicketKeys = new Set();
150
+ // BAPI-441: per-ticket latest blocking reason (ci.failed / review.changes_requested),
151
+ // tracked across the full ledger so an already-blocked ticket still carries a
152
+ // reason for the remediation pass to frame the nudge.
153
+ const ticketBlockedReasons = new Map();
86
154
  for (const event of events) {
87
155
  if (!TERMINAL_SIGNAL_TYPES.has(event.type))
88
156
  continue;
@@ -91,16 +159,45 @@ export function rebuildObservedState(postgresState, events, _now) {
91
159
  const ticketKey = runId ? runIdToTicketKey.get(runId) : undefined;
92
160
  if (!ticketKey)
93
161
  continue;
94
- if (event.type === "gate.met") {
95
- pendingMergeEvents.push(event);
162
+ // Record the blocking reason (latest wins; events are seq-ordered) regardless
163
+ // of fold state, so a ticket blocked on a prior tick still resolves a reason.
164
+ if (event.type === "ci.failed" || event.type === "review.changes_requested") {
165
+ ticketBlockedReasons.set(ticketKey, event.type);
96
166
  }
97
167
  const postgresStatus = ticketStatusMap.get(ticketKey) ?? "planned";
98
168
  if (!isNonTerminal(postgresStatus))
99
169
  continue;
100
- if (foldedTicketKeys.has(ticketKey))
101
- continue;
170
+ // Only queue for merge actioning if this ticket hasn't already been folded
171
+ // this tick. Without this guard, two gate.met events for the same ticket
172
+ // would both enqueue, and a ci.failed → gate.met sequence would enqueue a
173
+ // merge action for a ticket whose effective status is "blocked".
174
+ if (event.type === "gate.met" && !foldedTicketKeys.has(ticketKey)) {
175
+ pendingMergeEvents.push(event);
176
+ }
102
177
  const signalType = event.type;
103
178
  const nextStatus = signalToNextStatus(signalType);
179
+ if (foldedTicketKeys.has(ticketKey)) {
180
+ // Allow a same-tick upgrade from ready_for_review → done when
181
+ // merge.succeeded arrives after gate.met in the same ledger batch.
182
+ // First-signal-wins for everything else: if ci.failed arrived first,
183
+ // currentLocalStatus is "blocked" (not "ready_for_review"), so the
184
+ // upgrade guard below is false and merge.succeeded is intentionally
185
+ // dropped — a failed-CI ticket must not be silently advanced to done.
186
+ const currentLocalStatus = ticketStatusMap.get(ticketKey);
187
+ if (currentLocalStatus === "ready_for_review" && nextStatus === "done") {
188
+ const existingIdx = unfoldedSignals.findIndex((s) => s.ticket_key === ticketKey);
189
+ if (existingIdx >= 0) {
190
+ unfoldedSignals[existingIdx] = {
191
+ ...unfoldedSignals[existingIdx],
192
+ next_status: nextStatus,
193
+ signal_type: signalType,
194
+ event,
195
+ };
196
+ ticketStatusMap.set(ticketKey, nextStatus);
197
+ }
198
+ }
199
+ continue;
200
+ }
104
201
  const rowVersion = ticketRowVersionMap.get(ticketKey) ?? 0;
105
202
  unfoldedSignals.push({
106
203
  ticket_key: ticketKey,
@@ -119,6 +216,8 @@ export function rebuildObservedState(postgresState, events, _now) {
119
216
  plan_version: epic_run.current_plan_version,
120
217
  ticket_statuses: ticketStatusMap,
121
218
  ticket_row_versions: ticketRowVersionMap,
219
+ ticket_remediation_counters: ticketRemediationMap,
220
+ ticket_blocked_reasons: ticketBlockedReasons,
122
221
  unfolded_terminal_signals: unfoldedSignals,
123
222
  pending_merge_events: pendingMergeEvents,
124
223
  };
@@ -25,8 +25,14 @@ export const GIT_CI_PRODUCER = "git-pr-ci-producer";
25
25
  export const GIT_HOOK_PRODUCER = "git-hook";
26
26
  /** The single v1 done-gate condition type. */
27
27
  export const REQUIRED_CI_CHECKS_GREEN = "required_ci_checks_green";
28
+ /** The v2 review-state done-gate condition type. */
29
+ export const REVIEW_STATE = "review_state";
28
30
  /** Default gate name surfaced in `gate.met` event data. */
29
31
  export const DEFAULT_GATE_NAME = "done";
32
+ /** Event type emitted when the configured review source is satisfied. */
33
+ export const REVIEW_PASSED = "review.passed";
34
+ /** Event type emitted when a reviewer requests changes. */
35
+ export const REVIEW_CHANGES_REQUESTED = "review.changes_requested";
30
36
  /** Matches ASCII control characters (C0 range plus DEL). */
31
37
  const CONTROL_CHAR_RE = /[\u0000-\u001F\u007F]/;
32
38
  /** Matches a 40- or 64-character hex string (git SHA-1 / SHA-256 object ids). */