@cat-factory/orchestration 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import { parseBlueprintService, parseOnCallAssessment, parseSpecDoc, DEFAULT_COMPANION_MAX_ATTEMPTS, } from '@cat-factory/contracts';
1
+ import { parseBlueprintService, parseSpecDoc, DEFAULT_COMPANION_MAX_ATTEMPTS, } from '@cat-factory/contracts';
2
2
  import { blueprintPostOp, companionFor, companionTargets, isCompanionKind, registeredAgentStep, registeredPreOps, registeredPostOps, runRepoOps, specPostOp, TASK_ESTIMATOR_AGENT_KIND, } from '@cat-factory/agents';
3
3
  import { coerceTaskEstimate, summarizeEstimate } from '../estimation/estimate.logic.js';
4
4
  import { validatePipelineShape } from '../pipelines/pipelineShape.js';
@@ -7,8 +7,7 @@ import { reviewableArtifactOutput } from './artifact-review.logic.js';
7
7
  import { resolveIndividualVendors, } from './individualVendors.logic.js';
8
8
  import { assertFound, ConflictError, getErrorMessage, isModelUsable, NotFoundError, sameSubtasks, ValidationError, } from '@cat-factory/kernel';
9
9
  import { DEFAULT_MERGE_PRESET } from '@cat-factory/kernel';
10
- import { aggregateCi, CI_AGENT_KIND, CI_FIXER_AGENT_KIND, CONFLICTS_AGENT_KIND, CONFLICT_RESOLVER_AGENT_KIND, describeFailingChecks, listFailingChecks, isCiGreen, MERGER_AGENT_KIND, REQUIREMENTS_REVIEW_AGENT_KIND, CLARITY_REVIEW_AGENT_KIND, BUG_INVESTIGATOR_AGENT_KIND, TRACKER_AGENT_KIND, ANALYSIS_AGENT_KIND, TESTER_AGENT_KIND, HUMAN_TEST_AGENT_KIND, BLUEPRINTS_AGENT_KIND, SPEC_WRITER_AGENT_KIND, } from './ci.logic.js';
11
- import { POST_RELEASE_HEALTH_AGENT_KIND, ON_CALL_AGENT_KIND, classifyReleaseHealth, describeRegressedSignals, } from './release.logic.js';
10
+ import { CONFLICTS_AGENT_KIND, MERGER_AGENT_KIND, REQUIREMENTS_REVIEW_AGENT_KIND, CLARITY_REVIEW_AGENT_KIND, BUG_INVESTIGATOR_AGENT_KIND, TRACKER_AGENT_KIND, ANALYSIS_AGENT_KIND, TESTER_AGENT_KIND, HUMAN_TEST_AGENT_KIND, BLUEPRINTS_AGENT_KIND, SPEC_WRITER_AGENT_KIND, } from './ci.logic.js';
12
11
  import { AgentContextBuilder } from './AgentContextBuilder.js';
13
12
  import { CompanionController } from './CompanionController.js';
14
13
  import { inferTechnicalLabel } from './technical.logic.js';
@@ -24,14 +23,6 @@ import { requireWorkspace } from '@cat-factory/kernel';
24
23
  import { planResumedSteps, planRestartFromStep } from './retry.logic.js';
25
24
  import { isContainerEvictionError, isTransientEviction, MAX_EVICTION_RECOVERIES, MAX_TRANSIENT_EVICTION_RECOVERIES, } from './job.logic.js';
26
25
  import { decideTesterInfra, resolveTesterEnvironment, TESTER_INFRA_MESSAGES, } from './tester-infra.logic.js';
27
- /**
28
- * Max `conflict-resolver` escalations before the conflicts gate gives up. Deliberately
29
- * far below CI's budget (`ciMaxAttempts`, default 10): a conflict retry re-merges the
30
- * SAME base with no new signal, so extra attempts just burn containers re-attempting an
31
- * identical conflict. Three gives the (now conflict-aware) resolver a couple of shots at
32
- * model variance, then fails fast to a manual-resolution notification.
33
- */
34
- const CONFLICT_RESOLVER_MAX_ATTEMPTS = 3;
35
26
  /**
36
27
  * "What to do next" guidance per failure kind a pipeline run can produce, shown
37
28
  * under the failure banner on the board (mirrors bootstrap's FAILURE_HINTS). Only
@@ -47,10 +38,6 @@ const EXECUTION_FAILURE_HINTS = {
47
38
  cancelled: 'You stopped this run; its container was killed. Retry to start it again.',
48
39
  unknown: 'The run failed for an unclassified reason. Review the run, then retry.',
49
40
  };
50
- /** Format a 0..1 score as a rounded percentage for notification copy. */
51
- function pct(score) {
52
- return `${Math.round(score * 100)}%`;
53
- }
54
41
  /**
55
42
  * Parse `owner`/`repo` from a GitHub pull-request URL (`https://github.com/o/r/pull/42`).
56
43
  * Returns undefined for any URL that doesn't carry both segments. Host-agnostic on
@@ -62,35 +49,6 @@ function parseRepoFromPullUrl(url) {
62
49
  return undefined;
63
50
  return { owner: match[1], repo: match[2] };
64
51
  }
65
- /**
66
- * Render the Datadog evidence bundle into the prior-output text the on-call agent reads:
67
- * the regressed monitors/SLOs, recent error groups, and the investigation brief (correlate
68
- * the diff with the signals, return a JSON assessment, do NOT revert).
69
- */
70
- function renderReleaseEvidence(evidence) {
71
- const lines = ['## Post-release regression evidence', ''];
72
- if (evidence.regressedSignals.length > 0) {
73
- lines.push('Regressed signals:');
74
- for (const s of evidence.regressedSignals) {
75
- lines.push(`- ${s.kind} "${s.name}" (${s.id}): ${s.state}${s.detail ? ` — ${s.detail}` : ''}`);
76
- }
77
- lines.push('');
78
- }
79
- if (evidence.errors.length > 0) {
80
- lines.push('Recent errors:');
81
- for (const e of evidence.errors) {
82
- lines.push(`- ${e.title}${e.count != null ? ` ×${e.count}` : ''}${e.sampleMessage ? ` — ${e.sampleMessage}` : ''}`);
83
- }
84
- lines.push('');
85
- }
86
- if (evidence.notes)
87
- lines.push(evidence.notes, '');
88
- lines.push('Investigate whether THIS PR is the likely cause: correlate its diff with the regressed ' +
89
- 'signals and errors above (and the service logs). Beware correlation ≠ causation. Return a ' +
90
- 'JSON assessment: { "culpritConfidence": 0..1, "recommendation": "revert"|"hold"|"monitor", ' +
91
- '"rationale": "…", "evidence": ["…"] }. Do NOT make commits or revert anything — a human decides.');
92
- return lines.join('\n');
93
- }
94
52
  /**
95
53
  * The execution engine. It orchestrates a pipeline of agent-performed steps and
96
54
  * is fully deterministic: `advanceInstance` moves one run forward by exactly one
@@ -139,10 +97,6 @@ export class ExecutionService {
139
97
  notificationService;
140
98
  workspaceSettingsService;
141
99
  llmObservability;
142
- ciStatusProvider;
143
- mergeabilityProvider;
144
- releaseHealthProvider;
145
- incidentEnrichment;
146
100
  prMerger;
147
101
  mergePresetRepository;
148
102
  ticketTrackerProvider;
@@ -165,7 +119,7 @@ export class ExecutionService {
165
119
  * {@link stepResolverFor} and {@link StepCompletionResolver}.
166
120
  */
167
121
  stepResolverCache;
168
- constructor({ workspaceRepository, blockRepository, pipelineRepository, executionRepository, accountRepository, idGenerator, clock, agentExecutor, workRunner, executionEventPublisher, boardService, spendService, documentRepository, taskRepository, requirementReviewRepository, requirementReviewService, clarityReviewRepository, clarityReviewService, fragmentResolver, environmentProvisioning, environmentTeardown, branchUpdater, blueprintReconciler, notificationService, workspaceSettingsService, llmObservability, ciStatusProvider, mergeabilityProvider, releaseHealthProvider, incidentEnrichment, pullRequestMerger, mergePresetRepository, ticketTrackerProvider, issueWriteback, subscriptionActivationRepository, resolveWorkspaceModelDefault, resolveProviderCapabilities, localTestInfraSupported, resolveRunRepoContext, runInitiatorScope, }) {
122
+ constructor({ workspaceRepository, blockRepository, pipelineRepository, executionRepository, accountRepository, idGenerator, clock, agentExecutor, workRunner, executionEventPublisher, boardService, spendService, documentRepository, taskRepository, requirementReviewRepository, requirementReviewService, clarityReviewRepository, clarityReviewService, fragmentResolver, environmentProvisioning, environmentTeardown, branchUpdater, blueprintReconciler, notificationService, workspaceSettingsService, llmObservability, pullRequestMerger, mergePresetRepository, ticketTrackerProvider, issueWriteback, subscriptionActivationRepository, resolveWorkspaceModelDefault, resolveProviderCapabilities, localTestInfraSupported, resolveRunRepoContext, runInitiatorScope, }) {
169
123
  this.runInitiatorScope = runInitiatorScope ?? ((_initiatedBy, fn) => fn());
170
124
  this.workspaceRepository = workspaceRepository;
171
125
  this.blockRepository = blockRepository;
@@ -294,10 +248,6 @@ export class ExecutionService {
294
248
  this.notificationService = notificationService;
295
249
  this.workspaceSettingsService = workspaceSettingsService;
296
250
  this.llmObservability = llmObservability;
297
- this.ciStatusProvider = ciStatusProvider;
298
- this.mergeabilityProvider = mergeabilityProvider;
299
- this.releaseHealthProvider = releaseHealthProvider;
300
- this.incidentEnrichment = incidentEnrichment;
301
251
  this.prMerger = pullRequestMerger;
302
252
  this.mergePresetRepository = mergePresetRepository;
303
253
  this.ticketTrackerProvider = ticketTrackerProvider;
@@ -926,16 +876,15 @@ export class ExecutionService {
926
876
  }
927
877
  return { kind: 'awaiting_job', jobId: step.jobId, stepIndex: instance.currentStep };
928
878
  }
929
- // The post-release-health gate's helper is the `on-call` agent, which INVESTIGATES
930
- // (it makes no commits and doesn't change prod), so unlike ci-fixer/conflict-resolver
931
- // its completion must NOT re-probe to green re-probing would just regress again and
932
- // burn the budget. When it finishes OR fails resolve it the same way: raise the
933
- // `release_regression` notification (with the regressed signals stashed at escalation),
934
- // enrich any open incident, then finish the gate step so the run completes (a human
935
- // acts on the notification out-of-band). A FAILED investigation must NOT fall through
936
- // to the generic gate path: that would re-probe → still regress → exhaust the budget,
937
- // discarding the stashed signals and failing the run with a thinner notification.
938
- if (step.agentKind === POST_RELEASE_HEALTH_AGENT_KIND &&
879
+ // A gate whose helper INVESTIGATES instead of fixing (post-release-health on-call)
880
+ // declares a `resolveHelperCompletion` hook on its definition. When such a helper's job
881
+ // settles done OR failed we call the hook INSTEAD of re-probing the precheck
882
+ // (re-probing an investigate-don't-fix helper would just regress again and burn the
883
+ // budget) and finish the gate step with the output it returns. The gate raises its own
884
+ // `release_regression` notification + enriches any open incident inside the hook (from the
885
+ // signals stashed at escalation); the run then completes for a human to act out-of-band.
886
+ const completionGate = this.gateFor(step.agentKind);
887
+ if (completionGate?.resolveHelperCompletion &&
939
888
  step.gate?.phase === 'working' &&
940
889
  (update.state === 'done' || update.state === 'failed')) {
941
890
  const block = await this.blockRepository.get(workspaceId, instance.blockId);
@@ -944,10 +893,23 @@ export class ExecutionService {
944
893
  if (!block)
945
894
  return { kind: 'noop' };
946
895
  const isFinalStep = instance.currentStep === instance.steps.length - 1;
947
- const result = update.state === 'done'
948
- ? update.result
949
- : { output: `On-call investigation did not complete: ${update.error ?? 'unknown error'}` };
950
- return this.resolveOnCallStep(workspaceId, instance, step, block, result, isFinalStep, update.state === 'failed');
896
+ const jobResult = update.state === 'done'
897
+ ? { state: 'done', result: update.result }
898
+ : { state: 'failed', error: update.error ?? null };
899
+ const resolution = await completionGate.resolveHelperCompletion({
900
+ workspaceId,
901
+ instance,
902
+ block,
903
+ step,
904
+ result: jobResult,
905
+ });
906
+ // Preserve the done-result's fields (usage metering etc.) while recording the gate's
907
+ // resolved output; a failed investigation has no result to carry.
908
+ const base = update.state === 'done' ? update.result : { output: '' };
909
+ return this.recordStepResult(workspaceId, instance, step, isFinalStep, {
910
+ ...base,
911
+ output: resolution.output,
912
+ });
951
913
  }
952
914
  // A polling gate step's in-flight job is its helper agent (ci-fixer /
953
915
  // conflict-resolver), NOT the step's own work: when it finishes (or fails) we
@@ -1800,173 +1762,13 @@ export class ExecutionService {
1800
1762
  return { runInitiatorScope: this.runInitiatorScope };
1801
1763
  }
1802
1764
  buildGateRegistry() {
1803
- const gates = [
1804
- // CI gate: poll the PR head's check runs; escalate to a `ci-fixer` on red CI.
1805
- {
1806
- kind: CI_AGENT_KIND,
1807
- helperKind: CI_FIXER_AGENT_KIND,
1808
- wired: () => !!this.ciStatusProvider,
1809
- unwiredOutput: 'CI gate skipped (no CI status provider configured).',
1810
- probe: async (workspaceId, blockId) => {
1811
- const report = await this.ciStatusProvider.getStatus(workspaceId, blockId);
1812
- const verdict = aggregateCi(report.checks);
1813
- if (isCiGreen(verdict)) {
1814
- return {
1815
- status: 'pass',
1816
- headSha: report.headSha,
1817
- passOutput: verdict === 'none'
1818
- ? 'CI gate passed: no checks configured for the PR head.'
1819
- : `CI gate passed: ${report.checks.length} check(s) green.`,
1820
- };
1821
- }
1822
- if (verdict === 'pending')
1823
- return { status: 'pending', headSha: report.headSha };
1824
- return {
1825
- status: 'fail',
1826
- headSha: report.headSha,
1827
- failureSummary: describeFailingChecks(report.checks),
1828
- failingChecks: listFailingChecks(report.checks),
1829
- };
1830
- },
1831
- // Surface the failing-check summary to the fixer as resolved context.
1832
- helperPriorOutput: (summary) => ({ agentKind: CI_AGENT_KIND, output: summary }),
1833
- onExhausted: async ({ workspaceId, instance, block, step, summary }) => {
1834
- const attempts = step.gate?.attempts ?? 0;
1835
- await this.raiseCiFailed(workspaceId, instance, block, summary ?? '', attempts);
1836
- return {
1837
- error: `CI did not pass after ${attempts} CI-fixer attempt(s). ${summary ?? ''}`.trim(),
1838
- };
1839
- },
1840
- },
1841
- // Conflicts gate: check PR mergeability; escalate to a `conflict-resolver` on conflict.
1842
- {
1843
- kind: CONFLICTS_AGENT_KIND,
1844
- helperKind: CONFLICT_RESOLVER_AGENT_KIND,
1845
- wired: () => !!this.mergeabilityProvider,
1846
- unwiredOutput: 'Conflict gate skipped (no mergeability provider configured).',
1847
- // Unlike CI (where each fixer round gets fresh red-check output to act on), a
1848
- // conflict retry re-merges the SAME base and gets no new signal, so a large
1849
- // budget just burns containers re-attempting the same conflict (observed in
1850
- // prod: 10 attempts, head SHA never moved, run failed). Cap it low and fail
1851
- // fast to a manual-resolution notification instead of churning to CI's default
1852
- // of 10.
1853
- attemptBudget: () => CONFLICT_RESOLVER_MAX_ATTEMPTS,
1854
- probe: async (workspaceId, blockId) => {
1855
- const report = await this.mergeabilityProvider.getMergeability(workspaceId, blockId);
1856
- // No PR resolved, or it merges cleanly → nothing to do; advance.
1857
- if (report.headSha === null || report.verdict === 'mergeable') {
1858
- return {
1859
- status: 'pass',
1860
- headSha: report.headSha,
1861
- passOutput: report.headSha === null
1862
- ? 'Conflict gate passed: no open PR to gate.'
1863
- : 'Conflict gate passed: the PR merges cleanly with its base.',
1864
- };
1865
- }
1866
- // GitHub still computing mergeability → keep polling.
1867
- if (report.verdict === 'unknown')
1868
- return { status: 'pending', headSha: report.headSha };
1869
- return { status: 'fail', headSha: report.headSha };
1870
- },
1871
- onExhausted: async ({ step }) => ({
1872
- error: `The pull request still conflicts with its base after ` +
1873
- `${step.gate?.attempts ?? 0} conflict-resolver attempt(s). Resolve the conflict ` +
1874
- `manually, then retry the run.`,
1875
- }),
1876
- },
1877
- // Post-release-health gate: after deploy, watch the release's Datadog monitors/SLOs
1878
- // over a window; escalate to the `on-call` agent on a regression (it investigates,
1879
- // it does NOT fix prod, so its completion is resolved specially — see
1880
- // resolveOnCallStep — rather than re-probing to green).
1881
- {
1882
- kind: POST_RELEASE_HEALTH_AGENT_KIND,
1883
- helperKind: ON_CALL_AGENT_KIND,
1884
- wired: () => !!this.releaseHealthProvider,
1885
- unwiredOutput: 'Post-release health gate skipped (no release-health provider configured).',
1886
- attemptBudget: (preset) => preset.releaseMaxAttempts,
1887
- // Running out of poll budget while still watching means the window outlasted the
1888
- // driver's budget with NO regression observed — a healthy pass, not a timeout.
1889
- pollExhaustion: 'pass',
1890
- probe: async (workspaceId, blockId, gateState) => {
1891
- // Only watch a release that actually SHIPPED. The merger sets the block `done`
1892
- // when it merges for real, but leaves it `pr_ready` when it raises a review
1893
- // (assessment outside thresholds) without merging — and a no-merger pipeline
1894
- // also never auto-merges. There is nothing deployed to watch in those cases, so
1895
- // pass through immediately instead of polling Datadog (and possibly escalating
1896
- // an on-call investigation) for a change that was never released.
1897
- const block = await this.blockRepository.get(workspaceId, blockId);
1898
- if (!block || block.status !== 'done') {
1899
- return {
1900
- status: 'pass',
1901
- headSha: null,
1902
- passOutput: 'Post-release health gate skipped: the PR was not merged (nothing deployed to watch).',
1903
- };
1904
- }
1905
- const since = gateState.watchSince ?? this.clock.now();
1906
- const report = await this.releaseHealthProvider.probe(workspaceId, blockId, since);
1907
- // No signals configured for this block → nothing to watch; advance immediately
1908
- // (don't park for the whole window on an unmapped release).
1909
- if (report.signals.length === 0) {
1910
- return {
1911
- status: 'pass',
1912
- headSha: null,
1913
- passOutput: 'Post-release health gate passed: no monitors/SLOs configured for this release.',
1914
- };
1915
- }
1916
- // The watch window is resolved ONCE on first entry and stashed on the gate
1917
- // state (see evaluateGate), so the probe doesn't re-load the block + re-resolve
1918
- // the merge preset on every poll over the window.
1919
- const windowMinutes = gateState.watchWindowMinutes ?? DEFAULT_MERGE_PRESET.releaseWatchWindowMinutes;
1920
- const windowElapsed = this.clock.now() - since >= windowMinutes * 60_000;
1921
- const verdict = classifyReleaseHealth({ report, windowElapsed });
1922
- if (verdict === 'pass') {
1923
- return {
1924
- status: 'pass',
1925
- headSha: null,
1926
- passOutput: `Post-release health gate passed: ${report.signals.length} signal(s) healthy through the watch window.`,
1927
- };
1928
- }
1929
- if (verdict === 'pending')
1930
- return { status: 'pending', headSha: null };
1931
- return {
1932
- status: 'fail',
1933
- headSha: null,
1934
- failureSummary: describeRegressedSignals(report.signals),
1935
- };
1936
- },
1937
- // The on-call agent gets the full evidence bundle (regressed signals + recent
1938
- // error logs), gathered fresh at dispatch.
1939
- gatherHelperPriorOutputs: async (workspaceId, blockId, gateState) => {
1940
- const since = gateState.watchSince ?? this.clock.now();
1941
- const evidence = await this.releaseHealthProvider.gatherEvidence(workspaceId, blockId, since);
1942
- // Stash the regressed signals on the gate state so the on-call COMPLETION handler
1943
- // (resolveOnCallStep) builds the notification + incident enrichment from the SAME
1944
- // evidence the agent investigated — rather than re-reading Datadog a third time
1945
- // (which also risks disagreeing with what the agent saw if the window moved).
1946
- // The caller spreads `...step.gate` right after, so this mutation persists.
1947
- gateState.regressedSignals = evidence.regressedSignals;
1948
- return [
1949
- { agentKind: POST_RELEASE_HEALTH_AGENT_KIND, output: renderReleaseEvidence(evidence) },
1950
- ];
1951
- },
1952
- onExhausted: async ({ workspaceId, instance, block, step, summary }) => {
1953
- // Reached when releaseMaxAttempts is 0 (operator disabled the on-call
1954
- // investigation) or there is no async executor to escalate to — a FAILED
1955
- // investigation is handled in pollAgentJob, not here. Alert a human via the
1956
- // notification (with any signals already captured), then flag the run.
1957
- await this.raiseReleaseRegression(workspaceId, instance, block, null, step.gate?.regressedSignals ?? [], summary ?? '');
1958
- return {
1959
- error: `Post-release health regressed and no on-call investigation was configured. ${summary ?? ''}`.trim(),
1960
- };
1961
- },
1962
- },
1963
- ];
1964
- const map = new Map(gates.map((gate) => [gate.kind, gate]));
1965
- // Merge deployment-registered gates. The built-ins above stay inline (their closures
1966
- // capture `this` for the engine-held providers + the typed `raiseCiFailed` /
1967
- // `raiseReleaseRegression` notifications a generic context can't reproduce); a
1968
- // registered gate instead receives a minimal {@link GateContext}. A registered gate of
1969
- // the same kind replaces the built-in (last registration wins, like registerAgentKind).
1765
+ // The built-in gate suite (ci / conflicts / post-release-health) is no longer inline:
1766
+ // it ships as `@cat-factory/gates`, registered through the SAME public `registerGate`
1767
+ // seam any deployment uses (the dogfood — if the platform's own gates can be authored
1768
+ // as an external package, so can anyone's). The engine merely builds whatever gates were
1769
+ // registered at startup. A facade that forgot to `import '@cat-factory/gates'` then has
1770
+ // no gates and those steps fail — which the cross-runtime conformance suite catches.
1771
+ const map = new Map();
1970
1772
  const ctx = this.makeGateContext();
1971
1773
  for (const { kind, factory } of registeredGateFactories())
1972
1774
  map.set(kind, factory(ctx));
@@ -2168,23 +1970,6 @@ export class ExecutionService {
2168
1970
  return;
2169
1971
  await svc.clearWaitingDecision(workspaceId, instance.blockId);
2170
1972
  }
2171
- /** Raise a `ci_failed` notification when the CI gate exhausts its fixer budget. */
2172
- async raiseCiFailed(workspaceId, instance, block, summary, attempts) {
2173
- if (!this.notificationService)
2174
- return;
2175
- await this.notificationService.raise(workspaceId, {
2176
- type: 'ci_failed',
2177
- blockId: block.id,
2178
- executionId: instance.id,
2179
- title: `CI is still failing for "${block.title}"`,
2180
- body: `The CI-fixer agent tried ${attempts} time(s) but CI is still red. ${summary} ` +
2181
- `Take a look and retry the run once fixed.`,
2182
- payload: {
2183
- ...(block.pullRequest?.url ? { prUrl: block.pullRequest.url } : {}),
2184
- pipelineName: instance.pipelineName,
2185
- },
2186
- });
2187
- }
2188
1973
  /** Provision inputs (`{{input.*}}`) derived from the block under deployment. */
2189
1974
  deployInputs(block) {
2190
1975
  const inputs = {
@@ -2860,95 +2645,6 @@ export class ExecutionService {
2860
2645
  }
2861
2646
  return DEFAULT_MERGE_PRESET;
2862
2647
  }
2863
- /**
2864
- * Resolve a finished `on-call` investigation (the post-release-health gate's helper):
2865
- * parse its assessment, raise a `release_regression` notification for a human, enrich
2866
- * any incident PagerDuty/incident.io already opened, then finish the gate step so the
2867
- * run completes (the human acts on the notification out-of-band — the engine never
2868
- * auto-reverts). Best-effort on the side-effects; the step always finishes.
2869
- */
2870
- async resolveOnCallStep(workspaceId, instance, step, block, result, isFinalStep, investigationFailed = false) {
2871
- let assessment = null;
2872
- try {
2873
- assessment = parseOnCallAssessment(result.onCallAssessment);
2874
- }
2875
- catch {
2876
- assessment = null;
2877
- }
2878
- // Reuse the regressed signals captured when the gate escalated (see the gate's
2879
- // gatherHelperPriorOutputs) so the notification + incident enrichment reflect exactly
2880
- // what the on-call agent investigated and we don't re-read Datadog a third time. Only
2881
- // fall back to a fresh gather if they weren't persisted (e.g. an older parked run).
2882
- const since = step.gate?.watchSince ?? this.clock.now();
2883
- let regressedSignals = step.gate?.regressedSignals ?? [];
2884
- if (regressedSignals.length === 0 && this.releaseHealthProvider) {
2885
- try {
2886
- const evidence = await this.releaseHealthProvider.gatherEvidence(workspaceId, block.id, since);
2887
- regressedSignals = evidence.regressedSignals;
2888
- }
2889
- catch {
2890
- // best-effort: the assessment + summary still drive the notification
2891
- }
2892
- }
2893
- const baseSummary = step.gate?.lastFailureSummary ?? '';
2894
- const summary = investigationFailed
2895
- ? `${baseSummary} The automated on-call investigation could not complete, so no culprit assessment is available — investigate manually.`.trim()
2896
- : baseSummary;
2897
- await this.raiseReleaseRegression(workspaceId, instance, block, assessment, regressedSignals, summary);
2898
- await this.enrichIncident(workspaceId, block, assessment, regressedSignals, since);
2899
- const output = assessment
2900
- ? `On-call investigation: ${assessment.recommendation} (culprit confidence ${pct(assessment.culpritConfidence)}). ${assessment.rationale}`
2901
- : investigationFailed
2902
- ? 'On-call investigation did not complete; raised a release-regression notification for manual triage.'
2903
- : 'On-call investigation completed; see the release-regression notification.';
2904
- return this.recordStepResult(workspaceId, instance, step, isFinalStep, { ...result, output });
2905
- }
2906
- /** Raise a `release_regression` notification carrying the on-call assessment + signals. */
2907
- async raiseReleaseRegression(workspaceId, instance, block, assessment, signals, summary) {
2908
- if (!this.notificationService)
2909
- return;
2910
- const body = assessment
2911
- ? `Post-release monitoring flagged a regression after this PR shipped. On-call recommends ` +
2912
- `**${assessment.recommendation}** (culprit confidence ${pct(assessment.culpritConfidence)}). ` +
2913
- `${assessment.rationale}`
2914
- : `Post-release monitoring flagged a regression after this PR shipped. ${summary} ` +
2915
- `Investigate before deciding whether to revert.`;
2916
- await this.notificationService.raise(workspaceId, {
2917
- type: 'release_regression',
2918
- blockId: block.id,
2919
- executionId: instance.id,
2920
- title: `Release regression for "${block.title}"`,
2921
- body,
2922
- payload: {
2923
- ...(assessment ? { onCallAssessment: assessment } : {}),
2924
- ...(signals.length ? { releaseSignals: signals } : {}),
2925
- ...(block.pullRequest?.url ? { prUrl: block.pullRequest.url } : {}),
2926
- pipelineName: instance.pipelineName,
2927
- },
2928
- });
2929
- }
2930
- /**
2931
- * Best-effort: annotate an incident PagerDuty / incident.io already opened (from the
2932
- * same monitors/SLOs) with the on-call investigation. NOT alerting — those systems
2933
- * already paged. A no-op when no provider is wired or no matching incident exists.
2934
- */
2935
- async enrichIncident(workspaceId, block, assessment, signals, since) {
2936
- if (!this.incidentEnrichment)
2937
- return;
2938
- const update = {
2939
- title: `Regression suspected from "${block.title}"`,
2940
- body: assessment
2941
- ? `${assessment.rationale} (recommendation: ${assessment.recommendation}, culprit confidence ${pct(assessment.culpritConfidence)})`
2942
- : 'cat-factory on-call investigated a post-release regression suspected from this change.',
2943
- ...(block.pullRequest?.url ? { prUrl: block.pullRequest.url } : {}),
2944
- };
2945
- try {
2946
- await this.incidentEnrichment.enrich({ workspaceId, signalIds: signals.map((s) => s.id), since }, update);
2947
- }
2948
- catch {
2949
- // best-effort: a failing enrichment must not block the run or the notification
2950
- }
2951
- }
2952
2648
  /** Raise a `pipeline_complete` notification for a no-merger run awaiting confirmation. */
2953
2649
  async raisePipelineComplete(workspaceId, instance, block) {
2954
2650
  if (!this.notificationService)