@cat-factory/orchestration 0.18.1 → 0.19.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/container.d.ts +1 -9
- package/dist/container.d.ts.map +1 -1
- package/dist/container.js.map +1 -1
- package/dist/modules/board/BoardService.d.ts.map +1 -1
- package/dist/modules/board/BoardService.js +3 -1
- package/dist/modules/board/BoardService.js.map +1 -1
- package/dist/modules/execution/ExecutionService.d.ts +2 -49
- package/dist/modules/execution/ExecutionService.d.ts.map +1 -1
- package/dist/modules/execution/ExecutionService.js +36 -340
- package/dist/modules/execution/ExecutionService.js.map +1 -1
- package/dist/modules/execution/ci.logic.d.ts +1 -41
- package/dist/modules/execution/ci.logic.d.ts.map +1 -1
- package/dist/modules/execution/ci.logic.js +5 -56
- package/dist/modules/execution/ci.logic.js.map +1 -1
- package/dist/modules/execution/release.logic.d.ts +1 -42
- package/dist/modules/execution/release.logic.d.ts.map +1 -1
- package/dist/modules/execution/release.logic.js +5 -48
- package/dist/modules/execution/release.logic.js.map +1 -1
- package/dist/modules/requirements/requirements.logic.d.ts.map +1 -1
- package/dist/modules/requirements/requirements.logic.js.map +1 -1
- package/package.json +6 -6
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { parseBlueprintService,
|
|
1
|
+
import { parseBlueprintService, parseSpecDoc, DEFAULT_COMPANION_MAX_ATTEMPTS, } from '@cat-factory/contracts';
|
|
2
2
|
import { blueprintPostOp, companionFor, companionTargets, isCompanionKind, registeredAgentStep, registeredPreOps, registeredPostOps, runRepoOps, specPostOp, TASK_ESTIMATOR_AGENT_KIND, } from '@cat-factory/agents';
|
|
3
3
|
import { coerceTaskEstimate, summarizeEstimate } from '../estimation/estimate.logic.js';
|
|
4
4
|
import { validatePipelineShape } from '../pipelines/pipelineShape.js';
|
|
@@ -7,8 +7,7 @@ import { reviewableArtifactOutput } from './artifact-review.logic.js';
|
|
|
7
7
|
import { resolveIndividualVendors, } from './individualVendors.logic.js';
|
|
8
8
|
import { assertFound, ConflictError, getErrorMessage, isModelUsable, NotFoundError, sameSubtasks, ValidationError, } from '@cat-factory/kernel';
|
|
9
9
|
import { DEFAULT_MERGE_PRESET } from '@cat-factory/kernel';
|
|
10
|
-
import {
|
|
11
|
-
import { POST_RELEASE_HEALTH_AGENT_KIND, ON_CALL_AGENT_KIND, classifyReleaseHealth, describeRegressedSignals, } from './release.logic.js';
|
|
10
|
+
import { CONFLICTS_AGENT_KIND, MERGER_AGENT_KIND, REQUIREMENTS_REVIEW_AGENT_KIND, CLARITY_REVIEW_AGENT_KIND, BUG_INVESTIGATOR_AGENT_KIND, TRACKER_AGENT_KIND, ANALYSIS_AGENT_KIND, TESTER_AGENT_KIND, HUMAN_TEST_AGENT_KIND, BLUEPRINTS_AGENT_KIND, SPEC_WRITER_AGENT_KIND, } from './ci.logic.js';
|
|
12
11
|
import { AgentContextBuilder } from './AgentContextBuilder.js';
|
|
13
12
|
import { CompanionController } from './CompanionController.js';
|
|
14
13
|
import { inferTechnicalLabel } from './technical.logic.js';
|
|
@@ -24,14 +23,6 @@ import { requireWorkspace } from '@cat-factory/kernel';
|
|
|
24
23
|
import { planResumedSteps, planRestartFromStep } from './retry.logic.js';
|
|
25
24
|
import { isContainerEvictionError, isTransientEviction, MAX_EVICTION_RECOVERIES, MAX_TRANSIENT_EVICTION_RECOVERIES, } from './job.logic.js';
|
|
26
25
|
import { decideTesterInfra, resolveTesterEnvironment, TESTER_INFRA_MESSAGES, } from './tester-infra.logic.js';
|
|
27
|
-
/**
|
|
28
|
-
* Max `conflict-resolver` escalations before the conflicts gate gives up. Deliberately
|
|
29
|
-
* far below CI's budget (`ciMaxAttempts`, default 10): a conflict retry re-merges the
|
|
30
|
-
* SAME base with no new signal, so extra attempts just burn containers re-attempting an
|
|
31
|
-
* identical conflict. Three gives the (now conflict-aware) resolver a couple of shots at
|
|
32
|
-
* model variance, then fails fast to a manual-resolution notification.
|
|
33
|
-
*/
|
|
34
|
-
const CONFLICT_RESOLVER_MAX_ATTEMPTS = 3;
|
|
35
26
|
/**
|
|
36
27
|
* "What to do next" guidance per failure kind a pipeline run can produce, shown
|
|
37
28
|
* under the failure banner on the board (mirrors bootstrap's FAILURE_HINTS). Only
|
|
@@ -47,10 +38,6 @@ const EXECUTION_FAILURE_HINTS = {
|
|
|
47
38
|
cancelled: 'You stopped this run; its container was killed. Retry to start it again.',
|
|
48
39
|
unknown: 'The run failed for an unclassified reason. Review the run, then retry.',
|
|
49
40
|
};
|
|
50
|
-
/** Format a 0..1 score as a rounded percentage for notification copy. */
|
|
51
|
-
function pct(score) {
|
|
52
|
-
return `${Math.round(score * 100)}%`;
|
|
53
|
-
}
|
|
54
41
|
/**
|
|
55
42
|
* Parse `owner`/`repo` from a GitHub pull-request URL (`https://github.com/o/r/pull/42`).
|
|
56
43
|
* Returns undefined for any URL that doesn't carry both segments. Host-agnostic on
|
|
@@ -62,35 +49,6 @@ function parseRepoFromPullUrl(url) {
|
|
|
62
49
|
return undefined;
|
|
63
50
|
return { owner: match[1], repo: match[2] };
|
|
64
51
|
}
|
|
65
|
-
/**
|
|
66
|
-
* Render the Datadog evidence bundle into the prior-output text the on-call agent reads:
|
|
67
|
-
* the regressed monitors/SLOs, recent error groups, and the investigation brief (correlate
|
|
68
|
-
* the diff with the signals, return a JSON assessment, do NOT revert).
|
|
69
|
-
*/
|
|
70
|
-
function renderReleaseEvidence(evidence) {
|
|
71
|
-
const lines = ['## Post-release regression evidence', ''];
|
|
72
|
-
if (evidence.regressedSignals.length > 0) {
|
|
73
|
-
lines.push('Regressed signals:');
|
|
74
|
-
for (const s of evidence.regressedSignals) {
|
|
75
|
-
lines.push(`- ${s.kind} "${s.name}" (${s.id}): ${s.state}${s.detail ? ` — ${s.detail}` : ''}`);
|
|
76
|
-
}
|
|
77
|
-
lines.push('');
|
|
78
|
-
}
|
|
79
|
-
if (evidence.errors.length > 0) {
|
|
80
|
-
lines.push('Recent errors:');
|
|
81
|
-
for (const e of evidence.errors) {
|
|
82
|
-
lines.push(`- ${e.title}${e.count != null ? ` ×${e.count}` : ''}${e.sampleMessage ? ` — ${e.sampleMessage}` : ''}`);
|
|
83
|
-
}
|
|
84
|
-
lines.push('');
|
|
85
|
-
}
|
|
86
|
-
if (evidence.notes)
|
|
87
|
-
lines.push(evidence.notes, '');
|
|
88
|
-
lines.push('Investigate whether THIS PR is the likely cause: correlate its diff with the regressed ' +
|
|
89
|
-
'signals and errors above (and the service logs). Beware correlation ≠ causation. Return a ' +
|
|
90
|
-
'JSON assessment: { "culpritConfidence": 0..1, "recommendation": "revert"|"hold"|"monitor", ' +
|
|
91
|
-
'"rationale": "…", "evidence": ["…"] }. Do NOT make commits or revert anything — a human decides.');
|
|
92
|
-
return lines.join('\n');
|
|
93
|
-
}
|
|
94
52
|
/**
|
|
95
53
|
* The execution engine. It orchestrates a pipeline of agent-performed steps and
|
|
96
54
|
* is fully deterministic: `advanceInstance` moves one run forward by exactly one
|
|
@@ -139,10 +97,6 @@ export class ExecutionService {
|
|
|
139
97
|
notificationService;
|
|
140
98
|
workspaceSettingsService;
|
|
141
99
|
llmObservability;
|
|
142
|
-
ciStatusProvider;
|
|
143
|
-
mergeabilityProvider;
|
|
144
|
-
releaseHealthProvider;
|
|
145
|
-
incidentEnrichment;
|
|
146
100
|
prMerger;
|
|
147
101
|
mergePresetRepository;
|
|
148
102
|
ticketTrackerProvider;
|
|
@@ -165,7 +119,7 @@ export class ExecutionService {
|
|
|
165
119
|
* {@link stepResolverFor} and {@link StepCompletionResolver}.
|
|
166
120
|
*/
|
|
167
121
|
stepResolverCache;
|
|
168
|
-
constructor({ workspaceRepository, blockRepository, pipelineRepository, executionRepository, accountRepository, idGenerator, clock, agentExecutor, workRunner, executionEventPublisher, boardService, spendService, documentRepository, taskRepository, requirementReviewRepository, requirementReviewService, clarityReviewRepository, clarityReviewService, fragmentResolver, environmentProvisioning, environmentTeardown, branchUpdater, blueprintReconciler, notificationService, workspaceSettingsService, llmObservability,
|
|
122
|
+
constructor({ workspaceRepository, blockRepository, pipelineRepository, executionRepository, accountRepository, idGenerator, clock, agentExecutor, workRunner, executionEventPublisher, boardService, spendService, documentRepository, taskRepository, requirementReviewRepository, requirementReviewService, clarityReviewRepository, clarityReviewService, fragmentResolver, environmentProvisioning, environmentTeardown, branchUpdater, blueprintReconciler, notificationService, workspaceSettingsService, llmObservability, pullRequestMerger, mergePresetRepository, ticketTrackerProvider, issueWriteback, subscriptionActivationRepository, resolveWorkspaceModelDefault, resolveProviderCapabilities, localTestInfraSupported, resolveRunRepoContext, runInitiatorScope, }) {
|
|
169
123
|
this.runInitiatorScope = runInitiatorScope ?? ((_initiatedBy, fn) => fn());
|
|
170
124
|
this.workspaceRepository = workspaceRepository;
|
|
171
125
|
this.blockRepository = blockRepository;
|
|
@@ -294,10 +248,6 @@ export class ExecutionService {
|
|
|
294
248
|
this.notificationService = notificationService;
|
|
295
249
|
this.workspaceSettingsService = workspaceSettingsService;
|
|
296
250
|
this.llmObservability = llmObservability;
|
|
297
|
-
this.ciStatusProvider = ciStatusProvider;
|
|
298
|
-
this.mergeabilityProvider = mergeabilityProvider;
|
|
299
|
-
this.releaseHealthProvider = releaseHealthProvider;
|
|
300
|
-
this.incidentEnrichment = incidentEnrichment;
|
|
301
251
|
this.prMerger = pullRequestMerger;
|
|
302
252
|
this.mergePresetRepository = mergePresetRepository;
|
|
303
253
|
this.ticketTrackerProvider = ticketTrackerProvider;
|
|
@@ -926,16 +876,15 @@ export class ExecutionService {
|
|
|
926
876
|
}
|
|
927
877
|
return { kind: 'awaiting_job', jobId: step.jobId, stepIndex: instance.currentStep };
|
|
928
878
|
}
|
|
929
|
-
//
|
|
930
|
-
//
|
|
931
|
-
//
|
|
932
|
-
//
|
|
933
|
-
//
|
|
934
|
-
//
|
|
935
|
-
//
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
if (step.agentKind === POST_RELEASE_HEALTH_AGENT_KIND &&
|
|
879
|
+
// A gate whose helper INVESTIGATES instead of fixing (post-release-health → on-call)
|
|
880
|
+
// declares a `resolveHelperCompletion` hook on its definition. When such a helper's job
|
|
881
|
+
// settles — done OR failed — we call the hook INSTEAD of re-probing the precheck
|
|
882
|
+
// (re-probing an investigate-don't-fix helper would just regress again and burn the
|
|
883
|
+
// budget) and finish the gate step with the output it returns. The gate raises its own
|
|
884
|
+
// `release_regression` notification + enriches any open incident inside the hook (from the
|
|
885
|
+
// signals stashed at escalation); the run then completes for a human to act out-of-band.
|
|
886
|
+
const completionGate = this.gateFor(step.agentKind);
|
|
887
|
+
if (completionGate?.resolveHelperCompletion &&
|
|
939
888
|
step.gate?.phase === 'working' &&
|
|
940
889
|
(update.state === 'done' || update.state === 'failed')) {
|
|
941
890
|
const block = await this.blockRepository.get(workspaceId, instance.blockId);
|
|
@@ -944,10 +893,23 @@ export class ExecutionService {
|
|
|
944
893
|
if (!block)
|
|
945
894
|
return { kind: 'noop' };
|
|
946
895
|
const isFinalStep = instance.currentStep === instance.steps.length - 1;
|
|
947
|
-
const
|
|
948
|
-
? update.result
|
|
949
|
-
: {
|
|
950
|
-
|
|
896
|
+
const jobResult = update.state === 'done'
|
|
897
|
+
? { state: 'done', result: update.result }
|
|
898
|
+
: { state: 'failed', error: update.error ?? null };
|
|
899
|
+
const resolution = await completionGate.resolveHelperCompletion({
|
|
900
|
+
workspaceId,
|
|
901
|
+
instance,
|
|
902
|
+
block,
|
|
903
|
+
step,
|
|
904
|
+
result: jobResult,
|
|
905
|
+
});
|
|
906
|
+
// Preserve the done-result's fields (usage metering etc.) while recording the gate's
|
|
907
|
+
// resolved output; a failed investigation has no result to carry.
|
|
908
|
+
const base = update.state === 'done' ? update.result : { output: '' };
|
|
909
|
+
return this.recordStepResult(workspaceId, instance, step, isFinalStep, {
|
|
910
|
+
...base,
|
|
911
|
+
output: resolution.output,
|
|
912
|
+
});
|
|
951
913
|
}
|
|
952
914
|
// A polling gate step's in-flight job is its helper agent (ci-fixer /
|
|
953
915
|
// conflict-resolver), NOT the step's own work: when it finishes (or fails) we
|
|
@@ -1800,173 +1762,13 @@ export class ExecutionService {
|
|
|
1800
1762
|
return { runInitiatorScope: this.runInitiatorScope };
|
|
1801
1763
|
}
|
|
1802
1764
|
buildGateRegistry() {
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
|
|
1807
|
-
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
probe: async (workspaceId, blockId) => {
|
|
1811
|
-
const report = await this.ciStatusProvider.getStatus(workspaceId, blockId);
|
|
1812
|
-
const verdict = aggregateCi(report.checks);
|
|
1813
|
-
if (isCiGreen(verdict)) {
|
|
1814
|
-
return {
|
|
1815
|
-
status: 'pass',
|
|
1816
|
-
headSha: report.headSha,
|
|
1817
|
-
passOutput: verdict === 'none'
|
|
1818
|
-
? 'CI gate passed: no checks configured for the PR head.'
|
|
1819
|
-
: `CI gate passed: ${report.checks.length} check(s) green.`,
|
|
1820
|
-
};
|
|
1821
|
-
}
|
|
1822
|
-
if (verdict === 'pending')
|
|
1823
|
-
return { status: 'pending', headSha: report.headSha };
|
|
1824
|
-
return {
|
|
1825
|
-
status: 'fail',
|
|
1826
|
-
headSha: report.headSha,
|
|
1827
|
-
failureSummary: describeFailingChecks(report.checks),
|
|
1828
|
-
failingChecks: listFailingChecks(report.checks),
|
|
1829
|
-
};
|
|
1830
|
-
},
|
|
1831
|
-
// Surface the failing-check summary to the fixer as resolved context.
|
|
1832
|
-
helperPriorOutput: (summary) => ({ agentKind: CI_AGENT_KIND, output: summary }),
|
|
1833
|
-
onExhausted: async ({ workspaceId, instance, block, step, summary }) => {
|
|
1834
|
-
const attempts = step.gate?.attempts ?? 0;
|
|
1835
|
-
await this.raiseCiFailed(workspaceId, instance, block, summary ?? '', attempts);
|
|
1836
|
-
return {
|
|
1837
|
-
error: `CI did not pass after ${attempts} CI-fixer attempt(s). ${summary ?? ''}`.trim(),
|
|
1838
|
-
};
|
|
1839
|
-
},
|
|
1840
|
-
},
|
|
1841
|
-
// Conflicts gate: check PR mergeability; escalate to a `conflict-resolver` on conflict.
|
|
1842
|
-
{
|
|
1843
|
-
kind: CONFLICTS_AGENT_KIND,
|
|
1844
|
-
helperKind: CONFLICT_RESOLVER_AGENT_KIND,
|
|
1845
|
-
wired: () => !!this.mergeabilityProvider,
|
|
1846
|
-
unwiredOutput: 'Conflict gate skipped (no mergeability provider configured).',
|
|
1847
|
-
// Unlike CI (where each fixer round gets fresh red-check output to act on), a
|
|
1848
|
-
// conflict retry re-merges the SAME base and gets no new signal, so a large
|
|
1849
|
-
// budget just burns containers re-attempting the same conflict (observed in
|
|
1850
|
-
// prod: 10 attempts, head SHA never moved, run failed). Cap it low and fail
|
|
1851
|
-
// fast to a manual-resolution notification instead of churning to CI's default
|
|
1852
|
-
// of 10.
|
|
1853
|
-
attemptBudget: () => CONFLICT_RESOLVER_MAX_ATTEMPTS,
|
|
1854
|
-
probe: async (workspaceId, blockId) => {
|
|
1855
|
-
const report = await this.mergeabilityProvider.getMergeability(workspaceId, blockId);
|
|
1856
|
-
// No PR resolved, or it merges cleanly → nothing to do; advance.
|
|
1857
|
-
if (report.headSha === null || report.verdict === 'mergeable') {
|
|
1858
|
-
return {
|
|
1859
|
-
status: 'pass',
|
|
1860
|
-
headSha: report.headSha,
|
|
1861
|
-
passOutput: report.headSha === null
|
|
1862
|
-
? 'Conflict gate passed: no open PR to gate.'
|
|
1863
|
-
: 'Conflict gate passed: the PR merges cleanly with its base.',
|
|
1864
|
-
};
|
|
1865
|
-
}
|
|
1866
|
-
// GitHub still computing mergeability → keep polling.
|
|
1867
|
-
if (report.verdict === 'unknown')
|
|
1868
|
-
return { status: 'pending', headSha: report.headSha };
|
|
1869
|
-
return { status: 'fail', headSha: report.headSha };
|
|
1870
|
-
},
|
|
1871
|
-
onExhausted: async ({ step }) => ({
|
|
1872
|
-
error: `The pull request still conflicts with its base after ` +
|
|
1873
|
-
`${step.gate?.attempts ?? 0} conflict-resolver attempt(s). Resolve the conflict ` +
|
|
1874
|
-
`manually, then retry the run.`,
|
|
1875
|
-
}),
|
|
1876
|
-
},
|
|
1877
|
-
// Post-release-health gate: after deploy, watch the release's Datadog monitors/SLOs
|
|
1878
|
-
// over a window; escalate to the `on-call` agent on a regression (it investigates,
|
|
1879
|
-
// it does NOT fix prod, so its completion is resolved specially — see
|
|
1880
|
-
// resolveOnCallStep — rather than re-probing to green).
|
|
1881
|
-
{
|
|
1882
|
-
kind: POST_RELEASE_HEALTH_AGENT_KIND,
|
|
1883
|
-
helperKind: ON_CALL_AGENT_KIND,
|
|
1884
|
-
wired: () => !!this.releaseHealthProvider,
|
|
1885
|
-
unwiredOutput: 'Post-release health gate skipped (no release-health provider configured).',
|
|
1886
|
-
attemptBudget: (preset) => preset.releaseMaxAttempts,
|
|
1887
|
-
// Running out of poll budget while still watching means the window outlasted the
|
|
1888
|
-
// driver's budget with NO regression observed — a healthy pass, not a timeout.
|
|
1889
|
-
pollExhaustion: 'pass',
|
|
1890
|
-
probe: async (workspaceId, blockId, gateState) => {
|
|
1891
|
-
// Only watch a release that actually SHIPPED. The merger sets the block `done`
|
|
1892
|
-
// when it merges for real, but leaves it `pr_ready` when it raises a review
|
|
1893
|
-
// (assessment outside thresholds) without merging — and a no-merger pipeline
|
|
1894
|
-
// also never auto-merges. There is nothing deployed to watch in those cases, so
|
|
1895
|
-
// pass through immediately instead of polling Datadog (and possibly escalating
|
|
1896
|
-
// an on-call investigation) for a change that was never released.
|
|
1897
|
-
const block = await this.blockRepository.get(workspaceId, blockId);
|
|
1898
|
-
if (!block || block.status !== 'done') {
|
|
1899
|
-
return {
|
|
1900
|
-
status: 'pass',
|
|
1901
|
-
headSha: null,
|
|
1902
|
-
passOutput: 'Post-release health gate skipped: the PR was not merged (nothing deployed to watch).',
|
|
1903
|
-
};
|
|
1904
|
-
}
|
|
1905
|
-
const since = gateState.watchSince ?? this.clock.now();
|
|
1906
|
-
const report = await this.releaseHealthProvider.probe(workspaceId, blockId, since);
|
|
1907
|
-
// No signals configured for this block → nothing to watch; advance immediately
|
|
1908
|
-
// (don't park for the whole window on an unmapped release).
|
|
1909
|
-
if (report.signals.length === 0) {
|
|
1910
|
-
return {
|
|
1911
|
-
status: 'pass',
|
|
1912
|
-
headSha: null,
|
|
1913
|
-
passOutput: 'Post-release health gate passed: no monitors/SLOs configured for this release.',
|
|
1914
|
-
};
|
|
1915
|
-
}
|
|
1916
|
-
// The watch window is resolved ONCE on first entry and stashed on the gate
|
|
1917
|
-
// state (see evaluateGate), so the probe doesn't re-load the block + re-resolve
|
|
1918
|
-
// the merge preset on every poll over the window.
|
|
1919
|
-
const windowMinutes = gateState.watchWindowMinutes ?? DEFAULT_MERGE_PRESET.releaseWatchWindowMinutes;
|
|
1920
|
-
const windowElapsed = this.clock.now() - since >= windowMinutes * 60_000;
|
|
1921
|
-
const verdict = classifyReleaseHealth({ report, windowElapsed });
|
|
1922
|
-
if (verdict === 'pass') {
|
|
1923
|
-
return {
|
|
1924
|
-
status: 'pass',
|
|
1925
|
-
headSha: null,
|
|
1926
|
-
passOutput: `Post-release health gate passed: ${report.signals.length} signal(s) healthy through the watch window.`,
|
|
1927
|
-
};
|
|
1928
|
-
}
|
|
1929
|
-
if (verdict === 'pending')
|
|
1930
|
-
return { status: 'pending', headSha: null };
|
|
1931
|
-
return {
|
|
1932
|
-
status: 'fail',
|
|
1933
|
-
headSha: null,
|
|
1934
|
-
failureSummary: describeRegressedSignals(report.signals),
|
|
1935
|
-
};
|
|
1936
|
-
},
|
|
1937
|
-
// The on-call agent gets the full evidence bundle (regressed signals + recent
|
|
1938
|
-
// error logs), gathered fresh at dispatch.
|
|
1939
|
-
gatherHelperPriorOutputs: async (workspaceId, blockId, gateState) => {
|
|
1940
|
-
const since = gateState.watchSince ?? this.clock.now();
|
|
1941
|
-
const evidence = await this.releaseHealthProvider.gatherEvidence(workspaceId, blockId, since);
|
|
1942
|
-
// Stash the regressed signals on the gate state so the on-call COMPLETION handler
|
|
1943
|
-
// (resolveOnCallStep) builds the notification + incident enrichment from the SAME
|
|
1944
|
-
// evidence the agent investigated — rather than re-reading Datadog a third time
|
|
1945
|
-
// (which also risks disagreeing with what the agent saw if the window moved).
|
|
1946
|
-
// The caller spreads `...step.gate` right after, so this mutation persists.
|
|
1947
|
-
gateState.regressedSignals = evidence.regressedSignals;
|
|
1948
|
-
return [
|
|
1949
|
-
{ agentKind: POST_RELEASE_HEALTH_AGENT_KIND, output: renderReleaseEvidence(evidence) },
|
|
1950
|
-
];
|
|
1951
|
-
},
|
|
1952
|
-
onExhausted: async ({ workspaceId, instance, block, step, summary }) => {
|
|
1953
|
-
// Reached when releaseMaxAttempts is 0 (operator disabled the on-call
|
|
1954
|
-
// investigation) or there is no async executor to escalate to — a FAILED
|
|
1955
|
-
// investigation is handled in pollAgentJob, not here. Alert a human via the
|
|
1956
|
-
// notification (with any signals already captured), then flag the run.
|
|
1957
|
-
await this.raiseReleaseRegression(workspaceId, instance, block, null, step.gate?.regressedSignals ?? [], summary ?? '');
|
|
1958
|
-
return {
|
|
1959
|
-
error: `Post-release health regressed and no on-call investigation was configured. ${summary ?? ''}`.trim(),
|
|
1960
|
-
};
|
|
1961
|
-
},
|
|
1962
|
-
},
|
|
1963
|
-
];
|
|
1964
|
-
const map = new Map(gates.map((gate) => [gate.kind, gate]));
|
|
1965
|
-
// Merge deployment-registered gates. The built-ins above stay inline (their closures
|
|
1966
|
-
// capture `this` for the engine-held providers + the typed `raiseCiFailed` /
|
|
1967
|
-
// `raiseReleaseRegression` notifications a generic context can't reproduce); a
|
|
1968
|
-
// registered gate instead receives a minimal {@link GateContext}. A registered gate of
|
|
1969
|
-
// the same kind replaces the built-in (last registration wins, like registerAgentKind).
|
|
1765
|
+
// The built-in gate suite (ci / conflicts / post-release-health) is no longer inline:
|
|
1766
|
+
// it ships as `@cat-factory/gates`, registered through the SAME public `registerGate`
|
|
1767
|
+
// seam any deployment uses (the dogfood — if the platform's own gates can be authored
|
|
1768
|
+
// as an external package, so can anyone's). The engine merely builds whatever gates were
|
|
1769
|
+
// registered at startup. A facade that forgot to `import '@cat-factory/gates'` then has
|
|
1770
|
+
// no gates and those steps fail — which the cross-runtime conformance suite catches.
|
|
1771
|
+
const map = new Map();
|
|
1970
1772
|
const ctx = this.makeGateContext();
|
|
1971
1773
|
for (const { kind, factory } of registeredGateFactories())
|
|
1972
1774
|
map.set(kind, factory(ctx));
|
|
@@ -2168,23 +1970,6 @@ export class ExecutionService {
|
|
|
2168
1970
|
return;
|
|
2169
1971
|
await svc.clearWaitingDecision(workspaceId, instance.blockId);
|
|
2170
1972
|
}
|
|
2171
|
-
/** Raise a `ci_failed` notification when the CI gate exhausts its fixer budget. */
|
|
2172
|
-
async raiseCiFailed(workspaceId, instance, block, summary, attempts) {
|
|
2173
|
-
if (!this.notificationService)
|
|
2174
|
-
return;
|
|
2175
|
-
await this.notificationService.raise(workspaceId, {
|
|
2176
|
-
type: 'ci_failed',
|
|
2177
|
-
blockId: block.id,
|
|
2178
|
-
executionId: instance.id,
|
|
2179
|
-
title: `CI is still failing for "${block.title}"`,
|
|
2180
|
-
body: `The CI-fixer agent tried ${attempts} time(s) but CI is still red. ${summary} ` +
|
|
2181
|
-
`Take a look and retry the run once fixed.`,
|
|
2182
|
-
payload: {
|
|
2183
|
-
...(block.pullRequest?.url ? { prUrl: block.pullRequest.url } : {}),
|
|
2184
|
-
pipelineName: instance.pipelineName,
|
|
2185
|
-
},
|
|
2186
|
-
});
|
|
2187
|
-
}
|
|
2188
1973
|
/** Provision inputs (`{{input.*}}`) derived from the block under deployment. */
|
|
2189
1974
|
deployInputs(block) {
|
|
2190
1975
|
const inputs = {
|
|
@@ -2860,95 +2645,6 @@ export class ExecutionService {
|
|
|
2860
2645
|
}
|
|
2861
2646
|
return DEFAULT_MERGE_PRESET;
|
|
2862
2647
|
}
|
|
2863
|
-
/**
|
|
2864
|
-
* Resolve a finished `on-call` investigation (the post-release-health gate's helper):
|
|
2865
|
-
* parse its assessment, raise a `release_regression` notification for a human, enrich
|
|
2866
|
-
* any incident PagerDuty/incident.io already opened, then finish the gate step so the
|
|
2867
|
-
* run completes (the human acts on the notification out-of-band — the engine never
|
|
2868
|
-
* auto-reverts). Best-effort on the side-effects; the step always finishes.
|
|
2869
|
-
*/
|
|
2870
|
-
async resolveOnCallStep(workspaceId, instance, step, block, result, isFinalStep, investigationFailed = false) {
|
|
2871
|
-
let assessment = null;
|
|
2872
|
-
try {
|
|
2873
|
-
assessment = parseOnCallAssessment(result.onCallAssessment);
|
|
2874
|
-
}
|
|
2875
|
-
catch {
|
|
2876
|
-
assessment = null;
|
|
2877
|
-
}
|
|
2878
|
-
// Reuse the regressed signals captured when the gate escalated (see the gate's
|
|
2879
|
-
// gatherHelperPriorOutputs) so the notification + incident enrichment reflect exactly
|
|
2880
|
-
// what the on-call agent investigated and we don't re-read Datadog a third time. Only
|
|
2881
|
-
// fall back to a fresh gather if they weren't persisted (e.g. an older parked run).
|
|
2882
|
-
const since = step.gate?.watchSince ?? this.clock.now();
|
|
2883
|
-
let regressedSignals = step.gate?.regressedSignals ?? [];
|
|
2884
|
-
if (regressedSignals.length === 0 && this.releaseHealthProvider) {
|
|
2885
|
-
try {
|
|
2886
|
-
const evidence = await this.releaseHealthProvider.gatherEvidence(workspaceId, block.id, since);
|
|
2887
|
-
regressedSignals = evidence.regressedSignals;
|
|
2888
|
-
}
|
|
2889
|
-
catch {
|
|
2890
|
-
// best-effort: the assessment + summary still drive the notification
|
|
2891
|
-
}
|
|
2892
|
-
}
|
|
2893
|
-
const baseSummary = step.gate?.lastFailureSummary ?? '';
|
|
2894
|
-
const summary = investigationFailed
|
|
2895
|
-
? `${baseSummary} The automated on-call investigation could not complete, so no culprit assessment is available — investigate manually.`.trim()
|
|
2896
|
-
: baseSummary;
|
|
2897
|
-
await this.raiseReleaseRegression(workspaceId, instance, block, assessment, regressedSignals, summary);
|
|
2898
|
-
await this.enrichIncident(workspaceId, block, assessment, regressedSignals, since);
|
|
2899
|
-
const output = assessment
|
|
2900
|
-
? `On-call investigation: ${assessment.recommendation} (culprit confidence ${pct(assessment.culpritConfidence)}). ${assessment.rationale}`
|
|
2901
|
-
: investigationFailed
|
|
2902
|
-
? 'On-call investigation did not complete; raised a release-regression notification for manual triage.'
|
|
2903
|
-
: 'On-call investigation completed; see the release-regression notification.';
|
|
2904
|
-
return this.recordStepResult(workspaceId, instance, step, isFinalStep, { ...result, output });
|
|
2905
|
-
}
|
|
2906
|
-
/** Raise a `release_regression` notification carrying the on-call assessment + signals. */
|
|
2907
|
-
async raiseReleaseRegression(workspaceId, instance, block, assessment, signals, summary) {
|
|
2908
|
-
if (!this.notificationService)
|
|
2909
|
-
return;
|
|
2910
|
-
const body = assessment
|
|
2911
|
-
? `Post-release monitoring flagged a regression after this PR shipped. On-call recommends ` +
|
|
2912
|
-
`**${assessment.recommendation}** (culprit confidence ${pct(assessment.culpritConfidence)}). ` +
|
|
2913
|
-
`${assessment.rationale}`
|
|
2914
|
-
: `Post-release monitoring flagged a regression after this PR shipped. ${summary} ` +
|
|
2915
|
-
`Investigate before deciding whether to revert.`;
|
|
2916
|
-
await this.notificationService.raise(workspaceId, {
|
|
2917
|
-
type: 'release_regression',
|
|
2918
|
-
blockId: block.id,
|
|
2919
|
-
executionId: instance.id,
|
|
2920
|
-
title: `Release regression for "${block.title}"`,
|
|
2921
|
-
body,
|
|
2922
|
-
payload: {
|
|
2923
|
-
...(assessment ? { onCallAssessment: assessment } : {}),
|
|
2924
|
-
...(signals.length ? { releaseSignals: signals } : {}),
|
|
2925
|
-
...(block.pullRequest?.url ? { prUrl: block.pullRequest.url } : {}),
|
|
2926
|
-
pipelineName: instance.pipelineName,
|
|
2927
|
-
},
|
|
2928
|
-
});
|
|
2929
|
-
}
|
|
2930
|
-
/**
|
|
2931
|
-
* Best-effort: annotate an incident PagerDuty / incident.io already opened (from the
|
|
2932
|
-
* same monitors/SLOs) with the on-call investigation. NOT alerting — those systems
|
|
2933
|
-
* already paged. A no-op when no provider is wired or no matching incident exists.
|
|
2934
|
-
*/
|
|
2935
|
-
async enrichIncident(workspaceId, block, assessment, signals, since) {
|
|
2936
|
-
if (!this.incidentEnrichment)
|
|
2937
|
-
return;
|
|
2938
|
-
const update = {
|
|
2939
|
-
title: `Regression suspected from "${block.title}"`,
|
|
2940
|
-
body: assessment
|
|
2941
|
-
? `${assessment.rationale} (recommendation: ${assessment.recommendation}, culprit confidence ${pct(assessment.culpritConfidence)})`
|
|
2942
|
-
: 'cat-factory on-call investigated a post-release regression suspected from this change.',
|
|
2943
|
-
...(block.pullRequest?.url ? { prUrl: block.pullRequest.url } : {}),
|
|
2944
|
-
};
|
|
2945
|
-
try {
|
|
2946
|
-
await this.incidentEnrichment.enrich({ workspaceId, signalIds: signals.map((s) => s.id), since }, update);
|
|
2947
|
-
}
|
|
2948
|
-
catch {
|
|
2949
|
-
// best-effort: a failing enrichment must not block the run or the notification
|
|
2950
|
-
}
|
|
2951
|
-
}
|
|
2952
2648
|
/** Raise a `pipeline_complete` notification for a no-merger run awaiting confirmation. */
|
|
2953
2649
|
async raisePipelineComplete(workspaceId, instance, block) {
|
|
2954
2650
|
if (!this.notificationService)
|