@besales/ops-framework 0.1.29 → 0.1.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,18 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.1.31
4
+
5
+ - Added bounded review budgets for Check and Verify: default 3 minute stage SLA and one external provider run per stage.
6
+ - Added review budget gates that route repeated external Check/Verify loops to `human_arbitration_required` instead of launching another provider call; humans can explicitly override with `--force-review-budget`.
7
+ - Added `review_budget_summary` telemetry to Check/Verify timelines so elapsed time, provider runs, deterministic blocks and provider duration are visible without manual log math.
8
+ - Updated Planner/Checker/Verifier/Supervisor prompts to prefer consolidated remediation over one-item external review loops.
9
+
10
+ ## 0.1.30
11
+
12
+ - Added a generic `golden-set-regression` risk trigger for golden sets, eval fixtures, label cards, ground-truth datasets and regression checklists.
13
+ - Added shared `golden-set-regression` playbook and deterministic plan gate requiring label-card schema, coverage matrix, negative/edge cases and harness boundary.
14
+ - Added Verify evidence checks for golden-set/regression work so execution must prove label cards, coverage, edge cases, expected outputs/non-goals, source refs and manual-vs-automated boundary.
15
+
3
16
  ## 0.1.29
4
17
 
5
18
  - Added `precheck-remediation.md` for deterministic Check blocks so all missing plan gates are consolidated into one checklist before another external Check.
package/README.md CHANGED
@@ -188,6 +188,18 @@ Do not commit that `file:` dependency to production projects. It is only for pac
188
188
  - `initiative-requirements`
189
189
  - `test/self-test`
190
190
 
191
+ ## Review Budgets
192
+
193
+ External `run-check` and `run-verify` are bounded by default:
194
+
195
+ - stage SLA: `180000ms`;
196
+ - max external provider runs per stage: `1`.
197
+
198
+ When the budget is exceeded, the framework writes `human_arbitration_required`
199
+ instead of starting another provider loop. Consolidate the remaining findings in
200
+ task artifacts, or rerun with `--force-review-budget` only after explicit human
201
+ approval.
202
+
191
203
  ## Learning Loop
192
204
 
193
205
  Learning is controlled and human-approved:
@@ -46,7 +46,7 @@ export const RISK_CONFIG = {
46
46
  };
47
47
 
48
48
  export const ALLOWED_RISK_PROFILES = Object.keys(RISK_CONFIG);
49
- export const ALLOWED_RISK_TRIGGERS = ['auth-security', 'docs-only', 'dto-readmodel', 'ingestion-provider', 'materializer', 'panel-ui', 'prisma-schema', 'production-runtime', 'source-sync-provider', 'ui-visible-api', 'worker-queue'];
49
+ export const ALLOWED_RISK_TRIGGERS = ['auth-security', 'docs-only', 'dto-readmodel', 'golden-set-regression', 'ingestion-provider', 'materializer', 'panel-ui', 'prisma-schema', 'production-runtime', 'source-sync-provider', 'ui-visible-api', 'worker-queue'];
50
50
 
51
51
  export const CHECKER_CONTEXT_PACK_FILE = 'checker-context-pack.md';
52
52
  export const PLAYBOOK_TRIGGER_MAP = new Map([
@@ -60,6 +60,7 @@ export const PLAYBOOK_TRIGGER_MAP = new Map([
60
60
  ['worker-queue', ['complexity-performance']],
61
61
  ['materializer', ['complexity-performance']],
62
62
  ['dto-readmodel', ['complexity-performance']],
63
+ ['golden-set-regression', ['golden-set-regression']],
63
64
  ]);
64
65
 
65
66
  export const ALLOWED_VERDICTS = ['return_to_plan', 'ready_for_human_gate', 'human_arbitration_required', 'context_insufficient', 'checker_failed'];
@@ -663,6 +664,9 @@ export function classifyRisk({ structuralLines, referencedFiles, planSections, r
663
664
  if (hasText(/\b(dto|validation|read model|payload shape|contract)\b/)) {
664
665
  triggers.add('dto-readmodel');
665
666
  }
667
+ if (hasText(/\b(golden set|golden-set|golden dataset|eval|evals|evaluation|regression checklist|regression fixture|regression fixtures|fixture labels?|label cards?|ground truth|expected outputs?)\b/)) {
668
+ triggers.add('golden-set-regression');
669
+ }
666
670
 
667
671
  if (triggers.size === 0 && isDocsOnly(referencedFiles, planSections)) {
668
672
  triggers.add('docs-only');
@@ -768,6 +772,7 @@ export function analyzePlanQualityGates({ planContent, risk, referencedFiles = [
768
772
  const optimizationRequired = requiresOptimizationStrategy(optimizationTier);
769
773
  const productionRolloutRequired = requiresProductionRolloutGate(risk.riskTriggers);
770
774
  const sourceSyncProviderRequired = requiresSourceSyncProviderGate(risk.riskTriggers);
775
+ const goldenSetRequired = requiresGoldenSetRegressionGate(risk.riskTriggers);
771
776
  const executionMetadata = inspectExecutionMetadata(sections);
772
777
  const verificationLadder = inspectVerificationLadder(sections);
773
778
  const standardsAlignmentRequired = requiresStandardsAlignment({ referencedFiles, structuralLines });
@@ -780,6 +785,7 @@ export function analyzePlanQualityGates({ planContent, risk, referencedFiles = [
780
785
  const productionRollout = inspectProductionRolloutGate(sections);
781
786
  const sourceSyncProvider = inspectSourceSyncProviderGate(sections);
782
787
  const importIngestion = inspectImportIngestionGate(sections, planContent);
788
+ const goldenSetRegression = inspectGoldenSetRegressionGate(sections, planContent);
783
789
  const missingSignals = [];
784
790
 
785
791
  if (!executionMetadata.present) {
@@ -821,6 +827,9 @@ export function analyzePlanQualityGates({ planContent, risk, referencedFiles = [
821
827
  if (sourceSyncProviderRequired && importIngestion.required && !importIngestion.present) {
822
828
  missingSignals.push('Import/ingestion plan must include an Import / Ingestion Contract naming representative real fixtures or an explicit no-real-fixtures reason, raw metadata/speaker-label extraction, and duplicate-import policy.');
823
829
  }
830
+ if (goldenSetRequired && !goldenSetRegression.present) {
831
+ missingSignals.push('Golden set/eval fixture work must include `## Label Card Schema`, `## Coverage Matrix`, `## Negative / Edge Cases`, and `## Harness Boundary` so examples become a test contract, not just a list.');
832
+ }
824
833
 
825
834
  return {
826
835
  executionMetadata,
@@ -841,6 +850,8 @@ export function analyzePlanQualityGates({ planContent, risk, referencedFiles = [
841
850
  sourceSyncProviderRequired,
842
851
  sourceSyncProvider,
843
852
  importIngestion,
853
+ goldenSetRequired,
854
+ goldenSetRegression,
844
855
  missingSignals,
845
856
  };
846
857
  }
@@ -1070,6 +1081,10 @@ export function requiresSourceSyncProviderGate(riskTriggers = []) {
1070
1081
  return riskTriggers.includes('ingestion-provider') || riskTriggers.includes('source-sync-provider');
1071
1082
  }
1072
1083
 
1084
+ export function requiresGoldenSetRegressionGate(riskTriggers = []) {
1085
+ return riskTriggers.includes('golden-set-regression');
1086
+ }
1087
+
1073
1088
  export function inspectUiAcceptanceScenarios(sections) {
1074
1089
  const body = readCanonicalSection(sections, ['ui acceptance scenarios', 'ui acceptance', 'ui scenarios']);
1075
1090
  if (!body) {
@@ -1102,6 +1117,63 @@ export function inspectUiAcceptanceScenarios(sections) {
1102
1117
  return result;
1103
1118
  }
1104
1119
 
1120
+ export function inspectGoldenSetRegressionGate(sections) {
1121
+ const labelSchema = readCanonicalSection(sections, [
1122
+ 'label card schema',
1123
+ 'label schema',
1124
+ 'golden set label schema',
1125
+ 'fixture label schema',
1126
+ ]);
1127
+ const coverageMatrix = readCanonicalSection(sections, [
1128
+ 'coverage matrix',
1129
+ 'golden set coverage matrix',
1130
+ 'eval coverage matrix',
1131
+ 'fixture coverage matrix',
1132
+ ]);
1133
+ const negativeEdgeCases = readCanonicalSection(sections, [
1134
+ 'negative / edge cases',
1135
+ 'negative and edge cases',
1136
+ 'negative cases',
1137
+ 'edge cases',
1138
+ 'known edge cases',
1139
+ ]);
1140
+ const harnessBoundary = readCanonicalSection(sections, [
1141
+ 'harness boundary',
1142
+ 'automation boundary',
1143
+ 'regression harness boundary',
1144
+ 'manual vs automated boundary',
1145
+ ]);
1146
+
1147
+ const combined = [labelSchema, coverageMatrix, negativeEdgeCases, harnessBoundary].join('\n').toLowerCase();
1148
+ const result = {
1149
+ present: false,
1150
+ hasLabelCardSchema: Boolean(labelSchema),
1151
+ hasExpectedOutputs: /expected output|expected result|expectation|ground truth|acceptance|ожидаем|результат/.test(labelSchema.toLowerCase()),
1152
+ hasNonGoals: /non-?goal|ignored|deferred|out of scope|not applied|не\s+дела|игнор|отлож/.test(labelSchema.toLowerCase()),
1153
+ hasSourceEvidence: /source|quote|snippet|evidence|reference|ref|path|цитат|источник|сниппет/.test(labelSchema.toLowerCase()),
1154
+ hasCoverageMatrix: Boolean(coverageMatrix),
1155
+ hasCoverageDimensions: /track|profile|scenario|behavior|dimension|case|category|matrix|coverage|профил|сценари|покрыт|категор/.test(coverageMatrix.toLowerCase()),
1156
+ hasNegativeEdgeCases: Boolean(negativeEdgeCases),
1157
+ hasNegativeExpectation: /negative|edge|missing|unavailable|single|ambiguous|conflict|false positive|should not|must not|негатив|краев|отсутств|неоднознач/.test(negativeEdgeCases.toLowerCase()),
1158
+ hasHarnessBoundary: Boolean(harnessBoundary),
1159
+ hasManualVsAutomatedBoundary: /manual|automated|runner|harness|ci|checklist|later|future|not yet|ручн|автомат|позже|сейчас/.test(harnessBoundary.toLowerCase()),
1160
+ hasMissingCoveragePolicy: /missing|unavailable|gap|deferred|not found|known missing|отсутств|недоступ|пробел|отлож/.test(combined),
1161
+ };
1162
+ result.complete = result.hasLabelCardSchema
1163
+ && result.hasExpectedOutputs
1164
+ && result.hasNonGoals
1165
+ && result.hasSourceEvidence
1166
+ && result.hasCoverageMatrix
1167
+ && result.hasCoverageDimensions
1168
+ && result.hasNegativeEdgeCases
1169
+ && result.hasNegativeExpectation
1170
+ && result.hasHarnessBoundary
1171
+ && result.hasManualVsAutomatedBoundary
1172
+ && result.hasMissingCoveragePolicy;
1173
+ result.present = result.complete;
1174
+ return result;
1175
+ }
1176
+
1105
1177
  export function inspectComplexityPerformanceBudget(sections) {
1106
1178
  const body = readCanonicalSection(sections, [
1107
1179
  'complexity / performance budget',
@@ -1407,6 +1479,17 @@ export function buildCheckerContextPack({
1407
1479
  ].join('\n')
1408
1480
  : '- Source sync/provider gate is not required by detected triggers.',
1409
1481
  '',
1482
+ '## Golden Set / Regression Expectations',
1483
+ '',
1484
+ qualityGates.goldenSetRequired
1485
+ ? [
1486
+ '- Golden set/eval/regression fixture risk detected.',
1487
+ `- Golden Set / Regression Gate complete: \`${qualityGates.goldenSetRegression.present ? 'yes' : 'no'}\`.`,
1488
+ '- Checker must return `return_to_plan` if the plan lacks label-card schema, coverage matrix, negative/edge cases, harness boundary, missing coverage policy or source evidence rules.',
1489
+ '- A golden set is a reusable test contract, not merely a list of examples to process.',
1490
+ ].join('\n')
1491
+ : '- Golden set/regression fixture gate is not required by detected triggers.',
1492
+ '',
1410
1493
  '## Relevant Playbooks',
1411
1494
  '',
1412
1495
  renderRelevantPlaybookIndex(relevantPlaybooks),
@@ -1634,6 +1717,27 @@ export function validateExecutionEvidenceForPlan({ planContent, executionContent
1634
1717
  }
1635
1718
  }
1636
1719
 
1720
+ if (hasAnySection(planSections, ['label card schema', 'label schema', 'coverage matrix', 'negative / edge cases', 'harness boundary'])) {
1721
+ const evidence = readAnySection(executionSections, [
1722
+ 'golden set / regression evidence',
1723
+ 'golden set evidence',
1724
+ 'regression fixture evidence',
1725
+ 'label evidence',
1726
+ 'fixture evidence',
1727
+ ]);
1728
+ if (!evidence) {
1729
+ errors.push({
1730
+ category: 'missing_evidence',
1731
+ message: 'Plan contains Golden Set / Regression Gate but execution.md is missing Golden Set / Regression Evidence.',
1732
+ });
1733
+ } else if (!/(label card|label schema|coverage matrix|negative|edge case|harness|runner|manual|automated|expected output|non-?goal|source|quote|snippet|missing coverage)/i.test(evidence)) {
1734
+ errors.push({
1735
+ category: 'insufficient_evidence',
1736
+ message: 'Golden Set / Regression Evidence must show label cards, coverage matrix, negative/edge cases, harness boundary, expected outputs/non-goals and source refs.',
1737
+ });
1738
+ }
1739
+ }
1740
+
1637
1741
  return errors;
1638
1742
  }
1639
1743
 
@@ -1685,6 +1789,9 @@ function buildCheckerQuestions({ risk, qualityGates }) {
1685
1789
  if (qualityGates.importIngestion?.required) {
1686
1790
  questions.push('Does the import plan use representative real fixtures when available, extract raw metadata needed downstream, and define exact duplicate-import behavior?');
1687
1791
  }
1792
+ if (qualityGates.goldenSetRequired) {
1793
+ questions.push('Does the golden set define a reusable test contract with label-card schema, coverage matrix, negative/edge cases, source refs and explicit manual-vs-automated harness boundary?');
1794
+ }
1688
1795
  return questions;
1689
1796
  }
1690
1797
 
@@ -14,6 +14,7 @@ import {
14
14
  selectRelevantPlaybookNames,
15
15
  inspectComplexityPerformanceBudget,
16
16
  inspectExecutionMetadata,
17
+ inspectGoldenSetRegressionGate,
17
18
  inspectAuditWriterModel,
18
19
  inspectMigrationApplyPlan,
19
20
  inspectOptimizationStrategy,
@@ -24,6 +25,7 @@ import {
24
25
  inspectVerificationLadder,
25
26
  parseMarkdownSections,
26
27
  requiresOptimizationStrategy,
28
+ requiresGoldenSetRegressionGate,
27
29
  requiresStandardsAlignment,
28
30
  validateExecutionEvidenceForPlan,
29
31
  } from './check-context-utils.mjs';
@@ -833,4 +835,120 @@ describe('agent pipeline quality gates', () => {
833
835
  expect(result.importIngestion.present).toBe(true);
834
836
  expect(result.missingSignals.some((signal) => signal.includes('Import/ingestion plan must include'))).toBe(false);
835
837
  });
838
+
839
+ it('detects golden set regression work and requires a test-contract gate', () => {
840
+ const risk = classifyRisk({
841
+ structuralLines: [
842
+ '- Create golden set label cards and regression checklist for process quality.',
843
+ ],
844
+ referencedFiles: [],
845
+ planSections: new Map(),
846
+ });
847
+
848
+ expect(risk.riskTriggers).toContain('golden-set-regression');
849
+ expect(requiresGoldenSetRegressionGate(risk.riskTriggers)).toBe(true);
850
+ expect(selectRelevantPlaybookNames(risk.riskTriggers)).toContain('golden-set-regression');
851
+
852
+ const result = analyzePlanQualityGates({
853
+ planContent: [
854
+ '# Plan',
855
+ '',
856
+ '## Implementation Steps',
857
+ '',
858
+ '- Select golden set examples with expected outputs and non-goals.',
859
+ ].join('\n'),
860
+ risk,
861
+ });
862
+
863
+ expect(result.goldenSetRequired).toBe(true);
864
+ expect(result.goldenSetRegression.present).toBe(false);
865
+ expect(result.missingSignals).toContain('Golden set/eval fixture work must include `## Label Card Schema`, `## Coverage Matrix`, `## Negative / Edge Cases`, and `## Harness Boundary` so examples become a test contract, not just a list.');
866
+ });
867
+
868
+ it('accepts golden set regression gate with schema, coverage, edge cases and harness boundary', () => {
869
+ const plan = [
870
+ '# Plan',
871
+ '',
872
+ '## Label Card Schema',
873
+ '',
874
+ '- Source path / source id and short quote snippet are required.',
875
+ '- Expected outputs and acceptance expectations are required.',
876
+ '- Non-goals / ignored or deferred outputs are required.',
877
+ '- Confidence and risk notes are recorded when uncertain.',
878
+ '',
879
+ '## Coverage Matrix',
880
+ '',
881
+ '- Coverage dimensions: track, profile, scenario category and source format.',
882
+ '- Missing coverage / unavailable fixtures are recorded as gaps.',
883
+ '',
884
+ '## Negative / Edge Cases',
885
+ '',
886
+ '- Negative examples and edge cases must catch false positives and ambiguous input.',
887
+ '- Missing edge cases are recorded instead of fabricated.',
888
+ '',
889
+ '## Harness Boundary',
890
+ '',
891
+ '- Manual labels are created now; automated runner/CI harness comes later.',
892
+ ].join('\n');
893
+ const sections = parseMarkdownSections(plan);
894
+ const inspected = inspectGoldenSetRegressionGate(sections);
895
+ const result = analyzePlanQualityGates({
896
+ planContent: plan,
897
+ risk: {
898
+ riskProfile: 'medium',
899
+ riskTriggers: ['golden-set-regression'],
900
+ },
901
+ });
902
+
903
+ expect(inspected.present).toBe(true);
904
+ expect(result.goldenSetRegression.present).toBe(true);
905
+ expect(result.missingSignals.some((signal) => signal.includes('Golden set/eval fixture work'))).toBe(false);
906
+ });
907
+
908
+ it('requires golden set regression execution evidence when the plan declares label gates', () => {
909
+ const plan = [
910
+ '# Plan',
911
+ '',
912
+ '## Label Card Schema',
913
+ '',
914
+ '- Source path and source quote snippet.',
915
+ '- Expected outputs and non-goals.',
916
+ '',
917
+ '## Coverage Matrix',
918
+ '',
919
+ '- Coverage dimensions and missing coverage gaps.',
920
+ '',
921
+ '## Negative / Edge Cases',
922
+ '',
923
+ '- Negative cases and edge case behavior.',
924
+ '',
925
+ '## Harness Boundary',
926
+ '',
927
+ '- Manual labels now, automated runner later.',
928
+ ].join('\n');
929
+
930
+ const missing = validateExecutionEvidenceForPlan({
931
+ planContent: plan,
932
+ executionContent: '# Execution\n\nNo golden set evidence yet.',
933
+ });
934
+ expect(missing).toContainEqual({
935
+ category: 'missing_evidence',
936
+ message: 'Plan contains Golden Set / Regression Gate but execution.md is missing Golden Set / Regression Evidence.',
937
+ });
938
+
939
+ const ok = validateExecutionEvidenceForPlan({
940
+ planContent: plan,
941
+ executionContent: [
942
+ '# Execution',
943
+ '',
944
+ '## Golden Set / Regression Evidence',
945
+ '',
946
+ '- Label cards follow the label schema and include expected output, non-goals, source refs and short quote snippets.',
947
+ '- Coverage matrix filled; missing coverage recorded.',
948
+ '- Negative / edge case examples documented.',
949
+ '- Harness boundary: manual checklist now, automated runner later.',
950
+ ].join('\n'),
951
+ });
952
+ expect(ok.some((issue) => issue.message.includes('Golden Set / Regression'))).toBe(false);
953
+ });
836
954
  });
@@ -0,0 +1,79 @@
1
+ export function resolveStageReviewBudget(config, stage) {
2
+ const defaults = {
3
+ stageSlaMs: 180000,
4
+ maxExternalRunsPerStage: 1,
5
+ };
6
+ const reviewBudgets = config.reviewBudgets || {};
7
+ return {
8
+ ...defaults,
9
+ ...(reviewBudgets[stage] || {}),
10
+ };
11
+ }
12
+
13
+ export function summarizeReviewBudgetWindow({
14
+ timeline = [],
15
+ stage,
16
+ now = new Date(),
17
+ }) {
18
+ const terminalEvents = stage === 'verify'
19
+ ? new Set(['pass', 'pass_with_notes', 'human_arbitration_required', 'verifier_failed'])
20
+ : new Set(['ready_for_human_gate', 'human_arbitration_required', 'checker_failed']);
21
+ let lastTerminalIndex = -1;
22
+ for (let index = 0; index < timeline.length; index += 1) {
23
+ const event = timeline[index];
24
+ if ((event.event === `${stage}_completed` || event.event === 'check_completed') && terminalEvents.has(event.verdict)) {
25
+ lastTerminalIndex = index;
26
+ }
27
+ }
28
+
29
+ const windowEvents = timeline.slice(lastTerminalIndex + 1);
30
+ const firstEventAt = firstValidDate(windowEvents.map((event) => event.at)) || now;
31
+ const providerStarted = windowEvents.filter((event) => event.event === 'provider_started').length;
32
+ const providerCompleted = windowEvents.filter((event) => event.event === 'provider_completed').length;
33
+ const deterministicBlocks = windowEvents.filter((event) => event.event.includes('deterministic')).length;
34
+ const elapsedMs = Math.max(0, now.getTime() - firstEventAt.getTime());
35
+ const providerMs = windowEvents
36
+ .filter((event) => event.event === 'provider_completed')
37
+ .reduce((sum, event) => sum + Number(event.providerTiming?.durationMs || 0), 0);
38
+
39
+ return {
40
+ windowStartedAt: firstEventAt.toISOString(),
41
+ elapsedMs,
42
+ providerStarted,
43
+ providerCompleted,
44
+ deterministicBlocks,
45
+ providerMs,
46
+ eventCount: windowEvents.length,
47
+ };
48
+ }
49
+
50
+ export function evaluateReviewBudget({ budget, summary, force = false }) {
51
+ if (force) {
52
+ return { ok: true, reason: null };
53
+ }
54
+ if (summary.providerStarted >= budget.maxExternalRunsPerStage) {
55
+ return {
56
+ ok: false,
57
+ reason: 'max_external_runs_per_stage_exceeded',
58
+ message: `External review run budget exceeded: providerStarted=${summary.providerStarted}, maxExternalRunsPerStage=${budget.maxExternalRunsPerStage}.`,
59
+ };
60
+ }
61
+ if (summary.elapsedMs >= budget.stageSlaMs) {
62
+ return {
63
+ ok: false,
64
+ reason: 'stage_sla_exceeded',
65
+ message: `Review stage SLA exceeded: elapsedMs=${summary.elapsedMs}, stageSlaMs=${budget.stageSlaMs}.`,
66
+ };
67
+ }
68
+ return { ok: true, reason: null };
69
+ }
70
+
71
+ function firstValidDate(values) {
72
+ for (const value of values) {
73
+ const date = new Date(value);
74
+ if (!Number.isNaN(date.getTime())) {
75
+ return date;
76
+ }
77
+ }
78
+ return null;
79
+ }
@@ -0,0 +1,90 @@
1
+ import { describe, expect, it } from 'vitest';
2
+ import {
3
+ evaluateReviewBudget,
4
+ resolveStageReviewBudget,
5
+ summarizeReviewBudgetWindow,
6
+ } from './review-budget-utils.mjs';
7
+
8
+ describe('review budget utils', () => {
9
+ it('blocks a second external check in the same active stage window', () => {
10
+ const now = new Date('2026-06-04T12:02:00.000Z');
11
+ const summary = summarizeReviewBudgetWindow({
12
+ stage: 'check',
13
+ now,
14
+ timeline: [
15
+ { at: '2026-06-04T12:00:00.000Z', event: 'check_started' },
16
+ { at: '2026-06-04T12:00:02.000Z', event: 'provider_started' },
17
+ {
18
+ at: '2026-06-04T12:01:30.000Z',
19
+ event: 'provider_completed',
20
+ providerTiming: { durationMs: 88000 },
21
+ verdict: 'return_to_plan',
22
+ },
23
+ ],
24
+ });
25
+
26
+ expect(summary.providerStarted).toBe(1);
27
+ expect(evaluateReviewBudget({
28
+ budget: { stageSlaMs: 180000, maxExternalRunsPerStage: 1 },
29
+ summary,
30
+ })).toMatchObject({
31
+ ok: false,
32
+ reason: 'max_external_runs_per_stage_exceeded',
33
+ });
34
+ });
35
+
36
+ it('blocks when wall-clock stage SLA is exceeded even without provider time', () => {
37
+ const summary = summarizeReviewBudgetWindow({
38
+ stage: 'check',
39
+ now: new Date('2026-06-04T12:04:00.000Z'),
40
+ timeline: [
41
+ { at: '2026-06-04T12:00:00.000Z', event: 'check_started' },
42
+ { at: '2026-06-04T12:00:01.000Z', event: 'deterministic_precheck_blocked' },
43
+ ],
44
+ });
45
+
46
+ expect(summary.providerStarted).toBe(0);
47
+ expect(evaluateReviewBudget({
48
+ budget: { stageSlaMs: 180000, maxExternalRunsPerStage: 1 },
49
+ summary,
50
+ })).toMatchObject({
51
+ ok: false,
52
+ reason: 'stage_sla_exceeded',
53
+ });
54
+ });
55
+
56
+ it('resets the active check window after ready_for_human_gate', () => {
57
+ const summary = summarizeReviewBudgetWindow({
58
+ stage: 'check',
59
+ now: new Date('2026-06-04T12:10:30.000Z'),
60
+ timeline: [
61
+ { at: '2026-06-04T12:00:00.000Z', event: 'check_started' },
62
+ { at: '2026-06-04T12:00:05.000Z', event: 'provider_started' },
63
+ { at: '2026-06-04T12:01:00.000Z', event: 'provider_completed', verdict: 'ready_for_human_gate' },
64
+ { at: '2026-06-04T12:01:00.100Z', event: 'check_completed', verdict: 'ready_for_human_gate' },
65
+ { at: '2026-06-04T12:10:00.000Z', event: 'check_started' },
66
+ ],
67
+ });
68
+
69
+ expect(summary.providerStarted).toBe(0);
70
+ expect(summary.elapsedMs).toBe(30000);
71
+ expect(evaluateReviewBudget({
72
+ budget: { stageSlaMs: 180000, maxExternalRunsPerStage: 1 },
73
+ summary,
74
+ })).toMatchObject({ ok: true });
75
+ });
76
+
77
+ it('uses configured stage budgets over defaults', () => {
78
+ expect(resolveStageReviewBudget({
79
+ reviewBudgets: {
80
+ verify: {
81
+ stageSlaMs: 120000,
82
+ maxExternalRunsPerStage: 2,
83
+ },
84
+ },
85
+ }, 'verify')).toEqual({
86
+ stageSlaMs: 120000,
87
+ maxExternalRunsPerStage: 2,
88
+ });
89
+ });
90
+ });
@@ -84,6 +84,12 @@ export function buildTaskManifest({ taskDir, now = new Date().toISOString(), exi
84
84
  evidenceRequired: hasPlanGate(inputs.taskArtifacts.get('plan.md'), ['import / ingestion contract', 'import ingestion contract', 'ingestion contract', 'import contract']),
85
85
  evidenceComplete: !evidenceIssues.some((issue) => issue.message.includes('Import / Ingestion')),
86
86
  },
87
+ goldenSetRegression: {
88
+ required: Boolean(inputs.qualityGates.goldenSetRequired),
89
+ planComplete: Boolean(inputs.qualityGates.goldenSetRegression?.present),
90
+ evidenceRequired: hasPlanGate(inputs.taskArtifacts.get('plan.md'), ['label card schema', 'label schema', 'coverage matrix', 'negative / edge cases', 'harness boundary']),
91
+ evidenceComplete: !evidenceIssues.some((issue) => issue.message.includes('Golden Set / Regression Evidence')),
92
+ },
87
93
  },
88
94
  context: {
89
95
  planSha: inputs.planFingerprint.planSha,
@@ -38,6 +38,8 @@ function main() {
38
38
  productionRolloutComplete: inputs.qualityGates.productionRollout.present,
39
39
  sourceSyncProviderRequired: inputs.qualityGates.sourceSyncProviderRequired,
40
40
  sourceSyncProviderComplete: inputs.qualityGates.sourceSyncProvider.present,
41
+ goldenSetRequired: inputs.qualityGates.goldenSetRequired,
42
+ goldenSetComplete: inputs.qualityGates.goldenSetRegression.present,
41
43
  missingSignals: inputs.qualityGates.missingSignals,
42
44
  },
43
45
  executionEvidence: {
@@ -67,6 +69,7 @@ function printHuman(result) {
67
69
  console.log(`- Optimization strategy: ${result.planQuality.optimizationRequired ? `${result.planQuality.optimizationTier} required / ${result.planQuality.optimizationStrategyComplete ? 'complete' : 'missing'}` : `${result.planQuality.optimizationTier || 'O0'} / not required`}`);
68
70
  console.log(`- Production rollout: ${result.planQuality.productionRolloutRequired ? result.planQuality.productionRolloutComplete ? 'required / complete' : 'required / missing' : 'not required'}`);
69
71
  console.log(`- Source sync/provider: ${result.planQuality.sourceSyncProviderRequired ? result.planQuality.sourceSyncProviderComplete ? 'required / complete' : 'required / missing' : 'not required'}`);
72
+ console.log(`- Golden set/regression: ${result.planQuality.goldenSetRequired ? result.planQuality.goldenSetComplete ? 'required / complete' : 'required / missing' : 'not required'}`);
70
73
  if (result.planQuality.missingSignals.length > 0) {
71
74
  console.log('Plan quality issues:');
72
75
  for (const issue of result.planQuality.missingSignals) {
package/bin/run-check.mjs CHANGED
@@ -45,6 +45,11 @@ import {
45
45
  validateManifest,
46
46
  writeTaskManifest,
47
47
  } from './lib/task-manifest-utils.mjs';
48
+ import {
49
+ evaluateReviewBudget,
50
+ resolveStageReviewBudget,
51
+ summarizeReviewBudgetWindow,
52
+ } from './lib/review-budget-utils.mjs';
48
53
 
49
54
  function main() {
50
55
  runMain().catch((error) => {
@@ -64,6 +69,8 @@ async function runMain() {
64
69
  const taskId = path.basename(taskDir);
65
70
  const dryRun = getFlag(args, 'dry-run', false) === true;
66
71
  const noCache = getFlag(args, 'no-cache', false) === true;
72
+ const forceReviewBudget = getFlag(args, 'force-review-budget', false) === true
73
+ || getFlag(args, 'force-external-review', false) === true;
67
74
  const checkerConfig = resolveCheckerConfig(args);
68
75
  const runStartedAt = new Date();
69
76
  appendCheckTimeline(taskDir, {
@@ -72,6 +79,7 @@ async function runMain() {
72
79
  model: checkerConfig.model,
73
80
  noCache,
74
81
  dryRun,
82
+ forceReviewBudget,
75
83
  });
76
84
 
77
85
  let checkContext = ensureFreshCheckContext(taskDir, taskId);
@@ -231,6 +239,55 @@ async function runMain() {
231
239
  return;
232
240
  }
233
241
 
242
+ const reviewBudget = evaluateCurrentReviewBudget({
243
+ taskDir,
244
+ stage: 'check',
245
+ config: readAgentsConfig(),
246
+ force: forceReviewBudget,
247
+ });
248
+ if (!reviewBudget.ok) {
249
+ writeReviewBudgetReturn({
250
+ taskDir,
251
+ taskId,
252
+ checkContext,
253
+ checkerConfig,
254
+ checkerPromptSha,
255
+ cacheKey,
256
+ reason: reviewBudget.reason,
257
+ message: reviewBudget.message,
258
+ budget: reviewBudget.budget,
259
+ summary: reviewBudget.summary,
260
+ startedAt: runStartedAt,
261
+ });
262
+ appendCheckTimeline(taskDir, {
263
+ event: 'review_budget_blocked',
264
+ verdict: 'human_arbitration_required',
265
+ reason: reviewBudget.reason,
266
+ message: reviewBudget.message,
267
+ budget: reviewBudget.budget,
268
+ summary: reviewBudget.summary,
269
+ timing: buildTiming(runStartedAt),
270
+ });
271
+ recordLlmInputUsage({
272
+ taskDir,
273
+ stage: 'check',
274
+ packMeta: promptPayload.pack.meta,
275
+ attempts: [
276
+ ...llmInputAttempts,
277
+ buildAttemptRecord(promptPayload.pack.meta, `review_budget_blocked:${reviewBudget.reason}`),
278
+ ],
279
+ rerunCount,
280
+ timing: buildTiming(runStartedAt),
281
+ });
282
+ refreshTaskManifestAfterCheck(taskDir);
283
+ runValidator(taskArg);
284
+ console.log(`Checker review budget blocked ${taskId}: human_arbitration_required`);
285
+ console.log(`- reason: ${reviewBudget.reason}`);
286
+ console.log(`- elapsedMs: ${reviewBudget.summary.elapsedMs}`);
287
+ console.log(`- providerStarted: ${reviewBudget.summary.providerStarted}`);
288
+ return;
289
+ }
290
+
234
291
  try {
235
292
  const providerStartedAt = new Date();
236
293
  appendCheckTimeline(taskDir, {
@@ -319,6 +376,7 @@ async function runMain() {
319
376
  cacheKeySha,
320
377
  timing: buildTiming(runStartedAt),
321
378
  });
379
+ appendReviewBudgetSummary(taskDir);
322
380
  if (!isContextInsufficientResult(providerOutput.checkResultJson)) {
323
381
  storeInCache({ taskDir, cacheKeySha });
324
382
  }
@@ -337,6 +395,127 @@ async function runMain() {
337
395
  console.log(`- finalEstimatedInputTokens: ${promptPayload.pack.meta.estimatedTokens}`);
338
396
  }
339
397
 
398
+ function evaluateCurrentReviewBudget({ taskDir, stage, config, force }) {
399
+ const budget = resolveStageReviewBudget(config, stage);
400
+ const summary = summarizeReviewBudgetWindow({
401
+ timeline: readTimeline(taskDir, 'check-timeline.json'),
402
+ stage,
403
+ now: new Date(),
404
+ });
405
+ return {
406
+ ...evaluateReviewBudget({ budget, summary, force }),
407
+ budget,
408
+ summary,
409
+ };
410
+ }
411
+
412
+ function readTimeline(taskDir, fileName) {
413
+ const timelinePath = path.join(taskDir, fileName);
414
+ if (!fs.existsSync(timelinePath)) {
415
+ return [];
416
+ }
417
+ try {
418
+ const parsed = JSON.parse(fs.readFileSync(timelinePath, 'utf8'));
419
+ return Array.isArray(parsed) ? parsed : [];
420
+ } catch {
421
+ return [];
422
+ }
423
+ }
424
+
425
+ function writeReviewBudgetReturn({
426
+ taskDir,
427
+ taskId,
428
+ checkContext,
429
+ checkerConfig,
430
+ checkerPromptSha,
431
+ cacheKey,
432
+ reason,
433
+ message,
434
+ budget,
435
+ summary,
436
+ startedAt,
437
+ }) {
438
+ const result = {
439
+ taskId,
440
+ stage: 'Check',
441
+ checkerProvider: 'review-budget',
442
+ checkerModel: 'none',
443
+ planSha: checkContext.planSha,
444
+ memorySha: checkContext.memorySha,
445
+ riskProfile: checkContext.riskProfile,
446
+ verdict: 'human_arbitration_required',
447
+ failureReason: null,
448
+ blockingFindings: 1,
449
+ nonBlockingFindings: 0,
450
+ humanQuestions: 1,
451
+ findings: [
452
+ {
453
+ id: 'F-001',
454
+ severity: 'blocking',
455
+ claimCategory: 'human_decision_required',
456
+ claim: message,
457
+ evidenceRefs: [
458
+ {
459
+ type: 'file',
460
+ ref: 'check-timeline.json',
461
+ },
462
+ ],
463
+ affectedPlanSections: ['Plan/Check orchestration'],
464
+ expectedCorrection: 'Stop the repeated external review loop. Consolidate remaining findings into plan/status/check-resolution, or ask the human to approve one extra external review with --force-review-budget.',
465
+ },
466
+ ],
467
+ reviewBudget: {
468
+ reason,
469
+ budget,
470
+ summary,
471
+ forceFlag: '--force-review-budget',
472
+ },
473
+ readyForHumanGate: false,
474
+ createdAt: new Date().toISOString(),
475
+ };
476
+ const markdown = [
477
+ '# Check',
478
+ '',
479
+ '## итоговая оценка',
480
+ '',
481
+ '`human_arbitration_required`',
482
+ '',
483
+ '## Review Budget Gate',
484
+ '',
485
+ message,
486
+ '',
487
+ 'The external Checker was not invoked. The framework enforces a bounded Check stage so quality work happens through consolidated remediation instead of repeated provider loops.',
488
+ '',
489
+ '## Budget',
490
+ '',
491
+ '```json',
492
+ JSON.stringify({ reason, budget, summary }, null, 2),
493
+ '```',
494
+ '',
495
+ '## Required decision',
496
+ '',
497
+ '- Consolidate all remaining Check findings into `plan.md`, `status.md`, and `check-resolution.md`, then run one fresh Check after the window resets; or',
498
+ '- Ask the human to approve an extra external review and rerun with `--force-review-budget`.',
499
+ '',
500
+ '## Timing',
501
+ '',
502
+ `- Duration: ${buildTiming(startedAt).durationMs}ms`,
503
+ ].join('\n');
504
+
505
+ writeTaskFile(taskDir, 'check.md', appendRunnerMetadata(markdown, checkerPromptSha, cacheKey));
506
+ writeTaskFile(taskDir, 'check.result.json', JSON.stringify(result, null, 2));
507
+ updateStatus(taskDir, {
508
+ stage: 'Human Arbitration',
509
+ routingDecision: `review_budget_blocked:${reason}`,
510
+ checkVerdict: '`human_arbitration_required`',
511
+ checkResult: '- `check.result.json`: current; review budget blocked external Checker invocation',
512
+ supervisorAction: 'Check review budget blocked another external provider loop.',
513
+ nextStep: 'Human Arbitration: approve one extra external review with `--force-review-budget` or consolidate remaining findings before a fresh Check.',
514
+ humanApproval: 'yes',
515
+ });
516
+ appendOrchestrationLog(taskDir, `Check review budget blocked external checker; reason=${reason}; elapsedMs=${summary.elapsedMs}; providerStarted=${summary.providerStarted}; maxExternalRuns=${budget.maxExternalRunsPerStage}; stageSlaMs=${budget.stageSlaMs}`);
517
+ }
518
+
340
519
  function refreshTaskManifestAfterCheck(taskDir) {
341
520
  const manifest = buildTaskManifest({ taskDir });
342
521
  writeTaskManifest(taskDir, manifest);
@@ -366,6 +545,21 @@ function appendCheckTimeline(taskDir, event) {
366
545
  writeTaskFile(taskDir, 'check-timeline.json', JSON.stringify(existing, null, 2));
367
546
  }
368
547
 
548
+ function appendReviewBudgetSummary(taskDir) {
549
+ const config = readAgentsConfig();
550
+ const budget = resolveStageReviewBudget(config, 'check');
551
+ const summary = summarizeReviewBudgetWindow({
552
+ timeline: readTimeline(taskDir, 'check-timeline.json'),
553
+ stage: 'check',
554
+ now: new Date(),
555
+ });
556
+ appendCheckTimeline(taskDir, {
557
+ event: 'review_budget_summary',
558
+ budget,
559
+ summary,
560
+ });
561
+ }
562
+
369
563
  function buildAttemptRecord(packMeta, outcome) {
370
564
  return {
371
565
  mode: packMeta.mode,
@@ -28,6 +28,11 @@ import {
28
28
  summarizePackForConsole,
29
29
  } from './lib/llm-input-pack-utils.mjs';
30
30
  import { recordLlmInputUsage } from './lib/task-manifest-utils.mjs';
31
+ import {
32
+ evaluateReviewBudget,
33
+ resolveStageReviewBudget,
34
+ summarizeReviewBudgetWindow,
35
+ } from './lib/review-budget-utils.mjs';
31
36
 
32
37
  function main() {
33
38
  runMain().catch((error) => {
@@ -47,12 +52,15 @@ async function runMain() {
47
52
  const taskId = path.basename(taskDir);
48
53
  const verifierConfig = resolveVerifierConfig(args);
49
54
  const force = getFlag(args, 'force', false) === true;
55
+ const forceReviewBudget = getFlag(args, 'force-review-budget', false) === true
56
+ || getFlag(args, 'force-external-review', false) === true;
50
57
  const runStartedAt = new Date();
51
58
  appendVerifyTimeline(taskDir, {
52
59
  event: 'verify_started',
53
60
  mode: verifierConfig.mode,
54
61
  provider: verifierConfig.provider,
55
62
  model: verifierConfig.model,
63
+ forceReviewBudget,
56
64
  });
57
65
  const planSha = hashTaskMarkdown(taskDir, 'plan.md');
58
66
  const executionSha = hashTaskMarkdown(taskDir, 'execution.md');
@@ -202,6 +210,52 @@ async function runMain() {
202
210
  return;
203
211
  }
204
212
 
213
+ const reviewBudget = evaluateCurrentReviewBudget({
214
+ taskDir,
215
+ stage: 'verify',
216
+ config: readAgentsConfig(),
217
+ force: forceReviewBudget,
218
+ });
219
+ if (!reviewBudget.ok) {
220
+ writeVerifyReviewBudgetReturn({
221
+ taskDir,
222
+ taskId,
223
+ verifierConfig,
224
+ verifierRunId,
225
+ planSha,
226
+ executionSha,
227
+ reason: reviewBudget.reason,
228
+ message: reviewBudget.message,
229
+ budget: reviewBudget.budget,
230
+ summary: reviewBudget.summary,
231
+ });
232
+ appendVerifyTimeline(taskDir, {
233
+ event: 'review_budget_blocked',
234
+ verdict: 'human_arbitration_required',
235
+ reason: reviewBudget.reason,
236
+ message: reviewBudget.message,
237
+ budget: reviewBudget.budget,
238
+ summary: reviewBudget.summary,
239
+ timing: buildTiming(runStartedAt),
240
+ });
241
+ recordLlmInputUsage({
242
+ taskDir,
243
+ stage: 'verify',
244
+ packMeta: promptPayload.pack.meta,
245
+ attempts: [
246
+ ...llmInputAttempts,
247
+ buildAttemptRecord(promptPayload.pack.meta, `review_budget_blocked:${reviewBudget.reason}`),
248
+ ],
249
+ rerunCount,
250
+ timing: buildTiming(runStartedAt),
251
+ });
252
+ console.log(`Verifier review budget blocked ${taskId}: human_arbitration_required`);
253
+ console.log(`- reason: ${reviewBudget.reason}`);
254
+ console.log(`- elapsedMs: ${reviewBudget.summary.elapsedMs}`);
255
+ console.log(`- providerStarted: ${reviewBudget.summary.providerStarted}`);
256
+ return;
257
+ }
258
+
205
259
  try {
206
260
  const providerStartedAt = new Date();
207
261
  appendVerifyTimeline(taskDir, {
@@ -310,6 +364,7 @@ async function runMain() {
310
364
  finalEstimatedTokens: finalPack?.meta?.estimatedTokens || null,
311
365
  timing: buildTiming(runStartedAt),
312
366
  });
367
+ appendReviewBudgetSummary(taskDir);
313
368
  appendOrchestrationLog(taskDir, `external CLI verifier completed via ${verifierConfig.provider}; verdict=${verifyResultJson.verdict}; runId=${verifierRunId}`);
314
369
  console.log(`Verifier run completed for ${taskId}: ${verifyResultJson.verdict}`);
315
370
  console.log(`- verifierRunId: ${verifierRunId}`);
@@ -319,6 +374,119 @@ async function runMain() {
319
374
  }
320
375
  }
321
376
 
377
+ function evaluateCurrentReviewBudget({ taskDir, stage, config, force }) {
378
+ const budget = resolveStageReviewBudget(config, stage);
379
+ const summary = summarizeReviewBudgetWindow({
380
+ timeline: readTimeline(taskDir, 'verify-timeline.json'),
381
+ stage,
382
+ now: new Date(),
383
+ });
384
+ return {
385
+ ...evaluateReviewBudget({ budget, summary, force }),
386
+ budget,
387
+ summary,
388
+ };
389
+ }
390
+
391
+ function readTimeline(taskDir, fileName) {
392
+ const timelinePath = path.join(taskDir, fileName);
393
+ if (!fs.existsSync(timelinePath)) {
394
+ return [];
395
+ }
396
+ try {
397
+ const parsed = JSON.parse(fs.readFileSync(timelinePath, 'utf8'));
398
+ return Array.isArray(parsed) ? parsed : [];
399
+ } catch {
400
+ return [];
401
+ }
402
+ }
403
+
404
+ function writeVerifyReviewBudgetReturn({
405
+ taskDir,
406
+ taskId,
407
+ verifierConfig,
408
+ verifierRunId,
409
+ planSha,
410
+ executionSha,
411
+ reason,
412
+ message,
413
+ budget,
414
+ summary,
415
+ }) {
416
+ const verifyMarkdown = [
417
+ '# Verify',
418
+ '',
419
+ '## Verdict',
420
+ '',
421
+ '`human_arbitration_required`',
422
+ '',
423
+ '## Review Budget Gate',
424
+ '',
425
+ message,
426
+ '',
427
+ 'The external Verifier was not invoked. The framework enforces a bounded Verify stage so execution fixes are consolidated instead of repeatedly rechecked by an external provider.',
428
+ '',
429
+ '## Budget',
430
+ '',
431
+ '```json',
432
+ JSON.stringify({ reason, budget, summary }, null, 2),
433
+ '```',
434
+ '',
435
+ '## Required decision',
436
+ '',
437
+ '- Consolidate remaining Verify findings in `execution.md` / evidence artifacts, then run one fresh Verify after the window resets; or',
438
+ '- Ask the human to approve an extra external review and rerun with `--force-review-budget`.',
439
+ ].join('\n');
440
+ const result = {
441
+ schemaVersion: 1,
442
+ taskId,
443
+ planSha,
444
+ executionSha,
445
+ verificationMode: 'external_cli',
446
+ verifierProvider: 'review-budget',
447
+ verifierModel: 'none',
448
+ verifierRunId,
449
+ verdict: 'human_arbitration_required',
450
+ failureReason: null,
451
+ readyForRetrospective: false,
452
+ counts: {
453
+ blockingFindings: 1,
454
+ nonBlockingFindings: 0,
455
+ questions: 1,
456
+ },
457
+ findings: [
458
+ {
459
+ id: 'V-001',
460
+ severity: 'blocking',
461
+ claimCategory: 'runtime_risk',
462
+ affectedArtifacts: ['verify-timeline.json'],
463
+ claim: message,
464
+ evidenceRefs: [
465
+ {
466
+ type: 'artifact',
467
+ ref: 'verify-timeline.json',
468
+ },
469
+ ],
470
+ expectedCorrection: 'Stop the repeated external Verify loop. Consolidate remaining execution fixes or ask the human to approve one extra external review with --force-review-budget.',
471
+ },
472
+ ],
473
+ reviewBudget: {
474
+ reason,
475
+ budget,
476
+ summary,
477
+ forceFlag: '--force-review-budget',
478
+ },
479
+ };
480
+
481
+ writeTaskFile(taskDir, 'verify.md', verifyMarkdown);
482
+ writeTaskFile(taskDir, 'verify.result.json', JSON.stringify(result, null, 2));
483
+ updateStatusForVerifyResult(taskDir, result, {
484
+ reused: false,
485
+ verifierMode: 'external_cli',
486
+ });
487
+ appendOrchestrationLog(taskDir, `Verify review budget blocked external verifier; reason=${reason}; elapsedMs=${summary.elapsedMs}; providerStarted=${summary.providerStarted}; maxExternalRuns=${budget.maxExternalRunsPerStage}; stageSlaMs=${budget.stageSlaMs}`);
488
+ }
489
+
322
490
  function buildTiming(startedAt, completedAt = new Date()) {
323
491
  return {
324
492
  startedAt: startedAt.toISOString(),
@@ -426,6 +594,21 @@ function appendVerifyTimeline(taskDir, event) {
426
594
  writeTaskFile(taskDir, 'verify-timeline.json', JSON.stringify(existing, null, 2));
427
595
  }
428
596
 
597
+ function appendReviewBudgetSummary(taskDir) {
598
+ const config = readAgentsConfig();
599
+ const budget = resolveStageReviewBudget(config, 'verify');
600
+ const summary = summarizeReviewBudgetWindow({
601
+ timeline: readTimeline(taskDir, 'verify-timeline.json'),
602
+ stage: 'verify',
603
+ now: new Date(),
604
+ });
605
+ appendVerifyTimeline(taskDir, {
606
+ event: 'review_budget_summary',
607
+ budget,
608
+ summary,
609
+ });
610
+ }
611
+
429
612
  function buildAttemptRecord(packMeta, outcome) {
430
613
  return {
431
614
  mode: packMeta.mode,
@@ -18,6 +18,16 @@
18
18
  "isolatedContext": true,
19
19
  "readOnly": true
20
20
  },
21
+ "reviewBudgets": {
22
+ "check": {
23
+ "stageSlaMs": 180000,
24
+ "maxExternalRunsPerStage": 1
25
+ },
26
+ "verify": {
27
+ "stageSlaMs": 180000,
28
+ "maxExternalRunsPerStage": 1
29
+ }
30
+ },
21
31
  "checkerProviders": {
22
32
  "codex-cli": {
23
33
  "command": "${CODEX_CLI_COMMAND}",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@besales/ops-framework",
3
- "version": "0.1.29",
3
+ "version": "0.1.31",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "ops-agent": "bin/ops-agent.mjs"
@@ -0,0 +1,58 @@
1
+ # Golden Set / Regression Fixture Playbook
2
+
3
+ Use this when a task creates or changes golden sets, eval fixtures, label cards,
4
+ ground-truth examples, regression checklists or expected-output datasets.
5
+
6
+ Golden sets are test contracts, not just lists of examples.
7
+
8
+ ## Required Plan Sections
9
+
10
+ - `## Label Card Schema`
11
+ - `## Coverage Matrix`
12
+ - `## Negative / Edge Cases`
13
+ - `## Harness Boundary`
14
+
15
+ ## Label Card Schema
16
+
17
+ Each label card must define:
18
+
19
+ - stable fixture/source id and source reference;
20
+ - expected outputs or expected behavior;
21
+ - non-goals, ignored outputs or deferred outputs;
22
+ - source evidence rules, such as short quotes/snippets/refs and privacy limits;
23
+ - confidence/risk notes when labels are uncertain.
24
+
25
+ ## Coverage Matrix
26
+
27
+ The plan must name coverage dimensions that matter for this product area.
28
+ Examples include profiles, tracks, event types, user journeys, data qualities,
29
+ known edge behaviors, or source formats.
30
+
31
+ For every important dimension, record one of:
32
+
33
+ - selected fixture/example;
34
+ - explicitly missing/unavailable;
35
+ - intentionally deferred with reason.
36
+
37
+ ## Negative / Edge Cases
38
+
39
+ Include negative or edge cases that protect against false positives and
40
+ over-application. If no negative case exists in the current fixture pool, record
41
+ the missing case as a coverage gap instead of fabricating it.
42
+
43
+ ## Harness Boundary
44
+
45
+ State whether the task is creating manual labels/fixtures only or an automated
46
+ runner/CI harness. Do not claim automated regression coverage when only manual
47
+ label artifacts exist.
48
+
49
+ ## Verify Expectations
50
+
51
+ Execution evidence must show:
52
+
53
+ - label cards were created according to schema;
54
+ - coverage matrix was filled;
55
+ - negative/edge cases were selected or documented as missing;
56
+ - expected outputs and non-goals are inspectable;
57
+ - source refs/snippets respect privacy boundaries;
58
+ - manual-vs-automated harness boundary is explicit.
@@ -72,6 +72,9 @@ Project-specific context приходит только через task artifacts
72
72
  24. Если `llmInputPolicy.mode` не `strict` и отсутствующий full artifact реально нужен для честной оценки, verdict должен быть `context_insufficient`. Не используй `context_insufficient`, если deterministic gate уже явно показывает `return_to_plan`.
73
73
  25. Если в task artifacts есть `precheck-remediation.md`, Checker должен проверить, что весь checklist был закрыт одним consolidated plan update. Не создавай новый мелкий blocker по одному пункту из старого checklist, если оставшиеся пункты тоже очевидно не закрыты: верни consolidated finding, ссылающийся на `precheck-remediation.md`.
74
74
  26. Minor process/evidence polish не должен блокировать Human Gate, если deterministic gates закрыты, acceptance criteria покрыты, scope/risk/security/data correctness не нарушены, а остаток можно безопасно записать как `non_blocking` или human question.
75
+ 27. Если plan/task/checker-context показывает golden set/eval/regression fixtures/label cards/ground truth, Checker должен требовать `## Label Card Schema`, `## Coverage Matrix`, `## Negative / Edge Cases` и `## Harness Boundary`. Golden set без schema/coverage/negative cases/source evidence/non-goals/manual-vs-automated boundary является `return_to_plan`, даже если есть общий текст про expected outputs.
76
+ 28. Если remaining issue является процессной ясностью, wording polish или удобством статуса, а план уже содержит executable scope, acceptance, risk gates and verification evidence path, не возвращай `return_to_plan`; запиши как non-blocking note или human question. Цель Check - предотвратить дорогие ошибки до Execute, а не создавать повторные внешние циклы ради косметики.
77
+ 29. Если видишь несколько related blockers, объедини их в один consolidated finding с полным checklist. Не выдавай только первый найденный blocker, если следующий внешний Check очевидно найдет соседний.
75
78
 
76
79
  ## Контракт выхода
77
80
 
@@ -97,6 +100,7 @@ Project-specific context приходит только через task artifacts
97
100
  - проверка Optimization Strategy;
98
101
  - проверка Production Rollout Gate;
99
102
  - проверка Source Sync / Provider Gate;
103
+ - проверка Golden Set / Regression Gate;
100
104
  - проверка Loop Detector / consolidated remediation;
101
105
  - достаточность compact context или `context_insufficient`;
102
106
  - рекомендация supervisor: `return_to_plan` или `ready_for_human_gate`.
@@ -53,8 +53,11 @@
53
53
  17. Если Planner знает факт только из conversation context, этот факт нужно перенести в artifact: `brief.md`, `research.md`, `status.md` или `human_decision` evidence. Невидимый контекст не является evidence.
54
54
  18. Plan должен назвать risk tier (`R0`-`R5`), execution target and execution budget. Для `R1/R2` можно разрешить fast loop inside approved scope, но обязательно назвать stop rules.
55
55
  19. План проверки должен быть ladder-based: micro-verify during Execute, slice-verify before completion and external Verify requirement for closeout/high-risk claims.
56
+ 20. После `return_to_plan` Planner обязан выполнить один consolidated remediation pass: закрыть все blocking findings, precheck checklist и obvious adjacent gaps в `plan.md`/`check-resolution.md` до следующего Check. Не запускай внешний Check после единичной мелкой правки, если другие known blockers остаются открыты.
57
+ 21. Если Check остановлен review budget gate (`human_arbitration_required` с `reviewBudget.reason`), Planner не должен пытаться обойти это повторным запуском. Нужно либо запросить human approval на `--force-review-budget`, либо укрупнить remediation и вернуться к Check после явного решения.
56
58
  20. План должен описывать meaningful slice. Не дроби локальную работу на отдельный Plan/Check/Verify для каждого микрофикса, если риски и target остаются внутри одного approved tier.
57
59
  21. Если risk triggers или `checker-context-pack.md` показывают O2/O3 hot-path work, Planner обязан добавить `## Optimization Strategy`: tier, hot paths, expected data size, chosen efficient approach, anti-patterns avoided and bounded optimizer budget/stop rule. Цель gate — предотвратить очевидно неэффективное решение до Execute, а не запускать бесконечную оптимизацию.
60
+ 22. Если задача создает golden set/eval/regression fixtures/label cards/ground truth, Planner обязан добавить `## Label Card Schema`, `## Coverage Matrix`, `## Negative / Edge Cases` и `## Harness Boundary`. Golden set должен быть test contract with expected outputs, non-goals, source refs, missing coverage policy and manual-vs-automated boundary.
58
61
 
59
62
  ## Check Resolution Contract
60
63
 
@@ -122,6 +125,7 @@ Allowed `evidenceRefs[].type` and `artifactChangeRefs[].type`:
122
125
  - verification ladder;
123
126
  - UI verification path или явное `not applicable`;
124
127
  - Optimization Strategy для O2/O3 или явное `not applicable`;
128
+ - Label Card Schema / Coverage Matrix / Negative Edge Cases / Harness Boundary для golden set/eval fixture задач;
125
129
  - global standards alignment;
126
130
  - что требует human approval.
127
131
 
@@ -61,6 +61,8 @@ Supervisor является code-level orchestrator по контракту: rou
61
61
  27. Если external verifier/checker/browser tooling начинает тратить непропорционально много времени или блокируется окружением, Supervisor обязан остановить loop и вынести human decision: принять internal verify/evidence, запустить external escalation вручную или изменить scope.
62
62
  28. Если deterministic Check preflight создал `precheck-remediation.md`, Supervisor не должен запускать повторный Check после точечной правки одного пункта. Сначала Planner/Executor должен закрыть весь checklist или явно отметить not-applicable с evidence/human decision в `plan.md`/`status.md`, затем допускается один fresh Check.
63
63
  29. Перед повторным Check после deterministic precheck Supervisor обязан сверить, что `precheck-remediation.md` был использован как consolidated checklist: все listed gates отражены в plan/research/status, а не закрывались по одному через серию precheck loops.
64
+ 30. External Check и external Verify имеют stage SLA по умолчанию 3 минуты и максимум один external provider run на фазу. Если `check.result.json` или `verify.result.json` вернул `human_arbitration_required` с `reviewBudget.reason`, Supervisor не запускает еще один внешний review без явного human approval и `--force-review-budget`.
65
+ 31. После `return_to_plan` / `return_to_execute` Supervisor должен требовать один consolidated remediation pass. Запрещено запускать серию внешних Check/Verify для мелких последовательных правок, если их можно закрыть в одном artifact update.
64
66
 
65
67
  ## Hard Gate: Material Scope Expansion -> Brief Reset
66
68
 
@@ -46,6 +46,9 @@
46
46
  22. Если `llmInputPolicy.mode` не `strict` и отсутствующий full artifact реально нужен для честной оценки, verdict должен быть `context_insufficient`. Не используй `context_insufficient`, если execution evidence уже явно отсутствует или противоречит plan.
47
47
  23. Если task содержит `precheck-remediation.md`, verifier должен проверить только применимые пункты, которые дошли до Execute. Не возвращай `return_to_execute` из-за старого precheck checklist, если план закрыл его до Human Gate и фактическая реализация покрывает acceptance.
48
48
  24. Environment/tooling failures внешнего verifier/browser smoke не должны превращаться в бесконечный `return_to_execute` loop. Если implementation evidence достаточно, но внешний инструмент заблокирован окружением, используй `pass_with_notes` или `human_arbitration_required` согласно риску.
49
+ 25. Если `plan.md` содержит golden set/eval/regression fixture sections, verifier должен проверить `Golden Set / Regression Evidence`: label cards follow schema, coverage matrix is filled, negative/edge cases are selected or documented missing, expected outputs/non-goals are inspectable, source refs/snippets exist and manual-vs-automated harness boundary is explicit.
50
+ 26. External Verify должен укладываться в bounded review model: один внешний provider run по умолчанию. Если остаются несколько blockers, верни один consolidated `return_to_execute` finding с полным checklist. Minor documentation/status polish не должен запускать новый внешний цикл, если acceptance/evidence покрыты.
51
+ 27. Если review budget gate уже вернул `human_arbitration_required`, не предлагай повторный external Verify как обычный следующий шаг. Следующий шаг: consolidated execution fix, internal evidence decision или явный human approval на `--force-review-budget`.
49
52
 
50
53
  ## Контракт выхода
51
54
 
@@ -61,6 +64,7 @@
61
64
  - Optimization Review Evidence, если `plan.md` содержит O2/O3 `Optimization Strategy`;
62
65
  - Production Rollout Gate evidence, если он есть в `plan.md`;
63
66
  - Source Sync / Provider Gate evidence, если он есть в `plan.md`;
67
+ - Golden Set / Regression Evidence, если plan создает golden set/eval fixtures/label cards;
64
68
  - Loop Detector / consolidated remediation status;
65
69
  - достаточность compact context или `context_insufficient`;
66
70
  - findings;