@besales/ops-framework 0.1.28 → 0.1.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/bin/lib/check-context-utils.mjs +108 -1
- package/bin/lib/check-context-utils.test.mjs +118 -0
- package/bin/lib/llm-input-pack-utils.mjs +4 -1
- package/bin/lib/llm-input-pack-utils.test.mjs +58 -0
- package/bin/lib/task-manifest-utils.mjs +6 -0
- package/bin/quality-gates.mjs +3 -0
- package/bin/run-check.mjs +80 -2
- package/package.json +1 -1
- package/playbooks/golden-set-regression.md +58 -0
- package/prompts/checker.md +4 -0
- package/prompts/planner.md +2 -0
- package/prompts/supervisor.md +2 -0
- package/prompts/verifier.md +4 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.1.30
|
|
4
|
+
|
|
5
|
+
- Added a generic `golden-set-regression` risk trigger for golden sets, eval fixtures, label cards, ground-truth datasets and regression checklists.
|
|
6
|
+
- Added shared `golden-set-regression` playbook and deterministic plan gate requiring label-card schema, coverage matrix, negative/edge cases and harness boundary.
|
|
7
|
+
- Added Verify evidence checks for golden-set/regression work so execution must prove label cards, coverage, edge cases, expected outputs/non-goals, source refs and manual-vs-automated boundary.
|
|
8
|
+
|
|
9
|
+
## 0.1.29
|
|
10
|
+
|
|
11
|
+
- Added `precheck-remediation.md` for deterministic Check blocks so all missing plan gates are consolidated into one checklist before another external Check.
|
|
12
|
+
- Included precheck remediation artifacts in Checker and Verifier LLM input packs.
|
|
13
|
+
- Updated Checker/Verifier/Supervisor rules to avoid one-item precheck loops and route minor/tooling issues to notes or human decision when acceptance is covered.
|
|
14
|
+
|
|
3
15
|
## 0.1.28
|
|
4
16
|
|
|
5
17
|
- Added Verify reuse guard: a passing `verify.result.json` is reused when `plan.md` and `execution.md` hashes are unchanged, unless `--force` is passed.
|
|
@@ -46,7 +46,7 @@ export const RISK_CONFIG = {
|
|
|
46
46
|
};
|
|
47
47
|
|
|
48
48
|
export const ALLOWED_RISK_PROFILES = Object.keys(RISK_CONFIG);
|
|
49
|
-
export const ALLOWED_RISK_TRIGGERS = ['auth-security', 'docs-only', 'dto-readmodel', 'ingestion-provider', 'materializer', 'panel-ui', 'prisma-schema', 'production-runtime', 'source-sync-provider', 'ui-visible-api', 'worker-queue'];
|
|
49
|
+
export const ALLOWED_RISK_TRIGGERS = ['auth-security', 'docs-only', 'dto-readmodel', 'golden-set-regression', 'ingestion-provider', 'materializer', 'panel-ui', 'prisma-schema', 'production-runtime', 'source-sync-provider', 'ui-visible-api', 'worker-queue'];
|
|
50
50
|
|
|
51
51
|
export const CHECKER_CONTEXT_PACK_FILE = 'checker-context-pack.md';
|
|
52
52
|
export const PLAYBOOK_TRIGGER_MAP = new Map([
|
|
@@ -60,6 +60,7 @@ export const PLAYBOOK_TRIGGER_MAP = new Map([
|
|
|
60
60
|
['worker-queue', ['complexity-performance']],
|
|
61
61
|
['materializer', ['complexity-performance']],
|
|
62
62
|
['dto-readmodel', ['complexity-performance']],
|
|
63
|
+
['golden-set-regression', ['golden-set-regression']],
|
|
63
64
|
]);
|
|
64
65
|
|
|
65
66
|
export const ALLOWED_VERDICTS = ['return_to_plan', 'ready_for_human_gate', 'human_arbitration_required', 'context_insufficient', 'checker_failed'];
|
|
@@ -663,6 +664,9 @@ export function classifyRisk({ structuralLines, referencedFiles, planSections, r
|
|
|
663
664
|
if (hasText(/\b(dto|validation|read model|payload shape|contract)\b/)) {
|
|
664
665
|
triggers.add('dto-readmodel');
|
|
665
666
|
}
|
|
667
|
+
if (hasText(/\b(golden set|golden-set|golden dataset|eval|evals|evaluation|regression checklist|regression fixture|regression fixtures|fixture labels?|label cards?|ground truth|expected outputs?)\b/)) {
|
|
668
|
+
triggers.add('golden-set-regression');
|
|
669
|
+
}
|
|
666
670
|
|
|
667
671
|
if (triggers.size === 0 && isDocsOnly(referencedFiles, planSections)) {
|
|
668
672
|
triggers.add('docs-only');
|
|
@@ -768,6 +772,7 @@ export function analyzePlanQualityGates({ planContent, risk, referencedFiles = [
|
|
|
768
772
|
const optimizationRequired = requiresOptimizationStrategy(optimizationTier);
|
|
769
773
|
const productionRolloutRequired = requiresProductionRolloutGate(risk.riskTriggers);
|
|
770
774
|
const sourceSyncProviderRequired = requiresSourceSyncProviderGate(risk.riskTriggers);
|
|
775
|
+
const goldenSetRequired = requiresGoldenSetRegressionGate(risk.riskTriggers);
|
|
771
776
|
const executionMetadata = inspectExecutionMetadata(sections);
|
|
772
777
|
const verificationLadder = inspectVerificationLadder(sections);
|
|
773
778
|
const standardsAlignmentRequired = requiresStandardsAlignment({ referencedFiles, structuralLines });
|
|
@@ -780,6 +785,7 @@ export function analyzePlanQualityGates({ planContent, risk, referencedFiles = [
|
|
|
780
785
|
const productionRollout = inspectProductionRolloutGate(sections);
|
|
781
786
|
const sourceSyncProvider = inspectSourceSyncProviderGate(sections);
|
|
782
787
|
const importIngestion = inspectImportIngestionGate(sections, planContent);
|
|
788
|
+
const goldenSetRegression = inspectGoldenSetRegressionGate(sections, planContent);
|
|
783
789
|
const missingSignals = [];
|
|
784
790
|
|
|
785
791
|
if (!executionMetadata.present) {
|
|
@@ -821,6 +827,9 @@ export function analyzePlanQualityGates({ planContent, risk, referencedFiles = [
|
|
|
821
827
|
if (sourceSyncProviderRequired && importIngestion.required && !importIngestion.present) {
|
|
822
828
|
missingSignals.push('Import/ingestion plan must include an Import / Ingestion Contract naming representative real fixtures or an explicit no-real-fixtures reason, raw metadata/speaker-label extraction, and duplicate-import policy.');
|
|
823
829
|
}
|
|
830
|
+
if (goldenSetRequired && !goldenSetRegression.present) {
|
|
831
|
+
missingSignals.push('Golden set/eval fixture work must include `## Label Card Schema`, `## Coverage Matrix`, `## Negative / Edge Cases`, and `## Harness Boundary` so examples become a test contract, not just a list.');
|
|
832
|
+
}
|
|
824
833
|
|
|
825
834
|
return {
|
|
826
835
|
executionMetadata,
|
|
@@ -841,6 +850,8 @@ export function analyzePlanQualityGates({ planContent, risk, referencedFiles = [
|
|
|
841
850
|
sourceSyncProviderRequired,
|
|
842
851
|
sourceSyncProvider,
|
|
843
852
|
importIngestion,
|
|
853
|
+
goldenSetRequired,
|
|
854
|
+
goldenSetRegression,
|
|
844
855
|
missingSignals,
|
|
845
856
|
};
|
|
846
857
|
}
|
|
@@ -1070,6 +1081,10 @@ export function requiresSourceSyncProviderGate(riskTriggers = []) {
|
|
|
1070
1081
|
return riskTriggers.includes('ingestion-provider') || riskTriggers.includes('source-sync-provider');
|
|
1071
1082
|
}
|
|
1072
1083
|
|
|
1084
|
+
export function requiresGoldenSetRegressionGate(riskTriggers = []) {
|
|
1085
|
+
return riskTriggers.includes('golden-set-regression');
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1073
1088
|
export function inspectUiAcceptanceScenarios(sections) {
|
|
1074
1089
|
const body = readCanonicalSection(sections, ['ui acceptance scenarios', 'ui acceptance', 'ui scenarios']);
|
|
1075
1090
|
if (!body) {
|
|
@@ -1102,6 +1117,63 @@ export function inspectUiAcceptanceScenarios(sections) {
|
|
|
1102
1117
|
return result;
|
|
1103
1118
|
}
|
|
1104
1119
|
|
|
1120
|
+
export function inspectGoldenSetRegressionGate(sections) {
|
|
1121
|
+
const labelSchema = readCanonicalSection(sections, [
|
|
1122
|
+
'label card schema',
|
|
1123
|
+
'label schema',
|
|
1124
|
+
'golden set label schema',
|
|
1125
|
+
'fixture label schema',
|
|
1126
|
+
]);
|
|
1127
|
+
const coverageMatrix = readCanonicalSection(sections, [
|
|
1128
|
+
'coverage matrix',
|
|
1129
|
+
'golden set coverage matrix',
|
|
1130
|
+
'eval coverage matrix',
|
|
1131
|
+
'fixture coverage matrix',
|
|
1132
|
+
]);
|
|
1133
|
+
const negativeEdgeCases = readCanonicalSection(sections, [
|
|
1134
|
+
'negative / edge cases',
|
|
1135
|
+
'negative and edge cases',
|
|
1136
|
+
'negative cases',
|
|
1137
|
+
'edge cases',
|
|
1138
|
+
'known edge cases',
|
|
1139
|
+
]);
|
|
1140
|
+
const harnessBoundary = readCanonicalSection(sections, [
|
|
1141
|
+
'harness boundary',
|
|
1142
|
+
'automation boundary',
|
|
1143
|
+
'regression harness boundary',
|
|
1144
|
+
'manual vs automated boundary',
|
|
1145
|
+
]);
|
|
1146
|
+
|
|
1147
|
+
const combined = [labelSchema, coverageMatrix, negativeEdgeCases, harnessBoundary].join('\n').toLowerCase();
|
|
1148
|
+
const result = {
|
|
1149
|
+
present: false,
|
|
1150
|
+
hasLabelCardSchema: Boolean(labelSchema),
|
|
1151
|
+
hasExpectedOutputs: /expected output|expected result|expectation|ground truth|acceptance|ожидаем|результат/.test(labelSchema.toLowerCase()),
|
|
1152
|
+
hasNonGoals: /non-?goal|ignored|deferred|out of scope|not applied|не\s+дела|игнор|отлож/.test(labelSchema.toLowerCase()),
|
|
1153
|
+
hasSourceEvidence: /source|quote|snippet|evidence|reference|ref|path|цитат|источник|сниппет/.test(labelSchema.toLowerCase()),
|
|
1154
|
+
hasCoverageMatrix: Boolean(coverageMatrix),
|
|
1155
|
+
hasCoverageDimensions: /track|profile|scenario|behavior|dimension|case|category|matrix|coverage|профил|сценари|покрыт|категор/.test(coverageMatrix.toLowerCase()),
|
|
1156
|
+
hasNegativeEdgeCases: Boolean(negativeEdgeCases),
|
|
1157
|
+
hasNegativeExpectation: /negative|edge|missing|unavailable|single|ambiguous|conflict|false positive|should not|must not|негатив|краев|отсутств|неоднознач/.test(negativeEdgeCases.toLowerCase()),
|
|
1158
|
+
hasHarnessBoundary: Boolean(harnessBoundary),
|
|
1159
|
+
hasManualVsAutomatedBoundary: /manual|automated|runner|harness|ci|checklist|later|future|not yet|ручн|автомат|позже|сейчас/.test(harnessBoundary.toLowerCase()),
|
|
1160
|
+
hasMissingCoveragePolicy: /missing|unavailable|gap|deferred|not found|known missing|отсутств|недоступ|пробел|отлож/.test(combined),
|
|
1161
|
+
};
|
|
1162
|
+
result.complete = result.hasLabelCardSchema
|
|
1163
|
+
&& result.hasExpectedOutputs
|
|
1164
|
+
&& result.hasNonGoals
|
|
1165
|
+
&& result.hasSourceEvidence
|
|
1166
|
+
&& result.hasCoverageMatrix
|
|
1167
|
+
&& result.hasCoverageDimensions
|
|
1168
|
+
&& result.hasNegativeEdgeCases
|
|
1169
|
+
&& result.hasNegativeExpectation
|
|
1170
|
+
&& result.hasHarnessBoundary
|
|
1171
|
+
&& result.hasManualVsAutomatedBoundary
|
|
1172
|
+
&& result.hasMissingCoveragePolicy;
|
|
1173
|
+
result.present = result.complete;
|
|
1174
|
+
return result;
|
|
1175
|
+
}
|
|
1176
|
+
|
|
1105
1177
|
export function inspectComplexityPerformanceBudget(sections) {
|
|
1106
1178
|
const body = readCanonicalSection(sections, [
|
|
1107
1179
|
'complexity / performance budget',
|
|
@@ -1407,6 +1479,17 @@ export function buildCheckerContextPack({
|
|
|
1407
1479
|
].join('\n')
|
|
1408
1480
|
: '- Source sync/provider gate is not required by detected triggers.',
|
|
1409
1481
|
'',
|
|
1482
|
+
'## Golden Set / Regression Expectations',
|
|
1483
|
+
'',
|
|
1484
|
+
qualityGates.goldenSetRequired
|
|
1485
|
+
? [
|
|
1486
|
+
'- Golden set/eval/regression fixture risk detected.',
|
|
1487
|
+
`- Golden Set / Regression Gate complete: \`${qualityGates.goldenSetRegression.present ? 'yes' : 'no'}\`.`,
|
|
1488
|
+
'- Checker must return `return_to_plan` if the plan lacks label-card schema, coverage matrix, negative/edge cases, harness boundary, missing coverage policy or source evidence rules.',
|
|
1489
|
+
'- A golden set is a reusable test contract, not merely a list of examples to process.',
|
|
1490
|
+
].join('\n')
|
|
1491
|
+
: '- Golden set/regression fixture gate is not required by detected triggers.',
|
|
1492
|
+
'',
|
|
1410
1493
|
'## Relevant Playbooks',
|
|
1411
1494
|
'',
|
|
1412
1495
|
renderRelevantPlaybookIndex(relevantPlaybooks),
|
|
@@ -1634,6 +1717,27 @@ export function validateExecutionEvidenceForPlan({ planContent, executionContent
|
|
|
1634
1717
|
}
|
|
1635
1718
|
}
|
|
1636
1719
|
|
|
1720
|
+
if (hasAnySection(planSections, ['label card schema', 'label schema', 'coverage matrix', 'negative / edge cases', 'harness boundary'])) {
|
|
1721
|
+
const evidence = readAnySection(executionSections, [
|
|
1722
|
+
'golden set / regression evidence',
|
|
1723
|
+
'golden set evidence',
|
|
1724
|
+
'regression fixture evidence',
|
|
1725
|
+
'label evidence',
|
|
1726
|
+
'fixture evidence',
|
|
1727
|
+
]);
|
|
1728
|
+
if (!evidence) {
|
|
1729
|
+
errors.push({
|
|
1730
|
+
category: 'missing_evidence',
|
|
1731
|
+
message: 'Plan contains Golden Set / Regression Gate but execution.md is missing Golden Set / Regression Evidence.',
|
|
1732
|
+
});
|
|
1733
|
+
} else if (!/(label card|label schema|coverage matrix|negative|edge case|harness|runner|manual|automated|expected output|non-?goal|source|quote|snippet|missing coverage)/i.test(evidence)) {
|
|
1734
|
+
errors.push({
|
|
1735
|
+
category: 'insufficient_evidence',
|
|
1736
|
+
message: 'Golden Set / Regression Evidence must show label cards, coverage matrix, negative/edge cases, harness boundary, expected outputs/non-goals and source refs.',
|
|
1737
|
+
});
|
|
1738
|
+
}
|
|
1739
|
+
}
|
|
1740
|
+
|
|
1637
1741
|
return errors;
|
|
1638
1742
|
}
|
|
1639
1743
|
|
|
@@ -1685,6 +1789,9 @@ function buildCheckerQuestions({ risk, qualityGates }) {
|
|
|
1685
1789
|
if (qualityGates.importIngestion?.required) {
|
|
1686
1790
|
questions.push('Does the import plan use representative real fixtures when available, extract raw metadata needed downstream, and define exact duplicate-import behavior?');
|
|
1687
1791
|
}
|
|
1792
|
+
if (qualityGates.goldenSetRequired) {
|
|
1793
|
+
questions.push('Does the golden set define a reusable test contract with label-card schema, coverage matrix, negative/edge cases, source refs and explicit manual-vs-automated harness boundary?');
|
|
1794
|
+
}
|
|
1688
1795
|
return questions;
|
|
1689
1796
|
}
|
|
1690
1797
|
|
|
@@ -14,6 +14,7 @@ import {
|
|
|
14
14
|
selectRelevantPlaybookNames,
|
|
15
15
|
inspectComplexityPerformanceBudget,
|
|
16
16
|
inspectExecutionMetadata,
|
|
17
|
+
inspectGoldenSetRegressionGate,
|
|
17
18
|
inspectAuditWriterModel,
|
|
18
19
|
inspectMigrationApplyPlan,
|
|
19
20
|
inspectOptimizationStrategy,
|
|
@@ -24,6 +25,7 @@ import {
|
|
|
24
25
|
inspectVerificationLadder,
|
|
25
26
|
parseMarkdownSections,
|
|
26
27
|
requiresOptimizationStrategy,
|
|
28
|
+
requiresGoldenSetRegressionGate,
|
|
27
29
|
requiresStandardsAlignment,
|
|
28
30
|
validateExecutionEvidenceForPlan,
|
|
29
31
|
} from './check-context-utils.mjs';
|
|
@@ -833,4 +835,120 @@ describe('agent pipeline quality gates', () => {
|
|
|
833
835
|
expect(result.importIngestion.present).toBe(true);
|
|
834
836
|
expect(result.missingSignals.some((signal) => signal.includes('Import/ingestion plan must include'))).toBe(false);
|
|
835
837
|
});
|
|
838
|
+
|
|
839
|
+
it('detects golden set regression work and requires a test-contract gate', () => {
|
|
840
|
+
const risk = classifyRisk({
|
|
841
|
+
structuralLines: [
|
|
842
|
+
'- Create golden set label cards and regression checklist for process quality.',
|
|
843
|
+
],
|
|
844
|
+
referencedFiles: [],
|
|
845
|
+
planSections: new Map(),
|
|
846
|
+
});
|
|
847
|
+
|
|
848
|
+
expect(risk.riskTriggers).toContain('golden-set-regression');
|
|
849
|
+
expect(requiresGoldenSetRegressionGate(risk.riskTriggers)).toBe(true);
|
|
850
|
+
expect(selectRelevantPlaybookNames(risk.riskTriggers)).toContain('golden-set-regression');
|
|
851
|
+
|
|
852
|
+
const result = analyzePlanQualityGates({
|
|
853
|
+
planContent: [
|
|
854
|
+
'# Plan',
|
|
855
|
+
'',
|
|
856
|
+
'## Implementation Steps',
|
|
857
|
+
'',
|
|
858
|
+
'- Select golden set examples with expected outputs and non-goals.',
|
|
859
|
+
].join('\n'),
|
|
860
|
+
risk,
|
|
861
|
+
});
|
|
862
|
+
|
|
863
|
+
expect(result.goldenSetRequired).toBe(true);
|
|
864
|
+
expect(result.goldenSetRegression.present).toBe(false);
|
|
865
|
+
expect(result.missingSignals).toContain('Golden set/eval fixture work must include `## Label Card Schema`, `## Coverage Matrix`, `## Negative / Edge Cases`, and `## Harness Boundary` so examples become a test contract, not just a list.');
|
|
866
|
+
});
|
|
867
|
+
|
|
868
|
+
it('accepts golden set regression gate with schema, coverage, edge cases and harness boundary', () => {
|
|
869
|
+
const plan = [
|
|
870
|
+
'# Plan',
|
|
871
|
+
'',
|
|
872
|
+
'## Label Card Schema',
|
|
873
|
+
'',
|
|
874
|
+
'- Source path / source id and short quote snippet are required.',
|
|
875
|
+
'- Expected outputs and acceptance expectations are required.',
|
|
876
|
+
'- Non-goals / ignored or deferred outputs are required.',
|
|
877
|
+
'- Confidence and risk notes are recorded when uncertain.',
|
|
878
|
+
'',
|
|
879
|
+
'## Coverage Matrix',
|
|
880
|
+
'',
|
|
881
|
+
'- Coverage dimensions: track, profile, scenario category and source format.',
|
|
882
|
+
'- Missing coverage / unavailable fixtures are recorded as gaps.',
|
|
883
|
+
'',
|
|
884
|
+
'## Negative / Edge Cases',
|
|
885
|
+
'',
|
|
886
|
+
'- Negative examples and edge cases must catch false positives and ambiguous input.',
|
|
887
|
+
'- Missing edge cases are recorded instead of fabricated.',
|
|
888
|
+
'',
|
|
889
|
+
'## Harness Boundary',
|
|
890
|
+
'',
|
|
891
|
+
'- Manual labels are created now; automated runner/CI harness comes later.',
|
|
892
|
+
].join('\n');
|
|
893
|
+
const sections = parseMarkdownSections(plan);
|
|
894
|
+
const inspected = inspectGoldenSetRegressionGate(sections);
|
|
895
|
+
const result = analyzePlanQualityGates({
|
|
896
|
+
planContent: plan,
|
|
897
|
+
risk: {
|
|
898
|
+
riskProfile: 'medium',
|
|
899
|
+
riskTriggers: ['golden-set-regression'],
|
|
900
|
+
},
|
|
901
|
+
});
|
|
902
|
+
|
|
903
|
+
expect(inspected.present).toBe(true);
|
|
904
|
+
expect(result.goldenSetRegression.present).toBe(true);
|
|
905
|
+
expect(result.missingSignals.some((signal) => signal.includes('Golden set/eval fixture work'))).toBe(false);
|
|
906
|
+
});
|
|
907
|
+
|
|
908
|
+
it('requires golden set regression execution evidence when the plan declares label gates', () => {
|
|
909
|
+
const plan = [
|
|
910
|
+
'# Plan',
|
|
911
|
+
'',
|
|
912
|
+
'## Label Card Schema',
|
|
913
|
+
'',
|
|
914
|
+
'- Source path and source quote snippet.',
|
|
915
|
+
'- Expected outputs and non-goals.',
|
|
916
|
+
'',
|
|
917
|
+
'## Coverage Matrix',
|
|
918
|
+
'',
|
|
919
|
+
'- Coverage dimensions and missing coverage gaps.',
|
|
920
|
+
'',
|
|
921
|
+
'## Negative / Edge Cases',
|
|
922
|
+
'',
|
|
923
|
+
'- Negative cases and edge case behavior.',
|
|
924
|
+
'',
|
|
925
|
+
'## Harness Boundary',
|
|
926
|
+
'',
|
|
927
|
+
'- Manual labels now, automated runner later.',
|
|
928
|
+
].join('\n');
|
|
929
|
+
|
|
930
|
+
const missing = validateExecutionEvidenceForPlan({
|
|
931
|
+
planContent: plan,
|
|
932
|
+
executionContent: '# Execution\n\nNo golden set evidence yet.',
|
|
933
|
+
});
|
|
934
|
+
expect(missing).toContainEqual({
|
|
935
|
+
category: 'missing_evidence',
|
|
936
|
+
message: 'Plan contains Golden Set / Regression Gate but execution.md is missing Golden Set / Regression Evidence.',
|
|
937
|
+
});
|
|
938
|
+
|
|
939
|
+
const ok = validateExecutionEvidenceForPlan({
|
|
940
|
+
planContent: plan,
|
|
941
|
+
executionContent: [
|
|
942
|
+
'# Execution',
|
|
943
|
+
'',
|
|
944
|
+
'## Golden Set / Regression Evidence',
|
|
945
|
+
'',
|
|
946
|
+
'- Label cards follow the label schema and include expected output, non-goals, source refs and short quote snippets.',
|
|
947
|
+
'- Coverage matrix filled; missing coverage recorded.',
|
|
948
|
+
'- Negative / edge case examples documented.',
|
|
949
|
+
'- Harness boundary: manual checklist now, automated runner later.',
|
|
950
|
+
].join('\n'),
|
|
951
|
+
});
|
|
952
|
+
expect(ok.some((issue) => issue.message.includes('Golden Set / Regression'))).toBe(false);
|
|
953
|
+
});
|
|
836
954
|
});
|
|
@@ -199,12 +199,13 @@ export function buildCheckerLlmInputPack({
|
|
|
199
199
|
}) {
|
|
200
200
|
const selectedMode = normalizeLlmContextMode(mode) || 'standard';
|
|
201
201
|
const artifacts = selectedMode === 'strict'
|
|
202
|
-
? readArtifacts(taskDir, ['brief.md', 'research.md', 'plan.md', 'status.md', 'feedback.md', 'execution-feedback.md'], 'full')
|
|
202
|
+
? readArtifacts(taskDir, ['brief.md', 'research.md', 'plan.md', 'status.md', 'precheck-remediation.md', 'feedback.md', 'execution-feedback.md'], 'full')
|
|
203
203
|
: {
|
|
204
204
|
'brief.md': compactArtifact(taskDir, 'brief.md', selectedMode, ['goal', 'scope', 'success criteria']),
|
|
205
205
|
'research.md': compactArtifact(taskDir, 'research.md', selectedMode, ['findings', 'evidence', 'repo', 'architecture', 'standards']),
|
|
206
206
|
'plan.md': compactArtifact(taskDir, 'plan.md', selectedMode, CHECK_RELEVANT_SECTIONS),
|
|
207
207
|
'status.md': compactStatus(readTaskFile(taskDir, 'status.md')),
|
|
208
|
+
'precheck-remediation.md': compactArtifact(taskDir, 'precheck-remediation.md', selectedMode, ['checklist', 'rerun rule', 'purpose']),
|
|
208
209
|
'feedback.md': compactArtifact(taskDir, 'feedback.md', selectedMode, ['feedback event', 'classification', 'supervisor decision']),
|
|
209
210
|
};
|
|
210
211
|
|
|
@@ -325,6 +326,7 @@ export function buildVerifierLlmInputPack({
|
|
|
325
326
|
checkResult: readOptionalJson(taskDir, 'check.result.json'),
|
|
326
327
|
mode: 'standard',
|
|
327
328
|
}),
|
|
329
|
+
'precheck-remediation.md': compactArtifact(taskDir, 'precheck-remediation.md', 'standard', ['checklist', 'rerun rule', 'purpose']),
|
|
328
330
|
'check-resolution.md': compactArtifact(taskDir, 'check-resolution.md', 'standard', ['structured resolution', 'root cause', 'resolution']),
|
|
329
331
|
'human-gate-summary.md': truncateMiddle(readTaskFile(taskDir, 'human-gate-summary.md'), 3500),
|
|
330
332
|
'execution.md': readTaskFile(taskDir, 'execution.md'),
|
|
@@ -346,6 +348,7 @@ export function buildVerifierLlmInputPack({
|
|
|
346
348
|
checkResult: readOptionalJson(taskDir, 'check.result.json'),
|
|
347
349
|
mode: selectedMode,
|
|
348
350
|
}),
|
|
351
|
+
'precheck-remediation.md': compactArtifact(taskDir, 'precheck-remediation.md', selectedMode, ['checklist', 'rerun rule', 'purpose']),
|
|
349
352
|
'check-resolution.md': truncateMiddle(readTaskFile(taskDir, 'check-resolution.md'), charLimitForMode(selectedMode, 1500, 3500)),
|
|
350
353
|
'human-gate-summary.md': truncateMiddle(readTaskFile(taskDir, 'human-gate-summary.md'), charLimitForMode(selectedMode, 1200, 2500)),
|
|
351
354
|
'execution.md': compactArtifact(taskDir, 'execution.md', selectedMode, VERIFY_EXECUTION_SECTIONS),
|
|
@@ -118,6 +118,42 @@ describe('llm input pack utilities', () => {
|
|
|
118
118
|
expect(pack.meta.compactedArtifacts).toContain('plan.md');
|
|
119
119
|
});
|
|
120
120
|
|
|
121
|
+
it('includes precheck remediation in checker packs when present', () => {
|
|
122
|
+
const taskDir = createTask();
|
|
123
|
+
write(taskDir, 'precheck-remediation.md', [
|
|
124
|
+
'# Precheck Remediation',
|
|
125
|
+
'',
|
|
126
|
+
'## Checklist',
|
|
127
|
+
'',
|
|
128
|
+
'- [ ] P-001: Optimization Strategy is missing.',
|
|
129
|
+
'',
|
|
130
|
+
'## Rerun Rule',
|
|
131
|
+
'',
|
|
132
|
+
'- Rerun Check only after every checklist item is addressed.',
|
|
133
|
+
].join('\n'));
|
|
134
|
+
|
|
135
|
+
const pack = buildCheckerLlmInputPack({
|
|
136
|
+
taskDir,
|
|
137
|
+
taskId: 'TASK-999-token-pack',
|
|
138
|
+
checkerPromptSha: 'sha256:test',
|
|
139
|
+
cacheKey: { test: true },
|
|
140
|
+
checkContext: {
|
|
141
|
+
planSha: 'sha256:plan',
|
|
142
|
+
memorySha: 'sha256:memory',
|
|
143
|
+
riskProfile: 'high',
|
|
144
|
+
riskTriggers: ['source-sync-provider'],
|
|
145
|
+
},
|
|
146
|
+
checkEvidence: '# Evidence\n\nok',
|
|
147
|
+
checkerContextPack: '# Checker Context Pack\n\nok',
|
|
148
|
+
taskManifest: '{}',
|
|
149
|
+
projectMemory: [],
|
|
150
|
+
mode: 'standard',
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
expect(pack.input.taskArtifacts['precheck-remediation.md']).toContain('Optimization Strategy is missing.');
|
|
154
|
+
expect(pack.input.taskArtifacts['precheck-remediation.md']).toContain('Rerun Check only after every checklist item is addressed.');
|
|
155
|
+
});
|
|
156
|
+
|
|
121
157
|
it('stabilizes checker task manifest by excluding volatile check telemetry', () => {
|
|
122
158
|
const taskDir = createTask();
|
|
123
159
|
const pack = buildCheckerLlmInputPack({
|
|
@@ -274,6 +310,28 @@ describe('llm input pack utilities', () => {
|
|
|
274
310
|
expect(pack.input.taskArtifacts['check.md']).toContain('Human question should remain visible.');
|
|
275
311
|
});
|
|
276
312
|
|
|
313
|
+
it('includes precheck remediation in verifier packs when present', () => {
|
|
314
|
+
const taskDir = createTask();
|
|
315
|
+
write(taskDir, 'precheck-remediation.md', [
|
|
316
|
+
'# Precheck Remediation',
|
|
317
|
+
'',
|
|
318
|
+
'## Checklist',
|
|
319
|
+
'',
|
|
320
|
+
'- [ ] P-001: Source Sync / Provider Gate is missing.',
|
|
321
|
+
].join('\n'));
|
|
322
|
+
|
|
323
|
+
const pack = buildVerifierLlmInputPack({
|
|
324
|
+
taskDir,
|
|
325
|
+
taskId: 'TASK-999-token-pack',
|
|
326
|
+
planSha: 'sha256:plan',
|
|
327
|
+
executionSha: 'sha256:execution',
|
|
328
|
+
verifier: { provider: 'test', model: 'test', reasoningEffort: 'none', runId: 'run' },
|
|
329
|
+
mode: 'standard',
|
|
330
|
+
});
|
|
331
|
+
|
|
332
|
+
expect(pack.input.taskArtifacts['precheck-remediation.md']).toContain('Source Sync / Provider Gate is missing.');
|
|
333
|
+
});
|
|
334
|
+
|
|
277
335
|
it('uses a conservative estimate for Cyrillic and JSON-heavy payloads', () => {
|
|
278
336
|
const value = JSON.stringify({
|
|
279
337
|
text: 'Проверка русскоязычного JSON payload с большим количеством кавычек и структурных символов.',
|
|
@@ -84,6 +84,12 @@ export function buildTaskManifest({ taskDir, now = new Date().toISOString(), exi
|
|
|
84
84
|
evidenceRequired: hasPlanGate(inputs.taskArtifacts.get('plan.md'), ['import / ingestion contract', 'import ingestion contract', 'ingestion contract', 'import contract']),
|
|
85
85
|
evidenceComplete: !evidenceIssues.some((issue) => issue.message.includes('Import / Ingestion')),
|
|
86
86
|
},
|
|
87
|
+
goldenSetRegression: {
|
|
88
|
+
required: Boolean(inputs.qualityGates.goldenSetRequired),
|
|
89
|
+
planComplete: Boolean(inputs.qualityGates.goldenSetRegression?.present),
|
|
90
|
+
evidenceRequired: hasPlanGate(inputs.taskArtifacts.get('plan.md'), ['label card schema', 'label schema', 'coverage matrix', 'negative / edge cases', 'harness boundary']),
|
|
91
|
+
evidenceComplete: !evidenceIssues.some((issue) => issue.message.includes('Golden Set / Regression Evidence')),
|
|
92
|
+
},
|
|
87
93
|
},
|
|
88
94
|
context: {
|
|
89
95
|
planSha: inputs.planFingerprint.planSha,
|
package/bin/quality-gates.mjs
CHANGED
|
@@ -38,6 +38,8 @@ function main() {
|
|
|
38
38
|
productionRolloutComplete: inputs.qualityGates.productionRollout.present,
|
|
39
39
|
sourceSyncProviderRequired: inputs.qualityGates.sourceSyncProviderRequired,
|
|
40
40
|
sourceSyncProviderComplete: inputs.qualityGates.sourceSyncProvider.present,
|
|
41
|
+
goldenSetRequired: inputs.qualityGates.goldenSetRequired,
|
|
42
|
+
goldenSetComplete: inputs.qualityGates.goldenSetRegression.present,
|
|
41
43
|
missingSignals: inputs.qualityGates.missingSignals,
|
|
42
44
|
},
|
|
43
45
|
executionEvidence: {
|
|
@@ -67,6 +69,7 @@ function printHuman(result) {
|
|
|
67
69
|
console.log(`- Optimization strategy: ${result.planQuality.optimizationRequired ? `${result.planQuality.optimizationTier} required / ${result.planQuality.optimizationStrategyComplete ? 'complete' : 'missing'}` : `${result.planQuality.optimizationTier || 'O0'} / not required`}`);
|
|
68
70
|
console.log(`- Production rollout: ${result.planQuality.productionRolloutRequired ? result.planQuality.productionRolloutComplete ? 'required / complete' : 'required / missing' : 'not required'}`);
|
|
69
71
|
console.log(`- Source sync/provider: ${result.planQuality.sourceSyncProviderRequired ? result.planQuality.sourceSyncProviderComplete ? 'required / complete' : 'required / missing' : 'not required'}`);
|
|
72
|
+
console.log(`- Golden set/regression: ${result.planQuality.goldenSetRequired ? result.planQuality.goldenSetComplete ? 'required / complete' : 'required / missing' : 'not required'}`);
|
|
70
73
|
if (result.planQuality.missingSignals.length > 0) {
|
|
71
74
|
console.log('Plan quality issues:');
|
|
72
75
|
for (const issue of result.planQuality.missingSignals) {
|
package/bin/run-check.mjs
CHANGED
|
@@ -462,6 +462,12 @@ function writeDeterministicPrecheckReturn({
|
|
|
462
462
|
readyForHumanGate: false,
|
|
463
463
|
createdAt: new Date().toISOString(),
|
|
464
464
|
};
|
|
465
|
+
const remediation = buildPrecheckRemediation({
|
|
466
|
+
taskId,
|
|
467
|
+
checkContext,
|
|
468
|
+
issues,
|
|
469
|
+
startedAt,
|
|
470
|
+
});
|
|
465
471
|
const markdown = [
|
|
466
472
|
'# Check',
|
|
467
473
|
'',
|
|
@@ -471,6 +477,10 @@ function writeDeterministicPrecheckReturn({
|
|
|
471
477
|
'',
|
|
472
478
|
'External checker was not invoked because machine-readable plan/context gates already found blocking issues.',
|
|
473
479
|
'',
|
|
480
|
+
'## Consolidated remediation',
|
|
481
|
+
'',
|
|
482
|
+
'Before rerunning Check, close the full checklist in `precheck-remediation.md`. Do not fix one item and immediately rerun Check while other listed items remain open.',
|
|
483
|
+
'',
|
|
474
484
|
'## structured findings',
|
|
475
485
|
'',
|
|
476
486
|
'| ID | Severity | Category | Claim | Expected correction |',
|
|
@@ -488,15 +498,16 @@ function writeDeterministicPrecheckReturn({
|
|
|
488
498
|
|
|
489
499
|
writeTaskFile(taskDir, 'check.md', markdown);
|
|
490
500
|
writeTaskFile(taskDir, 'check.result.json', JSON.stringify(result, null, 2));
|
|
501
|
+
writeTaskFile(taskDir, 'precheck-remediation.md', remediation.markdown);
|
|
491
502
|
updateStatus(taskDir, {
|
|
492
503
|
checkVerdict: '`return_to_plan`',
|
|
493
504
|
checkResult: '- `check.result.json`: current',
|
|
494
505
|
supervisorAction: 'Deterministic Check preflight blocked external checker invocation.',
|
|
495
|
-
nextStep: '
|
|
506
|
+
nextStep: 'Close every item in `precheck-remediation.md`, update plan/research/status once, then rerun Check.',
|
|
496
507
|
humanApproval: 'no',
|
|
497
508
|
});
|
|
498
509
|
ensureFreshCheckContext(taskDir, taskId);
|
|
499
|
-
appendOrchestrationLog(taskDir, `deterministic Check preflight returned return_to_plan; findings=${findings.length}; external checker skipped`);
|
|
510
|
+
appendOrchestrationLog(taskDir, `deterministic Check preflight returned return_to_plan; findings=${findings.length}; remediation=${remediation.issueSetSha}; external checker skipped`);
|
|
500
511
|
}
|
|
501
512
|
|
|
502
513
|
function expectedCorrectionForPrecheckIssue(issue) {
|
|
@@ -509,6 +520,73 @@ function expectedCorrectionForPrecheckIssue(issue) {
|
|
|
509
520
|
return 'Fix task-manifest/check-context consistency before external Check.';
|
|
510
521
|
}
|
|
511
522
|
|
|
523
|
+
function buildPrecheckRemediation({
|
|
524
|
+
taskId,
|
|
525
|
+
checkContext,
|
|
526
|
+
issues,
|
|
527
|
+
startedAt,
|
|
528
|
+
}) {
|
|
529
|
+
const issueSetSha = sha256Json(issues.map((issue) => ({
|
|
530
|
+
category: issue.category,
|
|
531
|
+
message: issue.message,
|
|
532
|
+
})));
|
|
533
|
+
const grouped = groupIssuesByCategory(issues);
|
|
534
|
+
const lines = [
|
|
535
|
+
'# Precheck Remediation',
|
|
536
|
+
'',
|
|
537
|
+
`Task: \`${taskId}\``,
|
|
538
|
+
`Issue set: \`${issueSetSha}\``,
|
|
539
|
+
`Plan SHA: \`${checkContext.planSha}\``,
|
|
540
|
+
`Memory SHA: \`${checkContext.memorySha}\``,
|
|
541
|
+
`Created at: \`${new Date().toISOString()}\``,
|
|
542
|
+
`Precheck duration: \`${buildTiming(startedAt).durationMs}ms\``,
|
|
543
|
+
'',
|
|
544
|
+
'## Purpose',
|
|
545
|
+
'',
|
|
546
|
+
'This artifact consolidates deterministic Check blockers so the plan can be fixed in one focused pass before another external Check.',
|
|
547
|
+
'',
|
|
548
|
+
'Do not rerun external Check while any checklist item below is still open. Update `plan.md`, `research.md`, or `status.md` once, then rerun Check after the whole set is addressed.',
|
|
549
|
+
'',
|
|
550
|
+
'## Checklist',
|
|
551
|
+
'',
|
|
552
|
+
];
|
|
553
|
+
|
|
554
|
+
let index = 1;
|
|
555
|
+
for (const [category, categoryIssues] of grouped.entries()) {
|
|
556
|
+
lines.push(`### ${category}`);
|
|
557
|
+
lines.push('');
|
|
558
|
+
for (const issue of categoryIssues) {
|
|
559
|
+
lines.push(`- [ ] P-${String(index).padStart(3, '0')}: ${issue.message}`);
|
|
560
|
+
lines.push(` - Expected correction: ${expectedCorrectionForPrecheckIssue(issue)}`);
|
|
561
|
+
index += 1;
|
|
562
|
+
}
|
|
563
|
+
lines.push('');
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
lines.push('## Rerun Rule');
|
|
567
|
+
lines.push('');
|
|
568
|
+
lines.push('- Rerun Check only after every checklist item is addressed or explicitly rejected with evidence/human decision.');
|
|
569
|
+
lines.push('- If the same issue set appears again, consolidate the remaining fixes instead of starting another one-item loop.');
|
|
570
|
+
lines.push('- If a listed item is not applicable, record the reason in `plan.md` or `status.md` before rerunning Check.');
|
|
571
|
+
lines.push('');
|
|
572
|
+
|
|
573
|
+
return {
|
|
574
|
+
issueSetSha,
|
|
575
|
+
markdown: lines.join('\n'),
|
|
576
|
+
};
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
function groupIssuesByCategory(issues) {
|
|
580
|
+
const grouped = new Map();
|
|
581
|
+
for (const issue of issues) {
|
|
582
|
+
const category = issue.category || 'unknown';
|
|
583
|
+
const existing = grouped.get(category) || [];
|
|
584
|
+
existing.push(issue);
|
|
585
|
+
grouped.set(category, existing);
|
|
586
|
+
}
|
|
587
|
+
return grouped;
|
|
588
|
+
}
|
|
589
|
+
|
|
512
590
|
function escapeTableCell(value) {
|
|
513
591
|
return String(value || '').replace(/\|/g, '\\|').replace(/\n/g, ' ').trim();
|
|
514
592
|
}
|
package/package.json
CHANGED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Golden Set / Regression Fixture Playbook
|
|
2
|
+
|
|
3
|
+
Use this when a task creates or changes golden sets, eval fixtures, label cards,
|
|
4
|
+
ground-truth examples, regression checklists or expected-output datasets.
|
|
5
|
+
|
|
6
|
+
Golden sets are test contracts, not just lists of examples.
|
|
7
|
+
|
|
8
|
+
## Required Plan Sections
|
|
9
|
+
|
|
10
|
+
- `## Label Card Schema`
|
|
11
|
+
- `## Coverage Matrix`
|
|
12
|
+
- `## Negative / Edge Cases`
|
|
13
|
+
- `## Harness Boundary`
|
|
14
|
+
|
|
15
|
+
## Label Card Schema
|
|
16
|
+
|
|
17
|
+
Each label card must define:
|
|
18
|
+
|
|
19
|
+
- stable fixture/source id and source reference;
|
|
20
|
+
- expected outputs or expected behavior;
|
|
21
|
+
- non-goals, ignored outputs or deferred outputs;
|
|
22
|
+
- source evidence rules, such as short quotes/snippets/refs and privacy limits;
|
|
23
|
+
- confidence/risk notes when labels are uncertain.
|
|
24
|
+
|
|
25
|
+
## Coverage Matrix
|
|
26
|
+
|
|
27
|
+
The plan must name coverage dimensions that matter for this product area.
|
|
28
|
+
Examples include profiles, tracks, event types, user journeys, data qualities,
|
|
29
|
+
known edge behaviors, or source formats.
|
|
30
|
+
|
|
31
|
+
For every important dimension, record one of:
|
|
32
|
+
|
|
33
|
+
- selected fixture/example;
|
|
34
|
+
- explicitly missing/unavailable;
|
|
35
|
+
- intentionally deferred with reason.
|
|
36
|
+
|
|
37
|
+
## Negative / Edge Cases
|
|
38
|
+
|
|
39
|
+
Include negative or edge cases that protect against false positives and
|
|
40
|
+
over-application. If no negative case exists in the current fixture pool, record
|
|
41
|
+
the missing case as a coverage gap instead of fabricating it.
|
|
42
|
+
|
|
43
|
+
## Harness Boundary
|
|
44
|
+
|
|
45
|
+
State whether the task is creating manual labels/fixtures only or an automated
|
|
46
|
+
runner/CI harness. Do not claim automated regression coverage when only manual
|
|
47
|
+
label artifacts exist.
|
|
48
|
+
|
|
49
|
+
## Verify Expectations
|
|
50
|
+
|
|
51
|
+
Execution evidence must show:
|
|
52
|
+
|
|
53
|
+
- label cards were created according to schema;
|
|
54
|
+
- coverage matrix was filled;
|
|
55
|
+
- negative/edge cases were selected or documented as missing;
|
|
56
|
+
- expected outputs and non-goals are inspectable;
|
|
57
|
+
- source refs/snippets respect privacy boundaries;
|
|
58
|
+
- manual-vs-automated harness boundary is explicit.
|
package/prompts/checker.md
CHANGED
|
@@ -70,6 +70,9 @@ Project-specific context приходит только через task artifacts
|
|
|
70
70
|
22. Если `checker-context-pack.md`, `task-manifest.json` или risk triggers показывают sync/import/provider/raw records/retries/pagination/rate limits/idempotency/replay/backfill/partial failure, план обязан содержать `## Source Sync / Provider Gate`: scope/provider window, idempotency with dedupe key and duplicate action, failure handling/retry boundaries and coverage/parity evidence. Для import/manual-upload/transcript/evidence-capture задач план также обязан содержать `## Import / Ingestion Contract`: real representative fixtures when available or explicit no-real-fixtures reason, raw metadata/speaker-label extraction needed downstream, and repeat-import policy. Размытое "duplicates detected or reported" без skip/link/update/reject/report-only semantics недостаточно.
|
|
71
71
|
23. Если `task-manifest.json.loopDetector.requiresConsolidatedRemediation=true`, Checker должен блокировать повторный мелкий loop, пока plan/check-resolution не содержит consolidated remediation секцию, которая объединяет repeated reasons.
|
|
72
72
|
24. Если `llmInputPolicy.mode` не `strict` и отсутствующий full artifact реально нужен для честной оценки, verdict должен быть `context_insufficient`. Не используй `context_insufficient`, если deterministic gate уже явно показывает `return_to_plan`.
|
|
73
|
+
25. Если в task artifacts есть `precheck-remediation.md`, Checker должен проверить, что весь checklist был закрыт одним consolidated plan update. Не создавай новый мелкий blocker по одному пункту из старого checklist, если оставшиеся пункты тоже очевидно не закрыты: верни consolidated finding, ссылающийся на `precheck-remediation.md`.
|
|
74
|
+
26. Minor process/evidence polish не должен блокировать Human Gate, если deterministic gates закрыты, acceptance criteria покрыты, scope/risk/security/data correctness не нарушены, а остаток можно безопасно записать как `non_blocking` или human question.
|
|
75
|
+
27. Если plan/task/checker-context показывает golden set/eval/regression fixtures/label cards/ground truth, Checker должен требовать `## Label Card Schema`, `## Coverage Matrix`, `## Negative / Edge Cases` и `## Harness Boundary`. Golden set без schema/coverage/negative cases/source evidence/non-goals/manual-vs-automated boundary является `return_to_plan`, даже если есть общий текст про expected outputs.
|
|
73
76
|
|
|
74
77
|
## Контракт выхода
|
|
75
78
|
|
|
@@ -95,6 +98,7 @@ Project-specific context приходит только через task artifacts
|
|
|
95
98
|
- проверка Optimization Strategy;
|
|
96
99
|
- проверка Production Rollout Gate;
|
|
97
100
|
- проверка Source Sync / Provider Gate;
|
|
101
|
+
- проверка Golden Set / Regression Gate;
|
|
98
102
|
- проверка Loop Detector / consolidated remediation;
|
|
99
103
|
- достаточность compact context или `context_insufficient`;
|
|
100
104
|
- рекомендация supervisor: `return_to_plan` или `ready_for_human_gate`.
|
package/prompts/planner.md
CHANGED
|
@@ -55,6 +55,7 @@
|
|
|
55
55
|
19. План проверки должен быть ladder-based: micro-verify during Execute, slice-verify before completion and external Verify requirement for closeout/high-risk claims.
|
|
56
56
|
20. План должен описывать meaningful slice. Не дроби локальную работу на отдельный Plan/Check/Verify для каждого микрофикса, если риски и target остаются внутри одного approved tier.
|
|
57
57
|
21. Если risk triggers или `checker-context-pack.md` показывают O2/O3 hot-path work, Planner обязан добавить `## Optimization Strategy`: tier, hot paths, expected data size, chosen efficient approach, anti-patterns avoided and bounded optimizer budget/stop rule. Цель gate — предотвратить очевидно неэффективное решение до Execute, а не запускать бесконечную оптимизацию.
|
|
58
|
+
22. Если задача создает golden set/eval/regression fixtures/label cards/ground truth, Planner обязан добавить `## Label Card Schema`, `## Coverage Matrix`, `## Negative / Edge Cases` и `## Harness Boundary`. Golden set должен быть test contract with expected outputs, non-goals, source refs, missing coverage policy and manual-vs-automated boundary.
|
|
58
59
|
|
|
59
60
|
## Check Resolution Contract
|
|
60
61
|
|
|
@@ -122,6 +123,7 @@ Allowed `evidenceRefs[].type` and `artifactChangeRefs[].type`:
|
|
|
122
123
|
- verification ladder;
|
|
123
124
|
- UI verification path или явное `not applicable`;
|
|
124
125
|
- Optimization Strategy для O2/O3 или явное `not applicable`;
|
|
126
|
+
- Label Card Schema / Coverage Matrix / Negative Edge Cases / Harness Boundary для golden set/eval fixture задач;
|
|
125
127
|
- global standards alignment;
|
|
126
128
|
- что требует human approval.
|
|
127
129
|
|
package/prompts/supervisor.md
CHANGED
|
@@ -59,6 +59,8 @@ Supervisor является code-level orchestrator по контракту: rou
|
|
|
59
59
|
25. `verify.result.json` должен сверять `plan.md` с фактическим `execution.md`, diff/files/tests и явным execution evidence. Self-reported executor checks без verifier verdict не являются достаточным Verify.
|
|
60
60
|
26. `verify.result.json.verdict = pass | pass_with_notes` допустим при `verificationMode = internal_supervisor` для обычных `R0-R3` local engineering slices. Это cost-saving режим без независимого CLI/model verifier и он является default, если shared defaults или project agents override задают `verifier.mode = internal_supervisor`. `external_cli` обязателен только для R4/R5, production-readiness, destructive/security/financial/broad operational actions, production or real-user-data Prisma/data migrations/backfills, broad ambiguous refactors или explicit human request. Local scratch DB migrations, fixture imports and bounded local-only backfills can close with internal Verify when execution evidence covers the plan gates.
|
|
61
61
|
27. Если external verifier/checker/browser tooling начинает тратить непропорционально много времени или блокируется окружением, Supervisor обязан остановить loop и вынести human decision: принять internal verify/evidence, запустить external escalation вручную или изменить scope.
|
|
62
|
+
28. Если deterministic Check preflight создал `precheck-remediation.md`, Supervisor не должен запускать повторный Check после точечной правки одного пункта. Сначала Planner/Executor должен закрыть весь checklist или явно отметить not-applicable с evidence/human decision в `plan.md`/`status.md`, затем допускается один fresh Check.
|
|
63
|
+
29. Перед повторным Check после deterministic precheck Supervisor обязан сверить, что `precheck-remediation.md` был использован как consolidated checklist: все listed gates отражены в plan/research/status, а не закрывались по одному через серию precheck loops.
|
|
62
64
|
|
|
63
65
|
## Hard Gate: Material Scope Expansion -> Brief Reset
|
|
64
66
|
|
package/prompts/verifier.md
CHANGED
|
@@ -44,6 +44,9 @@
|
|
|
44
44
|
20. Если `plan.md` содержит `## Source Sync / Provider Gate`, verifier должен проверить `Source Sync / Provider Evidence`: scope/window, idempotency, retries/pagination/rate limits, raw-record handling, partial failure recovery and coverage/parity evidence.
|
|
45
45
|
21. Если `task-manifest.json.loopDetector.requiresConsolidatedRemediation=true`, verifier не должен закрывать задачу, пока repeated return reasons не объединены в consolidated remediation и не покрыты execution evidence.
|
|
46
46
|
22. Если `llmInputPolicy.mode` не `strict` и отсутствующий full artifact реально нужен для честной оценки, verdict должен быть `context_insufficient`. Не используй `context_insufficient`, если execution evidence уже явно отсутствует или противоречит plan.
|
|
47
|
+
23. Если task содержит `precheck-remediation.md`, verifier должен проверить только применимые пункты, которые дошли до Execute. Не возвращай `return_to_execute` из-за старого precheck checklist, если план закрыл его до Human Gate и фактическая реализация покрывает acceptance.
|
|
48
|
+
24. Environment/tooling failures внешнего verifier/browser smoke не должны превращаться в бесконечный `return_to_execute` loop. Если implementation evidence достаточно, но внешний инструмент заблокирован окружением, используй `pass_with_notes` или `human_arbitration_required` согласно риску.
|
|
49
|
+
25. Если `plan.md` содержит golden set/eval/regression fixture sections, verifier должен проверить `Golden Set / Regression Evidence`: label cards follow schema, coverage matrix is filled, negative/edge cases are selected or documented missing, expected outputs/non-goals are inspectable, source refs/snippets exist and manual-vs-automated harness boundary is explicit.
|
|
47
50
|
|
|
48
51
|
## Контракт выхода
|
|
49
52
|
|
|
@@ -59,6 +62,7 @@
|
|
|
59
62
|
- Optimization Review Evidence, если `plan.md` содержит O2/O3 `Optimization Strategy`;
|
|
60
63
|
- Production Rollout Gate evidence, если он есть в `plan.md`;
|
|
61
64
|
- Source Sync / Provider Gate evidence, если он есть в `plan.md`;
|
|
65
|
+
- Golden Set / Regression Evidence, если plan создает golden set/eval fixtures/label cards;
|
|
62
66
|
- Loop Detector / consolidated remediation status;
|
|
63
67
|
- достаточность compact context или `context_insufficient`;
|
|
64
68
|
- findings;
|