@ryuenn3123/agentic-senior-core 2.0.16 → 2.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-context/prompts/review-code.md +2 -0
- package/.agent-context/review-checklists/pr-checklist.md +2 -0
- package/.agent-context/rules/api-docs.md +11 -1
- package/.agent-context/state/benchmark-reproducibility.json +3 -1
- package/.agent-context/state/benchmark-writer-judge-config.json +58 -0
- package/.agent-context/state/benchmark-writer-judge-matrix.json +462 -0
- package/.cursorrules +60 -3686
- package/.windsurfrules +60 -3686
- package/README.md +33 -1
- package/lib/cli/compiler.mjs +98 -35
- package/package.json +2 -1
- package/scripts/benchmark-writer-judge-matrix.mjs +383 -0
- package/scripts/validate.mjs +19 -3
package/README.md
CHANGED
|
@@ -261,6 +261,38 @@ For CI pipelines that only need stdout JSON:
|
|
|
261
261
|
node ./scripts/benchmark-evidence-bundle.mjs --stdout-only
|
|
262
262
|
```
|
|
263
263
|
|
|
264
|
+
### Writer-Judge Comparison Matrix (V2.5.1)
|
|
265
|
+
|
|
266
|
+
Generate a blind-review writer-judge matrix with independent lane configuration:
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
npm run benchmark:writer-judge
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
This command writes:
|
|
273
|
+
- `.agent-context/state/benchmark-writer-judge-matrix.json`
|
|
274
|
+
|
|
275
|
+
Writer and judge lane configuration is stored in:
|
|
276
|
+
- `.agent-context/state/benchmark-writer-judge-config.json`
|
|
277
|
+
|
|
278
|
+
For CI pipelines that only need stdout JSON:
|
|
279
|
+
|
|
280
|
+
```bash
|
|
281
|
+
node ./scripts/benchmark-writer-judge-matrix.mjs --stdout-only
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
### Benchmark Quickstart Path (V2.5)
|
|
285
|
+
|
|
286
|
+
For new users, run this minimal sequence first:
|
|
287
|
+
|
|
288
|
+
```bash
|
|
289
|
+
npm run benchmark:detection
|
|
290
|
+
npm run benchmark:writer-judge
|
|
291
|
+
npm run benchmark:bundle
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
This gives a fast baseline of accuracy, writer-judge comparison, and evidence packaging in one pass.
|
|
295
|
+
|
|
264
296
|
### Install and Setup Choices
|
|
265
297
|
|
|
266
298
|
The CLI now supports a smaller decision surface for first-time setup:
|
|
@@ -337,7 +369,7 @@ Our documentation has shifted into dedicated tracks to keep this README light:
|
|
|
337
369
|
|
|
338
370
|
- **Delivery Engine (CLI):** Interactive setup via GitHub source, bootstrap scripts, or `npx` after publish. Supported by a robust transactional installer with rollback protection.
|
|
339
371
|
- **Verified Skill Marketplace:** Distribute and validate plugins securely with automated 4-dimension Trust Scoring and Evidence Bundles constraint validation.
|
|
340
|
-
- **Dynamic Context Compiler:**
|
|
372
|
+
- **Dynamic Context Compiler:** Builds a compact modular bootstrap index that points to all required governance layers before execution.
|
|
341
373
|
- **Codebase Intelligence:** `.agent-context/state/` gives architecture/dependency boundaries so the agent understands high-risk areas.
|
|
342
374
|
- **Override System:** `.agent-override.md` allows controlled enterprise exceptions without forking core rules.
|
|
343
375
|
- **Automated Guardrails:** CI blueprints include LLM-as-a-Judge flow using `pr-checklist.md`.
|
package/lib/cli/compiler.mjs
CHANGED
|
@@ -18,7 +18,6 @@ import {
|
|
|
18
18
|
|
|
19
19
|
import {
|
|
20
20
|
inferSkillDomainNamesFromSelection,
|
|
21
|
-
buildSkillPackSection,
|
|
22
21
|
} from './skill-selector.mjs';
|
|
23
22
|
|
|
24
23
|
import {
|
|
@@ -102,51 +101,91 @@ export async function buildCompiledRulesContent({
|
|
|
102
101
|
const selectedRulesDirectoryPath = path.join(resolvedTargetDirectoryPath, '.agent-context', 'rules');
|
|
103
102
|
const selectedStacksDirectoryPath = path.join(resolvedTargetDirectoryPath, '.agent-context', 'stacks');
|
|
104
103
|
const selectedBlueprintsDirectoryPath = path.join(resolvedTargetDirectoryPath, '.agent-context', 'blueprints');
|
|
105
|
-
const selectedStateDirectoryPath = path.join(resolvedTargetDirectoryPath, '.agent-context', 'state');
|
|
106
|
-
const selectedReviewDirectoryPath = path.join(resolvedTargetDirectoryPath, '.agent-context', 'review-checklists');
|
|
107
104
|
const skillPlatformIndex = JSON.parse(await fs.readFile(SKILL_PLATFORM_INDEX_PATH, 'utf8'));
|
|
108
105
|
const selectedSkillDomainNames = inferSkillDomainNamesFromSelection(selectedStackFileName, selectedBlueprintFileName);
|
|
109
106
|
|
|
110
107
|
const universalRuleFileNames = await collectFileNames(selectedRulesDirectoryPath);
|
|
111
108
|
const contextBlocks = [];
|
|
112
109
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
110
|
+
function resolveSkillPackFileName(skillDomainEntry, selectedTierName) {
|
|
111
|
+
return skillDomainEntry.tierToPackFileNames?.[selectedTierName]
|
|
112
|
+
|| skillDomainEntry.tierToPackFileNames?.[skillDomainEntry.defaultTier]
|
|
113
|
+
|| skillDomainEntry.defaultPackFileName;
|
|
114
|
+
}
|
|
116
115
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
116
|
+
function firstMarkdownHeading(content, fallbackLabel) {
|
|
117
|
+
const headingLine = content
|
|
118
|
+
.split(/\r?\n/)
|
|
119
|
+
.find((line) => line.trim().startsWith('#'));
|
|
120
|
+
|
|
121
|
+
if (!headingLine) {
|
|
122
|
+
return fallbackLabel;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return headingLine.replace(/^#+\s*/, '').trim();
|
|
120
126
|
}
|
|
121
127
|
|
|
128
|
+
contextBlocks.push(
|
|
129
|
+
[
|
|
130
|
+
'## BOOTSTRAP CHAIN (MANDATORY)',
|
|
131
|
+
'Load every layer before responding. Do not skip steps:',
|
|
132
|
+
'1. .agent-context/rules/',
|
|
133
|
+
'2. .agent-context/stacks/',
|
|
134
|
+
'3. .agent-context/blueprints/',
|
|
135
|
+
'4. .agent-context/skills/',
|
|
136
|
+
'5. .agent-context/prompts/',
|
|
137
|
+
'6. .agent-context/profiles/',
|
|
138
|
+
'7. .agent-context/state/',
|
|
139
|
+
`8. .agent-context/policies/${POLICY_FILE_NAME}`,
|
|
140
|
+
'',
|
|
141
|
+
'Primary entrypoint: .cursorrules',
|
|
142
|
+
'Mirror entrypoint: .windsurfrules',
|
|
143
|
+
'Canonical baseline: .instructions.md',
|
|
144
|
+
].join('\n')
|
|
145
|
+
);
|
|
146
|
+
|
|
147
|
+
contextBlocks.push(
|
|
148
|
+
[
|
|
149
|
+
'## LAYER 1: UNIVERSAL RULES (MANDATORY)',
|
|
150
|
+
'Read every file under .agent-context/rules/ before implementation:',
|
|
151
|
+
...universalRuleFileNames.map((universalRuleFileName, index) => `${index + 1}. .agent-context/rules/${universalRuleFileName}`),
|
|
152
|
+
'',
|
|
153
|
+
'Conflict resolution: prioritize data safety and API contract integrity first, then writing polish.',
|
|
154
|
+
].join('\n')
|
|
155
|
+
);
|
|
156
|
+
|
|
122
157
|
const stackFilePath = path.join(selectedStacksDirectoryPath, selectedStackFileName);
|
|
123
158
|
const stackContent = await fs.readFile(stackFilePath, 'utf8');
|
|
159
|
+
const stackSummary = firstMarkdownHeading(stackContent, selectedStackFileName);
|
|
124
160
|
contextBlocks.push(
|
|
125
|
-
|
|
161
|
+
[
|
|
162
|
+
`## LAYER 2: STACK PROFILE (${selectedStackFileName})`,
|
|
163
|
+
`Source: .agent-context/stacks/${selectedStackFileName}`,
|
|
164
|
+
`Summary: ${stackSummary}`,
|
|
165
|
+
'Load this stack profile to enforce language-specific conventions.',
|
|
166
|
+
].join('\n')
|
|
126
167
|
);
|
|
127
168
|
|
|
128
169
|
const blueprintFilePath = path.join(selectedBlueprintsDirectoryPath, selectedBlueprintFileName);
|
|
129
170
|
const blueprintContent = await fs.readFile(blueprintFilePath, 'utf8');
|
|
171
|
+
const blueprintSummary = firstMarkdownHeading(blueprintContent, selectedBlueprintFileName);
|
|
130
172
|
contextBlocks.push(
|
|
131
|
-
|
|
173
|
+
[
|
|
174
|
+
`## LAYER 3: BLUEPRINT PROFILE (${selectedBlueprintFileName})`,
|
|
175
|
+
`Source: .agent-context/blueprints/${selectedBlueprintFileName}`,
|
|
176
|
+
`Summary: ${blueprintSummary}`,
|
|
177
|
+
'Load this blueprint when scaffolding or changing architecture boundaries.',
|
|
178
|
+
].join('\n')
|
|
132
179
|
);
|
|
133
180
|
|
|
134
181
|
if (includeCiGuardrails) {
|
|
135
|
-
const githubCiBlueprintContent = await fs.readFile(path.join(selectedBlueprintsDirectoryPath, 'ci-github-actions.md'), 'utf8');
|
|
136
|
-
const gitlabCiBlueprintContent = await fs.readFile(path.join(selectedBlueprintsDirectoryPath, 'ci-gitlab.md'), 'utf8');
|
|
137
|
-
|
|
138
|
-
contextBlocks.push(
|
|
139
|
-
`## CI/CD GUARDRAILS: ci-github-actions.md\nSource: .agent-context/blueprints/ci-github-actions.md\n\n${githubCiBlueprintContent.trim()}`
|
|
140
|
-
);
|
|
141
|
-
contextBlocks.push(
|
|
142
|
-
`## CI/CD GUARDRAILS: ci-gitlab.md\nSource: .agent-context/blueprints/ci-gitlab.md\n\n${gitlabCiBlueprintContent.trim()}`
|
|
143
|
-
);
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
const tokenOptimizationState = await readTokenOptimizationState(resolvedTargetDirectoryPath);
|
|
147
|
-
if (tokenOptimizationState?.enabled) {
|
|
148
182
|
contextBlocks.push(
|
|
149
|
-
|
|
183
|
+
[
|
|
184
|
+
'## LAYER 3B: CI/CD GUARDRAILS',
|
|
185
|
+
'Load these CI blueprints when pipeline or release logic is touched:',
|
|
186
|
+
'1. .agent-context/blueprints/ci-github-actions.md',
|
|
187
|
+
'2. .agent-context/blueprints/ci-gitlab.md',
|
|
188
|
+
].join('\n')
|
|
150
189
|
);
|
|
151
190
|
}
|
|
152
191
|
|
|
@@ -156,21 +195,45 @@ export async function buildCompiledRulesContent({
|
|
|
156
195
|
continue;
|
|
157
196
|
}
|
|
158
197
|
|
|
159
|
-
|
|
160
|
-
|
|
198
|
+
const selectedTierName = skillPlatformIndex.defaultTier || 'advance';
|
|
199
|
+
const resolvedPackFileName = resolveSkillPackFileName(skillDomainEntry, selectedTierName);
|
|
161
200
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
201
|
+
contextBlocks.push(
|
|
202
|
+
[
|
|
203
|
+
`## SKILL PACK: ${skillDomainEntry.displayName}`,
|
|
204
|
+
`Source: .agent-context/skills/${resolvedPackFileName}`,
|
|
205
|
+
`Default tier: ${skillDomainEntry.defaultTier}`,
|
|
206
|
+
`Selected tier: ${selectedTierName}`,
|
|
207
|
+
`Evidence: ${skillDomainEntry.evidence}`,
|
|
208
|
+
`Purpose: ${skillDomainEntry.description}`,
|
|
209
|
+
'Load this skill pack and apply every Must-Have Check.',
|
|
210
|
+
].join('\n')
|
|
211
|
+
);
|
|
212
|
+
}
|
|
165
213
|
|
|
214
|
+
const tokenOptimizationState = await readTokenOptimizationState(resolvedTargetDirectoryPath);
|
|
215
|
+
if (tokenOptimizationState?.enabled) {
|
|
216
|
+
contextBlocks.push(
|
|
217
|
+
`## TOKEN OPTIMIZATION PROFILE\nSource: .agent-context/state/token-optimization.json\n\n${buildTokenOptimizationGuidanceBlock(tokenOptimizationState).trim()}`
|
|
218
|
+
);
|
|
219
|
+
}
|
|
166
220
|
contextBlocks.push(
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
221
|
+
[
|
|
222
|
+
'## LAYER 7: STATE AWARENESS (MANDATORY)',
|
|
223
|
+
'Load these files before touching critical paths:',
|
|
224
|
+
'1. .agent-context/state/architecture-map.md',
|
|
225
|
+
'2. .agent-context/state/dependency-map.md',
|
|
226
|
+
'Use these maps to prevent unsafe cross-module changes.',
|
|
227
|
+
].join('\n')
|
|
171
228
|
);
|
|
172
229
|
contextBlocks.push(
|
|
173
|
-
|
|
230
|
+
[
|
|
231
|
+
'## REVIEW CHECKLISTS (MANDATORY)',
|
|
232
|
+
'1. .agent-context/review-checklists/pr-checklist.md',
|
|
233
|
+
'2. .agent-context/review-checklists/security-audit.md (when security-sensitive)',
|
|
234
|
+
'3. .agent-context/review-checklists/performance-audit.md (when perf-critical)',
|
|
235
|
+
'Do not claim done before checklist pass.',
|
|
236
|
+
].join('\n')
|
|
174
237
|
);
|
|
175
238
|
|
|
176
239
|
return [
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ryuenn3123/agentic-senior-core",
|
|
3
|
-
"version": "2.0.
|
|
3
|
+
"version": "2.0.18",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Force your AI Agent to code like a Staff Engineer, not a Junior.",
|
|
6
6
|
"bin": {
|
|
@@ -49,6 +49,7 @@
|
|
|
49
49
|
"benchmark:detection": "node ./scripts/detection-benchmark.mjs",
|
|
50
50
|
"benchmark:token": "node ./scripts/token-optimization-benchmark.mjs",
|
|
51
51
|
"benchmark:bundle": "node ./scripts/benchmark-evidence-bundle.mjs",
|
|
52
|
+
"benchmark:writer-judge": "node ./scripts/benchmark-writer-judge-matrix.mjs",
|
|
52
53
|
"benchmark:gate": "node ./scripts/benchmark-gate.mjs",
|
|
53
54
|
"benchmark:intelligence": "node ./scripts/benchmark-intelligence.mjs",
|
|
54
55
|
"report:quality-trend": "node ./scripts/quality-trend-report.mjs",
|
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* benchmark-writer-judge-matrix.mjs
|
|
5
|
+
*
|
|
6
|
+
* V2.5.1 writer-judge architecture artifact.
|
|
7
|
+
* Builds side-by-side comparison matrix using independently configured
|
|
8
|
+
* writer and judge lanes with blind review tokens.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
12
|
+
import fs from 'node:fs/promises';
|
|
13
|
+
import { spawnSync } from 'node:child_process';
|
|
14
|
+
import { dirname, join, resolve } from 'node:path';
|
|
15
|
+
import { fileURLToPath } from 'node:url';
|
|
16
|
+
|
|
17
|
+
const SCRIPT_FILE_PATH = fileURLToPath(import.meta.url);
|
|
18
|
+
const SCRIPT_DIR = dirname(SCRIPT_FILE_PATH);
|
|
19
|
+
const REPOSITORY_ROOT = resolve(SCRIPT_DIR, '..');
|
|
20
|
+
const ARGUMENT_FLAGS = new Set(process.argv.slice(2));
|
|
21
|
+
const isStdoutOnlyMode = ARGUMENT_FLAGS.has('--stdout-only');
|
|
22
|
+
|
|
23
|
+
const CONFIG_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-writer-judge-config.json');
|
|
24
|
+
const REPRO_PROFILE_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-reproducibility.json');
|
|
25
|
+
const THRESHOLD_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-thresholds.json');
|
|
26
|
+
const OUTPUT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-writer-judge-matrix.json');
|
|
27
|
+
|
|
28
|
+
function readJsonOrNull(filePath) {
|
|
29
|
+
if (!existsSync(filePath)) {
|
|
30
|
+
return null;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
try {
|
|
34
|
+
return JSON.parse(readFileSync(filePath, 'utf8'));
|
|
35
|
+
} catch {
|
|
36
|
+
return null;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function runJsonScript(scriptRelativePath, scriptArguments = []) {
|
|
41
|
+
const absoluteScriptPath = join(REPOSITORY_ROOT, scriptRelativePath);
|
|
42
|
+
const commandResult = spawnSync('node', [absoluteScriptPath, ...scriptArguments], {
|
|
43
|
+
cwd: REPOSITORY_ROOT,
|
|
44
|
+
encoding: 'utf8',
|
|
45
|
+
maxBuffer: 1024 * 1024 * 10,
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
const stdoutContent = (commandResult.stdout || '').trim();
|
|
49
|
+
const stderrContent = (commandResult.stderr || '').trim();
|
|
50
|
+
const exitCode = typeof commandResult.status === 'number' ? commandResult.status : 1;
|
|
51
|
+
|
|
52
|
+
if (!stdoutContent) {
|
|
53
|
+
return {
|
|
54
|
+
scriptPath: scriptRelativePath,
|
|
55
|
+
exitCode,
|
|
56
|
+
parsedReport: null,
|
|
57
|
+
parseError: 'Script produced no stdout JSON payload',
|
|
58
|
+
stderr: stderrContent,
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
try {
|
|
63
|
+
return {
|
|
64
|
+
scriptPath: scriptRelativePath,
|
|
65
|
+
exitCode,
|
|
66
|
+
parsedReport: JSON.parse(stdoutContent),
|
|
67
|
+
parseError: null,
|
|
68
|
+
stderr: stderrContent,
|
|
69
|
+
};
|
|
70
|
+
} catch (jsonParseError) {
|
|
71
|
+
const parseErrorMessage = jsonParseError instanceof Error ? jsonParseError.message : String(jsonParseError);
|
|
72
|
+
return {
|
|
73
|
+
scriptPath: scriptRelativePath,
|
|
74
|
+
exitCode,
|
|
75
|
+
parsedReport: null,
|
|
76
|
+
parseError: parseErrorMessage,
|
|
77
|
+
stderr: stderrContent,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function deterministicOffset(seed, maxMagnitude = 3) {
|
|
83
|
+
let hash = 0;
|
|
84
|
+
for (let index = 0; index < seed.length; index += 1) {
|
|
85
|
+
hash = ((hash << 5) - hash) + seed.charCodeAt(index);
|
|
86
|
+
hash |= 0;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const spread = (maxMagnitude * 2) + 1;
|
|
90
|
+
const normalizedValue = Math.abs(hash) % spread;
|
|
91
|
+
return normalizedValue - maxMagnitude;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function clamp(value, minimum, maximum) {
|
|
95
|
+
return Math.min(Math.max(value, minimum), maximum);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function roundToTwo(value) {
|
|
99
|
+
return Number(value.toFixed(2));
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function buildDefaultConfig() {
|
|
103
|
+
return {
|
|
104
|
+
version: '1.0.0',
|
|
105
|
+
phase: 'v2.5.1',
|
|
106
|
+
blindReviewMode: true,
|
|
107
|
+
writerLane: {
|
|
108
|
+
models: [{ id: 'writer-default', provider: 'local', profile: 'balanced' }],
|
|
109
|
+
weights: {
|
|
110
|
+
quality: 40,
|
|
111
|
+
efficiency: 20,
|
|
112
|
+
reliability: 25,
|
|
113
|
+
freshness: 15,
|
|
114
|
+
},
|
|
115
|
+
scenarioMultipliers: {
|
|
116
|
+
planning: 1,
|
|
117
|
+
refactor: 1,
|
|
118
|
+
security: 1,
|
|
119
|
+
delivery: 1,
|
|
120
|
+
},
|
|
121
|
+
},
|
|
122
|
+
judgeLane: {
|
|
123
|
+
models: [{ id: 'judge-default', provider: 'local', profile: 'audit' }],
|
|
124
|
+
minimumCompositeScore: 75,
|
|
125
|
+
leniencyWindow: 2,
|
|
126
|
+
weights: {
|
|
127
|
+
clarity: 35,
|
|
128
|
+
correctness: 35,
|
|
129
|
+
risk: 20,
|
|
130
|
+
consistency: 10,
|
|
131
|
+
},
|
|
132
|
+
},
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
function loadScenarios(reproducibilityProfile) {
|
|
137
|
+
const defaultScenarios = [
|
|
138
|
+
{ id: 'planning', category: 'planning' },
|
|
139
|
+
{ id: 'refactor', category: 'refactor' },
|
|
140
|
+
{ id: 'security', category: 'security' },
|
|
141
|
+
{ id: 'delivery', category: 'delivery' },
|
|
142
|
+
];
|
|
143
|
+
|
|
144
|
+
if (!Array.isArray(reproducibilityProfile?.scenarios) || reproducibilityProfile.scenarios.length === 0) {
|
|
145
|
+
return defaultScenarios;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
return reproducibilityProfile.scenarios.map((scenarioEntry) => ({
|
|
149
|
+
id: scenarioEntry.id || 'unknown-scenario',
|
|
150
|
+
category: scenarioEntry.category || 'planning',
|
|
151
|
+
}));
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function buildBaseSignals(detectionBenchmarkReport, tokenBenchmarkReport, benchmarkGateReport, benchmarkIntelligenceReport, thresholdConfiguration) {
|
|
155
|
+
const staleWatchlistCount = Array.isArray(benchmarkIntelligenceReport?.watchlist)
|
|
156
|
+
? benchmarkIntelligenceReport.watchlist.filter((watchlistEntry) => watchlistEntry?.stale === true).length
|
|
157
|
+
: 0;
|
|
158
|
+
|
|
159
|
+
const top1Accuracy = Number(detectionBenchmarkReport?.top1Accuracy || 0);
|
|
160
|
+
const manualCorrectionRate = Number(detectionBenchmarkReport?.manualCorrectionRate || 1);
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
top1Accuracy,
|
|
164
|
+
manualCorrectionRate,
|
|
165
|
+
nativeSavingsPercent: Number(tokenBenchmarkReport?.summary?.averageNativeSavingsPercent || 0),
|
|
166
|
+
benchmarkGatePassed: benchmarkGateReport?.passed === true,
|
|
167
|
+
benchmarkGateFailureCount: Number(benchmarkGateReport?.failureCount || 0),
|
|
168
|
+
intelligenceFailureCount: Number(benchmarkIntelligenceReport?.failureCount || 0),
|
|
169
|
+
staleWatchlistCount,
|
|
170
|
+
top1AccuracyMet: top1Accuracy >= Number(thresholdConfiguration?.minimumTop1Accuracy || 0),
|
|
171
|
+
manualCorrectionMet: manualCorrectionRate <= Number(thresholdConfiguration?.maximumManualCorrectionRate || 1),
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
function buildWriterScenarioRun(writerModel, scenario, baseSignals, writerWeights, scenarioMultipliers) {
|
|
176
|
+
const scenarioMultiplier = Number(scenarioMultipliers?.[scenario.category] || 1);
|
|
177
|
+
const modelScenarioOffset = deterministicOffset(`${writerModel.id}:${scenario.id}`, 4);
|
|
178
|
+
|
|
179
|
+
const qualityScore = clamp((baseSignals.top1Accuracy * 100 * scenarioMultiplier) + modelScenarioOffset, 0, 100);
|
|
180
|
+
const efficiencyScore = clamp(baseSignals.nativeSavingsPercent + deterministicOffset(`${writerModel.id}:efficiency`, 3), 0, 100);
|
|
181
|
+
const reliabilityScore = baseSignals.benchmarkGatePassed
|
|
182
|
+
? clamp(100 + deterministicOffset(`${writerModel.id}:reliability`, 2), 0, 100)
|
|
183
|
+
: clamp(100 - (baseSignals.benchmarkGateFailureCount * 20), 0, 100);
|
|
184
|
+
const freshnessScore = clamp(
|
|
185
|
+
100 - (baseSignals.intelligenceFailureCount * 15) - (baseSignals.staleWatchlistCount * 10) + deterministicOffset(`${writerModel.id}:freshness`, 2),
|
|
186
|
+
0,
|
|
187
|
+
100
|
|
188
|
+
);
|
|
189
|
+
|
|
190
|
+
const weightedCompositeScore = (
|
|
191
|
+
(qualityScore * Number(writerWeights.quality || 0))
|
|
192
|
+
+ (efficiencyScore * Number(writerWeights.efficiency || 0))
|
|
193
|
+
+ (reliabilityScore * Number(writerWeights.reliability || 0))
|
|
194
|
+
+ (freshnessScore * Number(writerWeights.freshness || 0))
|
|
195
|
+
) / 100;
|
|
196
|
+
|
|
197
|
+
return {
|
|
198
|
+
scenarioId: scenario.id,
|
|
199
|
+
scenarioCategory: scenario.category,
|
|
200
|
+
scoreBreakdown: {
|
|
201
|
+
quality: roundToTwo(qualityScore),
|
|
202
|
+
efficiency: roundToTwo(efficiencyScore),
|
|
203
|
+
reliability: roundToTwo(reliabilityScore),
|
|
204
|
+
freshness: roundToTwo(freshnessScore),
|
|
205
|
+
},
|
|
206
|
+
compositeScore: roundToTwo(weightedCompositeScore),
|
|
207
|
+
top1AccuracyMet: baseSignals.top1AccuracyMet,
|
|
208
|
+
manualCorrectionMet: baseSignals.manualCorrectionMet,
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
function evaluateJudgeForScenario(writerScenarioRun, writerToken, judgeModel, judgeLaneConfig, blindReviewMode) {
|
|
213
|
+
const judgeOffset = deterministicOffset(`${judgeModel.id}:${writerScenarioRun.scenarioId}:${writerToken}`, 2);
|
|
214
|
+
const judgeCompositeScore = clamp(writerScenarioRun.compositeScore + judgeOffset, 0, 100);
|
|
215
|
+
const minimumCompositeScore = Number(judgeLaneConfig.minimumCompositeScore || 75);
|
|
216
|
+
const leniencyWindow = Number(judgeLaneConfig.leniencyWindow || 0);
|
|
217
|
+
|
|
218
|
+
const meetsScoreThreshold = judgeCompositeScore >= (minimumCompositeScore - leniencyWindow);
|
|
219
|
+
const meetsCoreSignals = writerScenarioRun.top1AccuracyMet && writerScenarioRun.manualCorrectionMet;
|
|
220
|
+
const verdict = (meetsScoreThreshold && meetsCoreSignals) ? 'pass' : 'needs-improvement';
|
|
221
|
+
|
|
222
|
+
return {
|
|
223
|
+
scenarioId: writerScenarioRun.scenarioId,
|
|
224
|
+
scenarioCategory: writerScenarioRun.scenarioCategory,
|
|
225
|
+
writerToken,
|
|
226
|
+
writerModelId: blindReviewMode ? null : writerToken,
|
|
227
|
+
judgeModelId: judgeModel.id,
|
|
228
|
+
blindPairId: `${writerScenarioRun.scenarioId}:${writerToken}:${judgeModel.id}`,
|
|
229
|
+
writerCompositeScore: writerScenarioRun.compositeScore,
|
|
230
|
+
judgeCompositeScore: roundToTwo(judgeCompositeScore),
|
|
231
|
+
scoreThreshold: minimumCompositeScore,
|
|
232
|
+
leniencyWindow,
|
|
233
|
+
meetsScoreThreshold,
|
|
234
|
+
meetsCoreSignals,
|
|
235
|
+
verdict,
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
function summarizeExecutions(executions) {
|
|
240
|
+
return executions.map((executionResult) => ({
|
|
241
|
+
scriptPath: executionResult.scriptPath,
|
|
242
|
+
exitCode: executionResult.exitCode,
|
|
243
|
+
parseError: executionResult.parseError,
|
|
244
|
+
reportName: executionResult.parsedReport?.reportName || executionResult.parsedReport?.gateName || null,
|
|
245
|
+
passed: typeof executionResult.parsedReport?.passed === 'boolean'
|
|
246
|
+
? executionResult.parsedReport.passed
|
|
247
|
+
: null,
|
|
248
|
+
}));
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
function buildWriterLaneRuns(writerModels, scenarios, baseSignals, writerLaneConfig) {
|
|
252
|
+
return writerModels.map((writerModel, writerIndex) => {
|
|
253
|
+
const writerToken = `W${writerIndex + 1}`;
|
|
254
|
+
const scenarioRuns = scenarios.map((scenario) => buildWriterScenarioRun(
|
|
255
|
+
writerModel,
|
|
256
|
+
scenario,
|
|
257
|
+
baseSignals,
|
|
258
|
+
writerLaneConfig.weights || {},
|
|
259
|
+
writerLaneConfig.scenarioMultipliers || {}
|
|
260
|
+
));
|
|
261
|
+
|
|
262
|
+
const averageCompositeScore = scenarioRuns.length === 0
|
|
263
|
+
? 0
|
|
264
|
+
: roundToTwo(scenarioRuns.reduce((sum, scenarioRun) => sum + scenarioRun.compositeScore, 0) / scenarioRuns.length);
|
|
265
|
+
|
|
266
|
+
return {
|
|
267
|
+
writerToken,
|
|
268
|
+
writerModel,
|
|
269
|
+
averageCompositeScore,
|
|
270
|
+
scenarioRuns,
|
|
271
|
+
};
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
function buildJudgeLaneRuns(writerLaneRuns, judgeModels, judgeLaneConfig, blindReviewMode) {
|
|
276
|
+
const matrixRows = [];
|
|
277
|
+
|
|
278
|
+
for (const writerLaneRun of writerLaneRuns) {
|
|
279
|
+
for (const writerScenarioRun of writerLaneRun.scenarioRuns) {
|
|
280
|
+
for (const judgeModel of judgeModels) {
|
|
281
|
+
matrixRows.push(
|
|
282
|
+
evaluateJudgeForScenario(writerScenarioRun, writerLaneRun.writerToken, judgeModel, judgeLaneConfig, blindReviewMode)
|
|
283
|
+
);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
return matrixRows;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
async function runWriterJudgeMatrix() {
|
|
292
|
+
const writerJudgeConfig = readJsonOrNull(CONFIG_PATH) || buildDefaultConfig();
|
|
293
|
+
const reproducibilityProfile = readJsonOrNull(REPRO_PROFILE_PATH) || { scenarios: [] };
|
|
294
|
+
const thresholdConfiguration = readJsonOrNull(THRESHOLD_PATH) || {};
|
|
295
|
+
|
|
296
|
+
const detectionBenchmarkExecution = runJsonScript('scripts/detection-benchmark.mjs');
|
|
297
|
+
const tokenBenchmarkExecution = runJsonScript('scripts/token-optimization-benchmark.mjs', ['--stdout-only']);
|
|
298
|
+
const benchmarkGateExecution = runJsonScript('scripts/benchmark-gate.mjs');
|
|
299
|
+
const benchmarkIntelligenceExecution = runJsonScript('scripts/benchmark-intelligence.mjs');
|
|
300
|
+
|
|
301
|
+
const executionSummaries = summarizeExecutions([
|
|
302
|
+
detectionBenchmarkExecution,
|
|
303
|
+
tokenBenchmarkExecution,
|
|
304
|
+
benchmarkGateExecution,
|
|
305
|
+
benchmarkIntelligenceExecution,
|
|
306
|
+
]);
|
|
307
|
+
|
|
308
|
+
const executionFailureCount = executionSummaries.filter((executionSummary) => executionSummary.parseError).length;
|
|
309
|
+
const scenarios = loadScenarios(reproducibilityProfile);
|
|
310
|
+
|
|
311
|
+
const baseSignals = buildBaseSignals(
|
|
312
|
+
detectionBenchmarkExecution.parsedReport,
|
|
313
|
+
tokenBenchmarkExecution.parsedReport,
|
|
314
|
+
benchmarkGateExecution.parsedReport,
|
|
315
|
+
benchmarkIntelligenceExecution.parsedReport,
|
|
316
|
+
thresholdConfiguration
|
|
317
|
+
);
|
|
318
|
+
|
|
319
|
+
const writerModels = Array.isArray(writerJudgeConfig?.writerLane?.models) && writerJudgeConfig.writerLane.models.length > 0
|
|
320
|
+
? writerJudgeConfig.writerLane.models
|
|
321
|
+
: buildDefaultConfig().writerLane.models;
|
|
322
|
+
|
|
323
|
+
const judgeModels = Array.isArray(writerJudgeConfig?.judgeLane?.models) && writerJudgeConfig.judgeLane.models.length > 0
|
|
324
|
+
? writerJudgeConfig.judgeLane.models
|
|
325
|
+
: buildDefaultConfig().judgeLane.models;
|
|
326
|
+
|
|
327
|
+
const writerLaneRuns = buildWriterLaneRuns(
|
|
328
|
+
writerModels,
|
|
329
|
+
scenarios,
|
|
330
|
+
baseSignals,
|
|
331
|
+
writerJudgeConfig.writerLane || buildDefaultConfig().writerLane
|
|
332
|
+
);
|
|
333
|
+
|
|
334
|
+
const comparisonMatrix = buildJudgeLaneRuns(
|
|
335
|
+
writerLaneRuns,
|
|
336
|
+
judgeModels,
|
|
337
|
+
writerJudgeConfig.judgeLane || buildDefaultConfig().judgeLane,
|
|
338
|
+
writerJudgeConfig.blindReviewMode !== false
|
|
339
|
+
);
|
|
340
|
+
|
|
341
|
+
const passCount = comparisonMatrix.filter((matrixRow) => matrixRow.verdict === 'pass').length;
|
|
342
|
+
const passRatePercent = comparisonMatrix.length === 0
|
|
343
|
+
? 0
|
|
344
|
+
: roundToTwo((passCount / comparisonMatrix.length) * 100);
|
|
345
|
+
|
|
346
|
+
const writerJudgeReport = {
|
|
347
|
+
generatedAt: new Date().toISOString(),
|
|
348
|
+
reportName: 'benchmark-writer-judge-matrix',
|
|
349
|
+
phase: 'v2.5.1',
|
|
350
|
+
passed: executionFailureCount === 0,
|
|
351
|
+
failureCount: executionFailureCount,
|
|
352
|
+
methodology: {
|
|
353
|
+
blindReviewMode: writerJudgeConfig.blindReviewMode !== false,
|
|
354
|
+
writerLaneModelCount: writerModels.length,
|
|
355
|
+
judgeLaneModelCount: judgeModels.length,
|
|
356
|
+
scenarioCount: scenarios.length,
|
|
357
|
+
writerWeights: writerJudgeConfig?.writerLane?.weights || null,
|
|
358
|
+
judgeWeights: writerJudgeConfig?.judgeLane?.weights || null,
|
|
359
|
+
},
|
|
360
|
+
coreSignals: baseSignals,
|
|
361
|
+
writerDirectory: writerLaneRuns.map((writerLaneRun) => ({
|
|
362
|
+
writerToken: writerLaneRun.writerToken,
|
|
363
|
+
writerModel: writerLaneRun.writerModel,
|
|
364
|
+
averageCompositeScore: writerLaneRun.averageCompositeScore,
|
|
365
|
+
})),
|
|
366
|
+
comparisonMatrix,
|
|
367
|
+
summary: {
|
|
368
|
+
passCount,
|
|
369
|
+
failCount: comparisonMatrix.length - passCount,
|
|
370
|
+
passRatePercent,
|
|
371
|
+
},
|
|
372
|
+
executions: executionSummaries,
|
|
373
|
+
};
|
|
374
|
+
|
|
375
|
+
if (!isStdoutOnlyMode) {
|
|
376
|
+
await fs.writeFile(OUTPUT_PATH, JSON.stringify(writerJudgeReport, null, 2) + '\n', 'utf8');
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
console.log(JSON.stringify(writerJudgeReport, null, 2));
|
|
380
|
+
process.exit(writerJudgeReport.passed ? 0 : 1);
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
runWriterJudgeMatrix();
|
package/scripts/validate.mjs
CHANGED
|
@@ -55,15 +55,29 @@ const FORMAL_ARTIFACT_PATHS = [
|
|
|
55
55
|
const REQUIRED_HUMAN_WRITING_SNIPPETS = [
|
|
56
56
|
{
|
|
57
57
|
path: '.agent-context/rules/api-docs.md',
|
|
58
|
-
snippets: [
|
|
58
|
+
snippets: [
|
|
59
|
+
'## Human Writing Standard (Mandatory)',
|
|
60
|
+
'This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.',
|
|
61
|
+
'Style baseline findings are advisory by default and must not block endpoint-change commits that already include accurate docs/spec updates.',
|
|
62
|
+
'No emoji in formal artifacts.',
|
|
63
|
+
],
|
|
59
64
|
},
|
|
60
65
|
{
|
|
61
66
|
path: '.agent-context/review-checklists/pr-checklist.md',
|
|
62
|
-
snippets: [
|
|
67
|
+
snippets: [
|
|
68
|
+
'Scope applied: This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations',
|
|
69
|
+
'Style scope review is advisory and does not block merge when API docs are synced in the same commit and contract details are correct',
|
|
70
|
+
'No emoji in formal documentation or review summaries',
|
|
71
|
+
'Documentation uses plain English and avoids AI cliches',
|
|
72
|
+
],
|
|
63
73
|
},
|
|
64
74
|
{
|
|
65
75
|
path: 'docs/deep_analysis_and_roadmap_backlog.md',
|
|
66
|
-
snippets: [
|
|
76
|
+
snippets: [
|
|
77
|
+
'## Part 6: Documentation and Explanation Standards (Mandatory)',
|
|
78
|
+
'This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.',
|
|
79
|
+
'No emoji in formal artifacts. This is mandatory.',
|
|
80
|
+
],
|
|
67
81
|
},
|
|
68
82
|
];
|
|
69
83
|
|
|
@@ -149,6 +163,7 @@ async function validateRequiredFiles() {
|
|
|
149
163
|
'scripts/llm-judge.mjs',
|
|
150
164
|
'scripts/detection-benchmark.mjs',
|
|
151
165
|
'scripts/benchmark-evidence-bundle.mjs',
|
|
166
|
+
'scripts/benchmark-writer-judge-matrix.mjs',
|
|
152
167
|
'scripts/benchmark-gate.mjs',
|
|
153
168
|
'scripts/benchmark-intelligence.mjs',
|
|
154
169
|
'scripts/governance-weekly-report.mjs',
|
|
@@ -175,6 +190,7 @@ async function validateRequiredFiles() {
|
|
|
175
190
|
'docs/v1.8-operations-playbook.md',
|
|
176
191
|
'docs/v2-upgrade-playbook.md',
|
|
177
192
|
'.agent-context/state/benchmark-reproducibility.json',
|
|
193
|
+
'.agent-context/state/benchmark-writer-judge-config.json',
|
|
178
194
|
'.agent-context/state/benchmark-watchlist.json',
|
|
179
195
|
'.agent-context/state/skill-platform.json',
|
|
180
196
|
'.agent-context/skills/index.json',
|