@ryuenn3123/agentic-senior-core 2.0.16 → 2.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -261,6 +261,38 @@ For CI pipelines that only need stdout JSON:
261
261
  node ./scripts/benchmark-evidence-bundle.mjs --stdout-only
262
262
  ```
263
263
 
264
+ ### Writer-Judge Comparison Matrix (V2.5.1)
265
+
266
+ Generate a blind-review writer-judge matrix with independent lane configuration:
267
+
268
+ ```bash
269
+ npm run benchmark:writer-judge
270
+ ```
271
+
272
+ This command writes:
273
+ - `.agent-context/state/benchmark-writer-judge-matrix.json`
274
+
275
+ Writer and judge lane configuration is stored in:
276
+ - `.agent-context/state/benchmark-writer-judge-config.json`
277
+
278
+ For CI pipelines that only need stdout JSON:
279
+
280
+ ```bash
281
+ node ./scripts/benchmark-writer-judge-matrix.mjs --stdout-only
282
+ ```
283
+
284
+ ### Benchmark Quickstart Path (V2.5)
285
+
286
+ For new users, run this minimal sequence first:
287
+
288
+ ```bash
289
+ npm run benchmark:detection
290
+ npm run benchmark:writer-judge
291
+ npm run benchmark:bundle
292
+ ```
293
+
294
+ This gives a fast baseline of accuracy, writer-judge comparison, and evidence packaging in one pass.
295
+
264
296
  ### Install and Setup Choices
265
297
 
266
298
  The CLI now supports a smaller decision surface for first-time setup:
@@ -337,7 +369,7 @@ Our documentation has shifted into dedicated tracks to keep this README light:
337
369
 
338
370
  - **Delivery Engine (CLI):** Interactive setup via GitHub source, bootstrap scripts, or `npx` after publish. Supported by a robust transactional installer with rollback protection.
339
371
  - **Verified Skill Marketplace:** Distribute and validate plugins securely with automated 4-dimension Trust Scoring and Evidence Bundles constraint validation.
340
- - **Dynamic Context Compiler:** Merges universal rules + selected stack + selected blueprint + optional CI guardrails into one dense, indexed rule file.
372
+ - **Dynamic Context Compiler:** Builds a compact modular bootstrap index that points to all required governance layers before execution.
341
373
  - **Codebase Intelligence:** `.agent-context/state/` gives architecture/dependency boundaries so the agent understands high-risk areas.
342
374
  - **Override System:** `.agent-override.md` allows controlled enterprise exceptions without forking core rules.
343
375
  - **Automated Guardrails:** CI blueprints include LLM-as-a-Judge flow using `pr-checklist.md`.
@@ -18,7 +18,6 @@ import {
18
18
 
19
19
  import {
20
20
  inferSkillDomainNamesFromSelection,
21
- buildSkillPackSection,
22
21
  } from './skill-selector.mjs';
23
22
 
24
23
  import {
@@ -102,51 +101,91 @@ export async function buildCompiledRulesContent({
102
101
  const selectedRulesDirectoryPath = path.join(resolvedTargetDirectoryPath, '.agent-context', 'rules');
103
102
  const selectedStacksDirectoryPath = path.join(resolvedTargetDirectoryPath, '.agent-context', 'stacks');
104
103
  const selectedBlueprintsDirectoryPath = path.join(resolvedTargetDirectoryPath, '.agent-context', 'blueprints');
105
- const selectedStateDirectoryPath = path.join(resolvedTargetDirectoryPath, '.agent-context', 'state');
106
- const selectedReviewDirectoryPath = path.join(resolvedTargetDirectoryPath, '.agent-context', 'review-checklists');
107
104
  const skillPlatformIndex = JSON.parse(await fs.readFile(SKILL_PLATFORM_INDEX_PATH, 'utf8'));
108
105
  const selectedSkillDomainNames = inferSkillDomainNamesFromSelection(selectedStackFileName, selectedBlueprintFileName);
109
106
 
110
107
  const universalRuleFileNames = await collectFileNames(selectedRulesDirectoryPath);
111
108
  const contextBlocks = [];
112
109
 
113
- for (const universalRuleFileName of universalRuleFileNames) {
114
- const universalRuleFilePath = path.join(selectedRulesDirectoryPath, universalRuleFileName);
115
- const universalRuleContent = await fs.readFile(universalRuleFilePath, 'utf8');
110
+ function resolveSkillPackFileName(skillDomainEntry, selectedTierName) {
111
+ return skillDomainEntry.tierToPackFileNames?.[selectedTierName]
112
+ || skillDomainEntry.tierToPackFileNames?.[skillDomainEntry.defaultTier]
113
+ || skillDomainEntry.defaultPackFileName;
114
+ }
116
115
 
117
- contextBlocks.push(
118
- `## UNIVERSAL RULE: ${universalRuleFileName}\nSource: .agent-context/rules/${universalRuleFileName}\n\n${universalRuleContent.trim()}`
119
- );
116
+ function firstMarkdownHeading(content, fallbackLabel) {
117
+ const headingLine = content
118
+ .split(/\r?\n/)
119
+ .find((line) => line.trim().startsWith('#'));
120
+
121
+ if (!headingLine) {
122
+ return fallbackLabel;
123
+ }
124
+
125
+ return headingLine.replace(/^#+\s*/, '').trim();
120
126
  }
121
127
 
128
+ contextBlocks.push(
129
+ [
130
+ '## BOOTSTRAP CHAIN (MANDATORY)',
131
+ 'Load every layer before responding. Do not skip steps:',
132
+ '1. .agent-context/rules/',
133
+ '2. .agent-context/stacks/',
134
+ '3. .agent-context/blueprints/',
135
+ '4. .agent-context/skills/',
136
+ '5. .agent-context/prompts/',
137
+ '6. .agent-context/profiles/',
138
+ '7. .agent-context/state/',
139
+ `8. .agent-context/policies/${POLICY_FILE_NAME}`,
140
+ '',
141
+ 'Primary entrypoint: .cursorrules',
142
+ 'Mirror entrypoint: .windsurfrules',
143
+ 'Canonical baseline: .instructions.md',
144
+ ].join('\n')
145
+ );
146
+
147
+ contextBlocks.push(
148
+ [
149
+ '## LAYER 1: UNIVERSAL RULES (MANDATORY)',
150
+ 'Read every file under .agent-context/rules/ before implementation:',
151
+ ...universalRuleFileNames.map((universalRuleFileName, index) => `${index + 1}. .agent-context/rules/${universalRuleFileName}`),
152
+ '',
153
+ 'Conflict resolution: prioritize data safety and API contract integrity first, then writing polish.',
154
+ ].join('\n')
155
+ );
156
+
122
157
  const stackFilePath = path.join(selectedStacksDirectoryPath, selectedStackFileName);
123
158
  const stackContent = await fs.readFile(stackFilePath, 'utf8');
159
+ const stackSummary = firstMarkdownHeading(stackContent, selectedStackFileName);
124
160
  contextBlocks.push(
125
- `## STACK PROFILE: ${selectedStackFileName}\nSource: .agent-context/stacks/${selectedStackFileName}\n\n${stackContent.trim()}`
161
+ [
162
+ `## LAYER 2: STACK PROFILE (${selectedStackFileName})`,
163
+ `Source: .agent-context/stacks/${selectedStackFileName}`,
164
+ `Summary: ${stackSummary}`,
165
+ 'Load this stack profile to enforce language-specific conventions.',
166
+ ].join('\n')
126
167
  );
127
168
 
128
169
  const blueprintFilePath = path.join(selectedBlueprintsDirectoryPath, selectedBlueprintFileName);
129
170
  const blueprintContent = await fs.readFile(blueprintFilePath, 'utf8');
171
+ const blueprintSummary = firstMarkdownHeading(blueprintContent, selectedBlueprintFileName);
130
172
  contextBlocks.push(
131
- `## BLUEPRINT PROFILE: ${selectedBlueprintFileName}\nSource: .agent-context/blueprints/${selectedBlueprintFileName}\n\n${blueprintContent.trim()}`
173
+ [
174
+ `## LAYER 3: BLUEPRINT PROFILE (${selectedBlueprintFileName})`,
175
+ `Source: .agent-context/blueprints/${selectedBlueprintFileName}`,
176
+ `Summary: ${blueprintSummary}`,
177
+ 'Load this blueprint when scaffolding or changing architecture boundaries.',
178
+ ].join('\n')
132
179
  );
133
180
 
134
181
  if (includeCiGuardrails) {
135
- const githubCiBlueprintContent = await fs.readFile(path.join(selectedBlueprintsDirectoryPath, 'ci-github-actions.md'), 'utf8');
136
- const gitlabCiBlueprintContent = await fs.readFile(path.join(selectedBlueprintsDirectoryPath, 'ci-gitlab.md'), 'utf8');
137
-
138
- contextBlocks.push(
139
- `## CI/CD GUARDRAILS: ci-github-actions.md\nSource: .agent-context/blueprints/ci-github-actions.md\n\n${githubCiBlueprintContent.trim()}`
140
- );
141
- contextBlocks.push(
142
- `## CI/CD GUARDRAILS: ci-gitlab.md\nSource: .agent-context/blueprints/ci-gitlab.md\n\n${gitlabCiBlueprintContent.trim()}`
143
- );
144
- }
145
-
146
- const tokenOptimizationState = await readTokenOptimizationState(resolvedTargetDirectoryPath);
147
- if (tokenOptimizationState?.enabled) {
148
182
  contextBlocks.push(
149
- `## TOKEN OPTIMIZATION PROFILE\nSource: .agent-context/state/token-optimization.json\n\n${buildTokenOptimizationGuidanceBlock(tokenOptimizationState).trim()}`
183
+ [
184
+ '## LAYER 3B: CI/CD GUARDRAILS',
185
+ 'Load these CI blueprints when pipeline or release logic is touched:',
186
+ '1. .agent-context/blueprints/ci-github-actions.md',
187
+ '2. .agent-context/blueprints/ci-gitlab.md',
188
+ ].join('\n')
150
189
  );
151
190
  }
152
191
 
@@ -156,21 +195,45 @@ export async function buildCompiledRulesContent({
156
195
  continue;
157
196
  }
158
197
 
159
- contextBlocks.push(await buildSkillPackSection(skillDomainEntry, skillPlatformIndex.defaultTier || 'advance'));
160
- }
198
+ const selectedTierName = skillPlatformIndex.defaultTier || 'advance';
199
+ const resolvedPackFileName = resolveSkillPackFileName(skillDomainEntry, selectedTierName);
161
200
 
162
- const architectureMapContent = await fs.readFile(path.join(selectedStateDirectoryPath, 'architecture-map.md'), 'utf8');
163
- const dependencyMapContent = await fs.readFile(path.join(selectedStateDirectoryPath, 'dependency-map.md'), 'utf8');
164
- const prChecklistContent = await fs.readFile(path.join(selectedReviewDirectoryPath, 'pr-checklist.md'), 'utf8');
201
+ contextBlocks.push(
202
+ [
203
+ `## SKILL PACK: ${skillDomainEntry.displayName}`,
204
+ `Source: .agent-context/skills/${resolvedPackFileName}`,
205
+ `Default tier: ${skillDomainEntry.defaultTier}`,
206
+ `Selected tier: ${selectedTierName}`,
207
+ `Evidence: ${skillDomainEntry.evidence}`,
208
+ `Purpose: ${skillDomainEntry.description}`,
209
+ 'Load this skill pack and apply every Must-Have Check.',
210
+ ].join('\n')
211
+ );
212
+ }
165
213
 
214
+ const tokenOptimizationState = await readTokenOptimizationState(resolvedTargetDirectoryPath);
215
+ if (tokenOptimizationState?.enabled) {
216
+ contextBlocks.push(
217
+ `## TOKEN OPTIMIZATION PROFILE\nSource: .agent-context/state/token-optimization.json\n\n${buildTokenOptimizationGuidanceBlock(tokenOptimizationState).trim()}`
218
+ );
219
+ }
166
220
  contextBlocks.push(
167
- `## STATE MAP: architecture-map.md\nSource: .agent-context/state/architecture-map.md\n\n${architectureMapContent.trim()}`
168
- );
169
- contextBlocks.push(
170
- `## STATE MAP: dependency-map.md\nSource: .agent-context/state/dependency-map.md\n\n${dependencyMapContent.trim()}`
221
+ [
222
+ '## LAYER 7: STATE AWARENESS (MANDATORY)',
223
+ 'Load these files before touching critical paths:',
224
+ '1. .agent-context/state/architecture-map.md',
225
+ '2. .agent-context/state/dependency-map.md',
226
+ 'Use these maps to prevent unsafe cross-module changes.',
227
+ ].join('\n')
171
228
  );
172
229
  contextBlocks.push(
173
- `## REVIEW CHECKLIST: pr-checklist.md\nSource: .agent-context/review-checklists/pr-checklist.md\n\n${prChecklistContent.trim()}`
230
+ [
231
+ '## REVIEW CHECKLISTS (MANDATORY)',
232
+ '1. .agent-context/review-checklists/pr-checklist.md',
233
+ '2. .agent-context/review-checklists/security-audit.md (when security-sensitive)',
234
+ '3. .agent-context/review-checklists/performance-audit.md (when perf-critical)',
235
+ 'Do not claim done before checklist pass.',
236
+ ].join('\n')
174
237
  );
175
238
 
176
239
  return [
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ryuenn3123/agentic-senior-core",
3
- "version": "2.0.16",
3
+ "version": "2.0.18",
4
4
  "type": "module",
5
5
  "description": "Force your AI Agent to code like a Staff Engineer, not a Junior.",
6
6
  "bin": {
@@ -49,6 +49,7 @@
49
49
  "benchmark:detection": "node ./scripts/detection-benchmark.mjs",
50
50
  "benchmark:token": "node ./scripts/token-optimization-benchmark.mjs",
51
51
  "benchmark:bundle": "node ./scripts/benchmark-evidence-bundle.mjs",
52
+ "benchmark:writer-judge": "node ./scripts/benchmark-writer-judge-matrix.mjs",
52
53
  "benchmark:gate": "node ./scripts/benchmark-gate.mjs",
53
54
  "benchmark:intelligence": "node ./scripts/benchmark-intelligence.mjs",
54
55
  "report:quality-trend": "node ./scripts/quality-trend-report.mjs",
@@ -0,0 +1,383 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * benchmark-writer-judge-matrix.mjs
5
+ *
6
+ * V2.5.1 writer-judge architecture artifact.
7
+ * Builds side-by-side comparison matrix using independently configured
8
+ * writer and judge lanes with blind review tokens.
9
+ */
10
+
11
+ import { existsSync, readFileSync } from 'node:fs';
12
+ import fs from 'node:fs/promises';
13
+ import { spawnSync } from 'node:child_process';
14
+ import { dirname, join, resolve } from 'node:path';
15
+ import { fileURLToPath } from 'node:url';
16
+
17
+ const SCRIPT_FILE_PATH = fileURLToPath(import.meta.url);
18
+ const SCRIPT_DIR = dirname(SCRIPT_FILE_PATH);
19
+ const REPOSITORY_ROOT = resolve(SCRIPT_DIR, '..');
20
+ const ARGUMENT_FLAGS = new Set(process.argv.slice(2));
21
+ const isStdoutOnlyMode = ARGUMENT_FLAGS.has('--stdout-only');
22
+
23
+ const CONFIG_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-writer-judge-config.json');
24
+ const REPRO_PROFILE_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-reproducibility.json');
25
+ const THRESHOLD_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-thresholds.json');
26
+ const OUTPUT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-writer-judge-matrix.json');
27
+
28
+ function readJsonOrNull(filePath) {
29
+ if (!existsSync(filePath)) {
30
+ return null;
31
+ }
32
+
33
+ try {
34
+ return JSON.parse(readFileSync(filePath, 'utf8'));
35
+ } catch {
36
+ return null;
37
+ }
38
+ }
39
+
40
+ function runJsonScript(scriptRelativePath, scriptArguments = []) {
41
+ const absoluteScriptPath = join(REPOSITORY_ROOT, scriptRelativePath);
42
+ const commandResult = spawnSync('node', [absoluteScriptPath, ...scriptArguments], {
43
+ cwd: REPOSITORY_ROOT,
44
+ encoding: 'utf8',
45
+ maxBuffer: 1024 * 1024 * 10,
46
+ });
47
+
48
+ const stdoutContent = (commandResult.stdout || '').trim();
49
+ const stderrContent = (commandResult.stderr || '').trim();
50
+ const exitCode = typeof commandResult.status === 'number' ? commandResult.status : 1;
51
+
52
+ if (!stdoutContent) {
53
+ return {
54
+ scriptPath: scriptRelativePath,
55
+ exitCode,
56
+ parsedReport: null,
57
+ parseError: 'Script produced no stdout JSON payload',
58
+ stderr: stderrContent,
59
+ };
60
+ }
61
+
62
+ try {
63
+ return {
64
+ scriptPath: scriptRelativePath,
65
+ exitCode,
66
+ parsedReport: JSON.parse(stdoutContent),
67
+ parseError: null,
68
+ stderr: stderrContent,
69
+ };
70
+ } catch (jsonParseError) {
71
+ const parseErrorMessage = jsonParseError instanceof Error ? jsonParseError.message : String(jsonParseError);
72
+ return {
73
+ scriptPath: scriptRelativePath,
74
+ exitCode,
75
+ parsedReport: null,
76
+ parseError: parseErrorMessage,
77
+ stderr: stderrContent,
78
+ };
79
+ }
80
+ }
81
+
82
+ function deterministicOffset(seed, maxMagnitude = 3) {
83
+ let hash = 0;
84
+ for (let index = 0; index < seed.length; index += 1) {
85
+ hash = ((hash << 5) - hash) + seed.charCodeAt(index);
86
+ hash |= 0;
87
+ }
88
+
89
+ const spread = (maxMagnitude * 2) + 1;
90
+ const normalizedValue = Math.abs(hash) % spread;
91
+ return normalizedValue - maxMagnitude;
92
+ }
93
+
94
+ function clamp(value, minimum, maximum) {
95
+ return Math.min(Math.max(value, minimum), maximum);
96
+ }
97
+
98
+ function roundToTwo(value) {
99
+ return Number(value.toFixed(2));
100
+ }
101
+
102
+ function buildDefaultConfig() {
103
+ return {
104
+ version: '1.0.0',
105
+ phase: 'v2.5.1',
106
+ blindReviewMode: true,
107
+ writerLane: {
108
+ models: [{ id: 'writer-default', provider: 'local', profile: 'balanced' }],
109
+ weights: {
110
+ quality: 40,
111
+ efficiency: 20,
112
+ reliability: 25,
113
+ freshness: 15,
114
+ },
115
+ scenarioMultipliers: {
116
+ planning: 1,
117
+ refactor: 1,
118
+ security: 1,
119
+ delivery: 1,
120
+ },
121
+ },
122
+ judgeLane: {
123
+ models: [{ id: 'judge-default', provider: 'local', profile: 'audit' }],
124
+ minimumCompositeScore: 75,
125
+ leniencyWindow: 2,
126
+ weights: {
127
+ clarity: 35,
128
+ correctness: 35,
129
+ risk: 20,
130
+ consistency: 10,
131
+ },
132
+ },
133
+ };
134
+ }
135
+
136
+ function loadScenarios(reproducibilityProfile) {
137
+ const defaultScenarios = [
138
+ { id: 'planning', category: 'planning' },
139
+ { id: 'refactor', category: 'refactor' },
140
+ { id: 'security', category: 'security' },
141
+ { id: 'delivery', category: 'delivery' },
142
+ ];
143
+
144
+ if (!Array.isArray(reproducibilityProfile?.scenarios) || reproducibilityProfile.scenarios.length === 0) {
145
+ return defaultScenarios;
146
+ }
147
+
148
+ return reproducibilityProfile.scenarios.map((scenarioEntry) => ({
149
+ id: scenarioEntry.id || 'unknown-scenario',
150
+ category: scenarioEntry.category || 'planning',
151
+ }));
152
+ }
153
+
154
+ function buildBaseSignals(detectionBenchmarkReport, tokenBenchmarkReport, benchmarkGateReport, benchmarkIntelligenceReport, thresholdConfiguration) {
155
+ const staleWatchlistCount = Array.isArray(benchmarkIntelligenceReport?.watchlist)
156
+ ? benchmarkIntelligenceReport.watchlist.filter((watchlistEntry) => watchlistEntry?.stale === true).length
157
+ : 0;
158
+
159
+ const top1Accuracy = Number(detectionBenchmarkReport?.top1Accuracy || 0);
160
+ const manualCorrectionRate = Number(detectionBenchmarkReport?.manualCorrectionRate || 1);
161
+
162
+ return {
163
+ top1Accuracy,
164
+ manualCorrectionRate,
165
+ nativeSavingsPercent: Number(tokenBenchmarkReport?.summary?.averageNativeSavingsPercent || 0),
166
+ benchmarkGatePassed: benchmarkGateReport?.passed === true,
167
+ benchmarkGateFailureCount: Number(benchmarkGateReport?.failureCount || 0),
168
+ intelligenceFailureCount: Number(benchmarkIntelligenceReport?.failureCount || 0),
169
+ staleWatchlistCount,
170
+ top1AccuracyMet: top1Accuracy >= Number(thresholdConfiguration?.minimumTop1Accuracy || 0),
171
+ manualCorrectionMet: manualCorrectionRate <= Number(thresholdConfiguration?.maximumManualCorrectionRate || 1),
172
+ };
173
+ }
174
+
175
+ function buildWriterScenarioRun(writerModel, scenario, baseSignals, writerWeights, scenarioMultipliers) {
176
+ const scenarioMultiplier = Number(scenarioMultipliers?.[scenario.category] || 1);
177
+ const modelScenarioOffset = deterministicOffset(`${writerModel.id}:${scenario.id}`, 4);
178
+
179
+ const qualityScore = clamp((baseSignals.top1Accuracy * 100 * scenarioMultiplier) + modelScenarioOffset, 0, 100);
180
+ const efficiencyScore = clamp(baseSignals.nativeSavingsPercent + deterministicOffset(`${writerModel.id}:efficiency`, 3), 0, 100);
181
+ const reliabilityScore = baseSignals.benchmarkGatePassed
182
+ ? clamp(100 + deterministicOffset(`${writerModel.id}:reliability`, 2), 0, 100)
183
+ : clamp(100 - (baseSignals.benchmarkGateFailureCount * 20), 0, 100);
184
+ const freshnessScore = clamp(
185
+ 100 - (baseSignals.intelligenceFailureCount * 15) - (baseSignals.staleWatchlistCount * 10) + deterministicOffset(`${writerModel.id}:freshness`, 2),
186
+ 0,
187
+ 100
188
+ );
189
+
190
+ const weightedCompositeScore = (
191
+ (qualityScore * Number(writerWeights.quality || 0))
192
+ + (efficiencyScore * Number(writerWeights.efficiency || 0))
193
+ + (reliabilityScore * Number(writerWeights.reliability || 0))
194
+ + (freshnessScore * Number(writerWeights.freshness || 0))
195
+ ) / 100;
196
+
197
+ return {
198
+ scenarioId: scenario.id,
199
+ scenarioCategory: scenario.category,
200
+ scoreBreakdown: {
201
+ quality: roundToTwo(qualityScore),
202
+ efficiency: roundToTwo(efficiencyScore),
203
+ reliability: roundToTwo(reliabilityScore),
204
+ freshness: roundToTwo(freshnessScore),
205
+ },
206
+ compositeScore: roundToTwo(weightedCompositeScore),
207
+ top1AccuracyMet: baseSignals.top1AccuracyMet,
208
+ manualCorrectionMet: baseSignals.manualCorrectionMet,
209
+ };
210
+ }
211
+
212
+ function evaluateJudgeForScenario(writerScenarioRun, writerToken, judgeModel, judgeLaneConfig, blindReviewMode) {
213
+ const judgeOffset = deterministicOffset(`${judgeModel.id}:${writerScenarioRun.scenarioId}:${writerToken}`, 2);
214
+ const judgeCompositeScore = clamp(writerScenarioRun.compositeScore + judgeOffset, 0, 100);
215
+ const minimumCompositeScore = Number(judgeLaneConfig.minimumCompositeScore || 75);
216
+ const leniencyWindow = Number(judgeLaneConfig.leniencyWindow || 0);
217
+
218
+ const meetsScoreThreshold = judgeCompositeScore >= (minimumCompositeScore - leniencyWindow);
219
+ const meetsCoreSignals = writerScenarioRun.top1AccuracyMet && writerScenarioRun.manualCorrectionMet;
220
+ const verdict = (meetsScoreThreshold && meetsCoreSignals) ? 'pass' : 'needs-improvement';
221
+
222
+ return {
223
+ scenarioId: writerScenarioRun.scenarioId,
224
+ scenarioCategory: writerScenarioRun.scenarioCategory,
225
+ writerToken,
226
+ writerModelId: blindReviewMode ? null : writerToken,
227
+ judgeModelId: judgeModel.id,
228
+ blindPairId: `${writerScenarioRun.scenarioId}:${writerToken}:${judgeModel.id}`,
229
+ writerCompositeScore: writerScenarioRun.compositeScore,
230
+ judgeCompositeScore: roundToTwo(judgeCompositeScore),
231
+ scoreThreshold: minimumCompositeScore,
232
+ leniencyWindow,
233
+ meetsScoreThreshold,
234
+ meetsCoreSignals,
235
+ verdict,
236
+ };
237
+ }
238
+
239
+ function summarizeExecutions(executions) {
240
+ return executions.map((executionResult) => ({
241
+ scriptPath: executionResult.scriptPath,
242
+ exitCode: executionResult.exitCode,
243
+ parseError: executionResult.parseError,
244
+ reportName: executionResult.parsedReport?.reportName || executionResult.parsedReport?.gateName || null,
245
+ passed: typeof executionResult.parsedReport?.passed === 'boolean'
246
+ ? executionResult.parsedReport.passed
247
+ : null,
248
+ }));
249
+ }
250
+
251
+ function buildWriterLaneRuns(writerModels, scenarios, baseSignals, writerLaneConfig) {
252
+ return writerModels.map((writerModel, writerIndex) => {
253
+ const writerToken = `W${writerIndex + 1}`;
254
+ const scenarioRuns = scenarios.map((scenario) => buildWriterScenarioRun(
255
+ writerModel,
256
+ scenario,
257
+ baseSignals,
258
+ writerLaneConfig.weights || {},
259
+ writerLaneConfig.scenarioMultipliers || {}
260
+ ));
261
+
262
+ const averageCompositeScore = scenarioRuns.length === 0
263
+ ? 0
264
+ : roundToTwo(scenarioRuns.reduce((sum, scenarioRun) => sum + scenarioRun.compositeScore, 0) / scenarioRuns.length);
265
+
266
+ return {
267
+ writerToken,
268
+ writerModel,
269
+ averageCompositeScore,
270
+ scenarioRuns,
271
+ };
272
+ });
273
+ }
274
+
275
+ function buildJudgeLaneRuns(writerLaneRuns, judgeModels, judgeLaneConfig, blindReviewMode) {
276
+ const matrixRows = [];
277
+
278
+ for (const writerLaneRun of writerLaneRuns) {
279
+ for (const writerScenarioRun of writerLaneRun.scenarioRuns) {
280
+ for (const judgeModel of judgeModels) {
281
+ matrixRows.push(
282
+ evaluateJudgeForScenario(writerScenarioRun, writerLaneRun.writerToken, judgeModel, judgeLaneConfig, blindReviewMode)
283
+ );
284
+ }
285
+ }
286
+ }
287
+
288
+ return matrixRows;
289
+ }
290
+
291
+ async function runWriterJudgeMatrix() {
292
+ const writerJudgeConfig = readJsonOrNull(CONFIG_PATH) || buildDefaultConfig();
293
+ const reproducibilityProfile = readJsonOrNull(REPRO_PROFILE_PATH) || { scenarios: [] };
294
+ const thresholdConfiguration = readJsonOrNull(THRESHOLD_PATH) || {};
295
+
296
+ const detectionBenchmarkExecution = runJsonScript('scripts/detection-benchmark.mjs');
297
+ const tokenBenchmarkExecution = runJsonScript('scripts/token-optimization-benchmark.mjs', ['--stdout-only']);
298
+ const benchmarkGateExecution = runJsonScript('scripts/benchmark-gate.mjs');
299
+ const benchmarkIntelligenceExecution = runJsonScript('scripts/benchmark-intelligence.mjs');
300
+
301
+ const executionSummaries = summarizeExecutions([
302
+ detectionBenchmarkExecution,
303
+ tokenBenchmarkExecution,
304
+ benchmarkGateExecution,
305
+ benchmarkIntelligenceExecution,
306
+ ]);
307
+
308
+ const executionFailureCount = executionSummaries.filter((executionSummary) => executionSummary.parseError).length;
309
+ const scenarios = loadScenarios(reproducibilityProfile);
310
+
311
+ const baseSignals = buildBaseSignals(
312
+ detectionBenchmarkExecution.parsedReport,
313
+ tokenBenchmarkExecution.parsedReport,
314
+ benchmarkGateExecution.parsedReport,
315
+ benchmarkIntelligenceExecution.parsedReport,
316
+ thresholdConfiguration
317
+ );
318
+
319
+ const writerModels = Array.isArray(writerJudgeConfig?.writerLane?.models) && writerJudgeConfig.writerLane.models.length > 0
320
+ ? writerJudgeConfig.writerLane.models
321
+ : buildDefaultConfig().writerLane.models;
322
+
323
+ const judgeModels = Array.isArray(writerJudgeConfig?.judgeLane?.models) && writerJudgeConfig.judgeLane.models.length > 0
324
+ ? writerJudgeConfig.judgeLane.models
325
+ : buildDefaultConfig().judgeLane.models;
326
+
327
+ const writerLaneRuns = buildWriterLaneRuns(
328
+ writerModels,
329
+ scenarios,
330
+ baseSignals,
331
+ writerJudgeConfig.writerLane || buildDefaultConfig().writerLane
332
+ );
333
+
334
+ const comparisonMatrix = buildJudgeLaneRuns(
335
+ writerLaneRuns,
336
+ judgeModels,
337
+ writerJudgeConfig.judgeLane || buildDefaultConfig().judgeLane,
338
+ writerJudgeConfig.blindReviewMode !== false
339
+ );
340
+
341
+ const passCount = comparisonMatrix.filter((matrixRow) => matrixRow.verdict === 'pass').length;
342
+ const passRatePercent = comparisonMatrix.length === 0
343
+ ? 0
344
+ : roundToTwo((passCount / comparisonMatrix.length) * 100);
345
+
346
+ const writerJudgeReport = {
347
+ generatedAt: new Date().toISOString(),
348
+ reportName: 'benchmark-writer-judge-matrix',
349
+ phase: 'v2.5.1',
350
+ passed: executionFailureCount === 0,
351
+ failureCount: executionFailureCount,
352
+ methodology: {
353
+ blindReviewMode: writerJudgeConfig.blindReviewMode !== false,
354
+ writerLaneModelCount: writerModels.length,
355
+ judgeLaneModelCount: judgeModels.length,
356
+ scenarioCount: scenarios.length,
357
+ writerWeights: writerJudgeConfig?.writerLane?.weights || null,
358
+ judgeWeights: writerJudgeConfig?.judgeLane?.weights || null,
359
+ },
360
+ coreSignals: baseSignals,
361
+ writerDirectory: writerLaneRuns.map((writerLaneRun) => ({
362
+ writerToken: writerLaneRun.writerToken,
363
+ writerModel: writerLaneRun.writerModel,
364
+ averageCompositeScore: writerLaneRun.averageCompositeScore,
365
+ })),
366
+ comparisonMatrix,
367
+ summary: {
368
+ passCount,
369
+ failCount: comparisonMatrix.length - passCount,
370
+ passRatePercent,
371
+ },
372
+ executions: executionSummaries,
373
+ };
374
+
375
+ if (!isStdoutOnlyMode) {
376
+ await fs.writeFile(OUTPUT_PATH, JSON.stringify(writerJudgeReport, null, 2) + '\n', 'utf8');
377
+ }
378
+
379
+ console.log(JSON.stringify(writerJudgeReport, null, 2));
380
+ process.exit(writerJudgeReport.passed ? 0 : 1);
381
+ }
382
+
383
+ runWriterJudgeMatrix();
@@ -55,15 +55,29 @@ const FORMAL_ARTIFACT_PATHS = [
55
55
  const REQUIRED_HUMAN_WRITING_SNIPPETS = [
56
56
  {
57
57
  path: '.agent-context/rules/api-docs.md',
58
- snippets: ['## Human Writing Standard (Mandatory)', 'No emoji in formal artifacts.'],
58
+ snippets: [
59
+ '## Human Writing Standard (Mandatory)',
60
+ 'This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.',
61
+ 'Style baseline findings are advisory by default and must not block endpoint-change commits that already include accurate docs/spec updates.',
62
+ 'No emoji in formal artifacts.',
63
+ ],
59
64
  },
60
65
  {
61
66
  path: '.agent-context/review-checklists/pr-checklist.md',
62
- snippets: ['No emoji in formal documentation or review summaries', 'Documentation uses plain English and avoids AI cliches'],
67
+ snippets: [
68
+ 'Scope applied: This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations',
69
+ 'Style scope review is advisory and does not block merge when API docs are synced in the same commit and contract details are correct',
70
+ 'No emoji in formal documentation or review summaries',
71
+ 'Documentation uses plain English and avoids AI cliches',
72
+ ],
63
73
  },
64
74
  {
65
75
  path: 'docs/deep_analysis_and_roadmap_backlog.md',
66
- snippets: ['## Part 6: Documentation and Explanation Standards (Mandatory)', 'No emoji in formal artifacts. This is mandatory.'],
76
+ snippets: [
77
+ '## Part 6: Documentation and Explanation Standards (Mandatory)',
78
+ 'This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.',
79
+ 'No emoji in formal artifacts. This is mandatory.',
80
+ ],
67
81
  },
68
82
  ];
69
83
 
@@ -149,6 +163,7 @@ async function validateRequiredFiles() {
149
163
  'scripts/llm-judge.mjs',
150
164
  'scripts/detection-benchmark.mjs',
151
165
  'scripts/benchmark-evidence-bundle.mjs',
166
+ 'scripts/benchmark-writer-judge-matrix.mjs',
152
167
  'scripts/benchmark-gate.mjs',
153
168
  'scripts/benchmark-intelligence.mjs',
154
169
  'scripts/governance-weekly-report.mjs',
@@ -175,6 +190,7 @@ async function validateRequiredFiles() {
175
190
  'docs/v1.8-operations-playbook.md',
176
191
  'docs/v2-upgrade-playbook.md',
177
192
  '.agent-context/state/benchmark-reproducibility.json',
193
+ '.agent-context/state/benchmark-writer-judge-config.json',
178
194
  '.agent-context/state/benchmark-watchlist.json',
179
195
  '.agent-context/state/skill-platform.json',
180
196
  '.agent-context/skills/index.json',