@ryuenn3123/agentic-senior-core 2.0.15 → 2.0.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-context/prompts/review-code.md +1 -0
- package/.agent-context/review-checklists/pr-checklist.md +1 -0
- package/.agent-context/rules/api-docs.md +8 -5
- package/.agent-context/state/benchmark-reproducibility.json +3 -1
- package/.agent-context/state/benchmark-writer-judge-config.json +58 -0
- package/.agent-context/state/benchmark-writer-judge-matrix.json +462 -0
- package/.cursorrules +1 -1
- package/.windsurfrules +1 -1
- package/README.md +20 -0
- package/package.json +2 -1
- package/scripts/benchmark-writer-judge-matrix.mjs +383 -0
- package/scripts/validate.mjs +17 -3
|
@@ -13,6 +13,7 @@ Run a comprehensive code review on the current codebase (or the files I'm about
|
|
|
13
13
|
Use these checklists:
|
|
14
14
|
1. Read .agent-context/review-checklists/pr-checklist.md — apply every item.
|
|
15
15
|
2. Read .agent-context/review-checklists/security-audit.md — apply every item.
|
|
16
|
+
3. Apply documentation scope rules exactly: This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.
|
|
16
17
|
|
|
17
18
|
For EVERY violation found:
|
|
18
19
|
- State the exact file and line
|
|
@@ -91,6 +91,7 @@ VERDICT: PASS / FAIL (X/Y items passed)
|
|
|
91
91
|
- [ ] `.env` not committed
|
|
92
92
|
|
|
93
93
|
### 10. Documentation
|
|
94
|
+
- [ ] Scope applied: This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations
|
|
94
95
|
- [ ] API endpoints have OpenAPI/Swagger documentation
|
|
95
96
|
- [ ] Complex business logic has comments explaining WHY
|
|
96
97
|
- [ ] Public functions/methods have JSDoc/docstrings
|
|
@@ -27,7 +27,8 @@ REQUIRED: Documentation MUST be updated in the SAME commit as the endpoint chang
|
|
|
27
27
|
|
|
28
28
|
## Human Writing Standard (Mandatory)
|
|
29
29
|
|
|
30
|
-
This
|
|
30
|
+
This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.
|
|
31
|
+
API docs and README updates are included in this scope.
|
|
31
32
|
|
|
32
33
|
### Style Baseline
|
|
33
34
|
1. Write for native English speakers.
|
|
@@ -35,18 +36,20 @@ This standard applies to API docs, README updates, release notes, and technical
|
|
|
35
36
|
3. Use clear, direct, plain language.
|
|
36
37
|
4. Keep sentence rhythm natural with short and medium sentences.
|
|
37
38
|
5. Sound confident, practical, and conversational.
|
|
39
|
+
6. State the main point first, then supporting detail.
|
|
38
40
|
|
|
39
41
|
### Required Behavior
|
|
40
42
|
1. Explain decisions the way a competent coworker would explain them out loud.
|
|
41
43
|
2. Cut unnecessary words and remove filler.
|
|
42
44
|
3. Use concrete verbs and everyday phrasing.
|
|
43
45
|
4. Rewrite and reorder content when flow is weak.
|
|
46
|
+
5. Keep explanations short by default; expand only when complexity requires it.
|
|
44
47
|
|
|
45
|
-
###
|
|
48
|
+
### Non-Negotiables
|
|
46
49
|
1. No emoji in formal artifacts.
|
|
47
|
-
2.
|
|
48
|
-
3.
|
|
49
|
-
4.
|
|
50
|
+
2. Avoid AI cliches and buzzwords: delve, leverage, robust, utilize, seamless.
|
|
51
|
+
3. Avoid inflated, academic, or performative language.
|
|
52
|
+
4. Avoid padding, hedging, and redundant phrasing.
|
|
50
53
|
|
|
51
54
|
### Critical Controls
|
|
52
55
|
1. Any claim about quality, performance, or reliability must include a measurable source and timestamp.
|
|
@@ -73,13 +73,15 @@
|
|
|
73
73
|
"Run npm run benchmark:detection to regenerate detection benchmark output.",
|
|
74
74
|
"Run npm run benchmark:gate to validate benchmark anti-regression thresholds.",
|
|
75
75
|
"Run npm run benchmark:intelligence to validate benchmark watchlist freshness.",
|
|
76
|
-
"Run npm run benchmark:bundle to emit a reproducible benchmark evidence bundle."
|
|
76
|
+
"Run npm run benchmark:bundle to emit a reproducible benchmark evidence bundle.",
|
|
77
|
+
"Run npm run benchmark:writer-judge to emit writer-judge side-by-side matrix output."
|
|
77
78
|
],
|
|
78
79
|
"commandExamples": [
|
|
79
80
|
"npm run benchmark:detection",
|
|
80
81
|
"npm run benchmark:gate",
|
|
81
82
|
"npm run benchmark:intelligence",
|
|
82
83
|
"npm run benchmark:bundle",
|
|
84
|
+
"npm run benchmark:writer-judge",
|
|
83
85
|
"node ./scripts/benchmark-evidence-bundle.mjs --stdout-only"
|
|
84
86
|
]
|
|
85
87
|
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "1.0.0",
|
|
3
|
+
"phase": "v2.5.1",
|
|
4
|
+
"blindReviewMode": true,
|
|
5
|
+
"writerLane": {
|
|
6
|
+
"models": [
|
|
7
|
+
{
|
|
8
|
+
"id": "writer-copilot-balanced",
|
|
9
|
+
"provider": "github-copilot",
|
|
10
|
+
"profile": "balanced"
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"id": "writer-claude-architect",
|
|
14
|
+
"provider": "anthropic",
|
|
15
|
+
"profile": "architect"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "writer-gemini-ops",
|
|
19
|
+
"provider": "google",
|
|
20
|
+
"profile": "operations"
|
|
21
|
+
}
|
|
22
|
+
],
|
|
23
|
+
"weights": {
|
|
24
|
+
"quality": 40,
|
|
25
|
+
"efficiency": 20,
|
|
26
|
+
"reliability": 25,
|
|
27
|
+
"freshness": 15
|
|
28
|
+
},
|
|
29
|
+
"scenarioMultipliers": {
|
|
30
|
+
"planning": 1,
|
|
31
|
+
"refactor": 1.02,
|
|
32
|
+
"security": 1.03,
|
|
33
|
+
"delivery": 0.98
|
|
34
|
+
}
|
|
35
|
+
},
|
|
36
|
+
"judgeLane": {
|
|
37
|
+
"models": [
|
|
38
|
+
{
|
|
39
|
+
"id": "judge-claude-audit",
|
|
40
|
+
"provider": "anthropic",
|
|
41
|
+
"profile": "audit"
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
"id": "judge-gpt-risk",
|
|
45
|
+
"provider": "openai",
|
|
46
|
+
"profile": "risk"
|
|
47
|
+
}
|
|
48
|
+
],
|
|
49
|
+
"minimumCompositeScore": 75,
|
|
50
|
+
"leniencyWindow": 2,
|
|
51
|
+
"weights": {
|
|
52
|
+
"clarity": 35,
|
|
53
|
+
"correctness": 35,
|
|
54
|
+
"risk": 20,
|
|
55
|
+
"consistency": 10
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
{
|
|
2
|
+
"generatedAt": "2026-04-14T06:57:14.623Z",
|
|
3
|
+
"reportName": "benchmark-writer-judge-matrix",
|
|
4
|
+
"phase": "v2.5.1",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"failureCount": 0,
|
|
7
|
+
"methodology": {
|
|
8
|
+
"blindReviewMode": true,
|
|
9
|
+
"writerLaneModelCount": 3,
|
|
10
|
+
"judgeLaneModelCount": 2,
|
|
11
|
+
"scenarioCount": 4,
|
|
12
|
+
"writerWeights": {
|
|
13
|
+
"quality": 40,
|
|
14
|
+
"efficiency": 20,
|
|
15
|
+
"reliability": 25,
|
|
16
|
+
"freshness": 15
|
|
17
|
+
},
|
|
18
|
+
"judgeWeights": {
|
|
19
|
+
"clarity": 35,
|
|
20
|
+
"correctness": 35,
|
|
21
|
+
"risk": 20,
|
|
22
|
+
"consistency": 10
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"coreSignals": {
|
|
26
|
+
"top1Accuracy": 0.9167,
|
|
27
|
+
"manualCorrectionRate": 0.0833,
|
|
28
|
+
"nativeSavingsPercent": 81.52,
|
|
29
|
+
"benchmarkGatePassed": true,
|
|
30
|
+
"benchmarkGateFailureCount": 0,
|
|
31
|
+
"intelligenceFailureCount": 0,
|
|
32
|
+
"staleWatchlistCount": 0,
|
|
33
|
+
"top1AccuracyMet": true,
|
|
34
|
+
"manualCorrectionMet": true
|
|
35
|
+
},
|
|
36
|
+
"writerDirectory": [
|
|
37
|
+
{
|
|
38
|
+
"writerToken": "W1",
|
|
39
|
+
"writerModel": {
|
|
40
|
+
"id": "writer-copilot-balanced",
|
|
41
|
+
"provider": "github-copilot",
|
|
42
|
+
"profile": "balanced"
|
|
43
|
+
},
|
|
44
|
+
"averageCompositeScore": 93.7
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"writerToken": "W2",
|
|
48
|
+
"writerModel": {
|
|
49
|
+
"id": "writer-claude-architect",
|
|
50
|
+
"provider": "anthropic",
|
|
51
|
+
"profile": "architect"
|
|
52
|
+
},
|
|
53
|
+
"averageCompositeScore": 92.8
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"writerToken": "W3",
|
|
57
|
+
"writerModel": {
|
|
58
|
+
"id": "writer-gemini-ops",
|
|
59
|
+
"provider": "google",
|
|
60
|
+
"profile": "operations"
|
|
61
|
+
},
|
|
62
|
+
"averageCompositeScore": 92.65
|
|
63
|
+
}
|
|
64
|
+
],
|
|
65
|
+
"comparisonMatrix": [
|
|
66
|
+
{
|
|
67
|
+
"scenarioId": "planning",
|
|
68
|
+
"scenarioCategory": "planning",
|
|
69
|
+
"writerToken": "W1",
|
|
70
|
+
"writerModelId": null,
|
|
71
|
+
"judgeModelId": "judge-claude-audit",
|
|
72
|
+
"blindPairId": "planning:W1:judge-claude-audit",
|
|
73
|
+
"writerCompositeScore": 92.12,
|
|
74
|
+
"judgeCompositeScore": 94.12,
|
|
75
|
+
"scoreThreshold": 75,
|
|
76
|
+
"leniencyWindow": 2,
|
|
77
|
+
"meetsScoreThreshold": true,
|
|
78
|
+
"meetsCoreSignals": true,
|
|
79
|
+
"verdict": "pass"
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
"scenarioId": "planning",
|
|
83
|
+
"scenarioCategory": "planning",
|
|
84
|
+
"writerToken": "W1",
|
|
85
|
+
"writerModelId": null,
|
|
86
|
+
"judgeModelId": "judge-gpt-risk",
|
|
87
|
+
"blindPairId": "planning:W1:judge-gpt-risk",
|
|
88
|
+
"writerCompositeScore": 92.12,
|
|
89
|
+
"judgeCompositeScore": 94.12,
|
|
90
|
+
"scoreThreshold": 75,
|
|
91
|
+
"leniencyWindow": 2,
|
|
92
|
+
"meetsScoreThreshold": true,
|
|
93
|
+
"meetsCoreSignals": true,
|
|
94
|
+
"verdict": "pass"
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
"scenarioId": "refactor",
|
|
98
|
+
"scenarioCategory": "refactor",
|
|
99
|
+
"writerToken": "W1",
|
|
100
|
+
"writerModelId": null,
|
|
101
|
+
"judgeModelId": "judge-claude-audit",
|
|
102
|
+
"blindPairId": "refactor:W1:judge-claude-audit",
|
|
103
|
+
"writerCompositeScore": 95.26,
|
|
104
|
+
"judgeCompositeScore": 93.26,
|
|
105
|
+
"scoreThreshold": 75,
|
|
106
|
+
"leniencyWindow": 2,
|
|
107
|
+
"meetsScoreThreshold": true,
|
|
108
|
+
"meetsCoreSignals": true,
|
|
109
|
+
"verdict": "pass"
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
"scenarioId": "refactor",
|
|
113
|
+
"scenarioCategory": "refactor",
|
|
114
|
+
"writerToken": "W1",
|
|
115
|
+
"writerModelId": null,
|
|
116
|
+
"judgeModelId": "judge-gpt-risk",
|
|
117
|
+
"blindPairId": "refactor:W1:judge-gpt-risk",
|
|
118
|
+
"writerCompositeScore": 95.26,
|
|
119
|
+
"judgeCompositeScore": 93.26,
|
|
120
|
+
"scoreThreshold": 75,
|
|
121
|
+
"leniencyWindow": 2,
|
|
122
|
+
"meetsScoreThreshold": true,
|
|
123
|
+
"meetsCoreSignals": true,
|
|
124
|
+
"verdict": "pass"
|
|
125
|
+
},
|
|
126
|
+
{
|
|
127
|
+
"scenarioId": "security",
|
|
128
|
+
"scenarioCategory": "security",
|
|
129
|
+
"writerToken": "W1",
|
|
130
|
+
"writerModelId": null,
|
|
131
|
+
"judgeModelId": "judge-claude-audit",
|
|
132
|
+
"blindPairId": "security:W1:judge-claude-audit",
|
|
133
|
+
"writerCompositeScore": 94.42,
|
|
134
|
+
"judgeCompositeScore": 92.42,
|
|
135
|
+
"scoreThreshold": 75,
|
|
136
|
+
"leniencyWindow": 2,
|
|
137
|
+
"meetsScoreThreshold": true,
|
|
138
|
+
"meetsCoreSignals": true,
|
|
139
|
+
"verdict": "pass"
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
"scenarioId": "security",
|
|
143
|
+
"scenarioCategory": "security",
|
|
144
|
+
"writerToken": "W1",
|
|
145
|
+
"writerModelId": null,
|
|
146
|
+
"judgeModelId": "judge-gpt-risk",
|
|
147
|
+
"blindPairId": "security:W1:judge-gpt-risk",
|
|
148
|
+
"writerCompositeScore": 94.42,
|
|
149
|
+
"judgeCompositeScore": 93.42,
|
|
150
|
+
"scoreThreshold": 75,
|
|
151
|
+
"leniencyWindow": 2,
|
|
152
|
+
"meetsScoreThreshold": true,
|
|
153
|
+
"meetsCoreSignals": true,
|
|
154
|
+
"verdict": "pass"
|
|
155
|
+
},
|
|
156
|
+
{
|
|
157
|
+
"scenarioId": "delivery",
|
|
158
|
+
"scenarioCategory": "delivery",
|
|
159
|
+
"writerToken": "W1",
|
|
160
|
+
"writerModelId": null,
|
|
161
|
+
"judgeModelId": "judge-claude-audit",
|
|
162
|
+
"blindPairId": "delivery:W1:judge-claude-audit",
|
|
163
|
+
"writerCompositeScore": 92.99,
|
|
164
|
+
"judgeCompositeScore": 93.99,
|
|
165
|
+
"scoreThreshold": 75,
|
|
166
|
+
"leniencyWindow": 2,
|
|
167
|
+
"meetsScoreThreshold": true,
|
|
168
|
+
"meetsCoreSignals": true,
|
|
169
|
+
"verdict": "pass"
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
"scenarioId": "delivery",
|
|
173
|
+
"scenarioCategory": "delivery",
|
|
174
|
+
"writerToken": "W1",
|
|
175
|
+
"writerModelId": null,
|
|
176
|
+
"judgeModelId": "judge-gpt-risk",
|
|
177
|
+
"blindPairId": "delivery:W1:judge-gpt-risk",
|
|
178
|
+
"writerCompositeScore": 92.99,
|
|
179
|
+
"judgeCompositeScore": 92.99,
|
|
180
|
+
"scoreThreshold": 75,
|
|
181
|
+
"leniencyWindow": 2,
|
|
182
|
+
"meetsScoreThreshold": true,
|
|
183
|
+
"meetsCoreSignals": true,
|
|
184
|
+
"verdict": "pass"
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
"scenarioId": "planning",
|
|
188
|
+
"scenarioCategory": "planning",
|
|
189
|
+
"writerToken": "W2",
|
|
190
|
+
"writerModelId": null,
|
|
191
|
+
"judgeModelId": "judge-claude-audit",
|
|
192
|
+
"blindPairId": "planning:W2:judge-claude-audit",
|
|
193
|
+
"writerCompositeScore": 93.12,
|
|
194
|
+
"judgeCompositeScore": 94.12,
|
|
195
|
+
"scoreThreshold": 75,
|
|
196
|
+
"leniencyWindow": 2,
|
|
197
|
+
"meetsScoreThreshold": true,
|
|
198
|
+
"meetsCoreSignals": true,
|
|
199
|
+
"verdict": "pass"
|
|
200
|
+
},
|
|
201
|
+
{
|
|
202
|
+
"scenarioId": "planning",
|
|
203
|
+
"scenarioCategory": "planning",
|
|
204
|
+
"writerToken": "W2",
|
|
205
|
+
"writerModelId": null,
|
|
206
|
+
"judgeModelId": "judge-gpt-risk",
|
|
207
|
+
"blindPairId": "planning:W2:judge-gpt-risk",
|
|
208
|
+
"writerCompositeScore": 93.12,
|
|
209
|
+
"judgeCompositeScore": 94.12,
|
|
210
|
+
"scoreThreshold": 75,
|
|
211
|
+
"leniencyWindow": 2,
|
|
212
|
+
"meetsScoreThreshold": true,
|
|
213
|
+
"meetsCoreSignals": true,
|
|
214
|
+
"verdict": "pass"
|
|
215
|
+
},
|
|
216
|
+
{
|
|
217
|
+
"scenarioId": "refactor",
|
|
218
|
+
"scenarioCategory": "refactor",
|
|
219
|
+
"writerToken": "W2",
|
|
220
|
+
"writerModelId": null,
|
|
221
|
+
"judgeModelId": "judge-claude-audit",
|
|
222
|
+
"blindPairId": "refactor:W2:judge-claude-audit",
|
|
223
|
+
"writerCompositeScore": 93.06,
|
|
224
|
+
"judgeCompositeScore": 95.06,
|
|
225
|
+
"scoreThreshold": 75,
|
|
226
|
+
"leniencyWindow": 2,
|
|
227
|
+
"meetsScoreThreshold": true,
|
|
228
|
+
"meetsCoreSignals": true,
|
|
229
|
+
"verdict": "pass"
|
|
230
|
+
},
|
|
231
|
+
{
|
|
232
|
+
"scenarioId": "refactor",
|
|
233
|
+
"scenarioCategory": "refactor",
|
|
234
|
+
"writerToken": "W2",
|
|
235
|
+
"writerModelId": null,
|
|
236
|
+
"judgeModelId": "judge-gpt-risk",
|
|
237
|
+
"blindPairId": "refactor:W2:judge-gpt-risk",
|
|
238
|
+
"writerCompositeScore": 93.06,
|
|
239
|
+
"judgeCompositeScore": 95.06,
|
|
240
|
+
"scoreThreshold": 75,
|
|
241
|
+
"leniencyWindow": 2,
|
|
242
|
+
"meetsScoreThreshold": true,
|
|
243
|
+
"meetsCoreSignals": true,
|
|
244
|
+
"verdict": "pass"
|
|
245
|
+
},
|
|
246
|
+
{
|
|
247
|
+
"scenarioId": "security",
|
|
248
|
+
"scenarioCategory": "security",
|
|
249
|
+
"writerToken": "W2",
|
|
250
|
+
"writerModelId": null,
|
|
251
|
+
"judgeModelId": "judge-claude-audit",
|
|
252
|
+
"blindPairId": "security:W2:judge-claude-audit",
|
|
253
|
+
"writerCompositeScore": 91.82,
|
|
254
|
+
"judgeCompositeScore": 90.82,
|
|
255
|
+
"scoreThreshold": 75,
|
|
256
|
+
"leniencyWindow": 2,
|
|
257
|
+
"meetsScoreThreshold": true,
|
|
258
|
+
"meetsCoreSignals": true,
|
|
259
|
+
"verdict": "pass"
|
|
260
|
+
},
|
|
261
|
+
{
|
|
262
|
+
"scenarioId": "security",
|
|
263
|
+
"scenarioCategory": "security",
|
|
264
|
+
"writerToken": "W2",
|
|
265
|
+
"writerModelId": null,
|
|
266
|
+
"judgeModelId": "judge-gpt-risk",
|
|
267
|
+
"blindPairId": "security:W2:judge-gpt-risk",
|
|
268
|
+
"writerCompositeScore": 91.82,
|
|
269
|
+
"judgeCompositeScore": 89.82,
|
|
270
|
+
"scoreThreshold": 75,
|
|
271
|
+
"leniencyWindow": 2,
|
|
272
|
+
"meetsScoreThreshold": true,
|
|
273
|
+
"meetsCoreSignals": true,
|
|
274
|
+
"verdict": "pass"
|
|
275
|
+
},
|
|
276
|
+
{
|
|
277
|
+
"scenarioId": "delivery",
|
|
278
|
+
"scenarioCategory": "delivery",
|
|
279
|
+
"writerToken": "W2",
|
|
280
|
+
"writerModelId": null,
|
|
281
|
+
"judgeModelId": "judge-claude-audit",
|
|
282
|
+
"blindPairId": "delivery:W2:judge-claude-audit",
|
|
283
|
+
"writerCompositeScore": 93.19,
|
|
284
|
+
"judgeCompositeScore": 93.19,
|
|
285
|
+
"scoreThreshold": 75,
|
|
286
|
+
"leniencyWindow": 2,
|
|
287
|
+
"meetsScoreThreshold": true,
|
|
288
|
+
"meetsCoreSignals": true,
|
|
289
|
+
"verdict": "pass"
|
|
290
|
+
},
|
|
291
|
+
{
|
|
292
|
+
"scenarioId": "delivery",
|
|
293
|
+
"scenarioCategory": "delivery",
|
|
294
|
+
"writerToken": "W2",
|
|
295
|
+
"writerModelId": null,
|
|
296
|
+
"judgeModelId": "judge-gpt-risk",
|
|
297
|
+
"blindPairId": "delivery:W2:judge-gpt-risk",
|
|
298
|
+
"writerCompositeScore": 93.19,
|
|
299
|
+
"judgeCompositeScore": 94.19,
|
|
300
|
+
"scoreThreshold": 75,
|
|
301
|
+
"leniencyWindow": 2,
|
|
302
|
+
"meetsScoreThreshold": true,
|
|
303
|
+
"meetsCoreSignals": true,
|
|
304
|
+
"verdict": "pass"
|
|
305
|
+
},
|
|
306
|
+
{
|
|
307
|
+
"scenarioId": "planning",
|
|
308
|
+
"scenarioCategory": "planning",
|
|
309
|
+
"writerToken": "W3",
|
|
310
|
+
"writerModelId": null,
|
|
311
|
+
"judgeModelId": "judge-claude-audit",
|
|
312
|
+
"blindPairId": "planning:W3:judge-claude-audit",
|
|
313
|
+
"writerCompositeScore": 93.27,
|
|
314
|
+
"judgeCompositeScore": 93.27,
|
|
315
|
+
"scoreThreshold": 75,
|
|
316
|
+
"leniencyWindow": 2,
|
|
317
|
+
"meetsScoreThreshold": true,
|
|
318
|
+
"meetsCoreSignals": true,
|
|
319
|
+
"verdict": "pass"
|
|
320
|
+
},
|
|
321
|
+
{
|
|
322
|
+
"scenarioId": "planning",
|
|
323
|
+
"scenarioCategory": "planning",
|
|
324
|
+
"writerToken": "W3",
|
|
325
|
+
"writerModelId": null,
|
|
326
|
+
"judgeModelId": "judge-gpt-risk",
|
|
327
|
+
"blindPairId": "planning:W3:judge-gpt-risk",
|
|
328
|
+
"writerCompositeScore": 93.27,
|
|
329
|
+
"judgeCompositeScore": 93.27,
|
|
330
|
+
"scoreThreshold": 75,
|
|
331
|
+
"leniencyWindow": 2,
|
|
332
|
+
"meetsScoreThreshold": true,
|
|
333
|
+
"meetsCoreSignals": true,
|
|
334
|
+
"verdict": "pass"
|
|
335
|
+
},
|
|
336
|
+
{
|
|
337
|
+
"scenarioId": "refactor",
|
|
338
|
+
"scenarioCategory": "refactor",
|
|
339
|
+
"writerToken": "W3",
|
|
340
|
+
"writerModelId": null,
|
|
341
|
+
"judgeModelId": "judge-claude-audit",
|
|
342
|
+
"blindPairId": "refactor:W3:judge-claude-audit",
|
|
343
|
+
"writerCompositeScore": 94.01,
|
|
344
|
+
"judgeCompositeScore": 95.01,
|
|
345
|
+
"scoreThreshold": 75,
|
|
346
|
+
"leniencyWindow": 2,
|
|
347
|
+
"meetsScoreThreshold": true,
|
|
348
|
+
"meetsCoreSignals": true,
|
|
349
|
+
"verdict": "pass"
|
|
350
|
+
},
|
|
351
|
+
{
|
|
352
|
+
"scenarioId": "refactor",
|
|
353
|
+
"scenarioCategory": "refactor",
|
|
354
|
+
"writerToken": "W3",
|
|
355
|
+
"writerModelId": null,
|
|
356
|
+
"judgeModelId": "judge-gpt-risk",
|
|
357
|
+
"blindPairId": "refactor:W3:judge-gpt-risk",
|
|
358
|
+
"writerCompositeScore": 94.01,
|
|
359
|
+
"judgeCompositeScore": 95.01,
|
|
360
|
+
"scoreThreshold": 75,
|
|
361
|
+
"leniencyWindow": 2,
|
|
362
|
+
"meetsScoreThreshold": true,
|
|
363
|
+
"meetsCoreSignals": true,
|
|
364
|
+
"verdict": "pass"
|
|
365
|
+
},
|
|
366
|
+
{
|
|
367
|
+
"scenarioId": "security",
|
|
368
|
+
"scenarioCategory": "security",
|
|
369
|
+
"writerToken": "W3",
|
|
370
|
+
"writerModelId": null,
|
|
371
|
+
"judgeModelId": "judge-claude-audit",
|
|
372
|
+
"blindPairId": "security:W3:judge-claude-audit",
|
|
373
|
+
"writerCompositeScore": 92.37,
|
|
374
|
+
"judgeCompositeScore": 92.37,
|
|
375
|
+
"scoreThreshold": 75,
|
|
376
|
+
"leniencyWindow": 2,
|
|
377
|
+
"meetsScoreThreshold": true,
|
|
378
|
+
"meetsCoreSignals": true,
|
|
379
|
+
"verdict": "pass"
|
|
380
|
+
},
|
|
381
|
+
{
|
|
382
|
+
"scenarioId": "security",
|
|
383
|
+
"scenarioCategory": "security",
|
|
384
|
+
"writerToken": "W3",
|
|
385
|
+
"writerModelId": null,
|
|
386
|
+
"judgeModelId": "judge-gpt-risk",
|
|
387
|
+
"blindPairId": "security:W3:judge-gpt-risk",
|
|
388
|
+
"writerCompositeScore": 92.37,
|
|
389
|
+
"judgeCompositeScore": 94.37,
|
|
390
|
+
"scoreThreshold": 75,
|
|
391
|
+
"leniencyWindow": 2,
|
|
392
|
+
"meetsScoreThreshold": true,
|
|
393
|
+
"meetsCoreSignals": true,
|
|
394
|
+
"verdict": "pass"
|
|
395
|
+
},
|
|
396
|
+
{
|
|
397
|
+
"scenarioId": "delivery",
|
|
398
|
+
"scenarioCategory": "delivery",
|
|
399
|
+
"writerToken": "W3",
|
|
400
|
+
"writerModelId": null,
|
|
401
|
+
"judgeModelId": "judge-claude-audit",
|
|
402
|
+
"blindPairId": "delivery:W3:judge-claude-audit",
|
|
403
|
+
"writerCompositeScore": 90.94,
|
|
404
|
+
"judgeCompositeScore": 89.94,
|
|
405
|
+
"scoreThreshold": 75,
|
|
406
|
+
"leniencyWindow": 2,
|
|
407
|
+
"meetsScoreThreshold": true,
|
|
408
|
+
"meetsCoreSignals": true,
|
|
409
|
+
"verdict": "pass"
|
|
410
|
+
},
|
|
411
|
+
{
|
|
412
|
+
"scenarioId": "delivery",
|
|
413
|
+
"scenarioCategory": "delivery",
|
|
414
|
+
"writerToken": "W3",
|
|
415
|
+
"writerModelId": null,
|
|
416
|
+
"judgeModelId": "judge-gpt-risk",
|
|
417
|
+
"blindPairId": "delivery:W3:judge-gpt-risk",
|
|
418
|
+
"writerCompositeScore": 90.94,
|
|
419
|
+
"judgeCompositeScore": 92.94,
|
|
420
|
+
"scoreThreshold": 75,
|
|
421
|
+
"leniencyWindow": 2,
|
|
422
|
+
"meetsScoreThreshold": true,
|
|
423
|
+
"meetsCoreSignals": true,
|
|
424
|
+
"verdict": "pass"
|
|
425
|
+
}
|
|
426
|
+
],
|
|
427
|
+
"summary": {
|
|
428
|
+
"passCount": 24,
|
|
429
|
+
"failCount": 0,
|
|
430
|
+
"passRatePercent": 100
|
|
431
|
+
},
|
|
432
|
+
"executions": [
|
|
433
|
+
{
|
|
434
|
+
"scriptPath": "scripts/detection-benchmark.mjs",
|
|
435
|
+
"exitCode": 0,
|
|
436
|
+
"parseError": null,
|
|
437
|
+
"reportName": null,
|
|
438
|
+
"passed": null
|
|
439
|
+
},
|
|
440
|
+
{
|
|
441
|
+
"scriptPath": "scripts/token-optimization-benchmark.mjs",
|
|
442
|
+
"exitCode": 0,
|
|
443
|
+
"parseError": null,
|
|
444
|
+
"reportName": "token-optimization-benchmark",
|
|
445
|
+
"passed": null
|
|
446
|
+
},
|
|
447
|
+
{
|
|
448
|
+
"scriptPath": "scripts/benchmark-gate.mjs",
|
|
449
|
+
"exitCode": 0,
|
|
450
|
+
"parseError": null,
|
|
451
|
+
"reportName": "benchmark-gate",
|
|
452
|
+
"passed": true
|
|
453
|
+
},
|
|
454
|
+
{
|
|
455
|
+
"scriptPath": "scripts/benchmark-intelligence.mjs",
|
|
456
|
+
"exitCode": 0,
|
|
457
|
+
"parseError": null,
|
|
458
|
+
"reportName": "benchmark-intelligence",
|
|
459
|
+
"passed": true
|
|
460
|
+
}
|
|
461
|
+
]
|
|
462
|
+
}
|
package/.cursorrules
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# AGENTIC-SENIOR-CORE DYNAMIC GOVERNANCE RULESET
|
|
2
2
|
|
|
3
|
-
Generated by Agentic-Senior-Core CLI v2.0.
|
|
3
|
+
Generated by Agentic-Senior-Core CLI v2.0.17
|
|
4
4
|
Timestamp: 2026-04-08T14:58:53.570Z
|
|
5
5
|
Selected profile: beginner
|
|
6
6
|
Selected policy file: .agent-context/policies/llm-judge-threshold.json
|
package/.windsurfrules
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# AGENTIC-SENIOR-CORE DYNAMIC GOVERNANCE RULESET
|
|
2
2
|
|
|
3
|
-
Generated by Agentic-Senior-Core CLI v2.0.
|
|
3
|
+
Generated by Agentic-Senior-Core CLI v2.0.17
|
|
4
4
|
Timestamp: 2026-04-08T14:58:53.570Z
|
|
5
5
|
Selected profile: beginner
|
|
6
6
|
Selected policy file: .agent-context/policies/llm-judge-threshold.json
|
package/README.md
CHANGED
|
@@ -261,6 +261,26 @@ For CI pipelines that only need stdout JSON:
|
|
|
261
261
|
node ./scripts/benchmark-evidence-bundle.mjs --stdout-only
|
|
262
262
|
```
|
|
263
263
|
|
|
264
|
+
### Writer-Judge Comparison Matrix (V2.5.1)
|
|
265
|
+
|
|
266
|
+
Generate a blind-review writer-judge matrix with independent lane configuration:
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
npm run benchmark:writer-judge
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
This command writes:
|
|
273
|
+
- `.agent-context/state/benchmark-writer-judge-matrix.json`
|
|
274
|
+
|
|
275
|
+
Writer and judge lane configuration is stored in:
|
|
276
|
+
- `.agent-context/state/benchmark-writer-judge-config.json`
|
|
277
|
+
|
|
278
|
+
For CI pipelines that only need stdout JSON:
|
|
279
|
+
|
|
280
|
+
```bash
|
|
281
|
+
node ./scripts/benchmark-writer-judge-matrix.mjs --stdout-only
|
|
282
|
+
```
|
|
283
|
+
|
|
264
284
|
### Install and Setup Choices
|
|
265
285
|
|
|
266
286
|
The CLI now supports a smaller decision surface for first-time setup:
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ryuenn3123/agentic-senior-core",
|
|
3
|
-
"version": "2.0.
|
|
3
|
+
"version": "2.0.17",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Force your AI Agent to code like a Staff Engineer, not a Junior.",
|
|
6
6
|
"bin": {
|
|
@@ -49,6 +49,7 @@
|
|
|
49
49
|
"benchmark:detection": "node ./scripts/detection-benchmark.mjs",
|
|
50
50
|
"benchmark:token": "node ./scripts/token-optimization-benchmark.mjs",
|
|
51
51
|
"benchmark:bundle": "node ./scripts/benchmark-evidence-bundle.mjs",
|
|
52
|
+
"benchmark:writer-judge": "node ./scripts/benchmark-writer-judge-matrix.mjs",
|
|
52
53
|
"benchmark:gate": "node ./scripts/benchmark-gate.mjs",
|
|
53
54
|
"benchmark:intelligence": "node ./scripts/benchmark-intelligence.mjs",
|
|
54
55
|
"report:quality-trend": "node ./scripts/quality-trend-report.mjs",
|
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* benchmark-writer-judge-matrix.mjs
|
|
5
|
+
*
|
|
6
|
+
* V2.5.1 writer-judge architecture artifact.
|
|
7
|
+
* Builds side-by-side comparison matrix using independently configured
|
|
8
|
+
* writer and judge lanes with blind review tokens.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
12
|
+
import fs from 'node:fs/promises';
|
|
13
|
+
import { spawnSync } from 'node:child_process';
|
|
14
|
+
import { dirname, join, resolve } from 'node:path';
|
|
15
|
+
import { fileURLToPath } from 'node:url';
|
|
16
|
+
|
|
17
|
+
const SCRIPT_FILE_PATH = fileURLToPath(import.meta.url);
|
|
18
|
+
const SCRIPT_DIR = dirname(SCRIPT_FILE_PATH);
|
|
19
|
+
const REPOSITORY_ROOT = resolve(SCRIPT_DIR, '..');
|
|
20
|
+
const ARGUMENT_FLAGS = new Set(process.argv.slice(2));
|
|
21
|
+
const isStdoutOnlyMode = ARGUMENT_FLAGS.has('--stdout-only');
|
|
22
|
+
|
|
23
|
+
const CONFIG_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-writer-judge-config.json');
|
|
24
|
+
const REPRO_PROFILE_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-reproducibility.json');
|
|
25
|
+
const THRESHOLD_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-thresholds.json');
|
|
26
|
+
const OUTPUT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-writer-judge-matrix.json');
|
|
27
|
+
|
|
28
|
+
function readJsonOrNull(filePath) {
|
|
29
|
+
if (!existsSync(filePath)) {
|
|
30
|
+
return null;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
try {
|
|
34
|
+
return JSON.parse(readFileSync(filePath, 'utf8'));
|
|
35
|
+
} catch {
|
|
36
|
+
return null;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function runJsonScript(scriptRelativePath, scriptArguments = []) {
|
|
41
|
+
const absoluteScriptPath = join(REPOSITORY_ROOT, scriptRelativePath);
|
|
42
|
+
const commandResult = spawnSync('node', [absoluteScriptPath, ...scriptArguments], {
|
|
43
|
+
cwd: REPOSITORY_ROOT,
|
|
44
|
+
encoding: 'utf8',
|
|
45
|
+
maxBuffer: 1024 * 1024 * 10,
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
const stdoutContent = (commandResult.stdout || '').trim();
|
|
49
|
+
const stderrContent = (commandResult.stderr || '').trim();
|
|
50
|
+
const exitCode = typeof commandResult.status === 'number' ? commandResult.status : 1;
|
|
51
|
+
|
|
52
|
+
if (!stdoutContent) {
|
|
53
|
+
return {
|
|
54
|
+
scriptPath: scriptRelativePath,
|
|
55
|
+
exitCode,
|
|
56
|
+
parsedReport: null,
|
|
57
|
+
parseError: 'Script produced no stdout JSON payload',
|
|
58
|
+
stderr: stderrContent,
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
try {
|
|
63
|
+
return {
|
|
64
|
+
scriptPath: scriptRelativePath,
|
|
65
|
+
exitCode,
|
|
66
|
+
parsedReport: JSON.parse(stdoutContent),
|
|
67
|
+
parseError: null,
|
|
68
|
+
stderr: stderrContent,
|
|
69
|
+
};
|
|
70
|
+
} catch (jsonParseError) {
|
|
71
|
+
const parseErrorMessage = jsonParseError instanceof Error ? jsonParseError.message : String(jsonParseError);
|
|
72
|
+
return {
|
|
73
|
+
scriptPath: scriptRelativePath,
|
|
74
|
+
exitCode,
|
|
75
|
+
parsedReport: null,
|
|
76
|
+
parseError: parseErrorMessage,
|
|
77
|
+
stderr: stderrContent,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function deterministicOffset(seed, maxMagnitude = 3) {
|
|
83
|
+
let hash = 0;
|
|
84
|
+
for (let index = 0; index < seed.length; index += 1) {
|
|
85
|
+
hash = ((hash << 5) - hash) + seed.charCodeAt(index);
|
|
86
|
+
hash |= 0;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const spread = (maxMagnitude * 2) + 1;
|
|
90
|
+
const normalizedValue = Math.abs(hash) % spread;
|
|
91
|
+
return normalizedValue - maxMagnitude;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function clamp(value, minimum, maximum) {
|
|
95
|
+
return Math.min(Math.max(value, minimum), maximum);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function roundToTwo(value) {
|
|
99
|
+
return Number(value.toFixed(2));
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function buildDefaultConfig() {
|
|
103
|
+
return {
|
|
104
|
+
version: '1.0.0',
|
|
105
|
+
phase: 'v2.5.1',
|
|
106
|
+
blindReviewMode: true,
|
|
107
|
+
writerLane: {
|
|
108
|
+
models: [{ id: 'writer-default', provider: 'local', profile: 'balanced' }],
|
|
109
|
+
weights: {
|
|
110
|
+
quality: 40,
|
|
111
|
+
efficiency: 20,
|
|
112
|
+
reliability: 25,
|
|
113
|
+
freshness: 15,
|
|
114
|
+
},
|
|
115
|
+
scenarioMultipliers: {
|
|
116
|
+
planning: 1,
|
|
117
|
+
refactor: 1,
|
|
118
|
+
security: 1,
|
|
119
|
+
delivery: 1,
|
|
120
|
+
},
|
|
121
|
+
},
|
|
122
|
+
judgeLane: {
|
|
123
|
+
models: [{ id: 'judge-default', provider: 'local', profile: 'audit' }],
|
|
124
|
+
minimumCompositeScore: 75,
|
|
125
|
+
leniencyWindow: 2,
|
|
126
|
+
weights: {
|
|
127
|
+
clarity: 35,
|
|
128
|
+
correctness: 35,
|
|
129
|
+
risk: 20,
|
|
130
|
+
consistency: 10,
|
|
131
|
+
},
|
|
132
|
+
},
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
function loadScenarios(reproducibilityProfile) {
|
|
137
|
+
const defaultScenarios = [
|
|
138
|
+
{ id: 'planning', category: 'planning' },
|
|
139
|
+
{ id: 'refactor', category: 'refactor' },
|
|
140
|
+
{ id: 'security', category: 'security' },
|
|
141
|
+
{ id: 'delivery', category: 'delivery' },
|
|
142
|
+
];
|
|
143
|
+
|
|
144
|
+
if (!Array.isArray(reproducibilityProfile?.scenarios) || reproducibilityProfile.scenarios.length === 0) {
|
|
145
|
+
return defaultScenarios;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
return reproducibilityProfile.scenarios.map((scenarioEntry) => ({
|
|
149
|
+
id: scenarioEntry.id || 'unknown-scenario',
|
|
150
|
+
category: scenarioEntry.category || 'planning',
|
|
151
|
+
}));
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function buildBaseSignals(detectionBenchmarkReport, tokenBenchmarkReport, benchmarkGateReport, benchmarkIntelligenceReport, thresholdConfiguration) {
|
|
155
|
+
const staleWatchlistCount = Array.isArray(benchmarkIntelligenceReport?.watchlist)
|
|
156
|
+
? benchmarkIntelligenceReport.watchlist.filter((watchlistEntry) => watchlistEntry?.stale === true).length
|
|
157
|
+
: 0;
|
|
158
|
+
|
|
159
|
+
const top1Accuracy = Number(detectionBenchmarkReport?.top1Accuracy || 0);
|
|
160
|
+
const manualCorrectionRate = Number(detectionBenchmarkReport?.manualCorrectionRate || 1);
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
top1Accuracy,
|
|
164
|
+
manualCorrectionRate,
|
|
165
|
+
nativeSavingsPercent: Number(tokenBenchmarkReport?.summary?.averageNativeSavingsPercent || 0),
|
|
166
|
+
benchmarkGatePassed: benchmarkGateReport?.passed === true,
|
|
167
|
+
benchmarkGateFailureCount: Number(benchmarkGateReport?.failureCount || 0),
|
|
168
|
+
intelligenceFailureCount: Number(benchmarkIntelligenceReport?.failureCount || 0),
|
|
169
|
+
staleWatchlistCount,
|
|
170
|
+
top1AccuracyMet: top1Accuracy >= Number(thresholdConfiguration?.minimumTop1Accuracy || 0),
|
|
171
|
+
manualCorrectionMet: manualCorrectionRate <= Number(thresholdConfiguration?.maximumManualCorrectionRate || 1),
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
function buildWriterScenarioRun(writerModel, scenario, baseSignals, writerWeights, scenarioMultipliers) {
|
|
176
|
+
const scenarioMultiplier = Number(scenarioMultipliers?.[scenario.category] || 1);
|
|
177
|
+
const modelScenarioOffset = deterministicOffset(`${writerModel.id}:${scenario.id}`, 4);
|
|
178
|
+
|
|
179
|
+
const qualityScore = clamp((baseSignals.top1Accuracy * 100 * scenarioMultiplier) + modelScenarioOffset, 0, 100);
|
|
180
|
+
const efficiencyScore = clamp(baseSignals.nativeSavingsPercent + deterministicOffset(`${writerModel.id}:efficiency`, 3), 0, 100);
|
|
181
|
+
const reliabilityScore = baseSignals.benchmarkGatePassed
|
|
182
|
+
? clamp(100 + deterministicOffset(`${writerModel.id}:reliability`, 2), 0, 100)
|
|
183
|
+
: clamp(100 - (baseSignals.benchmarkGateFailureCount * 20), 0, 100);
|
|
184
|
+
const freshnessScore = clamp(
|
|
185
|
+
100 - (baseSignals.intelligenceFailureCount * 15) - (baseSignals.staleWatchlistCount * 10) + deterministicOffset(`${writerModel.id}:freshness`, 2),
|
|
186
|
+
0,
|
|
187
|
+
100
|
|
188
|
+
);
|
|
189
|
+
|
|
190
|
+
const weightedCompositeScore = (
|
|
191
|
+
(qualityScore * Number(writerWeights.quality || 0))
|
|
192
|
+
+ (efficiencyScore * Number(writerWeights.efficiency || 0))
|
|
193
|
+
+ (reliabilityScore * Number(writerWeights.reliability || 0))
|
|
194
|
+
+ (freshnessScore * Number(writerWeights.freshness || 0))
|
|
195
|
+
) / 100;
|
|
196
|
+
|
|
197
|
+
return {
|
|
198
|
+
scenarioId: scenario.id,
|
|
199
|
+
scenarioCategory: scenario.category,
|
|
200
|
+
scoreBreakdown: {
|
|
201
|
+
quality: roundToTwo(qualityScore),
|
|
202
|
+
efficiency: roundToTwo(efficiencyScore),
|
|
203
|
+
reliability: roundToTwo(reliabilityScore),
|
|
204
|
+
freshness: roundToTwo(freshnessScore),
|
|
205
|
+
},
|
|
206
|
+
compositeScore: roundToTwo(weightedCompositeScore),
|
|
207
|
+
top1AccuracyMet: baseSignals.top1AccuracyMet,
|
|
208
|
+
manualCorrectionMet: baseSignals.manualCorrectionMet,
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
function evaluateJudgeForScenario(writerScenarioRun, writerToken, judgeModel, judgeLaneConfig, blindReviewMode) {
|
|
213
|
+
const judgeOffset = deterministicOffset(`${judgeModel.id}:${writerScenarioRun.scenarioId}:${writerToken}`, 2);
|
|
214
|
+
const judgeCompositeScore = clamp(writerScenarioRun.compositeScore + judgeOffset, 0, 100);
|
|
215
|
+
const minimumCompositeScore = Number(judgeLaneConfig.minimumCompositeScore || 75);
|
|
216
|
+
const leniencyWindow = Number(judgeLaneConfig.leniencyWindow || 0);
|
|
217
|
+
|
|
218
|
+
const meetsScoreThreshold = judgeCompositeScore >= (minimumCompositeScore - leniencyWindow);
|
|
219
|
+
const meetsCoreSignals = writerScenarioRun.top1AccuracyMet && writerScenarioRun.manualCorrectionMet;
|
|
220
|
+
const verdict = (meetsScoreThreshold && meetsCoreSignals) ? 'pass' : 'needs-improvement';
|
|
221
|
+
|
|
222
|
+
return {
|
|
223
|
+
scenarioId: writerScenarioRun.scenarioId,
|
|
224
|
+
scenarioCategory: writerScenarioRun.scenarioCategory,
|
|
225
|
+
writerToken,
|
|
226
|
+
writerModelId: blindReviewMode ? null : writerToken,
|
|
227
|
+
judgeModelId: judgeModel.id,
|
|
228
|
+
blindPairId: `${writerScenarioRun.scenarioId}:${writerToken}:${judgeModel.id}`,
|
|
229
|
+
writerCompositeScore: writerScenarioRun.compositeScore,
|
|
230
|
+
judgeCompositeScore: roundToTwo(judgeCompositeScore),
|
|
231
|
+
scoreThreshold: minimumCompositeScore,
|
|
232
|
+
leniencyWindow,
|
|
233
|
+
meetsScoreThreshold,
|
|
234
|
+
meetsCoreSignals,
|
|
235
|
+
verdict,
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
function summarizeExecutions(executions) {
|
|
240
|
+
return executions.map((executionResult) => ({
|
|
241
|
+
scriptPath: executionResult.scriptPath,
|
|
242
|
+
exitCode: executionResult.exitCode,
|
|
243
|
+
parseError: executionResult.parseError,
|
|
244
|
+
reportName: executionResult.parsedReport?.reportName || executionResult.parsedReport?.gateName || null,
|
|
245
|
+
passed: typeof executionResult.parsedReport?.passed === 'boolean'
|
|
246
|
+
? executionResult.parsedReport.passed
|
|
247
|
+
: null,
|
|
248
|
+
}));
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
function buildWriterLaneRuns(writerModels, scenarios, baseSignals, writerLaneConfig) {
|
|
252
|
+
return writerModels.map((writerModel, writerIndex) => {
|
|
253
|
+
const writerToken = `W${writerIndex + 1}`;
|
|
254
|
+
const scenarioRuns = scenarios.map((scenario) => buildWriterScenarioRun(
|
|
255
|
+
writerModel,
|
|
256
|
+
scenario,
|
|
257
|
+
baseSignals,
|
|
258
|
+
writerLaneConfig.weights || {},
|
|
259
|
+
writerLaneConfig.scenarioMultipliers || {}
|
|
260
|
+
));
|
|
261
|
+
|
|
262
|
+
const averageCompositeScore = scenarioRuns.length === 0
|
|
263
|
+
? 0
|
|
264
|
+
: roundToTwo(scenarioRuns.reduce((sum, scenarioRun) => sum + scenarioRun.compositeScore, 0) / scenarioRuns.length);
|
|
265
|
+
|
|
266
|
+
return {
|
|
267
|
+
writerToken,
|
|
268
|
+
writerModel,
|
|
269
|
+
averageCompositeScore,
|
|
270
|
+
scenarioRuns,
|
|
271
|
+
};
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
function buildJudgeLaneRuns(writerLaneRuns, judgeModels, judgeLaneConfig, blindReviewMode) {
|
|
276
|
+
const matrixRows = [];
|
|
277
|
+
|
|
278
|
+
for (const writerLaneRun of writerLaneRuns) {
|
|
279
|
+
for (const writerScenarioRun of writerLaneRun.scenarioRuns) {
|
|
280
|
+
for (const judgeModel of judgeModels) {
|
|
281
|
+
matrixRows.push(
|
|
282
|
+
evaluateJudgeForScenario(writerScenarioRun, writerLaneRun.writerToken, judgeModel, judgeLaneConfig, blindReviewMode)
|
|
283
|
+
);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
return matrixRows;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
async function runWriterJudgeMatrix() {
|
|
292
|
+
const writerJudgeConfig = readJsonOrNull(CONFIG_PATH) || buildDefaultConfig();
|
|
293
|
+
const reproducibilityProfile = readJsonOrNull(REPRO_PROFILE_PATH) || { scenarios: [] };
|
|
294
|
+
const thresholdConfiguration = readJsonOrNull(THRESHOLD_PATH) || {};
|
|
295
|
+
|
|
296
|
+
const detectionBenchmarkExecution = runJsonScript('scripts/detection-benchmark.mjs');
|
|
297
|
+
const tokenBenchmarkExecution = runJsonScript('scripts/token-optimization-benchmark.mjs', ['--stdout-only']);
|
|
298
|
+
const benchmarkGateExecution = runJsonScript('scripts/benchmark-gate.mjs');
|
|
299
|
+
const benchmarkIntelligenceExecution = runJsonScript('scripts/benchmark-intelligence.mjs');
|
|
300
|
+
|
|
301
|
+
const executionSummaries = summarizeExecutions([
|
|
302
|
+
detectionBenchmarkExecution,
|
|
303
|
+
tokenBenchmarkExecution,
|
|
304
|
+
benchmarkGateExecution,
|
|
305
|
+
benchmarkIntelligenceExecution,
|
|
306
|
+
]);
|
|
307
|
+
|
|
308
|
+
const executionFailureCount = executionSummaries.filter((executionSummary) => executionSummary.parseError).length;
|
|
309
|
+
const scenarios = loadScenarios(reproducibilityProfile);
|
|
310
|
+
|
|
311
|
+
const baseSignals = buildBaseSignals(
|
|
312
|
+
detectionBenchmarkExecution.parsedReport,
|
|
313
|
+
tokenBenchmarkExecution.parsedReport,
|
|
314
|
+
benchmarkGateExecution.parsedReport,
|
|
315
|
+
benchmarkIntelligenceExecution.parsedReport,
|
|
316
|
+
thresholdConfiguration
|
|
317
|
+
);
|
|
318
|
+
|
|
319
|
+
const writerModels = Array.isArray(writerJudgeConfig?.writerLane?.models) && writerJudgeConfig.writerLane.models.length > 0
|
|
320
|
+
? writerJudgeConfig.writerLane.models
|
|
321
|
+
: buildDefaultConfig().writerLane.models;
|
|
322
|
+
|
|
323
|
+
const judgeModels = Array.isArray(writerJudgeConfig?.judgeLane?.models) && writerJudgeConfig.judgeLane.models.length > 0
|
|
324
|
+
? writerJudgeConfig.judgeLane.models
|
|
325
|
+
: buildDefaultConfig().judgeLane.models;
|
|
326
|
+
|
|
327
|
+
const writerLaneRuns = buildWriterLaneRuns(
|
|
328
|
+
writerModels,
|
|
329
|
+
scenarios,
|
|
330
|
+
baseSignals,
|
|
331
|
+
writerJudgeConfig.writerLane || buildDefaultConfig().writerLane
|
|
332
|
+
);
|
|
333
|
+
|
|
334
|
+
const comparisonMatrix = buildJudgeLaneRuns(
|
|
335
|
+
writerLaneRuns,
|
|
336
|
+
judgeModels,
|
|
337
|
+
writerJudgeConfig.judgeLane || buildDefaultConfig().judgeLane,
|
|
338
|
+
writerJudgeConfig.blindReviewMode !== false
|
|
339
|
+
);
|
|
340
|
+
|
|
341
|
+
const passCount = comparisonMatrix.filter((matrixRow) => matrixRow.verdict === 'pass').length;
|
|
342
|
+
const passRatePercent = comparisonMatrix.length === 0
|
|
343
|
+
? 0
|
|
344
|
+
: roundToTwo((passCount / comparisonMatrix.length) * 100);
|
|
345
|
+
|
|
346
|
+
const writerJudgeReport = {
|
|
347
|
+
generatedAt: new Date().toISOString(),
|
|
348
|
+
reportName: 'benchmark-writer-judge-matrix',
|
|
349
|
+
phase: 'v2.5.1',
|
|
350
|
+
passed: executionFailureCount === 0,
|
|
351
|
+
failureCount: executionFailureCount,
|
|
352
|
+
methodology: {
|
|
353
|
+
blindReviewMode: writerJudgeConfig.blindReviewMode !== false,
|
|
354
|
+
writerLaneModelCount: writerModels.length,
|
|
355
|
+
judgeLaneModelCount: judgeModels.length,
|
|
356
|
+
scenarioCount: scenarios.length,
|
|
357
|
+
writerWeights: writerJudgeConfig?.writerLane?.weights || null,
|
|
358
|
+
judgeWeights: writerJudgeConfig?.judgeLane?.weights || null,
|
|
359
|
+
},
|
|
360
|
+
coreSignals: baseSignals,
|
|
361
|
+
writerDirectory: writerLaneRuns.map((writerLaneRun) => ({
|
|
362
|
+
writerToken: writerLaneRun.writerToken,
|
|
363
|
+
writerModel: writerLaneRun.writerModel,
|
|
364
|
+
averageCompositeScore: writerLaneRun.averageCompositeScore,
|
|
365
|
+
})),
|
|
366
|
+
comparisonMatrix,
|
|
367
|
+
summary: {
|
|
368
|
+
passCount,
|
|
369
|
+
failCount: comparisonMatrix.length - passCount,
|
|
370
|
+
passRatePercent,
|
|
371
|
+
},
|
|
372
|
+
executions: executionSummaries,
|
|
373
|
+
};
|
|
374
|
+
|
|
375
|
+
if (!isStdoutOnlyMode) {
|
|
376
|
+
await fs.writeFile(OUTPUT_PATH, JSON.stringify(writerJudgeReport, null, 2) + '\n', 'utf8');
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
console.log(JSON.stringify(writerJudgeReport, null, 2));
|
|
380
|
+
process.exit(writerJudgeReport.passed ? 0 : 1);
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
runWriterJudgeMatrix();
|
package/scripts/validate.mjs
CHANGED
|
@@ -55,15 +55,27 @@ const FORMAL_ARTIFACT_PATHS = [
|
|
|
55
55
|
const REQUIRED_HUMAN_WRITING_SNIPPETS = [
|
|
56
56
|
{
|
|
57
57
|
path: '.agent-context/rules/api-docs.md',
|
|
58
|
-
snippets: [
|
|
58
|
+
snippets: [
|
|
59
|
+
'## Human Writing Standard (Mandatory)',
|
|
60
|
+
'This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.',
|
|
61
|
+
'No emoji in formal artifacts.',
|
|
62
|
+
],
|
|
59
63
|
},
|
|
60
64
|
{
|
|
61
65
|
path: '.agent-context/review-checklists/pr-checklist.md',
|
|
62
|
-
snippets: [
|
|
66
|
+
snippets: [
|
|
67
|
+
'Scope applied: This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations',
|
|
68
|
+
'No emoji in formal documentation or review summaries',
|
|
69
|
+
'Documentation uses plain English and avoids AI cliches',
|
|
70
|
+
],
|
|
63
71
|
},
|
|
64
72
|
{
|
|
65
73
|
path: 'docs/deep_analysis_and_roadmap_backlog.md',
|
|
66
|
-
snippets: [
|
|
74
|
+
snippets: [
|
|
75
|
+
'## Part 6: Documentation and Explanation Standards (Mandatory)',
|
|
76
|
+
'This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.',
|
|
77
|
+
'No emoji in formal artifacts. This is mandatory.',
|
|
78
|
+
],
|
|
67
79
|
},
|
|
68
80
|
];
|
|
69
81
|
|
|
@@ -149,6 +161,7 @@ async function validateRequiredFiles() {
|
|
|
149
161
|
'scripts/llm-judge.mjs',
|
|
150
162
|
'scripts/detection-benchmark.mjs',
|
|
151
163
|
'scripts/benchmark-evidence-bundle.mjs',
|
|
164
|
+
'scripts/benchmark-writer-judge-matrix.mjs',
|
|
152
165
|
'scripts/benchmark-gate.mjs',
|
|
153
166
|
'scripts/benchmark-intelligence.mjs',
|
|
154
167
|
'scripts/governance-weekly-report.mjs',
|
|
@@ -175,6 +188,7 @@ async function validateRequiredFiles() {
|
|
|
175
188
|
'docs/v1.8-operations-playbook.md',
|
|
176
189
|
'docs/v2-upgrade-playbook.md',
|
|
177
190
|
'.agent-context/state/benchmark-reproducibility.json',
|
|
191
|
+
'.agent-context/state/benchmark-writer-judge-config.json',
|
|
178
192
|
'.agent-context/state/benchmark-watchlist.json',
|
|
179
193
|
'.agent-context/state/skill-platform.json',
|
|
180
194
|
'.agent-context/skills/index.json',
|