@ryuenn3123/agentic-senior-core 2.0.15 → 2.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,7 @@ Run a comprehensive code review on the current codebase (or the files I'm about
13
13
  Use these checklists:
14
14
  1. Read .agent-context/review-checklists/pr-checklist.md — apply every item.
15
15
  2. Read .agent-context/review-checklists/security-audit.md — apply every item.
16
+ 3. Apply documentation scope rules exactly: This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.
16
17
 
17
18
  For EVERY violation found:
18
19
  - State the exact file and line
@@ -91,6 +91,7 @@ VERDICT: PASS / FAIL (X/Y items passed)
91
91
  - [ ] `.env` not committed
92
92
 
93
93
  ### 10. Documentation
94
+ - [ ] Scope applied: This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations
94
95
  - [ ] API endpoints have OpenAPI/Swagger documentation
95
96
  - [ ] Complex business logic has comments explaining WHY
96
97
  - [ ] Public functions/methods have JSDoc/docstrings
@@ -27,7 +27,8 @@ REQUIRED: Documentation MUST be updated in the SAME commit as the endpoint chang
27
27
 
28
28
  ## Human Writing Standard (Mandatory)
29
29
 
30
- This standard applies to API docs, README updates, release notes, and technical explanations.
30
+ This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.
31
+ API docs and README updates are included in this scope.
31
32
 
32
33
  ### Style Baseline
33
34
  1. Write for native English speakers.
@@ -35,18 +36,20 @@ This standard applies to API docs, README updates, release notes, and technical
35
36
  3. Use clear, direct, plain language.
36
37
  4. Keep sentence rhythm natural with short and medium sentences.
37
38
  5. Sound confident, practical, and conversational.
39
+ 6. State the main point first, then supporting detail.
38
40
 
39
41
  ### Required Behavior
40
42
  1. Explain decisions the way a competent coworker would explain them out loud.
41
43
  2. Cut unnecessary words and remove filler.
42
44
  3. Use concrete verbs and everyday phrasing.
43
45
  4. Rewrite and reorder content when flow is weak.
46
+ 5. Keep explanations short by default; expand only when complexity requires it.
44
47
 
45
- ### Hard Bans
48
+ ### Non-Negotiables
46
49
  1. No emoji in formal artifacts.
47
- 2. No AI cliches: delve, leverage, robust, utilize, seamless.
48
- 3. No inflated, academic, or performative language.
49
- 4. No padding, hedging, or redundant phrasing.
50
+ 2. Avoid AI cliches and buzzwords: delve, leverage, robust, utilize, seamless.
51
+ 3. Avoid inflated, academic, or performative language.
52
+ 4. Avoid padding, hedging, and redundant phrasing.
50
53
 
51
54
  ### Critical Controls
52
55
  1. Any claim about quality, performance, or reliability must include a measurable source and timestamp.
@@ -73,13 +73,15 @@
73
73
  "Run npm run benchmark:detection to regenerate detection benchmark output.",
74
74
  "Run npm run benchmark:gate to validate benchmark anti-regression thresholds.",
75
75
  "Run npm run benchmark:intelligence to validate benchmark watchlist freshness.",
76
- "Run npm run benchmark:bundle to emit a reproducible benchmark evidence bundle."
76
+ "Run npm run benchmark:bundle to emit a reproducible benchmark evidence bundle.",
77
+ "Run npm run benchmark:writer-judge to emit writer-judge side-by-side matrix output."
77
78
  ],
78
79
  "commandExamples": [
79
80
  "npm run benchmark:detection",
80
81
  "npm run benchmark:gate",
81
82
  "npm run benchmark:intelligence",
82
83
  "npm run benchmark:bundle",
84
+ "npm run benchmark:writer-judge",
83
85
  "node ./scripts/benchmark-evidence-bundle.mjs --stdout-only"
84
86
  ]
85
87
  }
@@ -0,0 +1,58 @@
1
+ {
2
+ "version": "1.0.0",
3
+ "phase": "v2.5.1",
4
+ "blindReviewMode": true,
5
+ "writerLane": {
6
+ "models": [
7
+ {
8
+ "id": "writer-copilot-balanced",
9
+ "provider": "github-copilot",
10
+ "profile": "balanced"
11
+ },
12
+ {
13
+ "id": "writer-claude-architect",
14
+ "provider": "anthropic",
15
+ "profile": "architect"
16
+ },
17
+ {
18
+ "id": "writer-gemini-ops",
19
+ "provider": "google",
20
+ "profile": "operations"
21
+ }
22
+ ],
23
+ "weights": {
24
+ "quality": 40,
25
+ "efficiency": 20,
26
+ "reliability": 25,
27
+ "freshness": 15
28
+ },
29
+ "scenarioMultipliers": {
30
+ "planning": 1,
31
+ "refactor": 1.02,
32
+ "security": 1.03,
33
+ "delivery": 0.98
34
+ }
35
+ },
36
+ "judgeLane": {
37
+ "models": [
38
+ {
39
+ "id": "judge-claude-audit",
40
+ "provider": "anthropic",
41
+ "profile": "audit"
42
+ },
43
+ {
44
+ "id": "judge-gpt-risk",
45
+ "provider": "openai",
46
+ "profile": "risk"
47
+ }
48
+ ],
49
+ "minimumCompositeScore": 75,
50
+ "leniencyWindow": 2,
51
+ "weights": {
52
+ "clarity": 35,
53
+ "correctness": 35,
54
+ "risk": 20,
55
+ "consistency": 10
56
+ }
57
+ }
58
+ }
@@ -0,0 +1,462 @@
1
+ {
2
+ "generatedAt": "2026-04-14T06:57:14.623Z",
3
+ "reportName": "benchmark-writer-judge-matrix",
4
+ "phase": "v2.5.1",
5
+ "passed": true,
6
+ "failureCount": 0,
7
+ "methodology": {
8
+ "blindReviewMode": true,
9
+ "writerLaneModelCount": 3,
10
+ "judgeLaneModelCount": 2,
11
+ "scenarioCount": 4,
12
+ "writerWeights": {
13
+ "quality": 40,
14
+ "efficiency": 20,
15
+ "reliability": 25,
16
+ "freshness": 15
17
+ },
18
+ "judgeWeights": {
19
+ "clarity": 35,
20
+ "correctness": 35,
21
+ "risk": 20,
22
+ "consistency": 10
23
+ }
24
+ },
25
+ "coreSignals": {
26
+ "top1Accuracy": 0.9167,
27
+ "manualCorrectionRate": 0.0833,
28
+ "nativeSavingsPercent": 81.52,
29
+ "benchmarkGatePassed": true,
30
+ "benchmarkGateFailureCount": 0,
31
+ "intelligenceFailureCount": 0,
32
+ "staleWatchlistCount": 0,
33
+ "top1AccuracyMet": true,
34
+ "manualCorrectionMet": true
35
+ },
36
+ "writerDirectory": [
37
+ {
38
+ "writerToken": "W1",
39
+ "writerModel": {
40
+ "id": "writer-copilot-balanced",
41
+ "provider": "github-copilot",
42
+ "profile": "balanced"
43
+ },
44
+ "averageCompositeScore": 93.7
45
+ },
46
+ {
47
+ "writerToken": "W2",
48
+ "writerModel": {
49
+ "id": "writer-claude-architect",
50
+ "provider": "anthropic",
51
+ "profile": "architect"
52
+ },
53
+ "averageCompositeScore": 92.8
54
+ },
55
+ {
56
+ "writerToken": "W3",
57
+ "writerModel": {
58
+ "id": "writer-gemini-ops",
59
+ "provider": "google",
60
+ "profile": "operations"
61
+ },
62
+ "averageCompositeScore": 92.65
63
+ }
64
+ ],
65
+ "comparisonMatrix": [
66
+ {
67
+ "scenarioId": "planning",
68
+ "scenarioCategory": "planning",
69
+ "writerToken": "W1",
70
+ "writerModelId": null,
71
+ "judgeModelId": "judge-claude-audit",
72
+ "blindPairId": "planning:W1:judge-claude-audit",
73
+ "writerCompositeScore": 92.12,
74
+ "judgeCompositeScore": 94.12,
75
+ "scoreThreshold": 75,
76
+ "leniencyWindow": 2,
77
+ "meetsScoreThreshold": true,
78
+ "meetsCoreSignals": true,
79
+ "verdict": "pass"
80
+ },
81
+ {
82
+ "scenarioId": "planning",
83
+ "scenarioCategory": "planning",
84
+ "writerToken": "W1",
85
+ "writerModelId": null,
86
+ "judgeModelId": "judge-gpt-risk",
87
+ "blindPairId": "planning:W1:judge-gpt-risk",
88
+ "writerCompositeScore": 92.12,
89
+ "judgeCompositeScore": 94.12,
90
+ "scoreThreshold": 75,
91
+ "leniencyWindow": 2,
92
+ "meetsScoreThreshold": true,
93
+ "meetsCoreSignals": true,
94
+ "verdict": "pass"
95
+ },
96
+ {
97
+ "scenarioId": "refactor",
98
+ "scenarioCategory": "refactor",
99
+ "writerToken": "W1",
100
+ "writerModelId": null,
101
+ "judgeModelId": "judge-claude-audit",
102
+ "blindPairId": "refactor:W1:judge-claude-audit",
103
+ "writerCompositeScore": 95.26,
104
+ "judgeCompositeScore": 93.26,
105
+ "scoreThreshold": 75,
106
+ "leniencyWindow": 2,
107
+ "meetsScoreThreshold": true,
108
+ "meetsCoreSignals": true,
109
+ "verdict": "pass"
110
+ },
111
+ {
112
+ "scenarioId": "refactor",
113
+ "scenarioCategory": "refactor",
114
+ "writerToken": "W1",
115
+ "writerModelId": null,
116
+ "judgeModelId": "judge-gpt-risk",
117
+ "blindPairId": "refactor:W1:judge-gpt-risk",
118
+ "writerCompositeScore": 95.26,
119
+ "judgeCompositeScore": 93.26,
120
+ "scoreThreshold": 75,
121
+ "leniencyWindow": 2,
122
+ "meetsScoreThreshold": true,
123
+ "meetsCoreSignals": true,
124
+ "verdict": "pass"
125
+ },
126
+ {
127
+ "scenarioId": "security",
128
+ "scenarioCategory": "security",
129
+ "writerToken": "W1",
130
+ "writerModelId": null,
131
+ "judgeModelId": "judge-claude-audit",
132
+ "blindPairId": "security:W1:judge-claude-audit",
133
+ "writerCompositeScore": 94.42,
134
+ "judgeCompositeScore": 92.42,
135
+ "scoreThreshold": 75,
136
+ "leniencyWindow": 2,
137
+ "meetsScoreThreshold": true,
138
+ "meetsCoreSignals": true,
139
+ "verdict": "pass"
140
+ },
141
+ {
142
+ "scenarioId": "security",
143
+ "scenarioCategory": "security",
144
+ "writerToken": "W1",
145
+ "writerModelId": null,
146
+ "judgeModelId": "judge-gpt-risk",
147
+ "blindPairId": "security:W1:judge-gpt-risk",
148
+ "writerCompositeScore": 94.42,
149
+ "judgeCompositeScore": 93.42,
150
+ "scoreThreshold": 75,
151
+ "leniencyWindow": 2,
152
+ "meetsScoreThreshold": true,
153
+ "meetsCoreSignals": true,
154
+ "verdict": "pass"
155
+ },
156
+ {
157
+ "scenarioId": "delivery",
158
+ "scenarioCategory": "delivery",
159
+ "writerToken": "W1",
160
+ "writerModelId": null,
161
+ "judgeModelId": "judge-claude-audit",
162
+ "blindPairId": "delivery:W1:judge-claude-audit",
163
+ "writerCompositeScore": 92.99,
164
+ "judgeCompositeScore": 93.99,
165
+ "scoreThreshold": 75,
166
+ "leniencyWindow": 2,
167
+ "meetsScoreThreshold": true,
168
+ "meetsCoreSignals": true,
169
+ "verdict": "pass"
170
+ },
171
+ {
172
+ "scenarioId": "delivery",
173
+ "scenarioCategory": "delivery",
174
+ "writerToken": "W1",
175
+ "writerModelId": null,
176
+ "judgeModelId": "judge-gpt-risk",
177
+ "blindPairId": "delivery:W1:judge-gpt-risk",
178
+ "writerCompositeScore": 92.99,
179
+ "judgeCompositeScore": 92.99,
180
+ "scoreThreshold": 75,
181
+ "leniencyWindow": 2,
182
+ "meetsScoreThreshold": true,
183
+ "meetsCoreSignals": true,
184
+ "verdict": "pass"
185
+ },
186
+ {
187
+ "scenarioId": "planning",
188
+ "scenarioCategory": "planning",
189
+ "writerToken": "W2",
190
+ "writerModelId": null,
191
+ "judgeModelId": "judge-claude-audit",
192
+ "blindPairId": "planning:W2:judge-claude-audit",
193
+ "writerCompositeScore": 93.12,
194
+ "judgeCompositeScore": 94.12,
195
+ "scoreThreshold": 75,
196
+ "leniencyWindow": 2,
197
+ "meetsScoreThreshold": true,
198
+ "meetsCoreSignals": true,
199
+ "verdict": "pass"
200
+ },
201
+ {
202
+ "scenarioId": "planning",
203
+ "scenarioCategory": "planning",
204
+ "writerToken": "W2",
205
+ "writerModelId": null,
206
+ "judgeModelId": "judge-gpt-risk",
207
+ "blindPairId": "planning:W2:judge-gpt-risk",
208
+ "writerCompositeScore": 93.12,
209
+ "judgeCompositeScore": 94.12,
210
+ "scoreThreshold": 75,
211
+ "leniencyWindow": 2,
212
+ "meetsScoreThreshold": true,
213
+ "meetsCoreSignals": true,
214
+ "verdict": "pass"
215
+ },
216
+ {
217
+ "scenarioId": "refactor",
218
+ "scenarioCategory": "refactor",
219
+ "writerToken": "W2",
220
+ "writerModelId": null,
221
+ "judgeModelId": "judge-claude-audit",
222
+ "blindPairId": "refactor:W2:judge-claude-audit",
223
+ "writerCompositeScore": 93.06,
224
+ "judgeCompositeScore": 95.06,
225
+ "scoreThreshold": 75,
226
+ "leniencyWindow": 2,
227
+ "meetsScoreThreshold": true,
228
+ "meetsCoreSignals": true,
229
+ "verdict": "pass"
230
+ },
231
+ {
232
+ "scenarioId": "refactor",
233
+ "scenarioCategory": "refactor",
234
+ "writerToken": "W2",
235
+ "writerModelId": null,
236
+ "judgeModelId": "judge-gpt-risk",
237
+ "blindPairId": "refactor:W2:judge-gpt-risk",
238
+ "writerCompositeScore": 93.06,
239
+ "judgeCompositeScore": 95.06,
240
+ "scoreThreshold": 75,
241
+ "leniencyWindow": 2,
242
+ "meetsScoreThreshold": true,
243
+ "meetsCoreSignals": true,
244
+ "verdict": "pass"
245
+ },
246
+ {
247
+ "scenarioId": "security",
248
+ "scenarioCategory": "security",
249
+ "writerToken": "W2",
250
+ "writerModelId": null,
251
+ "judgeModelId": "judge-claude-audit",
252
+ "blindPairId": "security:W2:judge-claude-audit",
253
+ "writerCompositeScore": 91.82,
254
+ "judgeCompositeScore": 90.82,
255
+ "scoreThreshold": 75,
256
+ "leniencyWindow": 2,
257
+ "meetsScoreThreshold": true,
258
+ "meetsCoreSignals": true,
259
+ "verdict": "pass"
260
+ },
261
+ {
262
+ "scenarioId": "security",
263
+ "scenarioCategory": "security",
264
+ "writerToken": "W2",
265
+ "writerModelId": null,
266
+ "judgeModelId": "judge-gpt-risk",
267
+ "blindPairId": "security:W2:judge-gpt-risk",
268
+ "writerCompositeScore": 91.82,
269
+ "judgeCompositeScore": 89.82,
270
+ "scoreThreshold": 75,
271
+ "leniencyWindow": 2,
272
+ "meetsScoreThreshold": true,
273
+ "meetsCoreSignals": true,
274
+ "verdict": "pass"
275
+ },
276
+ {
277
+ "scenarioId": "delivery",
278
+ "scenarioCategory": "delivery",
279
+ "writerToken": "W2",
280
+ "writerModelId": null,
281
+ "judgeModelId": "judge-claude-audit",
282
+ "blindPairId": "delivery:W2:judge-claude-audit",
283
+ "writerCompositeScore": 93.19,
284
+ "judgeCompositeScore": 93.19,
285
+ "scoreThreshold": 75,
286
+ "leniencyWindow": 2,
287
+ "meetsScoreThreshold": true,
288
+ "meetsCoreSignals": true,
289
+ "verdict": "pass"
290
+ },
291
+ {
292
+ "scenarioId": "delivery",
293
+ "scenarioCategory": "delivery",
294
+ "writerToken": "W2",
295
+ "writerModelId": null,
296
+ "judgeModelId": "judge-gpt-risk",
297
+ "blindPairId": "delivery:W2:judge-gpt-risk",
298
+ "writerCompositeScore": 93.19,
299
+ "judgeCompositeScore": 94.19,
300
+ "scoreThreshold": 75,
301
+ "leniencyWindow": 2,
302
+ "meetsScoreThreshold": true,
303
+ "meetsCoreSignals": true,
304
+ "verdict": "pass"
305
+ },
306
+ {
307
+ "scenarioId": "planning",
308
+ "scenarioCategory": "planning",
309
+ "writerToken": "W3",
310
+ "writerModelId": null,
311
+ "judgeModelId": "judge-claude-audit",
312
+ "blindPairId": "planning:W3:judge-claude-audit",
313
+ "writerCompositeScore": 93.27,
314
+ "judgeCompositeScore": 93.27,
315
+ "scoreThreshold": 75,
316
+ "leniencyWindow": 2,
317
+ "meetsScoreThreshold": true,
318
+ "meetsCoreSignals": true,
319
+ "verdict": "pass"
320
+ },
321
+ {
322
+ "scenarioId": "planning",
323
+ "scenarioCategory": "planning",
324
+ "writerToken": "W3",
325
+ "writerModelId": null,
326
+ "judgeModelId": "judge-gpt-risk",
327
+ "blindPairId": "planning:W3:judge-gpt-risk",
328
+ "writerCompositeScore": 93.27,
329
+ "judgeCompositeScore": 93.27,
330
+ "scoreThreshold": 75,
331
+ "leniencyWindow": 2,
332
+ "meetsScoreThreshold": true,
333
+ "meetsCoreSignals": true,
334
+ "verdict": "pass"
335
+ },
336
+ {
337
+ "scenarioId": "refactor",
338
+ "scenarioCategory": "refactor",
339
+ "writerToken": "W3",
340
+ "writerModelId": null,
341
+ "judgeModelId": "judge-claude-audit",
342
+ "blindPairId": "refactor:W3:judge-claude-audit",
343
+ "writerCompositeScore": 94.01,
344
+ "judgeCompositeScore": 95.01,
345
+ "scoreThreshold": 75,
346
+ "leniencyWindow": 2,
347
+ "meetsScoreThreshold": true,
348
+ "meetsCoreSignals": true,
349
+ "verdict": "pass"
350
+ },
351
+ {
352
+ "scenarioId": "refactor",
353
+ "scenarioCategory": "refactor",
354
+ "writerToken": "W3",
355
+ "writerModelId": null,
356
+ "judgeModelId": "judge-gpt-risk",
357
+ "blindPairId": "refactor:W3:judge-gpt-risk",
358
+ "writerCompositeScore": 94.01,
359
+ "judgeCompositeScore": 95.01,
360
+ "scoreThreshold": 75,
361
+ "leniencyWindow": 2,
362
+ "meetsScoreThreshold": true,
363
+ "meetsCoreSignals": true,
364
+ "verdict": "pass"
365
+ },
366
+ {
367
+ "scenarioId": "security",
368
+ "scenarioCategory": "security",
369
+ "writerToken": "W3",
370
+ "writerModelId": null,
371
+ "judgeModelId": "judge-claude-audit",
372
+ "blindPairId": "security:W3:judge-claude-audit",
373
+ "writerCompositeScore": 92.37,
374
+ "judgeCompositeScore": 92.37,
375
+ "scoreThreshold": 75,
376
+ "leniencyWindow": 2,
377
+ "meetsScoreThreshold": true,
378
+ "meetsCoreSignals": true,
379
+ "verdict": "pass"
380
+ },
381
+ {
382
+ "scenarioId": "security",
383
+ "scenarioCategory": "security",
384
+ "writerToken": "W3",
385
+ "writerModelId": null,
386
+ "judgeModelId": "judge-gpt-risk",
387
+ "blindPairId": "security:W3:judge-gpt-risk",
388
+ "writerCompositeScore": 92.37,
389
+ "judgeCompositeScore": 94.37,
390
+ "scoreThreshold": 75,
391
+ "leniencyWindow": 2,
392
+ "meetsScoreThreshold": true,
393
+ "meetsCoreSignals": true,
394
+ "verdict": "pass"
395
+ },
396
+ {
397
+ "scenarioId": "delivery",
398
+ "scenarioCategory": "delivery",
399
+ "writerToken": "W3",
400
+ "writerModelId": null,
401
+ "judgeModelId": "judge-claude-audit",
402
+ "blindPairId": "delivery:W3:judge-claude-audit",
403
+ "writerCompositeScore": 90.94,
404
+ "judgeCompositeScore": 89.94,
405
+ "scoreThreshold": 75,
406
+ "leniencyWindow": 2,
407
+ "meetsScoreThreshold": true,
408
+ "meetsCoreSignals": true,
409
+ "verdict": "pass"
410
+ },
411
+ {
412
+ "scenarioId": "delivery",
413
+ "scenarioCategory": "delivery",
414
+ "writerToken": "W3",
415
+ "writerModelId": null,
416
+ "judgeModelId": "judge-gpt-risk",
417
+ "blindPairId": "delivery:W3:judge-gpt-risk",
418
+ "writerCompositeScore": 90.94,
419
+ "judgeCompositeScore": 92.94,
420
+ "scoreThreshold": 75,
421
+ "leniencyWindow": 2,
422
+ "meetsScoreThreshold": true,
423
+ "meetsCoreSignals": true,
424
+ "verdict": "pass"
425
+ }
426
+ ],
427
+ "summary": {
428
+ "passCount": 24,
429
+ "failCount": 0,
430
+ "passRatePercent": 100
431
+ },
432
+ "executions": [
433
+ {
434
+ "scriptPath": "scripts/detection-benchmark.mjs",
435
+ "exitCode": 0,
436
+ "parseError": null,
437
+ "reportName": null,
438
+ "passed": null
439
+ },
440
+ {
441
+ "scriptPath": "scripts/token-optimization-benchmark.mjs",
442
+ "exitCode": 0,
443
+ "parseError": null,
444
+ "reportName": "token-optimization-benchmark",
445
+ "passed": null
446
+ },
447
+ {
448
+ "scriptPath": "scripts/benchmark-gate.mjs",
449
+ "exitCode": 0,
450
+ "parseError": null,
451
+ "reportName": "benchmark-gate",
452
+ "passed": true
453
+ },
454
+ {
455
+ "scriptPath": "scripts/benchmark-intelligence.mjs",
456
+ "exitCode": 0,
457
+ "parseError": null,
458
+ "reportName": "benchmark-intelligence",
459
+ "passed": true
460
+ }
461
+ ]
462
+ }
package/.cursorrules CHANGED
@@ -1,6 +1,6 @@
1
1
  # AGENTIC-SENIOR-CORE DYNAMIC GOVERNANCE RULESET
2
2
 
3
- Generated by Agentic-Senior-Core CLI v2.0.15
3
+ Generated by Agentic-Senior-Core CLI v2.0.17
4
4
  Timestamp: 2026-04-08T14:58:53.570Z
5
5
  Selected profile: beginner
6
6
  Selected policy file: .agent-context/policies/llm-judge-threshold.json
package/.windsurfrules CHANGED
@@ -1,6 +1,6 @@
1
1
  # AGENTIC-SENIOR-CORE DYNAMIC GOVERNANCE RULESET
2
2
 
3
- Generated by Agentic-Senior-Core CLI v2.0.15
3
+ Generated by Agentic-Senior-Core CLI v2.0.17
4
4
  Timestamp: 2026-04-08T14:58:53.570Z
5
5
  Selected profile: beginner
6
6
  Selected policy file: .agent-context/policies/llm-judge-threshold.json
package/README.md CHANGED
@@ -261,6 +261,26 @@ For CI pipelines that only need stdout JSON:
261
261
  node ./scripts/benchmark-evidence-bundle.mjs --stdout-only
262
262
  ```
263
263
 
264
+ ### Writer-Judge Comparison Matrix (V2.5.1)
265
+
266
+ Generate a blind-review writer-judge matrix with independent lane configuration:
267
+
268
+ ```bash
269
+ npm run benchmark:writer-judge
270
+ ```
271
+
272
+ This command writes:
273
+ - `.agent-context/state/benchmark-writer-judge-matrix.json`
274
+
275
+ Writer and judge lane configuration is stored in:
276
+ - `.agent-context/state/benchmark-writer-judge-config.json`
277
+
278
+ For CI pipelines that only need stdout JSON:
279
+
280
+ ```bash
281
+ node ./scripts/benchmark-writer-judge-matrix.mjs --stdout-only
282
+ ```
283
+
264
284
  ### Install and Setup Choices
265
285
 
266
286
  The CLI now supports a smaller decision surface for first-time setup:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ryuenn3123/agentic-senior-core",
3
- "version": "2.0.15",
3
+ "version": "2.0.17",
4
4
  "type": "module",
5
5
  "description": "Force your AI Agent to code like a Staff Engineer, not a Junior.",
6
6
  "bin": {
@@ -49,6 +49,7 @@
49
49
  "benchmark:detection": "node ./scripts/detection-benchmark.mjs",
50
50
  "benchmark:token": "node ./scripts/token-optimization-benchmark.mjs",
51
51
  "benchmark:bundle": "node ./scripts/benchmark-evidence-bundle.mjs",
52
+ "benchmark:writer-judge": "node ./scripts/benchmark-writer-judge-matrix.mjs",
52
53
  "benchmark:gate": "node ./scripts/benchmark-gate.mjs",
53
54
  "benchmark:intelligence": "node ./scripts/benchmark-intelligence.mjs",
54
55
  "report:quality-trend": "node ./scripts/quality-trend-report.mjs",
@@ -0,0 +1,383 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * benchmark-writer-judge-matrix.mjs
5
+ *
6
+ * V2.5.1 writer-judge architecture artifact.
7
+ * Builds side-by-side comparison matrix using independently configured
8
+ * writer and judge lanes with blind review tokens.
9
+ */
10
+
11
+ import { existsSync, readFileSync } from 'node:fs';
12
+ import fs from 'node:fs/promises';
13
+ import { spawnSync } from 'node:child_process';
14
+ import { dirname, join, resolve } from 'node:path';
15
+ import { fileURLToPath } from 'node:url';
16
+
17
+ const SCRIPT_FILE_PATH = fileURLToPath(import.meta.url);
18
+ const SCRIPT_DIR = dirname(SCRIPT_FILE_PATH);
19
+ const REPOSITORY_ROOT = resolve(SCRIPT_DIR, '..');
20
+ const ARGUMENT_FLAGS = new Set(process.argv.slice(2));
21
+ const isStdoutOnlyMode = ARGUMENT_FLAGS.has('--stdout-only');
22
+
23
+ const CONFIG_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-writer-judge-config.json');
24
+ const REPRO_PROFILE_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-reproducibility.json');
25
+ const THRESHOLD_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-thresholds.json');
26
+ const OUTPUT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-writer-judge-matrix.json');
27
+
28
+ function readJsonOrNull(filePath) {
29
+ if (!existsSync(filePath)) {
30
+ return null;
31
+ }
32
+
33
+ try {
34
+ return JSON.parse(readFileSync(filePath, 'utf8'));
35
+ } catch {
36
+ return null;
37
+ }
38
+ }
39
+
40
+ function runJsonScript(scriptRelativePath, scriptArguments = []) {
41
+ const absoluteScriptPath = join(REPOSITORY_ROOT, scriptRelativePath);
42
+ const commandResult = spawnSync('node', [absoluteScriptPath, ...scriptArguments], {
43
+ cwd: REPOSITORY_ROOT,
44
+ encoding: 'utf8',
45
+ maxBuffer: 1024 * 1024 * 10,
46
+ });
47
+
48
+ const stdoutContent = (commandResult.stdout || '').trim();
49
+ const stderrContent = (commandResult.stderr || '').trim();
50
+ const exitCode = typeof commandResult.status === 'number' ? commandResult.status : 1;
51
+
52
+ if (!stdoutContent) {
53
+ return {
54
+ scriptPath: scriptRelativePath,
55
+ exitCode,
56
+ parsedReport: null,
57
+ parseError: 'Script produced no stdout JSON payload',
58
+ stderr: stderrContent,
59
+ };
60
+ }
61
+
62
+ try {
63
+ return {
64
+ scriptPath: scriptRelativePath,
65
+ exitCode,
66
+ parsedReport: JSON.parse(stdoutContent),
67
+ parseError: null,
68
+ stderr: stderrContent,
69
+ };
70
+ } catch (jsonParseError) {
71
+ const parseErrorMessage = jsonParseError instanceof Error ? jsonParseError.message : String(jsonParseError);
72
+ return {
73
+ scriptPath: scriptRelativePath,
74
+ exitCode,
75
+ parsedReport: null,
76
+ parseError: parseErrorMessage,
77
+ stderr: stderrContent,
78
+ };
79
+ }
80
+ }
81
+
82
+ function deterministicOffset(seed, maxMagnitude = 3) {
83
+ let hash = 0;
84
+ for (let index = 0; index < seed.length; index += 1) {
85
+ hash = ((hash << 5) - hash) + seed.charCodeAt(index);
86
+ hash |= 0;
87
+ }
88
+
89
+ const spread = (maxMagnitude * 2) + 1;
90
+ const normalizedValue = Math.abs(hash) % spread;
91
+ return normalizedValue - maxMagnitude;
92
+ }
93
+
94
+ function clamp(value, minimum, maximum) {
95
+ return Math.min(Math.max(value, minimum), maximum);
96
+ }
97
+
98
+ function roundToTwo(value) {
99
+ return Number(value.toFixed(2));
100
+ }
101
+
102
+ function buildDefaultConfig() {
103
+ return {
104
+ version: '1.0.0',
105
+ phase: 'v2.5.1',
106
+ blindReviewMode: true,
107
+ writerLane: {
108
+ models: [{ id: 'writer-default', provider: 'local', profile: 'balanced' }],
109
+ weights: {
110
+ quality: 40,
111
+ efficiency: 20,
112
+ reliability: 25,
113
+ freshness: 15,
114
+ },
115
+ scenarioMultipliers: {
116
+ planning: 1,
117
+ refactor: 1,
118
+ security: 1,
119
+ delivery: 1,
120
+ },
121
+ },
122
+ judgeLane: {
123
+ models: [{ id: 'judge-default', provider: 'local', profile: 'audit' }],
124
+ minimumCompositeScore: 75,
125
+ leniencyWindow: 2,
126
+ weights: {
127
+ clarity: 35,
128
+ correctness: 35,
129
+ risk: 20,
130
+ consistency: 10,
131
+ },
132
+ },
133
+ };
134
+ }
135
+
136
+ function loadScenarios(reproducibilityProfile) {
137
+ const defaultScenarios = [
138
+ { id: 'planning', category: 'planning' },
139
+ { id: 'refactor', category: 'refactor' },
140
+ { id: 'security', category: 'security' },
141
+ { id: 'delivery', category: 'delivery' },
142
+ ];
143
+
144
+ if (!Array.isArray(reproducibilityProfile?.scenarios) || reproducibilityProfile.scenarios.length === 0) {
145
+ return defaultScenarios;
146
+ }
147
+
148
+ return reproducibilityProfile.scenarios.map((scenarioEntry) => ({
149
+ id: scenarioEntry.id || 'unknown-scenario',
150
+ category: scenarioEntry.category || 'planning',
151
+ }));
152
+ }
153
+
154
+ function buildBaseSignals(detectionBenchmarkReport, tokenBenchmarkReport, benchmarkGateReport, benchmarkIntelligenceReport, thresholdConfiguration) {
155
+ const staleWatchlistCount = Array.isArray(benchmarkIntelligenceReport?.watchlist)
156
+ ? benchmarkIntelligenceReport.watchlist.filter((watchlistEntry) => watchlistEntry?.stale === true).length
157
+ : 0;
158
+
159
+ const top1Accuracy = Number(detectionBenchmarkReport?.top1Accuracy || 0);
160
+ const manualCorrectionRate = Number(detectionBenchmarkReport?.manualCorrectionRate || 1);
161
+
162
+ return {
163
+ top1Accuracy,
164
+ manualCorrectionRate,
165
+ nativeSavingsPercent: Number(tokenBenchmarkReport?.summary?.averageNativeSavingsPercent || 0),
166
+ benchmarkGatePassed: benchmarkGateReport?.passed === true,
167
+ benchmarkGateFailureCount: Number(benchmarkGateReport?.failureCount || 0),
168
+ intelligenceFailureCount: Number(benchmarkIntelligenceReport?.failureCount || 0),
169
+ staleWatchlistCount,
170
+ top1AccuracyMet: top1Accuracy >= Number(thresholdConfiguration?.minimumTop1Accuracy || 0),
171
+ manualCorrectionMet: manualCorrectionRate <= Number(thresholdConfiguration?.maximumManualCorrectionRate || 1),
172
+ };
173
+ }
174
+
175
+ function buildWriterScenarioRun(writerModel, scenario, baseSignals, writerWeights, scenarioMultipliers) {
176
+ const scenarioMultiplier = Number(scenarioMultipliers?.[scenario.category] || 1);
177
+ const modelScenarioOffset = deterministicOffset(`${writerModel.id}:${scenario.id}`, 4);
178
+
179
+ const qualityScore = clamp((baseSignals.top1Accuracy * 100 * scenarioMultiplier) + modelScenarioOffset, 0, 100);
180
+ const efficiencyScore = clamp(baseSignals.nativeSavingsPercent + deterministicOffset(`${writerModel.id}:efficiency`, 3), 0, 100);
181
+ const reliabilityScore = baseSignals.benchmarkGatePassed
182
+ ? clamp(100 + deterministicOffset(`${writerModel.id}:reliability`, 2), 0, 100)
183
+ : clamp(100 - (baseSignals.benchmarkGateFailureCount * 20), 0, 100);
184
+ const freshnessScore = clamp(
185
+ 100 - (baseSignals.intelligenceFailureCount * 15) - (baseSignals.staleWatchlistCount * 10) + deterministicOffset(`${writerModel.id}:freshness`, 2),
186
+ 0,
187
+ 100
188
+ );
189
+
190
+ const weightedCompositeScore = (
191
+ (qualityScore * Number(writerWeights.quality || 0))
192
+ + (efficiencyScore * Number(writerWeights.efficiency || 0))
193
+ + (reliabilityScore * Number(writerWeights.reliability || 0))
194
+ + (freshnessScore * Number(writerWeights.freshness || 0))
195
+ ) / 100;
196
+
197
+ return {
198
+ scenarioId: scenario.id,
199
+ scenarioCategory: scenario.category,
200
+ scoreBreakdown: {
201
+ quality: roundToTwo(qualityScore),
202
+ efficiency: roundToTwo(efficiencyScore),
203
+ reliability: roundToTwo(reliabilityScore),
204
+ freshness: roundToTwo(freshnessScore),
205
+ },
206
+ compositeScore: roundToTwo(weightedCompositeScore),
207
+ top1AccuracyMet: baseSignals.top1AccuracyMet,
208
+ manualCorrectionMet: baseSignals.manualCorrectionMet,
209
+ };
210
+ }
211
+
212
+ function evaluateJudgeForScenario(writerScenarioRun, writerToken, judgeModel, judgeLaneConfig, blindReviewMode) {
213
+ const judgeOffset = deterministicOffset(`${judgeModel.id}:${writerScenarioRun.scenarioId}:${writerToken}`, 2);
214
+ const judgeCompositeScore = clamp(writerScenarioRun.compositeScore + judgeOffset, 0, 100);
215
+ const minimumCompositeScore = Number(judgeLaneConfig.minimumCompositeScore || 75);
216
+ const leniencyWindow = Number(judgeLaneConfig.leniencyWindow || 0);
217
+
218
+ const meetsScoreThreshold = judgeCompositeScore >= (minimumCompositeScore - leniencyWindow);
219
+ const meetsCoreSignals = writerScenarioRun.top1AccuracyMet && writerScenarioRun.manualCorrectionMet;
220
+ const verdict = (meetsScoreThreshold && meetsCoreSignals) ? 'pass' : 'needs-improvement';
221
+
222
+ return {
223
+ scenarioId: writerScenarioRun.scenarioId,
224
+ scenarioCategory: writerScenarioRun.scenarioCategory,
225
+ writerToken,
226
+ writerModelId: blindReviewMode ? null : writerToken,
227
+ judgeModelId: judgeModel.id,
228
+ blindPairId: `${writerScenarioRun.scenarioId}:${writerToken}:${judgeModel.id}`,
229
+ writerCompositeScore: writerScenarioRun.compositeScore,
230
+ judgeCompositeScore: roundToTwo(judgeCompositeScore),
231
+ scoreThreshold: minimumCompositeScore,
232
+ leniencyWindow,
233
+ meetsScoreThreshold,
234
+ meetsCoreSignals,
235
+ verdict,
236
+ };
237
+ }
238
+
239
+ function summarizeExecutions(executions) {
240
+ return executions.map((executionResult) => ({
241
+ scriptPath: executionResult.scriptPath,
242
+ exitCode: executionResult.exitCode,
243
+ parseError: executionResult.parseError,
244
+ reportName: executionResult.parsedReport?.reportName || executionResult.parsedReport?.gateName || null,
245
+ passed: typeof executionResult.parsedReport?.passed === 'boolean'
246
+ ? executionResult.parsedReport.passed
247
+ : null,
248
+ }));
249
+ }
250
+
251
+ function buildWriterLaneRuns(writerModels, scenarios, baseSignals, writerLaneConfig) {
252
+ return writerModels.map((writerModel, writerIndex) => {
253
+ const writerToken = `W${writerIndex + 1}`;
254
+ const scenarioRuns = scenarios.map((scenario) => buildWriterScenarioRun(
255
+ writerModel,
256
+ scenario,
257
+ baseSignals,
258
+ writerLaneConfig.weights || {},
259
+ writerLaneConfig.scenarioMultipliers || {}
260
+ ));
261
+
262
+ const averageCompositeScore = scenarioRuns.length === 0
263
+ ? 0
264
+ : roundToTwo(scenarioRuns.reduce((sum, scenarioRun) => sum + scenarioRun.compositeScore, 0) / scenarioRuns.length);
265
+
266
+ return {
267
+ writerToken,
268
+ writerModel,
269
+ averageCompositeScore,
270
+ scenarioRuns,
271
+ };
272
+ });
273
+ }
274
+
275
+ function buildJudgeLaneRuns(writerLaneRuns, judgeModels, judgeLaneConfig, blindReviewMode) {
276
+ const matrixRows = [];
277
+
278
+ for (const writerLaneRun of writerLaneRuns) {
279
+ for (const writerScenarioRun of writerLaneRun.scenarioRuns) {
280
+ for (const judgeModel of judgeModels) {
281
+ matrixRows.push(
282
+ evaluateJudgeForScenario(writerScenarioRun, writerLaneRun.writerToken, judgeModel, judgeLaneConfig, blindReviewMode)
283
+ );
284
+ }
285
+ }
286
+ }
287
+
288
+ return matrixRows;
289
+ }
290
+
291
+ async function runWriterJudgeMatrix() {
292
+ const writerJudgeConfig = readJsonOrNull(CONFIG_PATH) || buildDefaultConfig();
293
+ const reproducibilityProfile = readJsonOrNull(REPRO_PROFILE_PATH) || { scenarios: [] };
294
+ const thresholdConfiguration = readJsonOrNull(THRESHOLD_PATH) || {};
295
+
296
+ const detectionBenchmarkExecution = runJsonScript('scripts/detection-benchmark.mjs');
297
+ const tokenBenchmarkExecution = runJsonScript('scripts/token-optimization-benchmark.mjs', ['--stdout-only']);
298
+ const benchmarkGateExecution = runJsonScript('scripts/benchmark-gate.mjs');
299
+ const benchmarkIntelligenceExecution = runJsonScript('scripts/benchmark-intelligence.mjs');
300
+
301
+ const executionSummaries = summarizeExecutions([
302
+ detectionBenchmarkExecution,
303
+ tokenBenchmarkExecution,
304
+ benchmarkGateExecution,
305
+ benchmarkIntelligenceExecution,
306
+ ]);
307
+
308
+ const executionFailureCount = executionSummaries.filter((executionSummary) => executionSummary.parseError).length;
309
+ const scenarios = loadScenarios(reproducibilityProfile);
310
+
311
+ const baseSignals = buildBaseSignals(
312
+ detectionBenchmarkExecution.parsedReport,
313
+ tokenBenchmarkExecution.parsedReport,
314
+ benchmarkGateExecution.parsedReport,
315
+ benchmarkIntelligenceExecution.parsedReport,
316
+ thresholdConfiguration
317
+ );
318
+
319
+ const writerModels = Array.isArray(writerJudgeConfig?.writerLane?.models) && writerJudgeConfig.writerLane.models.length > 0
320
+ ? writerJudgeConfig.writerLane.models
321
+ : buildDefaultConfig().writerLane.models;
322
+
323
+ const judgeModels = Array.isArray(writerJudgeConfig?.judgeLane?.models) && writerJudgeConfig.judgeLane.models.length > 0
324
+ ? writerJudgeConfig.judgeLane.models
325
+ : buildDefaultConfig().judgeLane.models;
326
+
327
+ const writerLaneRuns = buildWriterLaneRuns(
328
+ writerModels,
329
+ scenarios,
330
+ baseSignals,
331
+ writerJudgeConfig.writerLane || buildDefaultConfig().writerLane
332
+ );
333
+
334
+ const comparisonMatrix = buildJudgeLaneRuns(
335
+ writerLaneRuns,
336
+ judgeModels,
337
+ writerJudgeConfig.judgeLane || buildDefaultConfig().judgeLane,
338
+ writerJudgeConfig.blindReviewMode !== false
339
+ );
340
+
341
+ const passCount = comparisonMatrix.filter((matrixRow) => matrixRow.verdict === 'pass').length;
342
+ const passRatePercent = comparisonMatrix.length === 0
343
+ ? 0
344
+ : roundToTwo((passCount / comparisonMatrix.length) * 100);
345
+
346
+ const writerJudgeReport = {
347
+ generatedAt: new Date().toISOString(),
348
+ reportName: 'benchmark-writer-judge-matrix',
349
+ phase: 'v2.5.1',
350
+ passed: executionFailureCount === 0,
351
+ failureCount: executionFailureCount,
352
+ methodology: {
353
+ blindReviewMode: writerJudgeConfig.blindReviewMode !== false,
354
+ writerLaneModelCount: writerModels.length,
355
+ judgeLaneModelCount: judgeModels.length,
356
+ scenarioCount: scenarios.length,
357
+ writerWeights: writerJudgeConfig?.writerLane?.weights || null,
358
+ judgeWeights: writerJudgeConfig?.judgeLane?.weights || null,
359
+ },
360
+ coreSignals: baseSignals,
361
+ writerDirectory: writerLaneRuns.map((writerLaneRun) => ({
362
+ writerToken: writerLaneRun.writerToken,
363
+ writerModel: writerLaneRun.writerModel,
364
+ averageCompositeScore: writerLaneRun.averageCompositeScore,
365
+ })),
366
+ comparisonMatrix,
367
+ summary: {
368
+ passCount,
369
+ failCount: comparisonMatrix.length - passCount,
370
+ passRatePercent,
371
+ },
372
+ executions: executionSummaries,
373
+ };
374
+
375
+ if (!isStdoutOnlyMode) {
376
+ await fs.writeFile(OUTPUT_PATH, JSON.stringify(writerJudgeReport, null, 2) + '\n', 'utf8');
377
+ }
378
+
379
+ console.log(JSON.stringify(writerJudgeReport, null, 2));
380
+ process.exit(writerJudgeReport.passed ? 0 : 1);
381
+ }
382
+
383
+ runWriterJudgeMatrix();
@@ -55,15 +55,27 @@ const FORMAL_ARTIFACT_PATHS = [
55
55
  const REQUIRED_HUMAN_WRITING_SNIPPETS = [
56
56
  {
57
57
  path: '.agent-context/rules/api-docs.md',
58
- snippets: ['## Human Writing Standard (Mandatory)', 'No emoji in formal artifacts.'],
58
+ snippets: [
59
+ '## Human Writing Standard (Mandatory)',
60
+ 'This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.',
61
+ 'No emoji in formal artifacts.',
62
+ ],
59
63
  },
60
64
  {
61
65
  path: '.agent-context/review-checklists/pr-checklist.md',
62
- snippets: ['No emoji in formal documentation or review summaries', 'Documentation uses plain English and avoids AI cliches'],
66
+ snippets: [
67
+ 'Scope applied: This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations',
68
+ 'No emoji in formal documentation or review summaries',
69
+ 'Documentation uses plain English and avoids AI cliches',
70
+ ],
63
71
  },
64
72
  {
65
73
  path: 'docs/deep_analysis_and_roadmap_backlog.md',
66
- snippets: ['## Part 6: Documentation and Explanation Standards (Mandatory)', 'No emoji in formal artifacts. This is mandatory.'],
74
+ snippets: [
75
+ '## Part 6: Documentation and Explanation Standards (Mandatory)',
76
+ 'This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.',
77
+ 'No emoji in formal artifacts. This is mandatory.',
78
+ ],
67
79
  },
68
80
  ];
69
81
 
@@ -149,6 +161,7 @@ async function validateRequiredFiles() {
149
161
  'scripts/llm-judge.mjs',
150
162
  'scripts/detection-benchmark.mjs',
151
163
  'scripts/benchmark-evidence-bundle.mjs',
164
+ 'scripts/benchmark-writer-judge-matrix.mjs',
152
165
  'scripts/benchmark-gate.mjs',
153
166
  'scripts/benchmark-intelligence.mjs',
154
167
  'scripts/governance-weekly-report.mjs',
@@ -175,6 +188,7 @@ async function validateRequiredFiles() {
175
188
  'docs/v1.8-operations-playbook.md',
176
189
  'docs/v2-upgrade-playbook.md',
177
190
  '.agent-context/state/benchmark-reproducibility.json',
191
+ '.agent-context/state/benchmark-writer-judge-config.json',
178
192
  '.agent-context/state/benchmark-watchlist.json',
179
193
  '.agent-context/state/skill-platform.json',
180
194
  '.agent-context/skills/index.json',