@ryuenn3123/agentic-senior-core 2.0.16 → 2.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,8 @@ Run a comprehensive code review on the current codebase (or the files I'm about
13
13
  Use these checklists:
14
14
  1. Read .agent-context/review-checklists/pr-checklist.md — apply every item.
15
15
  2. Read .agent-context/review-checklists/security-audit.md — apply every item.
16
+ 3. Apply documentation scope rules exactly: This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.
17
+ 4. Treat scope-style findings as advisory unless they hide factual errors, contract mismatches, or non-negotiable violations.
16
18
 
17
19
  For EVERY violation found:
18
20
  - State the exact file and line
@@ -91,6 +91,8 @@ VERDICT: PASS / FAIL (X/Y items passed)
91
91
  - [ ] `.env` not committed
92
92
 
93
93
  ### 10. Documentation
94
+ - [ ] Scope applied: This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations
95
+ - [ ] Style scope review is advisory and does not block merge when API docs are synced in the same commit and contract details are correct
94
96
  - [ ] API endpoints have OpenAPI/Swagger documentation
95
97
  - [ ] Complex business logic has comments explaining WHY
96
98
  - [ ] Public functions/methods have JSDoc/docstrings
@@ -27,7 +27,8 @@ REQUIRED: Documentation MUST be updated in the SAME commit as the endpoint chang
27
27
 
28
28
  ## Human Writing Standard (Mandatory)
29
29
 
30
- This standard applies to API docs, README updates, release notes, and technical explanations.
30
+ This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.
31
+ API docs and README updates are included in this scope.
31
32
 
32
33
  ### Style Baseline
33
34
  1. Write for native English speakers.
@@ -44,6 +45,12 @@ This standard applies to API docs, README updates, release notes, and technical
44
45
  4. Rewrite and reorder content when flow is weak.
45
46
  5. Keep explanations short by default; expand only when complexity requires it.
46
47
 
48
+ ### Scope Severity and Merge Behavior
49
+ 1. Scope style guidance controls readability and consistency.
50
+ 2. Style baseline findings are advisory by default and must not block endpoint-change commits that already include accurate docs/spec updates.
51
+ 3. Hard blockers remain contract failures: missing same-commit docs sync, incorrect schema, missing required responses, or factual inaccuracies.
52
+ 4. If style polish is still needed, open a follow-up task instead of delaying the contract update.
53
+
47
54
  ### Non-Negotiables
48
55
  1. No emoji in formal artifacts.
49
56
  2. Avoid AI cliches and buzzwords: delve, leverage, robust, utilize, seamless.
@@ -197,6 +204,9 @@ This schema MUST be documented in OpenAPI as a reusable component (`#/components
197
204
  ```
198
205
  Endpoint changed + docs NOT updated = PR REJECTED.
199
206
 
207
+ This sync rule has higher priority than style polish timing.
208
+ Do not delay same-commit documentation sync only to iterate writing tone.
209
+
200
210
  The spec is a contract. If the contract is wrong, consumers will break.
201
211
  "I'll update the docs later" means "the docs will never be updated."
202
212
  ```
@@ -73,13 +73,15 @@
73
73
  "Run npm run benchmark:detection to regenerate detection benchmark output.",
74
74
  "Run npm run benchmark:gate to validate benchmark anti-regression thresholds.",
75
75
  "Run npm run benchmark:intelligence to validate benchmark watchlist freshness.",
76
- "Run npm run benchmark:bundle to emit a reproducible benchmark evidence bundle."
76
+ "Run npm run benchmark:bundle to emit a reproducible benchmark evidence bundle.",
77
+ "Run npm run benchmark:writer-judge to emit writer-judge side-by-side matrix output."
77
78
  ],
78
79
  "commandExamples": [
79
80
  "npm run benchmark:detection",
80
81
  "npm run benchmark:gate",
81
82
  "npm run benchmark:intelligence",
82
83
  "npm run benchmark:bundle",
84
+ "npm run benchmark:writer-judge",
83
85
  "node ./scripts/benchmark-evidence-bundle.mjs --stdout-only"
84
86
  ]
85
87
  }
@@ -0,0 +1,58 @@
1
+ {
2
+ "version": "1.0.0",
3
+ "phase": "v2.5.1",
4
+ "blindReviewMode": true,
5
+ "writerLane": {
6
+ "models": [
7
+ {
8
+ "id": "writer-copilot-balanced",
9
+ "provider": "github-copilot",
10
+ "profile": "balanced"
11
+ },
12
+ {
13
+ "id": "writer-claude-architect",
14
+ "provider": "anthropic",
15
+ "profile": "architect"
16
+ },
17
+ {
18
+ "id": "writer-gemini-ops",
19
+ "provider": "google",
20
+ "profile": "operations"
21
+ }
22
+ ],
23
+ "weights": {
24
+ "quality": 40,
25
+ "efficiency": 20,
26
+ "reliability": 25,
27
+ "freshness": 15
28
+ },
29
+ "scenarioMultipliers": {
30
+ "planning": 1,
31
+ "refactor": 1.02,
32
+ "security": 1.03,
33
+ "delivery": 0.98
34
+ }
35
+ },
36
+ "judgeLane": {
37
+ "models": [
38
+ {
39
+ "id": "judge-claude-audit",
40
+ "provider": "anthropic",
41
+ "profile": "audit"
42
+ },
43
+ {
44
+ "id": "judge-gpt-risk",
45
+ "provider": "openai",
46
+ "profile": "risk"
47
+ }
48
+ ],
49
+ "minimumCompositeScore": 75,
50
+ "leniencyWindow": 2,
51
+ "weights": {
52
+ "clarity": 35,
53
+ "correctness": 35,
54
+ "risk": 20,
55
+ "consistency": 10
56
+ }
57
+ }
58
+ }
@@ -0,0 +1,462 @@
1
+ {
2
+ "generatedAt": "2026-04-14T06:57:14.623Z",
3
+ "reportName": "benchmark-writer-judge-matrix",
4
+ "phase": "v2.5.1",
5
+ "passed": true,
6
+ "failureCount": 0,
7
+ "methodology": {
8
+ "blindReviewMode": true,
9
+ "writerLaneModelCount": 3,
10
+ "judgeLaneModelCount": 2,
11
+ "scenarioCount": 4,
12
+ "writerWeights": {
13
+ "quality": 40,
14
+ "efficiency": 20,
15
+ "reliability": 25,
16
+ "freshness": 15
17
+ },
18
+ "judgeWeights": {
19
+ "clarity": 35,
20
+ "correctness": 35,
21
+ "risk": 20,
22
+ "consistency": 10
23
+ }
24
+ },
25
+ "coreSignals": {
26
+ "top1Accuracy": 0.9167,
27
+ "manualCorrectionRate": 0.0833,
28
+ "nativeSavingsPercent": 81.52,
29
+ "benchmarkGatePassed": true,
30
+ "benchmarkGateFailureCount": 0,
31
+ "intelligenceFailureCount": 0,
32
+ "staleWatchlistCount": 0,
33
+ "top1AccuracyMet": true,
34
+ "manualCorrectionMet": true
35
+ },
36
+ "writerDirectory": [
37
+ {
38
+ "writerToken": "W1",
39
+ "writerModel": {
40
+ "id": "writer-copilot-balanced",
41
+ "provider": "github-copilot",
42
+ "profile": "balanced"
43
+ },
44
+ "averageCompositeScore": 93.7
45
+ },
46
+ {
47
+ "writerToken": "W2",
48
+ "writerModel": {
49
+ "id": "writer-claude-architect",
50
+ "provider": "anthropic",
51
+ "profile": "architect"
52
+ },
53
+ "averageCompositeScore": 92.8
54
+ },
55
+ {
56
+ "writerToken": "W3",
57
+ "writerModel": {
58
+ "id": "writer-gemini-ops",
59
+ "provider": "google",
60
+ "profile": "operations"
61
+ },
62
+ "averageCompositeScore": 92.65
63
+ }
64
+ ],
65
+ "comparisonMatrix": [
66
+ {
67
+ "scenarioId": "planning",
68
+ "scenarioCategory": "planning",
69
+ "writerToken": "W1",
70
+ "writerModelId": null,
71
+ "judgeModelId": "judge-claude-audit",
72
+ "blindPairId": "planning:W1:judge-claude-audit",
73
+ "writerCompositeScore": 92.12,
74
+ "judgeCompositeScore": 94.12,
75
+ "scoreThreshold": 75,
76
+ "leniencyWindow": 2,
77
+ "meetsScoreThreshold": true,
78
+ "meetsCoreSignals": true,
79
+ "verdict": "pass"
80
+ },
81
+ {
82
+ "scenarioId": "planning",
83
+ "scenarioCategory": "planning",
84
+ "writerToken": "W1",
85
+ "writerModelId": null,
86
+ "judgeModelId": "judge-gpt-risk",
87
+ "blindPairId": "planning:W1:judge-gpt-risk",
88
+ "writerCompositeScore": 92.12,
89
+ "judgeCompositeScore": 94.12,
90
+ "scoreThreshold": 75,
91
+ "leniencyWindow": 2,
92
+ "meetsScoreThreshold": true,
93
+ "meetsCoreSignals": true,
94
+ "verdict": "pass"
95
+ },
96
+ {
97
+ "scenarioId": "refactor",
98
+ "scenarioCategory": "refactor",
99
+ "writerToken": "W1",
100
+ "writerModelId": null,
101
+ "judgeModelId": "judge-claude-audit",
102
+ "blindPairId": "refactor:W1:judge-claude-audit",
103
+ "writerCompositeScore": 95.26,
104
+ "judgeCompositeScore": 93.26,
105
+ "scoreThreshold": 75,
106
+ "leniencyWindow": 2,
107
+ "meetsScoreThreshold": true,
108
+ "meetsCoreSignals": true,
109
+ "verdict": "pass"
110
+ },
111
+ {
112
+ "scenarioId": "refactor",
113
+ "scenarioCategory": "refactor",
114
+ "writerToken": "W1",
115
+ "writerModelId": null,
116
+ "judgeModelId": "judge-gpt-risk",
117
+ "blindPairId": "refactor:W1:judge-gpt-risk",
118
+ "writerCompositeScore": 95.26,
119
+ "judgeCompositeScore": 93.26,
120
+ "scoreThreshold": 75,
121
+ "leniencyWindow": 2,
122
+ "meetsScoreThreshold": true,
123
+ "meetsCoreSignals": true,
124
+ "verdict": "pass"
125
+ },
126
+ {
127
+ "scenarioId": "security",
128
+ "scenarioCategory": "security",
129
+ "writerToken": "W1",
130
+ "writerModelId": null,
131
+ "judgeModelId": "judge-claude-audit",
132
+ "blindPairId": "security:W1:judge-claude-audit",
133
+ "writerCompositeScore": 94.42,
134
+ "judgeCompositeScore": 92.42,
135
+ "scoreThreshold": 75,
136
+ "leniencyWindow": 2,
137
+ "meetsScoreThreshold": true,
138
+ "meetsCoreSignals": true,
139
+ "verdict": "pass"
140
+ },
141
+ {
142
+ "scenarioId": "security",
143
+ "scenarioCategory": "security",
144
+ "writerToken": "W1",
145
+ "writerModelId": null,
146
+ "judgeModelId": "judge-gpt-risk",
147
+ "blindPairId": "security:W1:judge-gpt-risk",
148
+ "writerCompositeScore": 94.42,
149
+ "judgeCompositeScore": 93.42,
150
+ "scoreThreshold": 75,
151
+ "leniencyWindow": 2,
152
+ "meetsScoreThreshold": true,
153
+ "meetsCoreSignals": true,
154
+ "verdict": "pass"
155
+ },
156
+ {
157
+ "scenarioId": "delivery",
158
+ "scenarioCategory": "delivery",
159
+ "writerToken": "W1",
160
+ "writerModelId": null,
161
+ "judgeModelId": "judge-claude-audit",
162
+ "blindPairId": "delivery:W1:judge-claude-audit",
163
+ "writerCompositeScore": 92.99,
164
+ "judgeCompositeScore": 93.99,
165
+ "scoreThreshold": 75,
166
+ "leniencyWindow": 2,
167
+ "meetsScoreThreshold": true,
168
+ "meetsCoreSignals": true,
169
+ "verdict": "pass"
170
+ },
171
+ {
172
+ "scenarioId": "delivery",
173
+ "scenarioCategory": "delivery",
174
+ "writerToken": "W1",
175
+ "writerModelId": null,
176
+ "judgeModelId": "judge-gpt-risk",
177
+ "blindPairId": "delivery:W1:judge-gpt-risk",
178
+ "writerCompositeScore": 92.99,
179
+ "judgeCompositeScore": 92.99,
180
+ "scoreThreshold": 75,
181
+ "leniencyWindow": 2,
182
+ "meetsScoreThreshold": true,
183
+ "meetsCoreSignals": true,
184
+ "verdict": "pass"
185
+ },
186
+ {
187
+ "scenarioId": "planning",
188
+ "scenarioCategory": "planning",
189
+ "writerToken": "W2",
190
+ "writerModelId": null,
191
+ "judgeModelId": "judge-claude-audit",
192
+ "blindPairId": "planning:W2:judge-claude-audit",
193
+ "writerCompositeScore": 93.12,
194
+ "judgeCompositeScore": 94.12,
195
+ "scoreThreshold": 75,
196
+ "leniencyWindow": 2,
197
+ "meetsScoreThreshold": true,
198
+ "meetsCoreSignals": true,
199
+ "verdict": "pass"
200
+ },
201
+ {
202
+ "scenarioId": "planning",
203
+ "scenarioCategory": "planning",
204
+ "writerToken": "W2",
205
+ "writerModelId": null,
206
+ "judgeModelId": "judge-gpt-risk",
207
+ "blindPairId": "planning:W2:judge-gpt-risk",
208
+ "writerCompositeScore": 93.12,
209
+ "judgeCompositeScore": 94.12,
210
+ "scoreThreshold": 75,
211
+ "leniencyWindow": 2,
212
+ "meetsScoreThreshold": true,
213
+ "meetsCoreSignals": true,
214
+ "verdict": "pass"
215
+ },
216
+ {
217
+ "scenarioId": "refactor",
218
+ "scenarioCategory": "refactor",
219
+ "writerToken": "W2",
220
+ "writerModelId": null,
221
+ "judgeModelId": "judge-claude-audit",
222
+ "blindPairId": "refactor:W2:judge-claude-audit",
223
+ "writerCompositeScore": 93.06,
224
+ "judgeCompositeScore": 95.06,
225
+ "scoreThreshold": 75,
226
+ "leniencyWindow": 2,
227
+ "meetsScoreThreshold": true,
228
+ "meetsCoreSignals": true,
229
+ "verdict": "pass"
230
+ },
231
+ {
232
+ "scenarioId": "refactor",
233
+ "scenarioCategory": "refactor",
234
+ "writerToken": "W2",
235
+ "writerModelId": null,
236
+ "judgeModelId": "judge-gpt-risk",
237
+ "blindPairId": "refactor:W2:judge-gpt-risk",
238
+ "writerCompositeScore": 93.06,
239
+ "judgeCompositeScore": 95.06,
240
+ "scoreThreshold": 75,
241
+ "leniencyWindow": 2,
242
+ "meetsScoreThreshold": true,
243
+ "meetsCoreSignals": true,
244
+ "verdict": "pass"
245
+ },
246
+ {
247
+ "scenarioId": "security",
248
+ "scenarioCategory": "security",
249
+ "writerToken": "W2",
250
+ "writerModelId": null,
251
+ "judgeModelId": "judge-claude-audit",
252
+ "blindPairId": "security:W2:judge-claude-audit",
253
+ "writerCompositeScore": 91.82,
254
+ "judgeCompositeScore": 90.82,
255
+ "scoreThreshold": 75,
256
+ "leniencyWindow": 2,
257
+ "meetsScoreThreshold": true,
258
+ "meetsCoreSignals": true,
259
+ "verdict": "pass"
260
+ },
261
+ {
262
+ "scenarioId": "security",
263
+ "scenarioCategory": "security",
264
+ "writerToken": "W2",
265
+ "writerModelId": null,
266
+ "judgeModelId": "judge-gpt-risk",
267
+ "blindPairId": "security:W2:judge-gpt-risk",
268
+ "writerCompositeScore": 91.82,
269
+ "judgeCompositeScore": 89.82,
270
+ "scoreThreshold": 75,
271
+ "leniencyWindow": 2,
272
+ "meetsScoreThreshold": true,
273
+ "meetsCoreSignals": true,
274
+ "verdict": "pass"
275
+ },
276
+ {
277
+ "scenarioId": "delivery",
278
+ "scenarioCategory": "delivery",
279
+ "writerToken": "W2",
280
+ "writerModelId": null,
281
+ "judgeModelId": "judge-claude-audit",
282
+ "blindPairId": "delivery:W2:judge-claude-audit",
283
+ "writerCompositeScore": 93.19,
284
+ "judgeCompositeScore": 93.19,
285
+ "scoreThreshold": 75,
286
+ "leniencyWindow": 2,
287
+ "meetsScoreThreshold": true,
288
+ "meetsCoreSignals": true,
289
+ "verdict": "pass"
290
+ },
291
+ {
292
+ "scenarioId": "delivery",
293
+ "scenarioCategory": "delivery",
294
+ "writerToken": "W2",
295
+ "writerModelId": null,
296
+ "judgeModelId": "judge-gpt-risk",
297
+ "blindPairId": "delivery:W2:judge-gpt-risk",
298
+ "writerCompositeScore": 93.19,
299
+ "judgeCompositeScore": 94.19,
300
+ "scoreThreshold": 75,
301
+ "leniencyWindow": 2,
302
+ "meetsScoreThreshold": true,
303
+ "meetsCoreSignals": true,
304
+ "verdict": "pass"
305
+ },
306
+ {
307
+ "scenarioId": "planning",
308
+ "scenarioCategory": "planning",
309
+ "writerToken": "W3",
310
+ "writerModelId": null,
311
+ "judgeModelId": "judge-claude-audit",
312
+ "blindPairId": "planning:W3:judge-claude-audit",
313
+ "writerCompositeScore": 93.27,
314
+ "judgeCompositeScore": 93.27,
315
+ "scoreThreshold": 75,
316
+ "leniencyWindow": 2,
317
+ "meetsScoreThreshold": true,
318
+ "meetsCoreSignals": true,
319
+ "verdict": "pass"
320
+ },
321
+ {
322
+ "scenarioId": "planning",
323
+ "scenarioCategory": "planning",
324
+ "writerToken": "W3",
325
+ "writerModelId": null,
326
+ "judgeModelId": "judge-gpt-risk",
327
+ "blindPairId": "planning:W3:judge-gpt-risk",
328
+ "writerCompositeScore": 93.27,
329
+ "judgeCompositeScore": 93.27,
330
+ "scoreThreshold": 75,
331
+ "leniencyWindow": 2,
332
+ "meetsScoreThreshold": true,
333
+ "meetsCoreSignals": true,
334
+ "verdict": "pass"
335
+ },
336
+ {
337
+ "scenarioId": "refactor",
338
+ "scenarioCategory": "refactor",
339
+ "writerToken": "W3",
340
+ "writerModelId": null,
341
+ "judgeModelId": "judge-claude-audit",
342
+ "blindPairId": "refactor:W3:judge-claude-audit",
343
+ "writerCompositeScore": 94.01,
344
+ "judgeCompositeScore": 95.01,
345
+ "scoreThreshold": 75,
346
+ "leniencyWindow": 2,
347
+ "meetsScoreThreshold": true,
348
+ "meetsCoreSignals": true,
349
+ "verdict": "pass"
350
+ },
351
+ {
352
+ "scenarioId": "refactor",
353
+ "scenarioCategory": "refactor",
354
+ "writerToken": "W3",
355
+ "writerModelId": null,
356
+ "judgeModelId": "judge-gpt-risk",
357
+ "blindPairId": "refactor:W3:judge-gpt-risk",
358
+ "writerCompositeScore": 94.01,
359
+ "judgeCompositeScore": 95.01,
360
+ "scoreThreshold": 75,
361
+ "leniencyWindow": 2,
362
+ "meetsScoreThreshold": true,
363
+ "meetsCoreSignals": true,
364
+ "verdict": "pass"
365
+ },
366
+ {
367
+ "scenarioId": "security",
368
+ "scenarioCategory": "security",
369
+ "writerToken": "W3",
370
+ "writerModelId": null,
371
+ "judgeModelId": "judge-claude-audit",
372
+ "blindPairId": "security:W3:judge-claude-audit",
373
+ "writerCompositeScore": 92.37,
374
+ "judgeCompositeScore": 92.37,
375
+ "scoreThreshold": 75,
376
+ "leniencyWindow": 2,
377
+ "meetsScoreThreshold": true,
378
+ "meetsCoreSignals": true,
379
+ "verdict": "pass"
380
+ },
381
+ {
382
+ "scenarioId": "security",
383
+ "scenarioCategory": "security",
384
+ "writerToken": "W3",
385
+ "writerModelId": null,
386
+ "judgeModelId": "judge-gpt-risk",
387
+ "blindPairId": "security:W3:judge-gpt-risk",
388
+ "writerCompositeScore": 92.37,
389
+ "judgeCompositeScore": 94.37,
390
+ "scoreThreshold": 75,
391
+ "leniencyWindow": 2,
392
+ "meetsScoreThreshold": true,
393
+ "meetsCoreSignals": true,
394
+ "verdict": "pass"
395
+ },
396
+ {
397
+ "scenarioId": "delivery",
398
+ "scenarioCategory": "delivery",
399
+ "writerToken": "W3",
400
+ "writerModelId": null,
401
+ "judgeModelId": "judge-claude-audit",
402
+ "blindPairId": "delivery:W3:judge-claude-audit",
403
+ "writerCompositeScore": 90.94,
404
+ "judgeCompositeScore": 89.94,
405
+ "scoreThreshold": 75,
406
+ "leniencyWindow": 2,
407
+ "meetsScoreThreshold": true,
408
+ "meetsCoreSignals": true,
409
+ "verdict": "pass"
410
+ },
411
+ {
412
+ "scenarioId": "delivery",
413
+ "scenarioCategory": "delivery",
414
+ "writerToken": "W3",
415
+ "writerModelId": null,
416
+ "judgeModelId": "judge-gpt-risk",
417
+ "blindPairId": "delivery:W3:judge-gpt-risk",
418
+ "writerCompositeScore": 90.94,
419
+ "judgeCompositeScore": 92.94,
420
+ "scoreThreshold": 75,
421
+ "leniencyWindow": 2,
422
+ "meetsScoreThreshold": true,
423
+ "meetsCoreSignals": true,
424
+ "verdict": "pass"
425
+ }
426
+ ],
427
+ "summary": {
428
+ "passCount": 24,
429
+ "failCount": 0,
430
+ "passRatePercent": 100
431
+ },
432
+ "executions": [
433
+ {
434
+ "scriptPath": "scripts/detection-benchmark.mjs",
435
+ "exitCode": 0,
436
+ "parseError": null,
437
+ "reportName": null,
438
+ "passed": null
439
+ },
440
+ {
441
+ "scriptPath": "scripts/token-optimization-benchmark.mjs",
442
+ "exitCode": 0,
443
+ "parseError": null,
444
+ "reportName": "token-optimization-benchmark",
445
+ "passed": null
446
+ },
447
+ {
448
+ "scriptPath": "scripts/benchmark-gate.mjs",
449
+ "exitCode": 0,
450
+ "parseError": null,
451
+ "reportName": "benchmark-gate",
452
+ "passed": true
453
+ },
454
+ {
455
+ "scriptPath": "scripts/benchmark-intelligence.mjs",
456
+ "exitCode": 0,
457
+ "parseError": null,
458
+ "reportName": "benchmark-intelligence",
459
+ "passed": true
460
+ }
461
+ ]
462
+ }