@ryuenn3123/agentic-senior-core 2.0.16 → 2.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-context/prompts/review-code.md +2 -0
- package/.agent-context/review-checklists/pr-checklist.md +2 -0
- package/.agent-context/rules/api-docs.md +11 -1
- package/.agent-context/state/benchmark-reproducibility.json +3 -1
- package/.agent-context/state/benchmark-writer-judge-config.json +58 -0
- package/.agent-context/state/benchmark-writer-judge-matrix.json +462 -0
- package/.cursorrules +60 -3686
- package/.windsurfrules +60 -3686
- package/README.md +33 -1
- package/lib/cli/compiler.mjs +98 -35
- package/package.json +2 -1
- package/scripts/benchmark-writer-judge-matrix.mjs +383 -0
- package/scripts/validate.mjs +19 -3
|
@@ -13,6 +13,8 @@ Run a comprehensive code review on the current codebase (or the files I'm about
|
|
|
13
13
|
Use these checklists:
|
|
14
14
|
1. Read .agent-context/review-checklists/pr-checklist.md — apply every item.
|
|
15
15
|
2. Read .agent-context/review-checklists/security-audit.md — apply every item.
|
|
16
|
+
3. Apply documentation scope rules exactly: This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.
|
|
17
|
+
4. Treat scope-style findings as advisory unless they hide factual errors, contract mismatches, or non-negotiable violations.
|
|
16
18
|
|
|
17
19
|
For EVERY violation found:
|
|
18
20
|
- State the exact file and line
|
|
@@ -91,6 +91,8 @@ VERDICT: PASS / FAIL (X/Y items passed)
|
|
|
91
91
|
- [ ] `.env` not committed
|
|
92
92
|
|
|
93
93
|
### 10. Documentation
|
|
94
|
+
- [ ] Scope applied: This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations
|
|
95
|
+
- [ ] Style scope review is advisory and does not block merge when API docs are synced in the same commit and contract details are correct
|
|
94
96
|
- [ ] API endpoints have OpenAPI/Swagger documentation
|
|
95
97
|
- [ ] Complex business logic has comments explaining WHY
|
|
96
98
|
- [ ] Public functions/methods have JSDoc/docstrings
|
|
@@ -27,7 +27,8 @@ REQUIRED: Documentation MUST be updated in the SAME commit as the endpoint chang
|
|
|
27
27
|
|
|
28
28
|
## Human Writing Standard (Mandatory)
|
|
29
29
|
|
|
30
|
-
This
|
|
30
|
+
This applies to documentation, release notes, onboarding text, review summaries, and agent-facing explanations.
|
|
31
|
+
API docs and README updates are included in this scope.
|
|
31
32
|
|
|
32
33
|
### Style Baseline
|
|
33
34
|
1. Write for native English speakers.
|
|
@@ -44,6 +45,12 @@ This standard applies to API docs, README updates, release notes, and technical
|
|
|
44
45
|
4. Rewrite and reorder content when flow is weak.
|
|
45
46
|
5. Keep explanations short by default; expand only when complexity requires it.
|
|
46
47
|
|
|
48
|
+
### Scope Severity and Merge Behavior
|
|
49
|
+
1. Scope style guidance controls readability and consistency.
|
|
50
|
+
2. Style baseline findings are advisory by default and must not block endpoint-change commits that already include accurate docs/spec updates.
|
|
51
|
+
3. Hard blockers remain contract failures: missing same-commit docs sync, incorrect schema, missing required responses, or factual inaccuracies.
|
|
52
|
+
4. If style polish is still needed, open a follow-up task instead of delaying the contract update.
|
|
53
|
+
|
|
47
54
|
### Non-Negotiables
|
|
48
55
|
1. No emoji in formal artifacts.
|
|
49
56
|
2. Avoid AI cliches and buzzwords: delve, leverage, robust, utilize, seamless.
|
|
@@ -197,6 +204,9 @@ This schema MUST be documented in OpenAPI as a reusable component (`#/components
|
|
|
197
204
|
```
|
|
198
205
|
Endpoint changed + docs NOT updated = PR REJECTED.
|
|
199
206
|
|
|
207
|
+
This sync rule has higher priority than style polish timing.
|
|
208
|
+
Do not delay same-commit documentation sync only to iterate writing tone.
|
|
209
|
+
|
|
200
210
|
The spec is a contract. If the contract is wrong, consumers will break.
|
|
201
211
|
"I'll update the docs later" means "the docs will never be updated."
|
|
202
212
|
```
|
|
@@ -73,13 +73,15 @@
|
|
|
73
73
|
"Run npm run benchmark:detection to regenerate detection benchmark output.",
|
|
74
74
|
"Run npm run benchmark:gate to validate benchmark anti-regression thresholds.",
|
|
75
75
|
"Run npm run benchmark:intelligence to validate benchmark watchlist freshness.",
|
|
76
|
-
"Run npm run benchmark:bundle to emit a reproducible benchmark evidence bundle."
|
|
76
|
+
"Run npm run benchmark:bundle to emit a reproducible benchmark evidence bundle.",
|
|
77
|
+
"Run npm run benchmark:writer-judge to emit writer-judge side-by-side matrix output."
|
|
77
78
|
],
|
|
78
79
|
"commandExamples": [
|
|
79
80
|
"npm run benchmark:detection",
|
|
80
81
|
"npm run benchmark:gate",
|
|
81
82
|
"npm run benchmark:intelligence",
|
|
82
83
|
"npm run benchmark:bundle",
|
|
84
|
+
"npm run benchmark:writer-judge",
|
|
83
85
|
"node ./scripts/benchmark-evidence-bundle.mjs --stdout-only"
|
|
84
86
|
]
|
|
85
87
|
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "1.0.0",
|
|
3
|
+
"phase": "v2.5.1",
|
|
4
|
+
"blindReviewMode": true,
|
|
5
|
+
"writerLane": {
|
|
6
|
+
"models": [
|
|
7
|
+
{
|
|
8
|
+
"id": "writer-copilot-balanced",
|
|
9
|
+
"provider": "github-copilot",
|
|
10
|
+
"profile": "balanced"
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"id": "writer-claude-architect",
|
|
14
|
+
"provider": "anthropic",
|
|
15
|
+
"profile": "architect"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "writer-gemini-ops",
|
|
19
|
+
"provider": "google",
|
|
20
|
+
"profile": "operations"
|
|
21
|
+
}
|
|
22
|
+
],
|
|
23
|
+
"weights": {
|
|
24
|
+
"quality": 40,
|
|
25
|
+
"efficiency": 20,
|
|
26
|
+
"reliability": 25,
|
|
27
|
+
"freshness": 15
|
|
28
|
+
},
|
|
29
|
+
"scenarioMultipliers": {
|
|
30
|
+
"planning": 1,
|
|
31
|
+
"refactor": 1.02,
|
|
32
|
+
"security": 1.03,
|
|
33
|
+
"delivery": 0.98
|
|
34
|
+
}
|
|
35
|
+
},
|
|
36
|
+
"judgeLane": {
|
|
37
|
+
"models": [
|
|
38
|
+
{
|
|
39
|
+
"id": "judge-claude-audit",
|
|
40
|
+
"provider": "anthropic",
|
|
41
|
+
"profile": "audit"
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
"id": "judge-gpt-risk",
|
|
45
|
+
"provider": "openai",
|
|
46
|
+
"profile": "risk"
|
|
47
|
+
}
|
|
48
|
+
],
|
|
49
|
+
"minimumCompositeScore": 75,
|
|
50
|
+
"leniencyWindow": 2,
|
|
51
|
+
"weights": {
|
|
52
|
+
"clarity": 35,
|
|
53
|
+
"correctness": 35,
|
|
54
|
+
"risk": 20,
|
|
55
|
+
"consistency": 10
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
{
|
|
2
|
+
"generatedAt": "2026-04-14T06:57:14.623Z",
|
|
3
|
+
"reportName": "benchmark-writer-judge-matrix",
|
|
4
|
+
"phase": "v2.5.1",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"failureCount": 0,
|
|
7
|
+
"methodology": {
|
|
8
|
+
"blindReviewMode": true,
|
|
9
|
+
"writerLaneModelCount": 3,
|
|
10
|
+
"judgeLaneModelCount": 2,
|
|
11
|
+
"scenarioCount": 4,
|
|
12
|
+
"writerWeights": {
|
|
13
|
+
"quality": 40,
|
|
14
|
+
"efficiency": 20,
|
|
15
|
+
"reliability": 25,
|
|
16
|
+
"freshness": 15
|
|
17
|
+
},
|
|
18
|
+
"judgeWeights": {
|
|
19
|
+
"clarity": 35,
|
|
20
|
+
"correctness": 35,
|
|
21
|
+
"risk": 20,
|
|
22
|
+
"consistency": 10
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"coreSignals": {
|
|
26
|
+
"top1Accuracy": 0.9167,
|
|
27
|
+
"manualCorrectionRate": 0.0833,
|
|
28
|
+
"nativeSavingsPercent": 81.52,
|
|
29
|
+
"benchmarkGatePassed": true,
|
|
30
|
+
"benchmarkGateFailureCount": 0,
|
|
31
|
+
"intelligenceFailureCount": 0,
|
|
32
|
+
"staleWatchlistCount": 0,
|
|
33
|
+
"top1AccuracyMet": true,
|
|
34
|
+
"manualCorrectionMet": true
|
|
35
|
+
},
|
|
36
|
+
"writerDirectory": [
|
|
37
|
+
{
|
|
38
|
+
"writerToken": "W1",
|
|
39
|
+
"writerModel": {
|
|
40
|
+
"id": "writer-copilot-balanced",
|
|
41
|
+
"provider": "github-copilot",
|
|
42
|
+
"profile": "balanced"
|
|
43
|
+
},
|
|
44
|
+
"averageCompositeScore": 93.7
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"writerToken": "W2",
|
|
48
|
+
"writerModel": {
|
|
49
|
+
"id": "writer-claude-architect",
|
|
50
|
+
"provider": "anthropic",
|
|
51
|
+
"profile": "architect"
|
|
52
|
+
},
|
|
53
|
+
"averageCompositeScore": 92.8
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"writerToken": "W3",
|
|
57
|
+
"writerModel": {
|
|
58
|
+
"id": "writer-gemini-ops",
|
|
59
|
+
"provider": "google",
|
|
60
|
+
"profile": "operations"
|
|
61
|
+
},
|
|
62
|
+
"averageCompositeScore": 92.65
|
|
63
|
+
}
|
|
64
|
+
],
|
|
65
|
+
"comparisonMatrix": [
|
|
66
|
+
{
|
|
67
|
+
"scenarioId": "planning",
|
|
68
|
+
"scenarioCategory": "planning",
|
|
69
|
+
"writerToken": "W1",
|
|
70
|
+
"writerModelId": null,
|
|
71
|
+
"judgeModelId": "judge-claude-audit",
|
|
72
|
+
"blindPairId": "planning:W1:judge-claude-audit",
|
|
73
|
+
"writerCompositeScore": 92.12,
|
|
74
|
+
"judgeCompositeScore": 94.12,
|
|
75
|
+
"scoreThreshold": 75,
|
|
76
|
+
"leniencyWindow": 2,
|
|
77
|
+
"meetsScoreThreshold": true,
|
|
78
|
+
"meetsCoreSignals": true,
|
|
79
|
+
"verdict": "pass"
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
"scenarioId": "planning",
|
|
83
|
+
"scenarioCategory": "planning",
|
|
84
|
+
"writerToken": "W1",
|
|
85
|
+
"writerModelId": null,
|
|
86
|
+
"judgeModelId": "judge-gpt-risk",
|
|
87
|
+
"blindPairId": "planning:W1:judge-gpt-risk",
|
|
88
|
+
"writerCompositeScore": 92.12,
|
|
89
|
+
"judgeCompositeScore": 94.12,
|
|
90
|
+
"scoreThreshold": 75,
|
|
91
|
+
"leniencyWindow": 2,
|
|
92
|
+
"meetsScoreThreshold": true,
|
|
93
|
+
"meetsCoreSignals": true,
|
|
94
|
+
"verdict": "pass"
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
"scenarioId": "refactor",
|
|
98
|
+
"scenarioCategory": "refactor",
|
|
99
|
+
"writerToken": "W1",
|
|
100
|
+
"writerModelId": null,
|
|
101
|
+
"judgeModelId": "judge-claude-audit",
|
|
102
|
+
"blindPairId": "refactor:W1:judge-claude-audit",
|
|
103
|
+
"writerCompositeScore": 95.26,
|
|
104
|
+
"judgeCompositeScore": 93.26,
|
|
105
|
+
"scoreThreshold": 75,
|
|
106
|
+
"leniencyWindow": 2,
|
|
107
|
+
"meetsScoreThreshold": true,
|
|
108
|
+
"meetsCoreSignals": true,
|
|
109
|
+
"verdict": "pass"
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
"scenarioId": "refactor",
|
|
113
|
+
"scenarioCategory": "refactor",
|
|
114
|
+
"writerToken": "W1",
|
|
115
|
+
"writerModelId": null,
|
|
116
|
+
"judgeModelId": "judge-gpt-risk",
|
|
117
|
+
"blindPairId": "refactor:W1:judge-gpt-risk",
|
|
118
|
+
"writerCompositeScore": 95.26,
|
|
119
|
+
"judgeCompositeScore": 93.26,
|
|
120
|
+
"scoreThreshold": 75,
|
|
121
|
+
"leniencyWindow": 2,
|
|
122
|
+
"meetsScoreThreshold": true,
|
|
123
|
+
"meetsCoreSignals": true,
|
|
124
|
+
"verdict": "pass"
|
|
125
|
+
},
|
|
126
|
+
{
|
|
127
|
+
"scenarioId": "security",
|
|
128
|
+
"scenarioCategory": "security",
|
|
129
|
+
"writerToken": "W1",
|
|
130
|
+
"writerModelId": null,
|
|
131
|
+
"judgeModelId": "judge-claude-audit",
|
|
132
|
+
"blindPairId": "security:W1:judge-claude-audit",
|
|
133
|
+
"writerCompositeScore": 94.42,
|
|
134
|
+
"judgeCompositeScore": 92.42,
|
|
135
|
+
"scoreThreshold": 75,
|
|
136
|
+
"leniencyWindow": 2,
|
|
137
|
+
"meetsScoreThreshold": true,
|
|
138
|
+
"meetsCoreSignals": true,
|
|
139
|
+
"verdict": "pass"
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
"scenarioId": "security",
|
|
143
|
+
"scenarioCategory": "security",
|
|
144
|
+
"writerToken": "W1",
|
|
145
|
+
"writerModelId": null,
|
|
146
|
+
"judgeModelId": "judge-gpt-risk",
|
|
147
|
+
"blindPairId": "security:W1:judge-gpt-risk",
|
|
148
|
+
"writerCompositeScore": 94.42,
|
|
149
|
+
"judgeCompositeScore": 93.42,
|
|
150
|
+
"scoreThreshold": 75,
|
|
151
|
+
"leniencyWindow": 2,
|
|
152
|
+
"meetsScoreThreshold": true,
|
|
153
|
+
"meetsCoreSignals": true,
|
|
154
|
+
"verdict": "pass"
|
|
155
|
+
},
|
|
156
|
+
{
|
|
157
|
+
"scenarioId": "delivery",
|
|
158
|
+
"scenarioCategory": "delivery",
|
|
159
|
+
"writerToken": "W1",
|
|
160
|
+
"writerModelId": null,
|
|
161
|
+
"judgeModelId": "judge-claude-audit",
|
|
162
|
+
"blindPairId": "delivery:W1:judge-claude-audit",
|
|
163
|
+
"writerCompositeScore": 92.99,
|
|
164
|
+
"judgeCompositeScore": 93.99,
|
|
165
|
+
"scoreThreshold": 75,
|
|
166
|
+
"leniencyWindow": 2,
|
|
167
|
+
"meetsScoreThreshold": true,
|
|
168
|
+
"meetsCoreSignals": true,
|
|
169
|
+
"verdict": "pass"
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
"scenarioId": "delivery",
|
|
173
|
+
"scenarioCategory": "delivery",
|
|
174
|
+
"writerToken": "W1",
|
|
175
|
+
"writerModelId": null,
|
|
176
|
+
"judgeModelId": "judge-gpt-risk",
|
|
177
|
+
"blindPairId": "delivery:W1:judge-gpt-risk",
|
|
178
|
+
"writerCompositeScore": 92.99,
|
|
179
|
+
"judgeCompositeScore": 92.99,
|
|
180
|
+
"scoreThreshold": 75,
|
|
181
|
+
"leniencyWindow": 2,
|
|
182
|
+
"meetsScoreThreshold": true,
|
|
183
|
+
"meetsCoreSignals": true,
|
|
184
|
+
"verdict": "pass"
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
"scenarioId": "planning",
|
|
188
|
+
"scenarioCategory": "planning",
|
|
189
|
+
"writerToken": "W2",
|
|
190
|
+
"writerModelId": null,
|
|
191
|
+
"judgeModelId": "judge-claude-audit",
|
|
192
|
+
"blindPairId": "planning:W2:judge-claude-audit",
|
|
193
|
+
"writerCompositeScore": 93.12,
|
|
194
|
+
"judgeCompositeScore": 94.12,
|
|
195
|
+
"scoreThreshold": 75,
|
|
196
|
+
"leniencyWindow": 2,
|
|
197
|
+
"meetsScoreThreshold": true,
|
|
198
|
+
"meetsCoreSignals": true,
|
|
199
|
+
"verdict": "pass"
|
|
200
|
+
},
|
|
201
|
+
{
|
|
202
|
+
"scenarioId": "planning",
|
|
203
|
+
"scenarioCategory": "planning",
|
|
204
|
+
"writerToken": "W2",
|
|
205
|
+
"writerModelId": null,
|
|
206
|
+
"judgeModelId": "judge-gpt-risk",
|
|
207
|
+
"blindPairId": "planning:W2:judge-gpt-risk",
|
|
208
|
+
"writerCompositeScore": 93.12,
|
|
209
|
+
"judgeCompositeScore": 94.12,
|
|
210
|
+
"scoreThreshold": 75,
|
|
211
|
+
"leniencyWindow": 2,
|
|
212
|
+
"meetsScoreThreshold": true,
|
|
213
|
+
"meetsCoreSignals": true,
|
|
214
|
+
"verdict": "pass"
|
|
215
|
+
},
|
|
216
|
+
{
|
|
217
|
+
"scenarioId": "refactor",
|
|
218
|
+
"scenarioCategory": "refactor",
|
|
219
|
+
"writerToken": "W2",
|
|
220
|
+
"writerModelId": null,
|
|
221
|
+
"judgeModelId": "judge-claude-audit",
|
|
222
|
+
"blindPairId": "refactor:W2:judge-claude-audit",
|
|
223
|
+
"writerCompositeScore": 93.06,
|
|
224
|
+
"judgeCompositeScore": 95.06,
|
|
225
|
+
"scoreThreshold": 75,
|
|
226
|
+
"leniencyWindow": 2,
|
|
227
|
+
"meetsScoreThreshold": true,
|
|
228
|
+
"meetsCoreSignals": true,
|
|
229
|
+
"verdict": "pass"
|
|
230
|
+
},
|
|
231
|
+
{
|
|
232
|
+
"scenarioId": "refactor",
|
|
233
|
+
"scenarioCategory": "refactor",
|
|
234
|
+
"writerToken": "W2",
|
|
235
|
+
"writerModelId": null,
|
|
236
|
+
"judgeModelId": "judge-gpt-risk",
|
|
237
|
+
"blindPairId": "refactor:W2:judge-gpt-risk",
|
|
238
|
+
"writerCompositeScore": 93.06,
|
|
239
|
+
"judgeCompositeScore": 95.06,
|
|
240
|
+
"scoreThreshold": 75,
|
|
241
|
+
"leniencyWindow": 2,
|
|
242
|
+
"meetsScoreThreshold": true,
|
|
243
|
+
"meetsCoreSignals": true,
|
|
244
|
+
"verdict": "pass"
|
|
245
|
+
},
|
|
246
|
+
{
|
|
247
|
+
"scenarioId": "security",
|
|
248
|
+
"scenarioCategory": "security",
|
|
249
|
+
"writerToken": "W2",
|
|
250
|
+
"writerModelId": null,
|
|
251
|
+
"judgeModelId": "judge-claude-audit",
|
|
252
|
+
"blindPairId": "security:W2:judge-claude-audit",
|
|
253
|
+
"writerCompositeScore": 91.82,
|
|
254
|
+
"judgeCompositeScore": 90.82,
|
|
255
|
+
"scoreThreshold": 75,
|
|
256
|
+
"leniencyWindow": 2,
|
|
257
|
+
"meetsScoreThreshold": true,
|
|
258
|
+
"meetsCoreSignals": true,
|
|
259
|
+
"verdict": "pass"
|
|
260
|
+
},
|
|
261
|
+
{
|
|
262
|
+
"scenarioId": "security",
|
|
263
|
+
"scenarioCategory": "security",
|
|
264
|
+
"writerToken": "W2",
|
|
265
|
+
"writerModelId": null,
|
|
266
|
+
"judgeModelId": "judge-gpt-risk",
|
|
267
|
+
"blindPairId": "security:W2:judge-gpt-risk",
|
|
268
|
+
"writerCompositeScore": 91.82,
|
|
269
|
+
"judgeCompositeScore": 89.82,
|
|
270
|
+
"scoreThreshold": 75,
|
|
271
|
+
"leniencyWindow": 2,
|
|
272
|
+
"meetsScoreThreshold": true,
|
|
273
|
+
"meetsCoreSignals": true,
|
|
274
|
+
"verdict": "pass"
|
|
275
|
+
},
|
|
276
|
+
{
|
|
277
|
+
"scenarioId": "delivery",
|
|
278
|
+
"scenarioCategory": "delivery",
|
|
279
|
+
"writerToken": "W2",
|
|
280
|
+
"writerModelId": null,
|
|
281
|
+
"judgeModelId": "judge-claude-audit",
|
|
282
|
+
"blindPairId": "delivery:W2:judge-claude-audit",
|
|
283
|
+
"writerCompositeScore": 93.19,
|
|
284
|
+
"judgeCompositeScore": 93.19,
|
|
285
|
+
"scoreThreshold": 75,
|
|
286
|
+
"leniencyWindow": 2,
|
|
287
|
+
"meetsScoreThreshold": true,
|
|
288
|
+
"meetsCoreSignals": true,
|
|
289
|
+
"verdict": "pass"
|
|
290
|
+
},
|
|
291
|
+
{
|
|
292
|
+
"scenarioId": "delivery",
|
|
293
|
+
"scenarioCategory": "delivery",
|
|
294
|
+
"writerToken": "W2",
|
|
295
|
+
"writerModelId": null,
|
|
296
|
+
"judgeModelId": "judge-gpt-risk",
|
|
297
|
+
"blindPairId": "delivery:W2:judge-gpt-risk",
|
|
298
|
+
"writerCompositeScore": 93.19,
|
|
299
|
+
"judgeCompositeScore": 94.19,
|
|
300
|
+
"scoreThreshold": 75,
|
|
301
|
+
"leniencyWindow": 2,
|
|
302
|
+
"meetsScoreThreshold": true,
|
|
303
|
+
"meetsCoreSignals": true,
|
|
304
|
+
"verdict": "pass"
|
|
305
|
+
},
|
|
306
|
+
{
|
|
307
|
+
"scenarioId": "planning",
|
|
308
|
+
"scenarioCategory": "planning",
|
|
309
|
+
"writerToken": "W3",
|
|
310
|
+
"writerModelId": null,
|
|
311
|
+
"judgeModelId": "judge-claude-audit",
|
|
312
|
+
"blindPairId": "planning:W3:judge-claude-audit",
|
|
313
|
+
"writerCompositeScore": 93.27,
|
|
314
|
+
"judgeCompositeScore": 93.27,
|
|
315
|
+
"scoreThreshold": 75,
|
|
316
|
+
"leniencyWindow": 2,
|
|
317
|
+
"meetsScoreThreshold": true,
|
|
318
|
+
"meetsCoreSignals": true,
|
|
319
|
+
"verdict": "pass"
|
|
320
|
+
},
|
|
321
|
+
{
|
|
322
|
+
"scenarioId": "planning",
|
|
323
|
+
"scenarioCategory": "planning",
|
|
324
|
+
"writerToken": "W3",
|
|
325
|
+
"writerModelId": null,
|
|
326
|
+
"judgeModelId": "judge-gpt-risk",
|
|
327
|
+
"blindPairId": "planning:W3:judge-gpt-risk",
|
|
328
|
+
"writerCompositeScore": 93.27,
|
|
329
|
+
"judgeCompositeScore": 93.27,
|
|
330
|
+
"scoreThreshold": 75,
|
|
331
|
+
"leniencyWindow": 2,
|
|
332
|
+
"meetsScoreThreshold": true,
|
|
333
|
+
"meetsCoreSignals": true,
|
|
334
|
+
"verdict": "pass"
|
|
335
|
+
},
|
|
336
|
+
{
|
|
337
|
+
"scenarioId": "refactor",
|
|
338
|
+
"scenarioCategory": "refactor",
|
|
339
|
+
"writerToken": "W3",
|
|
340
|
+
"writerModelId": null,
|
|
341
|
+
"judgeModelId": "judge-claude-audit",
|
|
342
|
+
"blindPairId": "refactor:W3:judge-claude-audit",
|
|
343
|
+
"writerCompositeScore": 94.01,
|
|
344
|
+
"judgeCompositeScore": 95.01,
|
|
345
|
+
"scoreThreshold": 75,
|
|
346
|
+
"leniencyWindow": 2,
|
|
347
|
+
"meetsScoreThreshold": true,
|
|
348
|
+
"meetsCoreSignals": true,
|
|
349
|
+
"verdict": "pass"
|
|
350
|
+
},
|
|
351
|
+
{
|
|
352
|
+
"scenarioId": "refactor",
|
|
353
|
+
"scenarioCategory": "refactor",
|
|
354
|
+
"writerToken": "W3",
|
|
355
|
+
"writerModelId": null,
|
|
356
|
+
"judgeModelId": "judge-gpt-risk",
|
|
357
|
+
"blindPairId": "refactor:W3:judge-gpt-risk",
|
|
358
|
+
"writerCompositeScore": 94.01,
|
|
359
|
+
"judgeCompositeScore": 95.01,
|
|
360
|
+
"scoreThreshold": 75,
|
|
361
|
+
"leniencyWindow": 2,
|
|
362
|
+
"meetsScoreThreshold": true,
|
|
363
|
+
"meetsCoreSignals": true,
|
|
364
|
+
"verdict": "pass"
|
|
365
|
+
},
|
|
366
|
+
{
|
|
367
|
+
"scenarioId": "security",
|
|
368
|
+
"scenarioCategory": "security",
|
|
369
|
+
"writerToken": "W3",
|
|
370
|
+
"writerModelId": null,
|
|
371
|
+
"judgeModelId": "judge-claude-audit",
|
|
372
|
+
"blindPairId": "security:W3:judge-claude-audit",
|
|
373
|
+
"writerCompositeScore": 92.37,
|
|
374
|
+
"judgeCompositeScore": 92.37,
|
|
375
|
+
"scoreThreshold": 75,
|
|
376
|
+
"leniencyWindow": 2,
|
|
377
|
+
"meetsScoreThreshold": true,
|
|
378
|
+
"meetsCoreSignals": true,
|
|
379
|
+
"verdict": "pass"
|
|
380
|
+
},
|
|
381
|
+
{
|
|
382
|
+
"scenarioId": "security",
|
|
383
|
+
"scenarioCategory": "security",
|
|
384
|
+
"writerToken": "W3",
|
|
385
|
+
"writerModelId": null,
|
|
386
|
+
"judgeModelId": "judge-gpt-risk",
|
|
387
|
+
"blindPairId": "security:W3:judge-gpt-risk",
|
|
388
|
+
"writerCompositeScore": 92.37,
|
|
389
|
+
"judgeCompositeScore": 94.37,
|
|
390
|
+
"scoreThreshold": 75,
|
|
391
|
+
"leniencyWindow": 2,
|
|
392
|
+
"meetsScoreThreshold": true,
|
|
393
|
+
"meetsCoreSignals": true,
|
|
394
|
+
"verdict": "pass"
|
|
395
|
+
},
|
|
396
|
+
{
|
|
397
|
+
"scenarioId": "delivery",
|
|
398
|
+
"scenarioCategory": "delivery",
|
|
399
|
+
"writerToken": "W3",
|
|
400
|
+
"writerModelId": null,
|
|
401
|
+
"judgeModelId": "judge-claude-audit",
|
|
402
|
+
"blindPairId": "delivery:W3:judge-claude-audit",
|
|
403
|
+
"writerCompositeScore": 90.94,
|
|
404
|
+
"judgeCompositeScore": 89.94,
|
|
405
|
+
"scoreThreshold": 75,
|
|
406
|
+
"leniencyWindow": 2,
|
|
407
|
+
"meetsScoreThreshold": true,
|
|
408
|
+
"meetsCoreSignals": true,
|
|
409
|
+
"verdict": "pass"
|
|
410
|
+
},
|
|
411
|
+
{
|
|
412
|
+
"scenarioId": "delivery",
|
|
413
|
+
"scenarioCategory": "delivery",
|
|
414
|
+
"writerToken": "W3",
|
|
415
|
+
"writerModelId": null,
|
|
416
|
+
"judgeModelId": "judge-gpt-risk",
|
|
417
|
+
"blindPairId": "delivery:W3:judge-gpt-risk",
|
|
418
|
+
"writerCompositeScore": 90.94,
|
|
419
|
+
"judgeCompositeScore": 92.94,
|
|
420
|
+
"scoreThreshold": 75,
|
|
421
|
+
"leniencyWindow": 2,
|
|
422
|
+
"meetsScoreThreshold": true,
|
|
423
|
+
"meetsCoreSignals": true,
|
|
424
|
+
"verdict": "pass"
|
|
425
|
+
}
|
|
426
|
+
],
|
|
427
|
+
"summary": {
|
|
428
|
+
"passCount": 24,
|
|
429
|
+
"failCount": 0,
|
|
430
|
+
"passRatePercent": 100
|
|
431
|
+
},
|
|
432
|
+
"executions": [
|
|
433
|
+
{
|
|
434
|
+
"scriptPath": "scripts/detection-benchmark.mjs",
|
|
435
|
+
"exitCode": 0,
|
|
436
|
+
"parseError": null,
|
|
437
|
+
"reportName": null,
|
|
438
|
+
"passed": null
|
|
439
|
+
},
|
|
440
|
+
{
|
|
441
|
+
"scriptPath": "scripts/token-optimization-benchmark.mjs",
|
|
442
|
+
"exitCode": 0,
|
|
443
|
+
"parseError": null,
|
|
444
|
+
"reportName": "token-optimization-benchmark",
|
|
445
|
+
"passed": null
|
|
446
|
+
},
|
|
447
|
+
{
|
|
448
|
+
"scriptPath": "scripts/benchmark-gate.mjs",
|
|
449
|
+
"exitCode": 0,
|
|
450
|
+
"parseError": null,
|
|
451
|
+
"reportName": "benchmark-gate",
|
|
452
|
+
"passed": true
|
|
453
|
+
},
|
|
454
|
+
{
|
|
455
|
+
"scriptPath": "scripts/benchmark-intelligence.mjs",
|
|
456
|
+
"exitCode": 0,
|
|
457
|
+
"parseError": null,
|
|
458
|
+
"reportName": "benchmark-intelligence",
|
|
459
|
+
"passed": true
|
|
460
|
+
}
|
|
461
|
+
]
|
|
462
|
+
}
|