@ryuenn3123/agentic-senior-core 3.0.36 → 3.0.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/.agent-context/prompts/bootstrap-design.md +109 -113
  2. package/.agent-context/rules/frontend-architecture.md +92 -104
  3. package/.agent-context/state/README.md +26 -0
  4. package/.cursor/mcp.json +10 -0
  5. package/.cursor/rules/agentic-senior-core.mdc +48 -0
  6. package/.cursorrules +22 -88
  7. package/.gemini/instructions.md +25 -16
  8. package/.github/copilot-instructions.md +25 -16
  9. package/.github/instructions/agentic-senior-core.instructions.md +47 -0
  10. package/.instructions.md +98 -207
  11. package/.windsurf/rules/agentic-senior-core.md +43 -0
  12. package/.windsurfrules +22 -88
  13. package/AGENTS.md +23 -26
  14. package/CLAUDE.md +43 -0
  15. package/CONTRIBUTING.md +5 -2
  16. package/GEMINI.md +43 -0
  17. package/README.md +24 -7
  18. package/lib/cli/backup.mjs +4 -4
  19. package/lib/cli/commands/init/project-context.mjs +101 -0
  20. package/lib/cli/commands/init/runtime-environment.mjs +59 -0
  21. package/lib/cli/commands/init/setup-decisions.mjs +83 -0
  22. package/lib/cli/commands/init.mjs +33 -250
  23. package/lib/cli/commands/optimize.mjs +1 -1
  24. package/lib/cli/commands/upgrade.mjs +32 -7
  25. package/lib/cli/compiler.mjs +59 -17
  26. package/lib/cli/constants.mjs +5 -0
  27. package/lib/cli/detector.mjs +4 -0
  28. package/lib/cli/preflight.mjs +3 -3
  29. package/lib/cli/project-scaffolder/design-contract/validation.mjs +789 -0
  30. package/lib/cli/project-scaffolder/design-contract.mjs +137 -861
  31. package/lib/cli/project-scaffolder/prompt-builders.mjs +73 -81
  32. package/lib/cli/utils/filesystem.mjs +79 -0
  33. package/lib/cli/utils/managed-surface.mjs +237 -0
  34. package/lib/cli/utils/prompting.mjs +44 -0
  35. package/lib/cli/utils.mjs +33 -335
  36. package/package.json +21 -2
  37. package/scripts/clean-local-artifacts.mjs +76 -0
  38. package/scripts/docs-quality-drift-report.mjs +5 -0
  39. package/scripts/frontend-usability-audit.mjs +23 -19
  40. package/scripts/governance-weekly-report.mjs +37 -15
  41. package/scripts/single-source-lazy-loading-audit.mjs +24 -0
  42. package/scripts/sync-thin-adapters.mjs +99 -129
  43. package/scripts/v3-purge-audit.mjs +5 -0
  44. package/scripts/validate/config.mjs +21 -0
  45. package/scripts/validate/coverage-checks.mjs +55 -0
  46. package/.agent-context/marketplace/trust-tiers.json +0 -114
  47. package/.agent-context/state/benchmark-analysis.json +0 -431
  48. package/.agent-context/state/benchmark-evidence-bundle.json +0 -1040
  49. package/.agent-context/state/benchmark-history.json +0 -75
  50. package/.agent-context/state/benchmark-trend-report.csv +0 -5
  51. package/.agent-context/state/benchmark-trend-report.json +0 -140
  52. package/.agent-context/state/benchmark-writer-judge-matrix.json +0 -462
  53. package/.agent-context/state/memory-continuity-benchmark.json +0 -132
  54. package/.agent-context/state/onboarding-report.json +0 -102
  55. package/.agent-context/state/quality-trend-report.json +0 -89
  56. package/.agent-context/state/token-optimization-benchmark.json +0 -130
  57. package/.agent-context/state/weekly-governance-report.json +0 -329
  58. package/lib/cli/compatibility.mjs +0 -124
@@ -1,75 +0,0 @@
1
- {
2
- "generatedAt": "2026-04-17T03:20:15.400Z",
3
- "reportName": "benchmark-history",
4
- "maxEntries": 90,
5
- "history": [
6
- {
7
- "generatedAt": "2026-04-17T02:54:01.239Z",
8
- "releaseVersion": "2.0.26",
9
- "fixtureCount": 12,
10
- "top1Accuracy": 0.9167,
11
- "manualCorrectionRate": 0.0833,
12
- "benchmarkGatePassed": true,
13
- "intelligencePassed": true,
14
- "staleWatchlistCount": 0,
15
- "reliabilityPassed": true,
16
- "reliabilityRiskLevel": "monitor",
17
- "incorrectDetectionRate": 0.0833,
18
- "lowConfidenceRate": 0.0833,
19
- "vulnerabilityTotal": null,
20
- "criticalVulnerabilityCount": null,
21
- "forbiddenContentPassed": true
22
- },
23
- {
24
- "generatedAt": "2026-04-17T02:54:57.419Z",
25
- "releaseVersion": "2.0.26",
26
- "fixtureCount": 12,
27
- "top1Accuracy": 0.9167,
28
- "manualCorrectionRate": 0.0833,
29
- "benchmarkGatePassed": true,
30
- "intelligencePassed": true,
31
- "staleWatchlistCount": 0,
32
- "reliabilityPassed": true,
33
- "reliabilityRiskLevel": "monitor",
34
- "incorrectDetectionRate": 0.0833,
35
- "lowConfidenceRate": 0.0833,
36
- "vulnerabilityTotal": null,
37
- "criticalVulnerabilityCount": null,
38
- "forbiddenContentPassed": true
39
- },
40
- {
41
- "generatedAt": "2026-04-17T03:19:31.047Z",
42
- "releaseVersion": "2.0.26",
43
- "fixtureCount": 12,
44
- "top1Accuracy": 0.9167,
45
- "manualCorrectionRate": 0.0833,
46
- "benchmarkGatePassed": true,
47
- "intelligencePassed": true,
48
- "staleWatchlistCount": 0,
49
- "reliabilityPassed": true,
50
- "reliabilityRiskLevel": "monitor",
51
- "incorrectDetectionRate": 0.0833,
52
- "lowConfidenceRate": 0.0833,
53
- "vulnerabilityTotal": null,
54
- "criticalVulnerabilityCount": null,
55
- "forbiddenContentPassed": true
56
- },
57
- {
58
- "generatedAt": "2026-04-17T03:20:15.400Z",
59
- "releaseVersion": "2.0.26",
60
- "fixtureCount": 12,
61
- "top1Accuracy": 0.9167,
62
- "manualCorrectionRate": 0.0833,
63
- "benchmarkGatePassed": true,
64
- "intelligencePassed": true,
65
- "staleWatchlistCount": 0,
66
- "reliabilityPassed": true,
67
- "reliabilityRiskLevel": "monitor",
68
- "incorrectDetectionRate": 0.0833,
69
- "lowConfidenceRate": 0.0833,
70
- "vulnerabilityTotal": null,
71
- "criticalVulnerabilityCount": null,
72
- "forbiddenContentPassed": true
73
- }
74
- ]
75
- }
@@ -1,5 +0,0 @@
1
- snapshotIndex,generatedAt,releaseVersion,top1Accuracy,manualCorrectionRate,incorrectDetectionRate,lowConfidenceRate,staleWatchlistCount,vulnerabilityTotal,criticalVulnerabilityCount,benchmarkGatePassed,intelligencePassed,reliabilityPassed,reliabilityRiskLevel
2
- "1","2026-04-17T02:54:01.239Z","2.0.26","0.9167","0.0833","0.0833","0.0833","0",,,"true","true","true","monitor"
3
- "2","2026-04-17T02:54:57.419Z","2.0.26","0.9167","0.0833","0.0833","0.0833","0",,,"true","true","true","monitor"
4
- "3","2026-04-17T03:19:31.047Z","2.0.26","0.9167","0.0833","0.0833","0.0833","0",,,"true","true","true","monitor"
5
- "4","2026-04-17T03:20:15.400Z","2.0.26","0.9167","0.0833","0.0833","0.0833","0",,,"true","true","true","monitor"
@@ -1,140 +0,0 @@
1
- {
2
- "generatedAt": "2026-04-17T03:20:15.400Z",
3
- "reportName": "benchmark-trend-report",
4
- "releaseVersion": "2.0.26",
5
- "historyCount": 4,
6
- "releaseDelta": {
7
- "currentReleaseVersion": "2.0.26",
8
- "previousReleaseVersion": "2.0.26",
9
- "comparedSnapshot": {
10
- "currentGeneratedAt": "2026-04-17T03:20:15.400Z",
11
- "previousGeneratedAt": "2026-04-17T03:19:31.047Z"
12
- },
13
- "top1AccuracyDelta": 0,
14
- "manualCorrectionRateDelta": 0,
15
- "staleWatchlistCountDelta": 0,
16
- "vulnerabilityTotalDelta": 0,
17
- "summary": [
18
- "top1Accuracy: +0",
19
- "manualCorrectionRate: +0",
20
- "staleWatchlistCount: +0",
21
- "vulnerabilityTotal: +0"
22
- ]
23
- },
24
- "trendTable": [
25
- {
26
- "snapshotIndex": 1,
27
- "generatedAt": "2026-04-17T02:54:01.239Z",
28
- "releaseVersion": "2.0.26",
29
- "top1Accuracy": 0.9167,
30
- "manualCorrectionRate": 0.0833,
31
- "incorrectDetectionRate": 0.0833,
32
- "lowConfidenceRate": 0.0833,
33
- "staleWatchlistCount": 0,
34
- "vulnerabilityTotal": null,
35
- "criticalVulnerabilityCount": null,
36
- "benchmarkGatePassed": true,
37
- "intelligencePassed": true,
38
- "reliabilityPassed": true,
39
- "reliabilityRiskLevel": "monitor"
40
- },
41
- {
42
- "snapshotIndex": 2,
43
- "generatedAt": "2026-04-17T02:54:57.419Z",
44
- "releaseVersion": "2.0.26",
45
- "top1Accuracy": 0.9167,
46
- "manualCorrectionRate": 0.0833,
47
- "incorrectDetectionRate": 0.0833,
48
- "lowConfidenceRate": 0.0833,
49
- "staleWatchlistCount": 0,
50
- "vulnerabilityTotal": null,
51
- "criticalVulnerabilityCount": null,
52
- "benchmarkGatePassed": true,
53
- "intelligencePassed": true,
54
- "reliabilityPassed": true,
55
- "reliabilityRiskLevel": "monitor"
56
- },
57
- {
58
- "snapshotIndex": 3,
59
- "generatedAt": "2026-04-17T03:19:31.047Z",
60
- "releaseVersion": "2.0.26",
61
- "top1Accuracy": 0.9167,
62
- "manualCorrectionRate": 0.0833,
63
- "incorrectDetectionRate": 0.0833,
64
- "lowConfidenceRate": 0.0833,
65
- "staleWatchlistCount": 0,
66
- "vulnerabilityTotal": null,
67
- "criticalVulnerabilityCount": null,
68
- "benchmarkGatePassed": true,
69
- "intelligencePassed": true,
70
- "reliabilityPassed": true,
71
- "reliabilityRiskLevel": "monitor"
72
- },
73
- {
74
- "snapshotIndex": 4,
75
- "generatedAt": "2026-04-17T03:20:15.400Z",
76
- "releaseVersion": "2.0.26",
77
- "top1Accuracy": 0.9167,
78
- "manualCorrectionRate": 0.0833,
79
- "incorrectDetectionRate": 0.0833,
80
- "lowConfidenceRate": 0.0833,
81
- "staleWatchlistCount": 0,
82
- "vulnerabilityTotal": null,
83
- "criticalVulnerabilityCount": null,
84
- "benchmarkGatePassed": true,
85
- "intelligencePassed": true,
86
- "reliabilityPassed": true,
87
- "reliabilityRiskLevel": "monitor"
88
- }
89
- ],
90
- "chartSeries": {
91
- "generatedAt": [
92
- "2026-04-17T02:54:01.239Z",
93
- "2026-04-17T02:54:57.419Z",
94
- "2026-04-17T03:19:31.047Z",
95
- "2026-04-17T03:20:15.400Z"
96
- ],
97
- "top1Accuracy": [
98
- 0.9167,
99
- 0.9167,
100
- 0.9167,
101
- 0.9167
102
- ],
103
- "manualCorrectionRate": [
104
- 0.0833,
105
- 0.0833,
106
- 0.0833,
107
- 0.0833
108
- ],
109
- "incorrectDetectionRate": [
110
- 0.0833,
111
- 0.0833,
112
- 0.0833,
113
- 0.0833
114
- ],
115
- "lowConfidenceRate": [
116
- 0.0833,
117
- 0.0833,
118
- 0.0833,
119
- 0.0833
120
- ],
121
- "staleWatchlistCount": [
122
- 0,
123
- 0,
124
- 0,
125
- 0
126
- ],
127
- "vulnerabilityTotal": [
128
- null,
129
- null,
130
- null,
131
- null
132
- ]
133
- },
134
- "artifacts": {
135
- "historyPath": ".agent-context/state/benchmark-history.json",
136
- "jsonPath": ".agent-context/state/benchmark-trend-report.json",
137
- "csvPath": ".agent-context/state/benchmark-trend-report.csv",
138
- "writeMode": "stdout-and-file"
139
- }
140
- }
@@ -1,462 +0,0 @@
1
- {
2
- "generatedAt": "2026-04-14T06:57:14.623Z",
3
- "reportName": "benchmark-writer-judge-matrix",
4
- "phase": "v2.5.1",
5
- "passed": true,
6
- "failureCount": 0,
7
- "methodology": {
8
- "blindReviewMode": true,
9
- "writerLaneModelCount": 3,
10
- "judgeLaneModelCount": 2,
11
- "scenarioCount": 4,
12
- "writerWeights": {
13
- "quality": 40,
14
- "efficiency": 20,
15
- "reliability": 25,
16
- "freshness": 15
17
- },
18
- "judgeWeights": {
19
- "clarity": 35,
20
- "correctness": 35,
21
- "risk": 20,
22
- "consistency": 10
23
- }
24
- },
25
- "coreSignals": {
26
- "top1Accuracy": 0.9167,
27
- "manualCorrectionRate": 0.0833,
28
- "nativeSavingsPercent": 81.52,
29
- "benchmarkGatePassed": true,
30
- "benchmarkGateFailureCount": 0,
31
- "intelligenceFailureCount": 0,
32
- "staleWatchlistCount": 0,
33
- "top1AccuracyMet": true,
34
- "manualCorrectionMet": true
35
- },
36
- "writerDirectory": [
37
- {
38
- "writerToken": "W1",
39
- "writerModel": {
40
- "id": "writer-copilot-balanced",
41
- "provider": "github-copilot",
42
- "profile": "balanced"
43
- },
44
- "averageCompositeScore": 93.7
45
- },
46
- {
47
- "writerToken": "W2",
48
- "writerModel": {
49
- "id": "writer-claude-architect",
50
- "provider": "anthropic",
51
- "profile": "architect"
52
- },
53
- "averageCompositeScore": 92.8
54
- },
55
- {
56
- "writerToken": "W3",
57
- "writerModel": {
58
- "id": "writer-gemini-ops",
59
- "provider": "google",
60
- "profile": "operations"
61
- },
62
- "averageCompositeScore": 92.65
63
- }
64
- ],
65
- "comparisonMatrix": [
66
- {
67
- "scenarioId": "planning",
68
- "scenarioCategory": "planning",
69
- "writerToken": "W1",
70
- "writerModelId": null,
71
- "judgeModelId": "judge-claude-audit",
72
- "blindPairId": "planning:W1:judge-claude-audit",
73
- "writerCompositeScore": 92.12,
74
- "judgeCompositeScore": 94.12,
75
- "scoreThreshold": 75,
76
- "leniencyWindow": 2,
77
- "meetsScoreThreshold": true,
78
- "meetsCoreSignals": true,
79
- "verdict": "pass"
80
- },
81
- {
82
- "scenarioId": "planning",
83
- "scenarioCategory": "planning",
84
- "writerToken": "W1",
85
- "writerModelId": null,
86
- "judgeModelId": "judge-gpt-risk",
87
- "blindPairId": "planning:W1:judge-gpt-risk",
88
- "writerCompositeScore": 92.12,
89
- "judgeCompositeScore": 94.12,
90
- "scoreThreshold": 75,
91
- "leniencyWindow": 2,
92
- "meetsScoreThreshold": true,
93
- "meetsCoreSignals": true,
94
- "verdict": "pass"
95
- },
96
- {
97
- "scenarioId": "refactor",
98
- "scenarioCategory": "refactor",
99
- "writerToken": "W1",
100
- "writerModelId": null,
101
- "judgeModelId": "judge-claude-audit",
102
- "blindPairId": "refactor:W1:judge-claude-audit",
103
- "writerCompositeScore": 95.26,
104
- "judgeCompositeScore": 93.26,
105
- "scoreThreshold": 75,
106
- "leniencyWindow": 2,
107
- "meetsScoreThreshold": true,
108
- "meetsCoreSignals": true,
109
- "verdict": "pass"
110
- },
111
- {
112
- "scenarioId": "refactor",
113
- "scenarioCategory": "refactor",
114
- "writerToken": "W1",
115
- "writerModelId": null,
116
- "judgeModelId": "judge-gpt-risk",
117
- "blindPairId": "refactor:W1:judge-gpt-risk",
118
- "writerCompositeScore": 95.26,
119
- "judgeCompositeScore": 93.26,
120
- "scoreThreshold": 75,
121
- "leniencyWindow": 2,
122
- "meetsScoreThreshold": true,
123
- "meetsCoreSignals": true,
124
- "verdict": "pass"
125
- },
126
- {
127
- "scenarioId": "security",
128
- "scenarioCategory": "security",
129
- "writerToken": "W1",
130
- "writerModelId": null,
131
- "judgeModelId": "judge-claude-audit",
132
- "blindPairId": "security:W1:judge-claude-audit",
133
- "writerCompositeScore": 94.42,
134
- "judgeCompositeScore": 92.42,
135
- "scoreThreshold": 75,
136
- "leniencyWindow": 2,
137
- "meetsScoreThreshold": true,
138
- "meetsCoreSignals": true,
139
- "verdict": "pass"
140
- },
141
- {
142
- "scenarioId": "security",
143
- "scenarioCategory": "security",
144
- "writerToken": "W1",
145
- "writerModelId": null,
146
- "judgeModelId": "judge-gpt-risk",
147
- "blindPairId": "security:W1:judge-gpt-risk",
148
- "writerCompositeScore": 94.42,
149
- "judgeCompositeScore": 93.42,
150
- "scoreThreshold": 75,
151
- "leniencyWindow": 2,
152
- "meetsScoreThreshold": true,
153
- "meetsCoreSignals": true,
154
- "verdict": "pass"
155
- },
156
- {
157
- "scenarioId": "delivery",
158
- "scenarioCategory": "delivery",
159
- "writerToken": "W1",
160
- "writerModelId": null,
161
- "judgeModelId": "judge-claude-audit",
162
- "blindPairId": "delivery:W1:judge-claude-audit",
163
- "writerCompositeScore": 92.99,
164
- "judgeCompositeScore": 93.99,
165
- "scoreThreshold": 75,
166
- "leniencyWindow": 2,
167
- "meetsScoreThreshold": true,
168
- "meetsCoreSignals": true,
169
- "verdict": "pass"
170
- },
171
- {
172
- "scenarioId": "delivery",
173
- "scenarioCategory": "delivery",
174
- "writerToken": "W1",
175
- "writerModelId": null,
176
- "judgeModelId": "judge-gpt-risk",
177
- "blindPairId": "delivery:W1:judge-gpt-risk",
178
- "writerCompositeScore": 92.99,
179
- "judgeCompositeScore": 92.99,
180
- "scoreThreshold": 75,
181
- "leniencyWindow": 2,
182
- "meetsScoreThreshold": true,
183
- "meetsCoreSignals": true,
184
- "verdict": "pass"
185
- },
186
- {
187
- "scenarioId": "planning",
188
- "scenarioCategory": "planning",
189
- "writerToken": "W2",
190
- "writerModelId": null,
191
- "judgeModelId": "judge-claude-audit",
192
- "blindPairId": "planning:W2:judge-claude-audit",
193
- "writerCompositeScore": 93.12,
194
- "judgeCompositeScore": 94.12,
195
- "scoreThreshold": 75,
196
- "leniencyWindow": 2,
197
- "meetsScoreThreshold": true,
198
- "meetsCoreSignals": true,
199
- "verdict": "pass"
200
- },
201
- {
202
- "scenarioId": "planning",
203
- "scenarioCategory": "planning",
204
- "writerToken": "W2",
205
- "writerModelId": null,
206
- "judgeModelId": "judge-gpt-risk",
207
- "blindPairId": "planning:W2:judge-gpt-risk",
208
- "writerCompositeScore": 93.12,
209
- "judgeCompositeScore": 94.12,
210
- "scoreThreshold": 75,
211
- "leniencyWindow": 2,
212
- "meetsScoreThreshold": true,
213
- "meetsCoreSignals": true,
214
- "verdict": "pass"
215
- },
216
- {
217
- "scenarioId": "refactor",
218
- "scenarioCategory": "refactor",
219
- "writerToken": "W2",
220
- "writerModelId": null,
221
- "judgeModelId": "judge-claude-audit",
222
- "blindPairId": "refactor:W2:judge-claude-audit",
223
- "writerCompositeScore": 93.06,
224
- "judgeCompositeScore": 95.06,
225
- "scoreThreshold": 75,
226
- "leniencyWindow": 2,
227
- "meetsScoreThreshold": true,
228
- "meetsCoreSignals": true,
229
- "verdict": "pass"
230
- },
231
- {
232
- "scenarioId": "refactor",
233
- "scenarioCategory": "refactor",
234
- "writerToken": "W2",
235
- "writerModelId": null,
236
- "judgeModelId": "judge-gpt-risk",
237
- "blindPairId": "refactor:W2:judge-gpt-risk",
238
- "writerCompositeScore": 93.06,
239
- "judgeCompositeScore": 95.06,
240
- "scoreThreshold": 75,
241
- "leniencyWindow": 2,
242
- "meetsScoreThreshold": true,
243
- "meetsCoreSignals": true,
244
- "verdict": "pass"
245
- },
246
- {
247
- "scenarioId": "security",
248
- "scenarioCategory": "security",
249
- "writerToken": "W2",
250
- "writerModelId": null,
251
- "judgeModelId": "judge-claude-audit",
252
- "blindPairId": "security:W2:judge-claude-audit",
253
- "writerCompositeScore": 91.82,
254
- "judgeCompositeScore": 90.82,
255
- "scoreThreshold": 75,
256
- "leniencyWindow": 2,
257
- "meetsScoreThreshold": true,
258
- "meetsCoreSignals": true,
259
- "verdict": "pass"
260
- },
261
- {
262
- "scenarioId": "security",
263
- "scenarioCategory": "security",
264
- "writerToken": "W2",
265
- "writerModelId": null,
266
- "judgeModelId": "judge-gpt-risk",
267
- "blindPairId": "security:W2:judge-gpt-risk",
268
- "writerCompositeScore": 91.82,
269
- "judgeCompositeScore": 89.82,
270
- "scoreThreshold": 75,
271
- "leniencyWindow": 2,
272
- "meetsScoreThreshold": true,
273
- "meetsCoreSignals": true,
274
- "verdict": "pass"
275
- },
276
- {
277
- "scenarioId": "delivery",
278
- "scenarioCategory": "delivery",
279
- "writerToken": "W2",
280
- "writerModelId": null,
281
- "judgeModelId": "judge-claude-audit",
282
- "blindPairId": "delivery:W2:judge-claude-audit",
283
- "writerCompositeScore": 93.19,
284
- "judgeCompositeScore": 93.19,
285
- "scoreThreshold": 75,
286
- "leniencyWindow": 2,
287
- "meetsScoreThreshold": true,
288
- "meetsCoreSignals": true,
289
- "verdict": "pass"
290
- },
291
- {
292
- "scenarioId": "delivery",
293
- "scenarioCategory": "delivery",
294
- "writerToken": "W2",
295
- "writerModelId": null,
296
- "judgeModelId": "judge-gpt-risk",
297
- "blindPairId": "delivery:W2:judge-gpt-risk",
298
- "writerCompositeScore": 93.19,
299
- "judgeCompositeScore": 94.19,
300
- "scoreThreshold": 75,
301
- "leniencyWindow": 2,
302
- "meetsScoreThreshold": true,
303
- "meetsCoreSignals": true,
304
- "verdict": "pass"
305
- },
306
- {
307
- "scenarioId": "planning",
308
- "scenarioCategory": "planning",
309
- "writerToken": "W3",
310
- "writerModelId": null,
311
- "judgeModelId": "judge-claude-audit",
312
- "blindPairId": "planning:W3:judge-claude-audit",
313
- "writerCompositeScore": 93.27,
314
- "judgeCompositeScore": 93.27,
315
- "scoreThreshold": 75,
316
- "leniencyWindow": 2,
317
- "meetsScoreThreshold": true,
318
- "meetsCoreSignals": true,
319
- "verdict": "pass"
320
- },
321
- {
322
- "scenarioId": "planning",
323
- "scenarioCategory": "planning",
324
- "writerToken": "W3",
325
- "writerModelId": null,
326
- "judgeModelId": "judge-gpt-risk",
327
- "blindPairId": "planning:W3:judge-gpt-risk",
328
- "writerCompositeScore": 93.27,
329
- "judgeCompositeScore": 93.27,
330
- "scoreThreshold": 75,
331
- "leniencyWindow": 2,
332
- "meetsScoreThreshold": true,
333
- "meetsCoreSignals": true,
334
- "verdict": "pass"
335
- },
336
- {
337
- "scenarioId": "refactor",
338
- "scenarioCategory": "refactor",
339
- "writerToken": "W3",
340
- "writerModelId": null,
341
- "judgeModelId": "judge-claude-audit",
342
- "blindPairId": "refactor:W3:judge-claude-audit",
343
- "writerCompositeScore": 94.01,
344
- "judgeCompositeScore": 95.01,
345
- "scoreThreshold": 75,
346
- "leniencyWindow": 2,
347
- "meetsScoreThreshold": true,
348
- "meetsCoreSignals": true,
349
- "verdict": "pass"
350
- },
351
- {
352
- "scenarioId": "refactor",
353
- "scenarioCategory": "refactor",
354
- "writerToken": "W3",
355
- "writerModelId": null,
356
- "judgeModelId": "judge-gpt-risk",
357
- "blindPairId": "refactor:W3:judge-gpt-risk",
358
- "writerCompositeScore": 94.01,
359
- "judgeCompositeScore": 95.01,
360
- "scoreThreshold": 75,
361
- "leniencyWindow": 2,
362
- "meetsScoreThreshold": true,
363
- "meetsCoreSignals": true,
364
- "verdict": "pass"
365
- },
366
- {
367
- "scenarioId": "security",
368
- "scenarioCategory": "security",
369
- "writerToken": "W3",
370
- "writerModelId": null,
371
- "judgeModelId": "judge-claude-audit",
372
- "blindPairId": "security:W3:judge-claude-audit",
373
- "writerCompositeScore": 92.37,
374
- "judgeCompositeScore": 92.37,
375
- "scoreThreshold": 75,
376
- "leniencyWindow": 2,
377
- "meetsScoreThreshold": true,
378
- "meetsCoreSignals": true,
379
- "verdict": "pass"
380
- },
381
- {
382
- "scenarioId": "security",
383
- "scenarioCategory": "security",
384
- "writerToken": "W3",
385
- "writerModelId": null,
386
- "judgeModelId": "judge-gpt-risk",
387
- "blindPairId": "security:W3:judge-gpt-risk",
388
- "writerCompositeScore": 92.37,
389
- "judgeCompositeScore": 94.37,
390
- "scoreThreshold": 75,
391
- "leniencyWindow": 2,
392
- "meetsScoreThreshold": true,
393
- "meetsCoreSignals": true,
394
- "verdict": "pass"
395
- },
396
- {
397
- "scenarioId": "delivery",
398
- "scenarioCategory": "delivery",
399
- "writerToken": "W3",
400
- "writerModelId": null,
401
- "judgeModelId": "judge-claude-audit",
402
- "blindPairId": "delivery:W3:judge-claude-audit",
403
- "writerCompositeScore": 90.94,
404
- "judgeCompositeScore": 89.94,
405
- "scoreThreshold": 75,
406
- "leniencyWindow": 2,
407
- "meetsScoreThreshold": true,
408
- "meetsCoreSignals": true,
409
- "verdict": "pass"
410
- },
411
- {
412
- "scenarioId": "delivery",
413
- "scenarioCategory": "delivery",
414
- "writerToken": "W3",
415
- "writerModelId": null,
416
- "judgeModelId": "judge-gpt-risk",
417
- "blindPairId": "delivery:W3:judge-gpt-risk",
418
- "writerCompositeScore": 90.94,
419
- "judgeCompositeScore": 92.94,
420
- "scoreThreshold": 75,
421
- "leniencyWindow": 2,
422
- "meetsScoreThreshold": true,
423
- "meetsCoreSignals": true,
424
- "verdict": "pass"
425
- }
426
- ],
427
- "summary": {
428
- "passCount": 24,
429
- "failCount": 0,
430
- "passRatePercent": 100
431
- },
432
- "executions": [
433
- {
434
- "scriptPath": "scripts/detection-benchmark.mjs",
435
- "exitCode": 0,
436
- "parseError": null,
437
- "reportName": null,
438
- "passed": null
439
- },
440
- {
441
- "scriptPath": "scripts/token-optimization-benchmark.mjs",
442
- "exitCode": 0,
443
- "parseError": null,
444
- "reportName": "token-optimization-benchmark",
445
- "passed": null
446
- },
447
- {
448
- "scriptPath": "scripts/benchmark-gate.mjs",
449
- "exitCode": 0,
450
- "parseError": null,
451
- "reportName": "benchmark-gate",
452
- "passed": true
453
- },
454
- {
455
- "scriptPath": "scripts/benchmark-intelligence.mjs",
456
- "exitCode": 0,
457
- "parseError": null,
458
- "reportName": "benchmark-intelligence",
459
- "passed": true
460
- }
461
- ]
462
- }