@ryuenn3123/agentic-senior-core 2.0.26 → 2.0.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-context/state/benchmark-evidence-bundle.json +672 -22
- package/.agent-context/state/benchmark-history.json +75 -0
- package/.agent-context/state/benchmark-trend-report.csv +5 -0
- package/.agent-context/state/benchmark-trend-report.json +140 -0
- package/.agent-context/state/benchmark-watchlist.json +3 -3
- package/.agent-context/state/memory-adapter-contract.json +52 -0
- package/.agent-context/state/memory-continuity-benchmark.json +132 -0
- package/.agent-context/state/memory-schema-v1.json +88 -0
- package/.cursorrules +1 -1
- package/.windsurfrules +1 -1
- package/README.md +22 -0
- package/lib/cli/memory-continuity.mjs +266 -0
- package/package.json +2 -1
- package/scripts/benchmark-evidence-bundle.mjs +493 -16
- package/scripts/memory-continuity-benchmark.mjs +322 -0
- package/scripts/validate.mjs +3 -0
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
{
|
|
2
|
-
"generatedAt": "2026-04-
|
|
2
|
+
"generatedAt": "2026-04-17T03:20:15.400Z",
|
|
3
3
|
"reportName": "benchmark-evidence-bundle",
|
|
4
|
-
"phase": "v2.5.
|
|
4
|
+
"phase": "v2.5.2",
|
|
5
|
+
"releaseVersion": "2.0.26",
|
|
5
6
|
"passed": true,
|
|
6
7
|
"failureCount": 0,
|
|
7
8
|
"methodology": {
|
|
@@ -13,20 +14,25 @@
|
|
|
13
14
|
"shellNotes": "PowerShell and POSIX shells are supported; prefer portable commands for benchmark reruns."
|
|
14
15
|
},
|
|
15
16
|
"scenarioCount": 4,
|
|
16
|
-
"commandCount":
|
|
17
|
+
"commandCount": 8
|
|
17
18
|
},
|
|
18
19
|
"rerunInstructions": [
|
|
19
20
|
"Run npm run benchmark:detection to regenerate detection benchmark output.",
|
|
20
21
|
"Run npm run benchmark:gate to validate benchmark anti-regression thresholds.",
|
|
21
22
|
"Run npm run benchmark:intelligence to validate benchmark watchlist freshness.",
|
|
22
|
-
"Run npm run benchmark:bundle to emit a reproducible benchmark evidence bundle."
|
|
23
|
+
"Run npm run benchmark:bundle to emit a reproducible benchmark evidence bundle.",
|
|
24
|
+
"Run npm run benchmark:writer-judge to emit writer-judge side-by-side matrix output.",
|
|
25
|
+
"Run npm run benchmark:continuity to validate cross-agent memory hydration, privacy redaction, and token-savings behavior."
|
|
23
26
|
],
|
|
24
27
|
"commandExamples": [
|
|
25
28
|
"npm run benchmark:detection",
|
|
26
29
|
"npm run benchmark:gate",
|
|
27
30
|
"npm run benchmark:intelligence",
|
|
28
31
|
"npm run benchmark:bundle",
|
|
29
|
-
"
|
|
32
|
+
"npm run benchmark:writer-judge",
|
|
33
|
+
"node ./scripts/benchmark-evidence-bundle.mjs --stdout-only",
|
|
34
|
+
"npm run benchmark:continuity",
|
|
35
|
+
"node ./scripts/memory-continuity-benchmark.mjs --stdout-only"
|
|
30
36
|
],
|
|
31
37
|
"rawInputs": {
|
|
32
38
|
"scenarios": [
|
|
@@ -103,19 +109,205 @@
|
|
|
103
109
|
{
|
|
104
110
|
"repository": "sickn33/antigravity-awesome-skills",
|
|
105
111
|
"owner": "core-architecture",
|
|
106
|
-
"lastReviewedAt": "2026-04-
|
|
112
|
+
"lastReviewedAt": "2026-04-17"
|
|
107
113
|
},
|
|
108
114
|
{
|
|
109
115
|
"repository": "github/awesome-copilot",
|
|
110
116
|
"owner": "core-architecture",
|
|
111
|
-
"lastReviewedAt": "2026-04-
|
|
117
|
+
"lastReviewedAt": "2026-04-17"
|
|
112
118
|
},
|
|
113
119
|
{
|
|
114
120
|
"repository": "MiniMax-AI/skills",
|
|
115
121
|
"owner": "frontend-governance",
|
|
116
|
-
"lastReviewedAt": "2026-04-
|
|
122
|
+
"lastReviewedAt": "2026-04-17"
|
|
117
123
|
}
|
|
118
|
-
]
|
|
124
|
+
],
|
|
125
|
+
"memorySchema": {
|
|
126
|
+
"schemaVersion": "1.0.0",
|
|
127
|
+
"schemaName": "cross-agent-memory-observation",
|
|
128
|
+
"description": "Provider-agnostic schema for persistent memory observations shared across coding agents and IDE hosts.",
|
|
129
|
+
"requiredFields": [
|
|
130
|
+
"id",
|
|
131
|
+
"projectId",
|
|
132
|
+
"sessionId",
|
|
133
|
+
"adapterId",
|
|
134
|
+
"eventType",
|
|
135
|
+
"timestamp",
|
|
136
|
+
"title",
|
|
137
|
+
"summary",
|
|
138
|
+
"detail",
|
|
139
|
+
"privacy"
|
|
140
|
+
],
|
|
141
|
+
"fieldDefinitions": {
|
|
142
|
+
"id": {
|
|
143
|
+
"type": "string",
|
|
144
|
+
"description": "Stable unique observation identifier."
|
|
145
|
+
},
|
|
146
|
+
"projectId": {
|
|
147
|
+
"type": "string",
|
|
148
|
+
"description": "Repository or workspace identifier."
|
|
149
|
+
},
|
|
150
|
+
"sessionId": {
|
|
151
|
+
"type": "string",
|
|
152
|
+
"description": "Source session identifier from host adapter."
|
|
153
|
+
},
|
|
154
|
+
"adapterId": {
|
|
155
|
+
"type": "string",
|
|
156
|
+
"allowedValues": [
|
|
157
|
+
"claude-code",
|
|
158
|
+
"gemini-cli",
|
|
159
|
+
"vscode-chat",
|
|
160
|
+
"custom"
|
|
161
|
+
],
|
|
162
|
+
"description": "Host adapter that captured this observation."
|
|
163
|
+
},
|
|
164
|
+
"eventType": {
|
|
165
|
+
"type": "string",
|
|
166
|
+
"allowedValues": [
|
|
167
|
+
"prompt",
|
|
168
|
+
"tool-use",
|
|
169
|
+
"decision",
|
|
170
|
+
"summary",
|
|
171
|
+
"issue",
|
|
172
|
+
"context"
|
|
173
|
+
],
|
|
174
|
+
"description": "Observation type for retrieval filtering."
|
|
175
|
+
},
|
|
176
|
+
"timestamp": {
|
|
177
|
+
"type": "string",
|
|
178
|
+
"format": "date-time",
|
|
179
|
+
"description": "ISO timestamp when observation was captured."
|
|
180
|
+
},
|
|
181
|
+
"title": {
|
|
182
|
+
"type": "string",
|
|
183
|
+
"description": "Compact human-readable headline."
|
|
184
|
+
},
|
|
185
|
+
"summary": {
|
|
186
|
+
"type": "string",
|
|
187
|
+
"description": "Compact session-start payload used for progressive disclosure."
|
|
188
|
+
},
|
|
189
|
+
"detail": {
|
|
190
|
+
"type": "string",
|
|
191
|
+
"description": "Expanded observation text fetched on demand."
|
|
192
|
+
},
|
|
193
|
+
"tags": {
|
|
194
|
+
"type": "array",
|
|
195
|
+
"items": "string",
|
|
196
|
+
"description": "Optional normalized tags for query refinement."
|
|
197
|
+
},
|
|
198
|
+
"privacy": {
|
|
199
|
+
"type": "object",
|
|
200
|
+
"requiredFields": [
|
|
201
|
+
"level",
|
|
202
|
+
"redactionApplied"
|
|
203
|
+
],
|
|
204
|
+
"fieldDefinitions": {
|
|
205
|
+
"level": {
|
|
206
|
+
"type": "string",
|
|
207
|
+
"allowedValues": [
|
|
208
|
+
"public",
|
|
209
|
+
"internal",
|
|
210
|
+
"restricted"
|
|
211
|
+
],
|
|
212
|
+
"description": "Privacy classification level."
|
|
213
|
+
},
|
|
214
|
+
"redactionApplied": {
|
|
215
|
+
"type": "boolean",
|
|
216
|
+
"description": "Indicates whether privacy sanitization modified payload content."
|
|
217
|
+
},
|
|
218
|
+
"redactionReasons": {
|
|
219
|
+
"type": "array",
|
|
220
|
+
"items": "string",
|
|
221
|
+
"description": "Redaction reason tags such as private-tag or token-like-value."
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
},
|
|
226
|
+
"retrievalContract": {
|
|
227
|
+
"sessionStartPayload": [
|
|
228
|
+
"id",
|
|
229
|
+
"adapterId",
|
|
230
|
+
"eventType",
|
|
231
|
+
"timestamp",
|
|
232
|
+
"title",
|
|
233
|
+
"summary"
|
|
234
|
+
],
|
|
235
|
+
"onDemandPayload": [
|
|
236
|
+
"detail",
|
|
237
|
+
"tags",
|
|
238
|
+
"privacy"
|
|
239
|
+
],
|
|
240
|
+
"progressiveDisclosure": true
|
|
241
|
+
}
|
|
242
|
+
},
|
|
243
|
+
"memoryAdapterContract": {
|
|
244
|
+
"schemaVersion": "1.0.0",
|
|
245
|
+
"contractName": "cross-agent-memory-adapter",
|
|
246
|
+
"description": "Adapter contract for ingesting and retrieving shared memory observations across IDE hosts.",
|
|
247
|
+
"requiredAdapters": [
|
|
248
|
+
"claude-code",
|
|
249
|
+
"gemini-cli",
|
|
250
|
+
"vscode-chat"
|
|
251
|
+
],
|
|
252
|
+
"requiredOperations": {
|
|
253
|
+
"ingestion": [
|
|
254
|
+
"captureObservation",
|
|
255
|
+
"captureSessionSummary"
|
|
256
|
+
],
|
|
257
|
+
"retrieval": [
|
|
258
|
+
"searchIndex",
|
|
259
|
+
"getTimeline",
|
|
260
|
+
"getObservations"
|
|
261
|
+
],
|
|
262
|
+
"privacy": [
|
|
263
|
+
"applyPrivateTagRedaction",
|
|
264
|
+
"applyInlineSecretRedaction"
|
|
265
|
+
]
|
|
266
|
+
},
|
|
267
|
+
"adapters": [
|
|
268
|
+
{
|
|
269
|
+
"adapterId": "claude-code",
|
|
270
|
+
"hostType": "plugin-hooks",
|
|
271
|
+
"status": "pilot-ready",
|
|
272
|
+
"ingestionEvents": [
|
|
273
|
+
"SessionStart",
|
|
274
|
+
"UserPromptSubmit",
|
|
275
|
+
"PostToolUse",
|
|
276
|
+
"Stop",
|
|
277
|
+
"SessionEnd"
|
|
278
|
+
],
|
|
279
|
+
"retrievalMode": "mcp-tools"
|
|
280
|
+
},
|
|
281
|
+
{
|
|
282
|
+
"adapterId": "gemini-cli",
|
|
283
|
+
"hostType": "plugin-hooks",
|
|
284
|
+
"status": "pilot-ready",
|
|
285
|
+
"ingestionEvents": [
|
|
286
|
+
"session_start",
|
|
287
|
+
"prompt_submit",
|
|
288
|
+
"post_tool",
|
|
289
|
+
"session_end"
|
|
290
|
+
],
|
|
291
|
+
"retrievalMode": "mcp-tools"
|
|
292
|
+
},
|
|
293
|
+
{
|
|
294
|
+
"adapterId": "vscode-chat",
|
|
295
|
+
"hostType": "chat-customization-plugin",
|
|
296
|
+
"status": "pilot-ready",
|
|
297
|
+
"ingestionEvents": [
|
|
298
|
+
"chatStart",
|
|
299
|
+
"promptSubmit",
|
|
300
|
+
"postToolUse",
|
|
301
|
+
"chatEnd"
|
|
302
|
+
],
|
|
303
|
+
"retrievalMode": "mcp-tools"
|
|
304
|
+
}
|
|
305
|
+
],
|
|
306
|
+
"notes": [
|
|
307
|
+
"Web chat hosts are explicitly out of scope for this pilot because local runtime hooks are unavailable.",
|
|
308
|
+
"Adapters should emit provider-agnostic payloads matching .agent-context/state/memory-schema-v1.json."
|
|
309
|
+
]
|
|
310
|
+
}
|
|
119
311
|
},
|
|
120
312
|
"rubric": {
|
|
121
313
|
"benchmarkThresholds": {
|
|
@@ -124,11 +316,329 @@
|
|
|
124
316
|
"maximumTop1AccuracyDrop": 0.02,
|
|
125
317
|
"maximumManualCorrectionIncrease": 0.03
|
|
126
318
|
},
|
|
127
|
-
"intelligenceSlaDays": 14
|
|
319
|
+
"intelligenceSlaDays": 14,
|
|
320
|
+
"reliabilityThresholds": {
|
|
321
|
+
"minimumConfidenceGap": 0.1,
|
|
322
|
+
"maximumLowConfidenceRate": 0.2,
|
|
323
|
+
"maximumIncorrectDetectionRate": 0.1
|
|
324
|
+
},
|
|
325
|
+
"continuityThresholds": {
|
|
326
|
+
"minimumRelevantRecall": 0.8,
|
|
327
|
+
"minimumSessionStartTokenSavingsPercent": 35,
|
|
328
|
+
"maximumUnsafeObservationCount": 0
|
|
329
|
+
}
|
|
330
|
+
},
|
|
331
|
+
"bugIndicators": {
|
|
332
|
+
"incorrectFixtureCount": 1,
|
|
333
|
+
"incorrectDetectionRate": 0.0833,
|
|
334
|
+
"manualCorrectionFixtureCount": 1,
|
|
335
|
+
"manualCorrectionRate": 0.0833,
|
|
336
|
+
"lowConfidenceFixtureCount": 1,
|
|
337
|
+
"lowConfidenceRate": 0.0833,
|
|
338
|
+
"flaggedFixtures": [
|
|
339
|
+
{
|
|
340
|
+
"fixtureName": "mixed-ts-python",
|
|
341
|
+
"confidenceGap": 0.02,
|
|
342
|
+
"detectedStack": "python.md",
|
|
343
|
+
"expectedStack": "typescript.md",
|
|
344
|
+
"isCorrect": false,
|
|
345
|
+
"needsManualCorrection": true
|
|
346
|
+
}
|
|
347
|
+
]
|
|
348
|
+
},
|
|
349
|
+
"reliabilitySignals": {
|
|
350
|
+
"passed": true,
|
|
351
|
+
"failureCount": 0,
|
|
352
|
+
"riskLevel": "monitor",
|
|
353
|
+
"thresholds": {
|
|
354
|
+
"minimumConfidenceGap": 0.1,
|
|
355
|
+
"maximumLowConfidenceRate": 0.2,
|
|
356
|
+
"maximumIncorrectDetectionRate": 0.1
|
|
357
|
+
},
|
|
358
|
+
"metrics": {
|
|
359
|
+
"fixtureCount": 12,
|
|
360
|
+
"incorrectFixtureCount": 1,
|
|
361
|
+
"lowConfidenceFixtureCount": 1,
|
|
362
|
+
"manualCorrectionFixtureCount": 1,
|
|
363
|
+
"incorrectDetectionRate": 0.0833,
|
|
364
|
+
"lowConfidenceRate": 0.0833,
|
|
365
|
+
"manualCorrectionRate": 0.0833
|
|
366
|
+
},
|
|
367
|
+
"checks": [
|
|
368
|
+
{
|
|
369
|
+
"checkName": "incorrect-detection-rate",
|
|
370
|
+
"passed": true,
|
|
371
|
+
"details": "incorrectRate=0.0833 max=0.1"
|
|
372
|
+
},
|
|
373
|
+
{
|
|
374
|
+
"checkName": "low-confidence-rate",
|
|
375
|
+
"passed": true,
|
|
376
|
+
"details": "lowConfidenceRate=0.0833 max=0.2"
|
|
377
|
+
},
|
|
378
|
+
{
|
|
379
|
+
"checkName": "manual-correction-early-warning",
|
|
380
|
+
"passed": true,
|
|
381
|
+
"details": "manualCorrectionRate=0.0833 warningThreshold=0.12"
|
|
382
|
+
}
|
|
383
|
+
],
|
|
384
|
+
"flaggedFixtures": [
|
|
385
|
+
{
|
|
386
|
+
"fixtureName": "mixed-ts-python",
|
|
387
|
+
"confidenceGap": 0.02,
|
|
388
|
+
"detectedStack": "python.md",
|
|
389
|
+
"expectedStack": "typescript.md",
|
|
390
|
+
"isCorrect": false,
|
|
391
|
+
"needsManualCorrection": true
|
|
392
|
+
}
|
|
393
|
+
]
|
|
394
|
+
},
|
|
395
|
+
"securityIndicators": {
|
|
396
|
+
"forbiddenContent": {
|
|
397
|
+
"checkName": "forbidden-content-scan",
|
|
398
|
+
"passed": true,
|
|
399
|
+
"exitCode": 0,
|
|
400
|
+
"details": "No forbidden content detected"
|
|
401
|
+
},
|
|
402
|
+
"vulnerabilityScan": {
|
|
403
|
+
"checkName": "npm-audit-indicator",
|
|
404
|
+
"isAvailable": false,
|
|
405
|
+
"hasKnownVulnerabilities": null,
|
|
406
|
+
"severityCounts": null,
|
|
407
|
+
"exitCode": 1,
|
|
408
|
+
"error": "Payload is empty"
|
|
409
|
+
}
|
|
410
|
+
},
|
|
411
|
+
"releaseDelta": {
|
|
412
|
+
"currentReleaseVersion": "2.0.26",
|
|
413
|
+
"previousReleaseVersion": "2.0.26",
|
|
414
|
+
"comparedSnapshot": {
|
|
415
|
+
"currentGeneratedAt": "2026-04-17T03:20:15.400Z",
|
|
416
|
+
"previousGeneratedAt": "2026-04-17T03:19:31.047Z"
|
|
417
|
+
},
|
|
418
|
+
"top1AccuracyDelta": 0,
|
|
419
|
+
"manualCorrectionRateDelta": 0,
|
|
420
|
+
"staleWatchlistCountDelta": 0,
|
|
421
|
+
"vulnerabilityTotalDelta": 0,
|
|
422
|
+
"summary": [
|
|
423
|
+
"top1Accuracy: +0",
|
|
424
|
+
"manualCorrectionRate: +0",
|
|
425
|
+
"staleWatchlistCount: +0",
|
|
426
|
+
"vulnerabilityTotal: +0"
|
|
427
|
+
]
|
|
428
|
+
},
|
|
429
|
+
"history": [
|
|
430
|
+
{
|
|
431
|
+
"generatedAt": "2026-04-17T02:54:01.239Z",
|
|
432
|
+
"releaseVersion": "2.0.26",
|
|
433
|
+
"fixtureCount": 12,
|
|
434
|
+
"top1Accuracy": 0.9167,
|
|
435
|
+
"manualCorrectionRate": 0.0833,
|
|
436
|
+
"benchmarkGatePassed": true,
|
|
437
|
+
"intelligencePassed": true,
|
|
438
|
+
"staleWatchlistCount": 0,
|
|
439
|
+
"reliabilityPassed": true,
|
|
440
|
+
"reliabilityRiskLevel": "monitor",
|
|
441
|
+
"incorrectDetectionRate": 0.0833,
|
|
442
|
+
"lowConfidenceRate": 0.0833,
|
|
443
|
+
"vulnerabilityTotal": null,
|
|
444
|
+
"criticalVulnerabilityCount": null,
|
|
445
|
+
"forbiddenContentPassed": true
|
|
446
|
+
},
|
|
447
|
+
{
|
|
448
|
+
"generatedAt": "2026-04-17T02:54:57.419Z",
|
|
449
|
+
"releaseVersion": "2.0.26",
|
|
450
|
+
"fixtureCount": 12,
|
|
451
|
+
"top1Accuracy": 0.9167,
|
|
452
|
+
"manualCorrectionRate": 0.0833,
|
|
453
|
+
"benchmarkGatePassed": true,
|
|
454
|
+
"intelligencePassed": true,
|
|
455
|
+
"staleWatchlistCount": 0,
|
|
456
|
+
"reliabilityPassed": true,
|
|
457
|
+
"reliabilityRiskLevel": "monitor",
|
|
458
|
+
"incorrectDetectionRate": 0.0833,
|
|
459
|
+
"lowConfidenceRate": 0.0833,
|
|
460
|
+
"vulnerabilityTotal": null,
|
|
461
|
+
"criticalVulnerabilityCount": null,
|
|
462
|
+
"forbiddenContentPassed": true
|
|
463
|
+
},
|
|
464
|
+
{
|
|
465
|
+
"generatedAt": "2026-04-17T03:19:31.047Z",
|
|
466
|
+
"releaseVersion": "2.0.26",
|
|
467
|
+
"fixtureCount": 12,
|
|
468
|
+
"top1Accuracy": 0.9167,
|
|
469
|
+
"manualCorrectionRate": 0.0833,
|
|
470
|
+
"benchmarkGatePassed": true,
|
|
471
|
+
"intelligencePassed": true,
|
|
472
|
+
"staleWatchlistCount": 0,
|
|
473
|
+
"reliabilityPassed": true,
|
|
474
|
+
"reliabilityRiskLevel": "monitor",
|
|
475
|
+
"incorrectDetectionRate": 0.0833,
|
|
476
|
+
"lowConfidenceRate": 0.0833,
|
|
477
|
+
"vulnerabilityTotal": null,
|
|
478
|
+
"criticalVulnerabilityCount": null,
|
|
479
|
+
"forbiddenContentPassed": true
|
|
480
|
+
},
|
|
481
|
+
{
|
|
482
|
+
"generatedAt": "2026-04-17T03:20:15.400Z",
|
|
483
|
+
"releaseVersion": "2.0.26",
|
|
484
|
+
"fixtureCount": 12,
|
|
485
|
+
"top1Accuracy": 0.9167,
|
|
486
|
+
"manualCorrectionRate": 0.0833,
|
|
487
|
+
"benchmarkGatePassed": true,
|
|
488
|
+
"intelligencePassed": true,
|
|
489
|
+
"staleWatchlistCount": 0,
|
|
490
|
+
"reliabilityPassed": true,
|
|
491
|
+
"reliabilityRiskLevel": "monitor",
|
|
492
|
+
"incorrectDetectionRate": 0.0833,
|
|
493
|
+
"lowConfidenceRate": 0.0833,
|
|
494
|
+
"vulnerabilityTotal": null,
|
|
495
|
+
"criticalVulnerabilityCount": null,
|
|
496
|
+
"forbiddenContentPassed": true
|
|
497
|
+
}
|
|
498
|
+
],
|
|
499
|
+
"trendReport": {
|
|
500
|
+
"generatedAt": "2026-04-17T03:20:15.400Z",
|
|
501
|
+
"reportName": "benchmark-trend-report",
|
|
502
|
+
"releaseVersion": "2.0.26",
|
|
503
|
+
"historyCount": 4,
|
|
504
|
+
"releaseDelta": {
|
|
505
|
+
"currentReleaseVersion": "2.0.26",
|
|
506
|
+
"previousReleaseVersion": "2.0.26",
|
|
507
|
+
"comparedSnapshot": {
|
|
508
|
+
"currentGeneratedAt": "2026-04-17T03:20:15.400Z",
|
|
509
|
+
"previousGeneratedAt": "2026-04-17T03:19:31.047Z"
|
|
510
|
+
},
|
|
511
|
+
"top1AccuracyDelta": 0,
|
|
512
|
+
"manualCorrectionRateDelta": 0,
|
|
513
|
+
"staleWatchlistCountDelta": 0,
|
|
514
|
+
"vulnerabilityTotalDelta": 0,
|
|
515
|
+
"summary": [
|
|
516
|
+
"top1Accuracy: +0",
|
|
517
|
+
"manualCorrectionRate: +0",
|
|
518
|
+
"staleWatchlistCount: +0",
|
|
519
|
+
"vulnerabilityTotal: +0"
|
|
520
|
+
]
|
|
521
|
+
},
|
|
522
|
+
"trendTable": [
|
|
523
|
+
{
|
|
524
|
+
"snapshotIndex": 1,
|
|
525
|
+
"generatedAt": "2026-04-17T02:54:01.239Z",
|
|
526
|
+
"releaseVersion": "2.0.26",
|
|
527
|
+
"top1Accuracy": 0.9167,
|
|
528
|
+
"manualCorrectionRate": 0.0833,
|
|
529
|
+
"incorrectDetectionRate": 0.0833,
|
|
530
|
+
"lowConfidenceRate": 0.0833,
|
|
531
|
+
"staleWatchlistCount": 0,
|
|
532
|
+
"vulnerabilityTotal": null,
|
|
533
|
+
"criticalVulnerabilityCount": null,
|
|
534
|
+
"benchmarkGatePassed": true,
|
|
535
|
+
"intelligencePassed": true,
|
|
536
|
+
"reliabilityPassed": true,
|
|
537
|
+
"reliabilityRiskLevel": "monitor"
|
|
538
|
+
},
|
|
539
|
+
{
|
|
540
|
+
"snapshotIndex": 2,
|
|
541
|
+
"generatedAt": "2026-04-17T02:54:57.419Z",
|
|
542
|
+
"releaseVersion": "2.0.26",
|
|
543
|
+
"top1Accuracy": 0.9167,
|
|
544
|
+
"manualCorrectionRate": 0.0833,
|
|
545
|
+
"incorrectDetectionRate": 0.0833,
|
|
546
|
+
"lowConfidenceRate": 0.0833,
|
|
547
|
+
"staleWatchlistCount": 0,
|
|
548
|
+
"vulnerabilityTotal": null,
|
|
549
|
+
"criticalVulnerabilityCount": null,
|
|
550
|
+
"benchmarkGatePassed": true,
|
|
551
|
+
"intelligencePassed": true,
|
|
552
|
+
"reliabilityPassed": true,
|
|
553
|
+
"reliabilityRiskLevel": "monitor"
|
|
554
|
+
},
|
|
555
|
+
{
|
|
556
|
+
"snapshotIndex": 3,
|
|
557
|
+
"generatedAt": "2026-04-17T03:19:31.047Z",
|
|
558
|
+
"releaseVersion": "2.0.26",
|
|
559
|
+
"top1Accuracy": 0.9167,
|
|
560
|
+
"manualCorrectionRate": 0.0833,
|
|
561
|
+
"incorrectDetectionRate": 0.0833,
|
|
562
|
+
"lowConfidenceRate": 0.0833,
|
|
563
|
+
"staleWatchlistCount": 0,
|
|
564
|
+
"vulnerabilityTotal": null,
|
|
565
|
+
"criticalVulnerabilityCount": null,
|
|
566
|
+
"benchmarkGatePassed": true,
|
|
567
|
+
"intelligencePassed": true,
|
|
568
|
+
"reliabilityPassed": true,
|
|
569
|
+
"reliabilityRiskLevel": "monitor"
|
|
570
|
+
},
|
|
571
|
+
{
|
|
572
|
+
"snapshotIndex": 4,
|
|
573
|
+
"generatedAt": "2026-04-17T03:20:15.400Z",
|
|
574
|
+
"releaseVersion": "2.0.26",
|
|
575
|
+
"top1Accuracy": 0.9167,
|
|
576
|
+
"manualCorrectionRate": 0.0833,
|
|
577
|
+
"incorrectDetectionRate": 0.0833,
|
|
578
|
+
"lowConfidenceRate": 0.0833,
|
|
579
|
+
"staleWatchlistCount": 0,
|
|
580
|
+
"vulnerabilityTotal": null,
|
|
581
|
+
"criticalVulnerabilityCount": null,
|
|
582
|
+
"benchmarkGatePassed": true,
|
|
583
|
+
"intelligencePassed": true,
|
|
584
|
+
"reliabilityPassed": true,
|
|
585
|
+
"reliabilityRiskLevel": "monitor"
|
|
586
|
+
}
|
|
587
|
+
],
|
|
588
|
+
"chartSeries": {
|
|
589
|
+
"generatedAt": [
|
|
590
|
+
"2026-04-17T02:54:01.239Z",
|
|
591
|
+
"2026-04-17T02:54:57.419Z",
|
|
592
|
+
"2026-04-17T03:19:31.047Z",
|
|
593
|
+
"2026-04-17T03:20:15.400Z"
|
|
594
|
+
],
|
|
595
|
+
"top1Accuracy": [
|
|
596
|
+
0.9167,
|
|
597
|
+
0.9167,
|
|
598
|
+
0.9167,
|
|
599
|
+
0.9167
|
|
600
|
+
],
|
|
601
|
+
"manualCorrectionRate": [
|
|
602
|
+
0.0833,
|
|
603
|
+
0.0833,
|
|
604
|
+
0.0833,
|
|
605
|
+
0.0833
|
|
606
|
+
],
|
|
607
|
+
"incorrectDetectionRate": [
|
|
608
|
+
0.0833,
|
|
609
|
+
0.0833,
|
|
610
|
+
0.0833,
|
|
611
|
+
0.0833
|
|
612
|
+
],
|
|
613
|
+
"lowConfidenceRate": [
|
|
614
|
+
0.0833,
|
|
615
|
+
0.0833,
|
|
616
|
+
0.0833,
|
|
617
|
+
0.0833
|
|
618
|
+
],
|
|
619
|
+
"staleWatchlistCount": [
|
|
620
|
+
0,
|
|
621
|
+
0,
|
|
622
|
+
0,
|
|
623
|
+
0
|
|
624
|
+
],
|
|
625
|
+
"vulnerabilityTotal": [
|
|
626
|
+
null,
|
|
627
|
+
null,
|
|
628
|
+
null,
|
|
629
|
+
null
|
|
630
|
+
]
|
|
631
|
+
},
|
|
632
|
+
"artifacts": {
|
|
633
|
+
"historyPath": ".agent-context/state/benchmark-history.json",
|
|
634
|
+
"jsonPath": ".agent-context/state/benchmark-trend-report.json",
|
|
635
|
+
"csvPath": ".agent-context/state/benchmark-trend-report.csv",
|
|
636
|
+
"writeMode": "stdout-and-file"
|
|
637
|
+
}
|
|
128
638
|
},
|
|
129
639
|
"outputs": {
|
|
130
640
|
"detectionBenchmark": {
|
|
131
|
-
"generatedAt": "2026-04-
|
|
641
|
+
"generatedAt": "2026-04-17T03:20:15.113Z",
|
|
132
642
|
"fixtureCount": 12,
|
|
133
643
|
"top1Accuracy": 0.9167,
|
|
134
644
|
"manualCorrectionRate": 0.0833,
|
|
@@ -232,7 +742,7 @@
|
|
|
232
742
|
]
|
|
233
743
|
},
|
|
234
744
|
"benchmarkGate": {
|
|
235
|
-
"generatedAt": "2026-04-
|
|
745
|
+
"generatedAt": "2026-04-17T03:20:15.211Z",
|
|
236
746
|
"gateName": "benchmark-gate",
|
|
237
747
|
"passed": true,
|
|
238
748
|
"failureCount": 0,
|
|
@@ -275,7 +785,7 @@
|
|
|
275
785
|
]
|
|
276
786
|
},
|
|
277
787
|
"benchmarkIntelligence": {
|
|
278
|
-
"generatedAt": "2026-04-
|
|
788
|
+
"generatedAt": "2026-04-17T03:20:15.258Z",
|
|
279
789
|
"reportName": "benchmark-intelligence",
|
|
280
790
|
"passed": true,
|
|
281
791
|
"failureCount": 0,
|
|
@@ -284,22 +794,22 @@
|
|
|
284
794
|
{
|
|
285
795
|
"repository": "sickn33/antigravity-awesome-skills",
|
|
286
796
|
"owner": "core-architecture",
|
|
287
|
-
"lastReviewedAt": "2026-04-
|
|
288
|
-
"ageInDays":
|
|
797
|
+
"lastReviewedAt": "2026-04-17",
|
|
798
|
+
"ageInDays": 0,
|
|
289
799
|
"stale": false
|
|
290
800
|
},
|
|
291
801
|
{
|
|
292
802
|
"repository": "github/awesome-copilot",
|
|
293
803
|
"owner": "core-architecture",
|
|
294
|
-
"lastReviewedAt": "2026-04-
|
|
295
|
-
"ageInDays":
|
|
804
|
+
"lastReviewedAt": "2026-04-17",
|
|
805
|
+
"ageInDays": 0,
|
|
296
806
|
"stale": false
|
|
297
807
|
},
|
|
298
808
|
{
|
|
299
809
|
"repository": "MiniMax-AI/skills",
|
|
300
810
|
"owner": "frontend-governance",
|
|
301
|
-
"lastReviewedAt": "2026-04-
|
|
302
|
-
"ageInDays":
|
|
811
|
+
"lastReviewedAt": "2026-04-17",
|
|
812
|
+
"ageInDays": 0,
|
|
303
813
|
"stale": false
|
|
304
814
|
}
|
|
305
815
|
],
|
|
@@ -332,7 +842,7 @@
|
|
|
332
842
|
"checkName": "review-sla-compliance",
|
|
333
843
|
"repository": "sickn33/antigravity-awesome-skills",
|
|
334
844
|
"passed": true,
|
|
335
|
-
"details": "ageInDays=
|
|
845
|
+
"details": "ageInDays=0 slaDays=14"
|
|
336
846
|
},
|
|
337
847
|
{
|
|
338
848
|
"checkName": "watchlist-owner-defined",
|
|
@@ -344,7 +854,7 @@
|
|
|
344
854
|
"checkName": "review-sla-compliance",
|
|
345
855
|
"repository": "github/awesome-copilot",
|
|
346
856
|
"passed": true,
|
|
347
|
-
"details": "ageInDays=
|
|
857
|
+
"details": "ageInDays=0 slaDays=14"
|
|
348
858
|
},
|
|
349
859
|
{
|
|
350
860
|
"checkName": "watchlist-owner-defined",
|
|
@@ -356,7 +866,139 @@
|
|
|
356
866
|
"checkName": "review-sla-compliance",
|
|
357
867
|
"repository": "MiniMax-AI/skills",
|
|
358
868
|
"passed": true,
|
|
359
|
-
"details": "ageInDays=
|
|
869
|
+
"details": "ageInDays=0 slaDays=14"
|
|
870
|
+
}
|
|
871
|
+
]
|
|
872
|
+
},
|
|
873
|
+
"memoryContinuityBenchmark": {
|
|
874
|
+
"generatedAt": "2026-04-17T03:20:15.324Z",
|
|
875
|
+
"reportName": "memory-continuity-benchmark",
|
|
876
|
+
"schemaVersion": "1.0.0",
|
|
877
|
+
"passed": true,
|
|
878
|
+
"failureCount": 0,
|
|
879
|
+
"thresholds": {
|
|
880
|
+
"minimumRelevantRecall": 0.8,
|
|
881
|
+
"minimumSessionStartTokenSavingsPercent": 35,
|
|
882
|
+
"maximumUnsafeObservationCount": 0
|
|
883
|
+
},
|
|
884
|
+
"adapterCoverage": {
|
|
885
|
+
"requiredAdapterIds": [
|
|
886
|
+
"claude-code",
|
|
887
|
+
"gemini-cli",
|
|
888
|
+
"vscode-chat"
|
|
889
|
+
],
|
|
890
|
+
"availableAdapterIds": [
|
|
891
|
+
"claude-code",
|
|
892
|
+
"gemini-cli",
|
|
893
|
+
"vscode-chat"
|
|
894
|
+
],
|
|
895
|
+
"missingAdapterIds": [],
|
|
896
|
+
"passed": true
|
|
897
|
+
},
|
|
898
|
+
"privacyControls": {
|
|
899
|
+
"redactedObservationCount": 2,
|
|
900
|
+
"privateTagRedactionCount": 1,
|
|
901
|
+
"inlineRedactionCount": 1,
|
|
902
|
+
"unsafeObservationCount": 0
|
|
903
|
+
},
|
|
904
|
+
"continuitySummary": {
|
|
905
|
+
"totalObservationCount": 5,
|
|
906
|
+
"scenarioCount": 3,
|
|
907
|
+
"averageRelevantRecall": 1,
|
|
908
|
+
"averageSessionStartTokenSavingsPercent": 63.17
|
|
909
|
+
},
|
|
910
|
+
"scenarios": [
|
|
911
|
+
{
|
|
912
|
+
"scenarioId": "docker-lane-hydration",
|
|
913
|
+
"query": "what is docker strategy for development and production",
|
|
914
|
+
"expectedObservationIds": [
|
|
915
|
+
"obs-001"
|
|
916
|
+
],
|
|
917
|
+
"indexObservationIds": [
|
|
918
|
+
"obs-001",
|
|
919
|
+
"obs-005",
|
|
920
|
+
"obs-003",
|
|
921
|
+
"obs-002",
|
|
922
|
+
"obs-004"
|
|
923
|
+
],
|
|
924
|
+
"hydratedObservationIds": [
|
|
925
|
+
"obs-001"
|
|
926
|
+
],
|
|
927
|
+
"relevantRecall": 1,
|
|
928
|
+
"fullContextTokenEstimate": 267,
|
|
929
|
+
"sessionStartTokenEstimate": 103,
|
|
930
|
+
"sessionStartTokenSavingsPercent": 61.42
|
|
931
|
+
},
|
|
932
|
+
{
|
|
933
|
+
"scenarioId": "runtime-hydration",
|
|
934
|
+
"query": "which runtime target should we prefer on windows with wsl",
|
|
935
|
+
"expectedObservationIds": [
|
|
936
|
+
"obs-002"
|
|
937
|
+
],
|
|
938
|
+
"indexObservationIds": [
|
|
939
|
+
"obs-002",
|
|
940
|
+
"obs-001",
|
|
941
|
+
"obs-005",
|
|
942
|
+
"obs-004",
|
|
943
|
+
"obs-003"
|
|
944
|
+
],
|
|
945
|
+
"hydratedObservationIds": [
|
|
946
|
+
"obs-002"
|
|
947
|
+
],
|
|
948
|
+
"relevantRecall": 1,
|
|
949
|
+
"fullContextTokenEstimate": 267,
|
|
950
|
+
"sessionStartTokenEstimate": 97,
|
|
951
|
+
"sessionStartTokenSavingsPercent": 63.67
|
|
952
|
+
},
|
|
953
|
+
{
|
|
954
|
+
"scenarioId": "frontend-quality-hydration",
|
|
955
|
+
"query": "show frontend rubric quality decisions",
|
|
956
|
+
"expectedObservationIds": [
|
|
957
|
+
"obs-003"
|
|
958
|
+
],
|
|
959
|
+
"indexObservationIds": [
|
|
960
|
+
"obs-003",
|
|
961
|
+
"obs-005",
|
|
962
|
+
"obs-004",
|
|
963
|
+
"obs-002",
|
|
964
|
+
"obs-001"
|
|
965
|
+
],
|
|
966
|
+
"hydratedObservationIds": [
|
|
967
|
+
"obs-003"
|
|
968
|
+
],
|
|
969
|
+
"relevantRecall": 1,
|
|
970
|
+
"fullContextTokenEstimate": 267,
|
|
971
|
+
"sessionStartTokenEstimate": 95,
|
|
972
|
+
"sessionStartTokenSavingsPercent": 64.42
|
|
973
|
+
}
|
|
974
|
+
],
|
|
975
|
+
"references": {
|
|
976
|
+
"memorySchemaPath": ".agent-context/state/memory-schema-v1.json",
|
|
977
|
+
"memoryAdapterContractPath": ".agent-context/state/memory-adapter-contract.json",
|
|
978
|
+
"benchmarkOutputPath": ".agent-context/state/memory-continuity-benchmark.json",
|
|
979
|
+
"schemaDeclaredVersion": "1.0.0",
|
|
980
|
+
"adapterContractVersion": "1.0.0"
|
|
981
|
+
},
|
|
982
|
+
"checks": [
|
|
983
|
+
{
|
|
984
|
+
"checkName": "adapter-coverage",
|
|
985
|
+
"passed": true,
|
|
986
|
+
"details": "required=3 missing=0"
|
|
987
|
+
},
|
|
988
|
+
{
|
|
989
|
+
"checkName": "continuity-recall-threshold",
|
|
990
|
+
"passed": true,
|
|
991
|
+
"details": "averageRelevantRecall=1 minimum=0.8"
|
|
992
|
+
},
|
|
993
|
+
{
|
|
994
|
+
"checkName": "session-start-token-savings-threshold",
|
|
995
|
+
"passed": true,
|
|
996
|
+
"details": "averageSessionStartTokenSavingsPercent=63.17 minimum=35"
|
|
997
|
+
},
|
|
998
|
+
{
|
|
999
|
+
"checkName": "privacy-redaction-safety",
|
|
1000
|
+
"passed": true,
|
|
1001
|
+
"details": "unsafeObservationCount=0 max=0"
|
|
360
1002
|
}
|
|
361
1003
|
]
|
|
362
1004
|
}
|
|
@@ -385,6 +1027,14 @@
|
|
|
385
1027
|
"stderr": null,
|
|
386
1028
|
"reportName": "benchmark-intelligence",
|
|
387
1029
|
"passed": true
|
|
1030
|
+
},
|
|
1031
|
+
{
|
|
1032
|
+
"scriptPath": "scripts/memory-continuity-benchmark.mjs",
|
|
1033
|
+
"exitCode": 0,
|
|
1034
|
+
"parseError": null,
|
|
1035
|
+
"stderr": null,
|
|
1036
|
+
"reportName": "memory-continuity-benchmark",
|
|
1037
|
+
"passed": true
|
|
388
1038
|
}
|
|
389
1039
|
]
|
|
390
1040
|
}
|