@ryuenn3123/agentic-senior-core 2.0.26 → 2.0.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,15 +3,14 @@
3
3
  /**
4
4
  * benchmark-evidence-bundle.mjs
5
5
  *
6
- * V2.5.1 reproducibility baseline artifact.
7
- * Aggregates benchmark inputs, rubric, command examples, and outputs
8
- * into a single machine-readable evidence bundle.
6
+ * Benchmark evidence bundle with reproducibility, trend history,
7
+ * security signals, and reliability early warnings.
9
8
  */
10
9
 
11
10
  import { existsSync, readFileSync } from 'node:fs';
12
11
  import fs from 'node:fs/promises';
13
12
  import { spawnSync } from 'node:child_process';
14
- import { dirname, join, resolve } from 'node:path';
13
+ import { dirname, join, relative, resolve } from 'node:path';
15
14
  import { fileURLToPath } from 'node:url';
16
15
 
17
16
  const SCRIPT_FILE_PATH = fileURLToPath(import.meta.url);
@@ -20,10 +19,23 @@ const REPOSITORY_ROOT = resolve(SCRIPT_DIR, '..');
20
19
  const ARGUMENT_FLAGS = new Set(process.argv.slice(2));
21
20
  const isStdoutOnlyMode = ARGUMENT_FLAGS.has('--stdout-only');
22
21
 
22
+ const PACKAGE_JSON_PATH = join(REPOSITORY_ROOT, 'package.json');
23
23
  const REPRO_PROFILE_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-reproducibility.json');
24
24
  const BENCHMARK_THRESHOLD_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-thresholds.json');
25
25
  const BENCHMARK_WATCHLIST_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-watchlist.json');
26
+ const MEMORY_SCHEMA_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'memory-schema-v1.json');
27
+ const MEMORY_ADAPTER_CONTRACT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'memory-adapter-contract.json');
26
28
  const OUTPUT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-evidence-bundle.json');
29
+ const HISTORY_OUTPUT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-history.json');
30
+ const TREND_JSON_OUTPUT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-trend-report.json');
31
+ const TREND_CSV_OUTPUT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-trend-report.csv');
32
+
33
+ const MAX_HISTORY_ENTRIES = 90;
34
+ const RELIABILITY_THRESHOLDS = {
35
+ minimumConfidenceGap: 0.1,
36
+ maximumLowConfidenceRate: 0.2,
37
+ maximumIncorrectDetectionRate: 0.1,
38
+ };
27
39
 
28
40
  function readJsonOrNull(filePath) {
29
41
  if (!existsSync(filePath)) {
@@ -37,6 +49,47 @@ function readJsonOrNull(filePath) {
37
49
  }
38
50
  }
39
51
 
52
+ function toRelativePath(filePath) {
53
+ return relative(REPOSITORY_ROOT, filePath).replace(/\\/g, '/');
54
+ }
55
+
56
+ function toFiniteNumber(rawValue, fallbackValue = null) {
57
+ const parsedValue = Number(rawValue);
58
+ if (!Number.isFinite(parsedValue)) {
59
+ return fallbackValue;
60
+ }
61
+
62
+ return parsedValue;
63
+ }
64
+
65
+ function parseJsonPayload(rawPayload) {
66
+ const payloadText = String(rawPayload || '').trim();
67
+ if (!payloadText) {
68
+ return { parsed: null, error: 'Payload is empty' };
69
+ }
70
+
71
+ try {
72
+ return { parsed: JSON.parse(payloadText), error: null };
73
+ } catch {
74
+ const firstCurlyBracketIndex = payloadText.indexOf('{');
75
+ const lastCurlyBracketIndex = payloadText.lastIndexOf('}');
76
+
77
+ if (firstCurlyBracketIndex !== -1 && lastCurlyBracketIndex > firstCurlyBracketIndex) {
78
+ const candidatePayload = payloadText.slice(firstCurlyBracketIndex, lastCurlyBracketIndex + 1);
79
+ try {
80
+ return { parsed: JSON.parse(candidatePayload), error: null };
81
+ } catch (secondError) {
82
+ return {
83
+ parsed: null,
84
+ error: secondError instanceof Error ? secondError.message : String(secondError),
85
+ };
86
+ }
87
+ }
88
+
89
+ return { parsed: null, error: 'No JSON object found in payload' };
90
+ }
91
+ }
92
+
40
93
  function runJsonScript(scriptRelativePath) {
41
94
  const absoluteScriptPath = join(REPOSITORY_ROOT, scriptRelativePath);
42
95
  const executionResult = spawnSync('node', [absoluteScriptPath], {
@@ -79,6 +132,74 @@ function runJsonScript(scriptRelativePath) {
79
132
  }
80
133
  }
81
134
 
135
+ function runNodeScript(scriptRelativePath, argumentsList = []) {
136
+ const absoluteScriptPath = join(REPOSITORY_ROOT, scriptRelativePath);
137
+ const executionResult = spawnSync('node', [absoluteScriptPath, ...argumentsList], {
138
+ cwd: REPOSITORY_ROOT,
139
+ encoding: 'utf8',
140
+ maxBuffer: 1024 * 1024 * 10,
141
+ });
142
+
143
+ return {
144
+ scriptPath: scriptRelativePath,
145
+ exitCode: typeof executionResult.status === 'number' ? executionResult.status : 1,
146
+ stdout: (executionResult.stdout || '').trim(),
147
+ stderr: (executionResult.stderr || '').trim(),
148
+ };
149
+ }
150
+
151
+ function runNpmAuditIndicator() {
152
+ const executionResult = spawnSync('npm', ['audit', '--json', '--omit=dev'], {
153
+ cwd: REPOSITORY_ROOT,
154
+ encoding: 'utf8',
155
+ maxBuffer: 1024 * 1024 * 10,
156
+ });
157
+
158
+ const combinedOutput = [executionResult.stdout, executionResult.stderr].filter(Boolean).join('\n').trim();
159
+ const { parsed: parsedAuditReport, error: parseError } = parseJsonPayload(combinedOutput);
160
+ const exitCode = typeof executionResult.status === 'number' ? executionResult.status : 1;
161
+
162
+ if (!parsedAuditReport || parseError) {
163
+ return {
164
+ isAvailable: false,
165
+ exitCode,
166
+ severityCounts: null,
167
+ hasKnownVulnerabilities: null,
168
+ error: parseError || 'Unable to parse npm audit output',
169
+ };
170
+ }
171
+
172
+ const vulnerabilityMetadata = parsedAuditReport.metadata?.vulnerabilities || null;
173
+ const severityCounts = vulnerabilityMetadata
174
+ ? {
175
+ info: toFiniteNumber(vulnerabilityMetadata.info, 0),
176
+ low: toFiniteNumber(vulnerabilityMetadata.low, 0),
177
+ moderate: toFiniteNumber(vulnerabilityMetadata.moderate, 0),
178
+ high: toFiniteNumber(vulnerabilityMetadata.high, 0),
179
+ critical: toFiniteNumber(vulnerabilityMetadata.critical, 0),
180
+ total: toFiniteNumber(vulnerabilityMetadata.total, 0),
181
+ }
182
+ : null;
183
+
184
+ if (!severityCounts) {
185
+ return {
186
+ isAvailable: false,
187
+ exitCode,
188
+ severityCounts: null,
189
+ hasKnownVulnerabilities: null,
190
+ error: parsedAuditReport.error?.summary || 'npm audit report does not include vulnerability metadata',
191
+ };
192
+ }
193
+
194
+ return {
195
+ isAvailable: true,
196
+ exitCode,
197
+ severityCounts,
198
+ hasKnownVulnerabilities: severityCounts.total > 0,
199
+ error: null,
200
+ };
201
+ }
202
+
82
203
  function summarizeExecution(scriptExecutionResult) {
83
204
  return {
84
205
  scriptPath: scriptExecutionResult.scriptPath,
@@ -92,7 +213,18 @@ function summarizeExecution(scriptExecutionResult) {
92
213
  };
93
214
  }
94
215
 
95
- function buildRubricSummary(thresholdConfiguration, intelligenceReport) {
216
+ function appendUniqueTextValues(baseValues, additionalValues) {
217
+ const mergedValues = [...baseValues];
218
+ for (const additionalValue of additionalValues) {
219
+ if (!mergedValues.includes(additionalValue)) {
220
+ mergedValues.push(additionalValue);
221
+ }
222
+ }
223
+
224
+ return mergedValues;
225
+ }
226
+
227
+ function buildRubricSummary(thresholdConfiguration, intelligenceReport, memoryContinuityReport) {
96
228
  return {
97
229
  benchmarkThresholds: {
98
230
  minimumTop1Accuracy: thresholdConfiguration?.minimumTop1Accuracy ?? null,
@@ -101,25 +233,296 @@ function buildRubricSummary(thresholdConfiguration, intelligenceReport) {
101
233
  maximumManualCorrectionIncrease: thresholdConfiguration?.maximumManualCorrectionIncrease ?? null,
102
234
  },
103
235
  intelligenceSlaDays: intelligenceReport?.reviewSlaDays ?? null,
236
+ reliabilityThresholds: RELIABILITY_THRESHOLDS,
237
+ continuityThresholds: memoryContinuityReport?.thresholds || null,
238
+ };
239
+ }
240
+
241
+ function buildReliabilitySignals(detectionBenchmarkReport) {
242
+ const fixtureResults = Array.isArray(detectionBenchmarkReport?.fixtures) ? detectionBenchmarkReport.fixtures : [];
243
+ const fixtureCount = fixtureResults.length;
244
+
245
+ const incorrectFixtures = fixtureResults.filter((fixtureResult) => fixtureResult?.isCorrect === false);
246
+ const lowConfidenceFixtures = fixtureResults.filter((fixtureResult) => {
247
+ const confidenceGap = toFiniteNumber(fixtureResult?.confidenceGap, 0);
248
+ return confidenceGap < RELIABILITY_THRESHOLDS.minimumConfidenceGap;
249
+ });
250
+ const manualCorrectionFixtures = fixtureResults.filter((fixtureResult) => fixtureResult?.needsManualCorrection === true);
251
+
252
+ const incorrectDetectionRate = fixtureCount === 0
253
+ ? 0
254
+ : Number((incorrectFixtures.length / fixtureCount).toFixed(4));
255
+ const lowConfidenceRate = fixtureCount === 0
256
+ ? 0
257
+ : Number((lowConfidenceFixtures.length / fixtureCount).toFixed(4));
258
+ const manualCorrectionRate = fixtureCount === 0
259
+ ? 0
260
+ : Number((manualCorrectionFixtures.length / fixtureCount).toFixed(4));
261
+
262
+ const reliabilityChecks = [
263
+ {
264
+ checkName: 'incorrect-detection-rate',
265
+ passed: incorrectDetectionRate <= RELIABILITY_THRESHOLDS.maximumIncorrectDetectionRate,
266
+ details: `incorrectRate=${incorrectDetectionRate} max=${RELIABILITY_THRESHOLDS.maximumIncorrectDetectionRate}`,
267
+ },
268
+ {
269
+ checkName: 'low-confidence-rate',
270
+ passed: lowConfidenceRate <= RELIABILITY_THRESHOLDS.maximumLowConfidenceRate,
271
+ details: `lowConfidenceRate=${lowConfidenceRate} max=${RELIABILITY_THRESHOLDS.maximumLowConfidenceRate}`,
272
+ },
273
+ {
274
+ checkName: 'manual-correction-early-warning',
275
+ passed: manualCorrectionRate <= 0.12,
276
+ details: `manualCorrectionRate=${manualCorrectionRate} warningThreshold=0.12`,
277
+ },
278
+ ];
279
+
280
+ const failureCount = reliabilityChecks.filter((reliabilityCheck) => !reliabilityCheck.passed).length;
281
+ const riskLevel = failureCount === 0
282
+ ? (incorrectFixtures.length > 0 || lowConfidenceFixtures.length > 0 ? 'monitor' : 'stable')
283
+ : (failureCount >= 2 ? 'high' : 'elevated');
284
+
285
+ return {
286
+ passed: failureCount === 0,
287
+ failureCount,
288
+ riskLevel,
289
+ thresholds: RELIABILITY_THRESHOLDS,
290
+ metrics: {
291
+ fixtureCount,
292
+ incorrectFixtureCount: incorrectFixtures.length,
293
+ lowConfidenceFixtureCount: lowConfidenceFixtures.length,
294
+ manualCorrectionFixtureCount: manualCorrectionFixtures.length,
295
+ incorrectDetectionRate,
296
+ lowConfidenceRate,
297
+ manualCorrectionRate,
298
+ },
299
+ checks: reliabilityChecks,
300
+ flaggedFixtures: fixtureResults
301
+ .filter((fixtureResult) => fixtureResult?.isCorrect === false || fixtureResult?.needsManualCorrection === true)
302
+ .map((fixtureResult) => ({
303
+ fixtureName: fixtureResult.fixtureName,
304
+ confidenceGap: fixtureResult.confidenceGap,
305
+ detectedStack: fixtureResult.detectedStack,
306
+ expectedStack: fixtureResult.expectedStack,
307
+ isCorrect: fixtureResult.isCorrect,
308
+ needsManualCorrection: fixtureResult.needsManualCorrection,
309
+ })),
310
+ };
311
+ }
312
+
313
+ function buildBugIndicators(reliabilitySignals) {
314
+ return {
315
+ incorrectFixtureCount: reliabilitySignals.metrics.incorrectFixtureCount,
316
+ incorrectDetectionRate: reliabilitySignals.metrics.incorrectDetectionRate,
317
+ manualCorrectionFixtureCount: reliabilitySignals.metrics.manualCorrectionFixtureCount,
318
+ manualCorrectionRate: reliabilitySignals.metrics.manualCorrectionRate,
319
+ lowConfidenceFixtureCount: reliabilitySignals.metrics.lowConfidenceFixtureCount,
320
+ lowConfidenceRate: reliabilitySignals.metrics.lowConfidenceRate,
321
+ flaggedFixtures: reliabilitySignals.flaggedFixtures,
322
+ };
323
+ }
324
+
325
+ function buildSecurityIndicators(forbiddenContentExecution, npmAuditIndicator) {
326
+ const forbiddenContentPassed = forbiddenContentExecution.exitCode === 0;
327
+
328
+ return {
329
+ forbiddenContent: {
330
+ checkName: 'forbidden-content-scan',
331
+ passed: forbiddenContentPassed,
332
+ exitCode: forbiddenContentExecution.exitCode,
333
+ details: forbiddenContentPassed
334
+ ? 'No forbidden content detected'
335
+ : 'Forbidden content scan found one or more violations',
336
+ },
337
+ vulnerabilityScan: {
338
+ checkName: 'npm-audit-indicator',
339
+ isAvailable: npmAuditIndicator.isAvailable,
340
+ hasKnownVulnerabilities: npmAuditIndicator.hasKnownVulnerabilities,
341
+ severityCounts: npmAuditIndicator.severityCounts,
342
+ exitCode: npmAuditIndicator.exitCode,
343
+ error: npmAuditIndicator.error,
344
+ },
345
+ };
346
+ }
347
+
348
+ function readReleaseVersion() {
349
+ const packageJson = readJsonOrNull(PACKAGE_JSON_PATH);
350
+ return typeof packageJson?.version === 'string' && packageJson.version.trim().length > 0
351
+ ? packageJson.version.trim()
352
+ : 'unknown';
353
+ }
354
+
355
+ function loadBenchmarkHistory() {
356
+ const historyPayload = readJsonOrNull(HISTORY_OUTPUT_PATH);
357
+
358
+ if (Array.isArray(historyPayload?.history)) {
359
+ return historyPayload.history;
360
+ }
361
+
362
+ if (Array.isArray(historyPayload)) {
363
+ return historyPayload;
364
+ }
365
+
366
+ return [];
367
+ }
368
+
369
+ function mergeBenchmarkHistory(previousHistoryEntries, currentSnapshot) {
370
+ const mergedHistoryEntries = [...previousHistoryEntries, currentSnapshot];
371
+ if (mergedHistoryEntries.length <= MAX_HISTORY_ENTRIES) {
372
+ return mergedHistoryEntries;
373
+ }
374
+
375
+ return mergedHistoryEntries.slice(mergedHistoryEntries.length - MAX_HISTORY_ENTRIES);
376
+ }
377
+
378
+ function buildHistorySnapshot({
379
+ generatedAt,
380
+ releaseVersion,
381
+ detectionBenchmarkReport,
382
+ benchmarkGateReport,
383
+ benchmarkIntelligenceReport,
384
+ reliabilitySignals,
385
+ securityIndicators,
386
+ }) {
387
+ const staleWatchlistCount = Array.isArray(benchmarkIntelligenceReport?.watchlist)
388
+ ? benchmarkIntelligenceReport.watchlist.filter((watchlistEntry) => watchlistEntry?.stale === true).length
389
+ : null;
390
+
391
+ return {
392
+ generatedAt,
393
+ releaseVersion,
394
+ fixtureCount: toFiniteNumber(detectionBenchmarkReport?.fixtureCount, 0),
395
+ top1Accuracy: toFiniteNumber(detectionBenchmarkReport?.top1Accuracy, 0),
396
+ manualCorrectionRate: toFiniteNumber(detectionBenchmarkReport?.manualCorrectionRate, 0),
397
+ benchmarkGatePassed: benchmarkGateReport?.passed === true,
398
+ intelligencePassed: benchmarkIntelligenceReport?.passed === true,
399
+ staleWatchlistCount,
400
+ reliabilityPassed: reliabilitySignals.passed,
401
+ reliabilityRiskLevel: reliabilitySignals.riskLevel,
402
+ incorrectDetectionRate: reliabilitySignals.metrics.incorrectDetectionRate,
403
+ lowConfidenceRate: reliabilitySignals.metrics.lowConfidenceRate,
404
+ vulnerabilityTotal: securityIndicators.vulnerabilityScan.severityCounts?.total ?? null,
405
+ criticalVulnerabilityCount: securityIndicators.vulnerabilityScan.severityCounts?.critical ?? null,
406
+ forbiddenContentPassed: securityIndicators.forbiddenContent.passed,
104
407
  };
105
408
  }
106
409
 
410
+ function buildReleaseDelta(historyEntries, currentSnapshot) {
411
+ const previousSnapshots = historyEntries.slice(0, -1);
412
+ if (previousSnapshots.length === 0) {
413
+ return null;
414
+ }
415
+
416
+ const previousReleaseSnapshot = [...previousSnapshots].reverse().find(
417
+ (historyEntry) => historyEntry.releaseVersion !== currentSnapshot.releaseVersion
418
+ ) || previousSnapshots[previousSnapshots.length - 1];
419
+
420
+ const top1AccuracyDelta = Number((currentSnapshot.top1Accuracy - previousReleaseSnapshot.top1Accuracy).toFixed(4));
421
+ const manualCorrectionDelta = Number((currentSnapshot.manualCorrectionRate - previousReleaseSnapshot.manualCorrectionRate).toFixed(4));
422
+ const staleWatchlistDelta =
423
+ (toFiniteNumber(currentSnapshot.staleWatchlistCount, 0) - toFiniteNumber(previousReleaseSnapshot.staleWatchlistCount, 0));
424
+ const vulnerabilityDelta =
425
+ (toFiniteNumber(currentSnapshot.vulnerabilityTotal, 0) - toFiniteNumber(previousReleaseSnapshot.vulnerabilityTotal, 0));
426
+
427
+ return {
428
+ currentReleaseVersion: currentSnapshot.releaseVersion,
429
+ previousReleaseVersion: previousReleaseSnapshot.releaseVersion,
430
+ comparedSnapshot: {
431
+ currentGeneratedAt: currentSnapshot.generatedAt,
432
+ previousGeneratedAt: previousReleaseSnapshot.generatedAt,
433
+ },
434
+ top1AccuracyDelta,
435
+ manualCorrectionRateDelta: manualCorrectionDelta,
436
+ staleWatchlistCountDelta: staleWatchlistDelta,
437
+ vulnerabilityTotalDelta: vulnerabilityDelta,
438
+ summary: [
439
+ `top1Accuracy: ${top1AccuracyDelta >= 0 ? '+' : ''}${top1AccuracyDelta}`,
440
+ `manualCorrectionRate: ${manualCorrectionDelta >= 0 ? '+' : ''}${manualCorrectionDelta}`,
441
+ `staleWatchlistCount: ${staleWatchlistDelta >= 0 ? '+' : ''}${staleWatchlistDelta}`,
442
+ `vulnerabilityTotal: ${vulnerabilityDelta >= 0 ? '+' : ''}${vulnerabilityDelta}`,
443
+ ],
444
+ };
445
+ }
446
+
447
+ function buildTrendTable(historyEntries) {
448
+ return historyEntries.map((historyEntry, index) => ({
449
+ snapshotIndex: index + 1,
450
+ generatedAt: historyEntry.generatedAt,
451
+ releaseVersion: historyEntry.releaseVersion,
452
+ top1Accuracy: historyEntry.top1Accuracy,
453
+ manualCorrectionRate: historyEntry.manualCorrectionRate,
454
+ incorrectDetectionRate: historyEntry.incorrectDetectionRate,
455
+ lowConfidenceRate: historyEntry.lowConfidenceRate,
456
+ staleWatchlistCount: historyEntry.staleWatchlistCount,
457
+ vulnerabilityTotal: historyEntry.vulnerabilityTotal,
458
+ criticalVulnerabilityCount: historyEntry.criticalVulnerabilityCount,
459
+ benchmarkGatePassed: historyEntry.benchmarkGatePassed,
460
+ intelligencePassed: historyEntry.intelligencePassed,
461
+ reliabilityPassed: historyEntry.reliabilityPassed,
462
+ reliabilityRiskLevel: historyEntry.reliabilityRiskLevel,
463
+ }));
464
+ }
465
+
466
+ function buildChartSeries(historyEntries) {
467
+ return {
468
+ generatedAt: historyEntries.map((historyEntry) => historyEntry.generatedAt),
469
+ top1Accuracy: historyEntries.map((historyEntry) => historyEntry.top1Accuracy),
470
+ manualCorrectionRate: historyEntries.map((historyEntry) => historyEntry.manualCorrectionRate),
471
+ incorrectDetectionRate: historyEntries.map((historyEntry) => historyEntry.incorrectDetectionRate),
472
+ lowConfidenceRate: historyEntries.map((historyEntry) => historyEntry.lowConfidenceRate),
473
+ staleWatchlistCount: historyEntries.map((historyEntry) => historyEntry.staleWatchlistCount),
474
+ vulnerabilityTotal: historyEntries.map((historyEntry) => historyEntry.vulnerabilityTotal),
475
+ };
476
+ }
477
+
478
+ function convertTrendTableToCsv(trendTable) {
479
+ if (trendTable.length === 0) {
480
+ return '';
481
+ }
482
+
483
+ const headers = Object.keys(trendTable[0]);
484
+ const csvRows = [headers.join(',')];
485
+
486
+ for (const trendRow of trendTable) {
487
+ const rowValues = headers.map((header) => {
488
+ const rawValue = trendRow[header];
489
+ if (rawValue === null || rawValue === undefined) {
490
+ return '';
491
+ }
492
+
493
+ const normalizedValue = String(rawValue).replace(/"/g, '""');
494
+ return `"${normalizedValue}"`;
495
+ });
496
+
497
+ csvRows.push(rowValues.join(','));
498
+ }
499
+
500
+ return `${csvRows.join('\n')}\n`;
501
+ }
502
+
107
503
  async function runBenchmarkEvidenceBundle() {
108
504
  const reproducibilityProfile = readJsonOrNull(REPRO_PROFILE_PATH);
109
505
  const thresholdConfiguration = readJsonOrNull(BENCHMARK_THRESHOLD_PATH);
110
506
  const watchlistConfiguration = readJsonOrNull(BENCHMARK_WATCHLIST_PATH);
507
+ const memorySchemaConfiguration = readJsonOrNull(MEMORY_SCHEMA_PATH);
508
+ const memoryAdapterContractConfiguration = readJsonOrNull(MEMORY_ADAPTER_CONTRACT_PATH);
509
+ const releaseVersion = readReleaseVersion();
111
510
 
112
511
  const detectionBenchmarkExecution = runJsonScript('scripts/detection-benchmark.mjs');
113
512
  const benchmarkGateExecution = runJsonScript('scripts/benchmark-gate.mjs');
114
513
  const benchmarkIntelligenceExecution = runJsonScript('scripts/benchmark-intelligence.mjs');
514
+ const memoryContinuityExecution = runJsonScript('scripts/memory-continuity-benchmark.mjs');
515
+ const forbiddenContentExecution = runNodeScript('scripts/forbidden-content-check.mjs');
516
+ const npmAuditIndicator = runNpmAuditIndicator();
115
517
 
116
518
  const executionSummaries = [
117
519
  summarizeExecution(detectionBenchmarkExecution),
118
520
  summarizeExecution(benchmarkGateExecution),
119
521
  summarizeExecution(benchmarkIntelligenceExecution),
522
+ summarizeExecution(memoryContinuityExecution),
120
523
  ];
121
524
 
122
- const failureCount = executionSummaries.filter((executionSummary) => {
525
+ const executionFailureCount = executionSummaries.filter((executionSummary) => {
123
526
  if (executionSummary.parseError) {
124
527
  return true;
125
528
  }
@@ -131,41 +534,115 @@ async function runBenchmarkEvidenceBundle() {
131
534
  return executionSummary.exitCode !== 0;
132
535
  }).length;
133
536
 
537
+ const reliabilitySignals = buildReliabilitySignals(detectionBenchmarkExecution.parsedReport);
538
+ const reliabilityFailureCount = reliabilitySignals.failureCount > 0 ? 1 : 0;
539
+ const failureCount = executionFailureCount + reliabilityFailureCount;
540
+ const securityIndicators = buildSecurityIndicators(forbiddenContentExecution, npmAuditIndicator);
541
+ const bugIndicators = buildBugIndicators(reliabilitySignals);
542
+
543
+ const generatedAt = new Date().toISOString();
544
+ const currentSnapshot = buildHistorySnapshot({
545
+ generatedAt,
546
+ releaseVersion,
547
+ detectionBenchmarkReport: detectionBenchmarkExecution.parsedReport,
548
+ benchmarkGateReport: benchmarkGateExecution.parsedReport,
549
+ benchmarkIntelligenceReport: benchmarkIntelligenceExecution.parsedReport,
550
+ reliabilitySignals,
551
+ securityIndicators,
552
+ });
553
+ const previousHistoryEntries = loadBenchmarkHistory();
554
+ const historyEntries = mergeBenchmarkHistory(previousHistoryEntries, currentSnapshot);
555
+ const releaseDelta = buildReleaseDelta(historyEntries, currentSnapshot);
556
+ const trendTable = buildTrendTable(historyEntries);
557
+ const chartSeries = buildChartSeries(historyEntries);
558
+
559
+ const trendReport = {
560
+ generatedAt,
561
+ reportName: 'benchmark-trend-report',
562
+ releaseVersion,
563
+ historyCount: historyEntries.length,
564
+ releaseDelta,
565
+ trendTable,
566
+ chartSeries,
567
+ artifacts: {
568
+ historyPath: toRelativePath(HISTORY_OUTPUT_PATH),
569
+ jsonPath: toRelativePath(TREND_JSON_OUTPUT_PATH),
570
+ csvPath: toRelativePath(TREND_CSV_OUTPUT_PATH),
571
+ writeMode: isStdoutOnlyMode ? 'stdout-only' : 'stdout-and-file',
572
+ },
573
+ };
574
+
575
+ const historyPayload = {
576
+ generatedAt,
577
+ reportName: 'benchmark-history',
578
+ maxEntries: MAX_HISTORY_ENTRIES,
579
+ history: historyEntries,
580
+ };
581
+
582
+ const trendCsvPayload = convertTrendTableToCsv(trendTable);
583
+ const baseRerunInstructions = Array.isArray(reproducibilityProfile?.rerunInstructions)
584
+ ? reproducibilityProfile.rerunInstructions
585
+ : [];
586
+ const baseCommandExamples = Array.isArray(reproducibilityProfile?.commandExamples)
587
+ ? reproducibilityProfile.commandExamples
588
+ : [];
589
+ const rerunInstructions = appendUniqueTextValues(baseRerunInstructions, [
590
+ 'Run npm run benchmark:continuity to validate cross-agent memory hydration, privacy redaction, and token-savings behavior.',
591
+ ]);
592
+ const commandExamples = appendUniqueTextValues(baseCommandExamples, [
593
+ 'npm run benchmark:continuity',
594
+ 'node ./scripts/memory-continuity-benchmark.mjs --stdout-only',
595
+ ]);
596
+
134
597
  const evidenceBundleReport = {
135
- generatedAt: new Date().toISOString(),
598
+ generatedAt,
136
599
  reportName: 'benchmark-evidence-bundle',
137
- phase: 'v2.5.1',
600
+ phase: 'v2.5.2',
601
+ releaseVersion,
138
602
  passed: failureCount === 0,
139
603
  failureCount,
140
604
  methodology: {
141
605
  deterministicRuntime: reproducibilityProfile?.deterministicRuntime || null,
142
606
  scenarioCount: Array.isArray(reproducibilityProfile?.scenarios) ? reproducibilityProfile.scenarios.length : 0,
143
- commandCount: Array.isArray(reproducibilityProfile?.commandExamples) ? reproducibilityProfile.commandExamples.length : 0,
607
+ commandCount: commandExamples.length,
144
608
  },
145
- rerunInstructions: Array.isArray(reproducibilityProfile?.rerunInstructions)
146
- ? reproducibilityProfile.rerunInstructions
147
- : [],
148
- commandExamples: Array.isArray(reproducibilityProfile?.commandExamples)
149
- ? reproducibilityProfile.commandExamples
150
- : [],
609
+ rerunInstructions,
610
+ commandExamples,
151
611
  rawInputs: {
152
612
  scenarios: Array.isArray(reproducibilityProfile?.scenarios) ? reproducibilityProfile.scenarios : [],
153
613
  benchmarkThresholds: thresholdConfiguration,
154
614
  benchmarkWatchlist: Array.isArray(watchlistConfiguration?.repositories)
155
615
  ? watchlistConfiguration.repositories
156
616
  : [],
617
+ memorySchema: memorySchemaConfiguration,
618
+ memoryAdapterContract: memoryAdapterContractConfiguration,
157
619
  },
158
- rubric: buildRubricSummary(thresholdConfiguration, benchmarkIntelligenceExecution.parsedReport),
620
+ rubric: buildRubricSummary(
621
+ thresholdConfiguration,
622
+ benchmarkIntelligenceExecution.parsedReport,
623
+ memoryContinuityExecution.parsedReport
624
+ ),
625
+ bugIndicators,
626
+ reliabilitySignals,
627
+ securityIndicators,
628
+ releaseDelta,
629
+ history: historyEntries,
630
+ trendReport,
159
631
  outputs: {
160
632
  detectionBenchmark: detectionBenchmarkExecution.parsedReport,
161
633
  benchmarkGate: benchmarkGateExecution.parsedReport,
162
634
  benchmarkIntelligence: benchmarkIntelligenceExecution.parsedReport,
635
+ memoryContinuityBenchmark: memoryContinuityExecution.parsedReport,
163
636
  },
164
637
  executions: executionSummaries,
165
638
  };
166
639
 
167
640
  if (!isStdoutOnlyMode) {
641
+ await fs.mkdir(join(REPOSITORY_ROOT, '.agent-context', 'state'), { recursive: true });
168
642
  await fs.writeFile(OUTPUT_PATH, JSON.stringify(evidenceBundleReport, null, 2) + '\n', 'utf8');
643
+ await fs.writeFile(HISTORY_OUTPUT_PATH, JSON.stringify(historyPayload, null, 2) + '\n', 'utf8');
644
+ await fs.writeFile(TREND_JSON_OUTPUT_PATH, JSON.stringify(trendReport, null, 2) + '\n', 'utf8');
645
+ await fs.writeFile(TREND_CSV_OUTPUT_PATH, trendCsvPayload, 'utf8');
169
646
  }
170
647
 
171
648
  console.log(JSON.stringify(evidenceBundleReport, null, 2));