@ryuenn3123/agentic-senior-core 2.0.25 → 2.0.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-context/review-checklists/frontend-excellence-rubric.md +54 -0
- package/.agent-context/review-checklists/frontend-skill-parity.md +1 -0
- package/.agent-context/review-checklists/frontend-usability.md +1 -0
- package/.agent-context/rules/docker-runtime.md +29 -0
- package/.agent-context/skills/frontend/README.md +1 -0
- package/.agent-context/skills/frontend.md +4 -0
- package/.agent-context/state/benchmark-evidence-bundle.json +672 -22
- package/.agent-context/state/benchmark-history.json +75 -0
- package/.agent-context/state/benchmark-trend-report.csv +5 -0
- package/.agent-context/state/benchmark-trend-report.json +140 -0
- package/.agent-context/state/benchmark-watchlist.json +3 -3
- package/.agent-context/state/memory-adapter-contract.json +52 -0
- package/.agent-context/state/memory-continuity-benchmark.json +132 -0
- package/.agent-context/state/memory-schema-v1.json +88 -0
- package/.cursorrules +1 -1
- package/.windsurfrules +1 -1
- package/README.md +29 -0
- package/lib/cli/commands/init.mjs +358 -16
- package/lib/cli/commands/optimize.mjs +12 -0
- package/lib/cli/commands/upgrade.mjs +30 -1
- package/lib/cli/compiler.mjs +55 -1
- package/lib/cli/constants.mjs +83 -0
- package/lib/cli/detector.mjs +11 -1
- package/lib/cli/memory-continuity.mjs +266 -0
- package/lib/cli/project-scaffolder.mjs +174 -1
- package/lib/cli/skill-selector.mjs +60 -38
- package/lib/cli/templates/architecture-decision-record.md.tmpl +39 -0
- package/lib/cli/templates/flow-overview.md.tmpl +12 -0
- package/lib/cli/templates/project-brief.md.id.tmpl +2 -0
- package/lib/cli/templates/project-brief.md.tmpl +26 -0
- package/lib/cli/utils.mjs +2 -1
- package/package.json +2 -1
- package/scripts/benchmark-evidence-bundle.mjs +493 -16
- package/scripts/frontend-usability-audit.mjs +21 -0
- package/scripts/memory-continuity-benchmark.mjs +322 -0
- package/scripts/release-gate.mjs +30 -0
- package/scripts/validate.mjs +5 -0
|
@@ -3,15 +3,14 @@
|
|
|
3
3
|
/**
|
|
4
4
|
* benchmark-evidence-bundle.mjs
|
|
5
5
|
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
* into a single machine-readable evidence bundle.
|
|
6
|
+
* Benchmark evidence bundle with reproducibility, trend history,
|
|
7
|
+
* security signals, and reliability early warnings.
|
|
9
8
|
*/
|
|
10
9
|
|
|
11
10
|
import { existsSync, readFileSync } from 'node:fs';
|
|
12
11
|
import fs from 'node:fs/promises';
|
|
13
12
|
import { spawnSync } from 'node:child_process';
|
|
14
|
-
import { dirname, join, resolve } from 'node:path';
|
|
13
|
+
import { dirname, join, relative, resolve } from 'node:path';
|
|
15
14
|
import { fileURLToPath } from 'node:url';
|
|
16
15
|
|
|
17
16
|
const SCRIPT_FILE_PATH = fileURLToPath(import.meta.url);
|
|
@@ -20,10 +19,23 @@ const REPOSITORY_ROOT = resolve(SCRIPT_DIR, '..');
|
|
|
20
19
|
const ARGUMENT_FLAGS = new Set(process.argv.slice(2));
|
|
21
20
|
const isStdoutOnlyMode = ARGUMENT_FLAGS.has('--stdout-only');
|
|
22
21
|
|
|
22
|
+
const PACKAGE_JSON_PATH = join(REPOSITORY_ROOT, 'package.json');
|
|
23
23
|
const REPRO_PROFILE_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-reproducibility.json');
|
|
24
24
|
const BENCHMARK_THRESHOLD_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-thresholds.json');
|
|
25
25
|
const BENCHMARK_WATCHLIST_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-watchlist.json');
|
|
26
|
+
const MEMORY_SCHEMA_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'memory-schema-v1.json');
|
|
27
|
+
const MEMORY_ADAPTER_CONTRACT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'memory-adapter-contract.json');
|
|
26
28
|
const OUTPUT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-evidence-bundle.json');
|
|
29
|
+
const HISTORY_OUTPUT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-history.json');
|
|
30
|
+
const TREND_JSON_OUTPUT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-trend-report.json');
|
|
31
|
+
const TREND_CSV_OUTPUT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-trend-report.csv');
|
|
32
|
+
|
|
33
|
+
const MAX_HISTORY_ENTRIES = 90;
|
|
34
|
+
const RELIABILITY_THRESHOLDS = {
|
|
35
|
+
minimumConfidenceGap: 0.1,
|
|
36
|
+
maximumLowConfidenceRate: 0.2,
|
|
37
|
+
maximumIncorrectDetectionRate: 0.1,
|
|
38
|
+
};
|
|
27
39
|
|
|
28
40
|
function readJsonOrNull(filePath) {
|
|
29
41
|
if (!existsSync(filePath)) {
|
|
@@ -37,6 +49,47 @@ function readJsonOrNull(filePath) {
|
|
|
37
49
|
}
|
|
38
50
|
}
|
|
39
51
|
|
|
52
|
+
function toRelativePath(filePath) {
|
|
53
|
+
return relative(REPOSITORY_ROOT, filePath).replace(/\\/g, '/');
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function toFiniteNumber(rawValue, fallbackValue = null) {
|
|
57
|
+
const parsedValue = Number(rawValue);
|
|
58
|
+
if (!Number.isFinite(parsedValue)) {
|
|
59
|
+
return fallbackValue;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
return parsedValue;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function parseJsonPayload(rawPayload) {
|
|
66
|
+
const payloadText = String(rawPayload || '').trim();
|
|
67
|
+
if (!payloadText) {
|
|
68
|
+
return { parsed: null, error: 'Payload is empty' };
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
try {
|
|
72
|
+
return { parsed: JSON.parse(payloadText), error: null };
|
|
73
|
+
} catch {
|
|
74
|
+
const firstCurlyBracketIndex = payloadText.indexOf('{');
|
|
75
|
+
const lastCurlyBracketIndex = payloadText.lastIndexOf('}');
|
|
76
|
+
|
|
77
|
+
if (firstCurlyBracketIndex !== -1 && lastCurlyBracketIndex > firstCurlyBracketIndex) {
|
|
78
|
+
const candidatePayload = payloadText.slice(firstCurlyBracketIndex, lastCurlyBracketIndex + 1);
|
|
79
|
+
try {
|
|
80
|
+
return { parsed: JSON.parse(candidatePayload), error: null };
|
|
81
|
+
} catch (secondError) {
|
|
82
|
+
return {
|
|
83
|
+
parsed: null,
|
|
84
|
+
error: secondError instanceof Error ? secondError.message : String(secondError),
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return { parsed: null, error: 'No JSON object found in payload' };
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
40
93
|
function runJsonScript(scriptRelativePath) {
|
|
41
94
|
const absoluteScriptPath = join(REPOSITORY_ROOT, scriptRelativePath);
|
|
42
95
|
const executionResult = spawnSync('node', [absoluteScriptPath], {
|
|
@@ -79,6 +132,74 @@ function runJsonScript(scriptRelativePath) {
|
|
|
79
132
|
}
|
|
80
133
|
}
|
|
81
134
|
|
|
135
|
+
function runNodeScript(scriptRelativePath, argumentsList = []) {
|
|
136
|
+
const absoluteScriptPath = join(REPOSITORY_ROOT, scriptRelativePath);
|
|
137
|
+
const executionResult = spawnSync('node', [absoluteScriptPath, ...argumentsList], {
|
|
138
|
+
cwd: REPOSITORY_ROOT,
|
|
139
|
+
encoding: 'utf8',
|
|
140
|
+
maxBuffer: 1024 * 1024 * 10,
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
scriptPath: scriptRelativePath,
|
|
145
|
+
exitCode: typeof executionResult.status === 'number' ? executionResult.status : 1,
|
|
146
|
+
stdout: (executionResult.stdout || '').trim(),
|
|
147
|
+
stderr: (executionResult.stderr || '').trim(),
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
function runNpmAuditIndicator() {
|
|
152
|
+
const executionResult = spawnSync('npm', ['audit', '--json', '--omit=dev'], {
|
|
153
|
+
cwd: REPOSITORY_ROOT,
|
|
154
|
+
encoding: 'utf8',
|
|
155
|
+
maxBuffer: 1024 * 1024 * 10,
|
|
156
|
+
});
|
|
157
|
+
|
|
158
|
+
const combinedOutput = [executionResult.stdout, executionResult.stderr].filter(Boolean).join('\n').trim();
|
|
159
|
+
const { parsed: parsedAuditReport, error: parseError } = parseJsonPayload(combinedOutput);
|
|
160
|
+
const exitCode = typeof executionResult.status === 'number' ? executionResult.status : 1;
|
|
161
|
+
|
|
162
|
+
if (!parsedAuditReport || parseError) {
|
|
163
|
+
return {
|
|
164
|
+
isAvailable: false,
|
|
165
|
+
exitCode,
|
|
166
|
+
severityCounts: null,
|
|
167
|
+
hasKnownVulnerabilities: null,
|
|
168
|
+
error: parseError || 'Unable to parse npm audit output',
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
const vulnerabilityMetadata = parsedAuditReport.metadata?.vulnerabilities || null;
|
|
173
|
+
const severityCounts = vulnerabilityMetadata
|
|
174
|
+
? {
|
|
175
|
+
info: toFiniteNumber(vulnerabilityMetadata.info, 0),
|
|
176
|
+
low: toFiniteNumber(vulnerabilityMetadata.low, 0),
|
|
177
|
+
moderate: toFiniteNumber(vulnerabilityMetadata.moderate, 0),
|
|
178
|
+
high: toFiniteNumber(vulnerabilityMetadata.high, 0),
|
|
179
|
+
critical: toFiniteNumber(vulnerabilityMetadata.critical, 0),
|
|
180
|
+
total: toFiniteNumber(vulnerabilityMetadata.total, 0),
|
|
181
|
+
}
|
|
182
|
+
: null;
|
|
183
|
+
|
|
184
|
+
if (!severityCounts) {
|
|
185
|
+
return {
|
|
186
|
+
isAvailable: false,
|
|
187
|
+
exitCode,
|
|
188
|
+
severityCounts: null,
|
|
189
|
+
hasKnownVulnerabilities: null,
|
|
190
|
+
error: parsedAuditReport.error?.summary || 'npm audit report does not include vulnerability metadata',
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
return {
|
|
195
|
+
isAvailable: true,
|
|
196
|
+
exitCode,
|
|
197
|
+
severityCounts,
|
|
198
|
+
hasKnownVulnerabilities: severityCounts.total > 0,
|
|
199
|
+
error: null,
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
|
|
82
203
|
function summarizeExecution(scriptExecutionResult) {
|
|
83
204
|
return {
|
|
84
205
|
scriptPath: scriptExecutionResult.scriptPath,
|
|
@@ -92,7 +213,18 @@ function summarizeExecution(scriptExecutionResult) {
|
|
|
92
213
|
};
|
|
93
214
|
}
|
|
94
215
|
|
|
95
|
-
function
|
|
216
|
+
function appendUniqueTextValues(baseValues, additionalValues) {
|
|
217
|
+
const mergedValues = [...baseValues];
|
|
218
|
+
for (const additionalValue of additionalValues) {
|
|
219
|
+
if (!mergedValues.includes(additionalValue)) {
|
|
220
|
+
mergedValues.push(additionalValue);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
return mergedValues;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function buildRubricSummary(thresholdConfiguration, intelligenceReport, memoryContinuityReport) {
|
|
96
228
|
return {
|
|
97
229
|
benchmarkThresholds: {
|
|
98
230
|
minimumTop1Accuracy: thresholdConfiguration?.minimumTop1Accuracy ?? null,
|
|
@@ -101,25 +233,296 @@ function buildRubricSummary(thresholdConfiguration, intelligenceReport) {
|
|
|
101
233
|
maximumManualCorrectionIncrease: thresholdConfiguration?.maximumManualCorrectionIncrease ?? null,
|
|
102
234
|
},
|
|
103
235
|
intelligenceSlaDays: intelligenceReport?.reviewSlaDays ?? null,
|
|
236
|
+
reliabilityThresholds: RELIABILITY_THRESHOLDS,
|
|
237
|
+
continuityThresholds: memoryContinuityReport?.thresholds || null,
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
function buildReliabilitySignals(detectionBenchmarkReport) {
|
|
242
|
+
const fixtureResults = Array.isArray(detectionBenchmarkReport?.fixtures) ? detectionBenchmarkReport.fixtures : [];
|
|
243
|
+
const fixtureCount = fixtureResults.length;
|
|
244
|
+
|
|
245
|
+
const incorrectFixtures = fixtureResults.filter((fixtureResult) => fixtureResult?.isCorrect === false);
|
|
246
|
+
const lowConfidenceFixtures = fixtureResults.filter((fixtureResult) => {
|
|
247
|
+
const confidenceGap = toFiniteNumber(fixtureResult?.confidenceGap, 0);
|
|
248
|
+
return confidenceGap < RELIABILITY_THRESHOLDS.minimumConfidenceGap;
|
|
249
|
+
});
|
|
250
|
+
const manualCorrectionFixtures = fixtureResults.filter((fixtureResult) => fixtureResult?.needsManualCorrection === true);
|
|
251
|
+
|
|
252
|
+
const incorrectDetectionRate = fixtureCount === 0
|
|
253
|
+
? 0
|
|
254
|
+
: Number((incorrectFixtures.length / fixtureCount).toFixed(4));
|
|
255
|
+
const lowConfidenceRate = fixtureCount === 0
|
|
256
|
+
? 0
|
|
257
|
+
: Number((lowConfidenceFixtures.length / fixtureCount).toFixed(4));
|
|
258
|
+
const manualCorrectionRate = fixtureCount === 0
|
|
259
|
+
? 0
|
|
260
|
+
: Number((manualCorrectionFixtures.length / fixtureCount).toFixed(4));
|
|
261
|
+
|
|
262
|
+
const reliabilityChecks = [
|
|
263
|
+
{
|
|
264
|
+
checkName: 'incorrect-detection-rate',
|
|
265
|
+
passed: incorrectDetectionRate <= RELIABILITY_THRESHOLDS.maximumIncorrectDetectionRate,
|
|
266
|
+
details: `incorrectRate=${incorrectDetectionRate} max=${RELIABILITY_THRESHOLDS.maximumIncorrectDetectionRate}`,
|
|
267
|
+
},
|
|
268
|
+
{
|
|
269
|
+
checkName: 'low-confidence-rate',
|
|
270
|
+
passed: lowConfidenceRate <= RELIABILITY_THRESHOLDS.maximumLowConfidenceRate,
|
|
271
|
+
details: `lowConfidenceRate=${lowConfidenceRate} max=${RELIABILITY_THRESHOLDS.maximumLowConfidenceRate}`,
|
|
272
|
+
},
|
|
273
|
+
{
|
|
274
|
+
checkName: 'manual-correction-early-warning',
|
|
275
|
+
passed: manualCorrectionRate <= 0.12,
|
|
276
|
+
details: `manualCorrectionRate=${manualCorrectionRate} warningThreshold=0.12`,
|
|
277
|
+
},
|
|
278
|
+
];
|
|
279
|
+
|
|
280
|
+
const failureCount = reliabilityChecks.filter((reliabilityCheck) => !reliabilityCheck.passed).length;
|
|
281
|
+
const riskLevel = failureCount === 0
|
|
282
|
+
? (incorrectFixtures.length > 0 || lowConfidenceFixtures.length > 0 ? 'monitor' : 'stable')
|
|
283
|
+
: (failureCount >= 2 ? 'high' : 'elevated');
|
|
284
|
+
|
|
285
|
+
return {
|
|
286
|
+
passed: failureCount === 0,
|
|
287
|
+
failureCount,
|
|
288
|
+
riskLevel,
|
|
289
|
+
thresholds: RELIABILITY_THRESHOLDS,
|
|
290
|
+
metrics: {
|
|
291
|
+
fixtureCount,
|
|
292
|
+
incorrectFixtureCount: incorrectFixtures.length,
|
|
293
|
+
lowConfidenceFixtureCount: lowConfidenceFixtures.length,
|
|
294
|
+
manualCorrectionFixtureCount: manualCorrectionFixtures.length,
|
|
295
|
+
incorrectDetectionRate,
|
|
296
|
+
lowConfidenceRate,
|
|
297
|
+
manualCorrectionRate,
|
|
298
|
+
},
|
|
299
|
+
checks: reliabilityChecks,
|
|
300
|
+
flaggedFixtures: fixtureResults
|
|
301
|
+
.filter((fixtureResult) => fixtureResult?.isCorrect === false || fixtureResult?.needsManualCorrection === true)
|
|
302
|
+
.map((fixtureResult) => ({
|
|
303
|
+
fixtureName: fixtureResult.fixtureName,
|
|
304
|
+
confidenceGap: fixtureResult.confidenceGap,
|
|
305
|
+
detectedStack: fixtureResult.detectedStack,
|
|
306
|
+
expectedStack: fixtureResult.expectedStack,
|
|
307
|
+
isCorrect: fixtureResult.isCorrect,
|
|
308
|
+
needsManualCorrection: fixtureResult.needsManualCorrection,
|
|
309
|
+
})),
|
|
310
|
+
};
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
function buildBugIndicators(reliabilitySignals) {
|
|
314
|
+
return {
|
|
315
|
+
incorrectFixtureCount: reliabilitySignals.metrics.incorrectFixtureCount,
|
|
316
|
+
incorrectDetectionRate: reliabilitySignals.metrics.incorrectDetectionRate,
|
|
317
|
+
manualCorrectionFixtureCount: reliabilitySignals.metrics.manualCorrectionFixtureCount,
|
|
318
|
+
manualCorrectionRate: reliabilitySignals.metrics.manualCorrectionRate,
|
|
319
|
+
lowConfidenceFixtureCount: reliabilitySignals.metrics.lowConfidenceFixtureCount,
|
|
320
|
+
lowConfidenceRate: reliabilitySignals.metrics.lowConfidenceRate,
|
|
321
|
+
flaggedFixtures: reliabilitySignals.flaggedFixtures,
|
|
322
|
+
};
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
function buildSecurityIndicators(forbiddenContentExecution, npmAuditIndicator) {
|
|
326
|
+
const forbiddenContentPassed = forbiddenContentExecution.exitCode === 0;
|
|
327
|
+
|
|
328
|
+
return {
|
|
329
|
+
forbiddenContent: {
|
|
330
|
+
checkName: 'forbidden-content-scan',
|
|
331
|
+
passed: forbiddenContentPassed,
|
|
332
|
+
exitCode: forbiddenContentExecution.exitCode,
|
|
333
|
+
details: forbiddenContentPassed
|
|
334
|
+
? 'No forbidden content detected'
|
|
335
|
+
: 'Forbidden content scan found one or more violations',
|
|
336
|
+
},
|
|
337
|
+
vulnerabilityScan: {
|
|
338
|
+
checkName: 'npm-audit-indicator',
|
|
339
|
+
isAvailable: npmAuditIndicator.isAvailable,
|
|
340
|
+
hasKnownVulnerabilities: npmAuditIndicator.hasKnownVulnerabilities,
|
|
341
|
+
severityCounts: npmAuditIndicator.severityCounts,
|
|
342
|
+
exitCode: npmAuditIndicator.exitCode,
|
|
343
|
+
error: npmAuditIndicator.error,
|
|
344
|
+
},
|
|
345
|
+
};
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
function readReleaseVersion() {
|
|
349
|
+
const packageJson = readJsonOrNull(PACKAGE_JSON_PATH);
|
|
350
|
+
return typeof packageJson?.version === 'string' && packageJson.version.trim().length > 0
|
|
351
|
+
? packageJson.version.trim()
|
|
352
|
+
: 'unknown';
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
function loadBenchmarkHistory() {
|
|
356
|
+
const historyPayload = readJsonOrNull(HISTORY_OUTPUT_PATH);
|
|
357
|
+
|
|
358
|
+
if (Array.isArray(historyPayload?.history)) {
|
|
359
|
+
return historyPayload.history;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
if (Array.isArray(historyPayload)) {
|
|
363
|
+
return historyPayload;
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
return [];
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
function mergeBenchmarkHistory(previousHistoryEntries, currentSnapshot) {
|
|
370
|
+
const mergedHistoryEntries = [...previousHistoryEntries, currentSnapshot];
|
|
371
|
+
if (mergedHistoryEntries.length <= MAX_HISTORY_ENTRIES) {
|
|
372
|
+
return mergedHistoryEntries;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
return mergedHistoryEntries.slice(mergedHistoryEntries.length - MAX_HISTORY_ENTRIES);
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
function buildHistorySnapshot({
|
|
379
|
+
generatedAt,
|
|
380
|
+
releaseVersion,
|
|
381
|
+
detectionBenchmarkReport,
|
|
382
|
+
benchmarkGateReport,
|
|
383
|
+
benchmarkIntelligenceReport,
|
|
384
|
+
reliabilitySignals,
|
|
385
|
+
securityIndicators,
|
|
386
|
+
}) {
|
|
387
|
+
const staleWatchlistCount = Array.isArray(benchmarkIntelligenceReport?.watchlist)
|
|
388
|
+
? benchmarkIntelligenceReport.watchlist.filter((watchlistEntry) => watchlistEntry?.stale === true).length
|
|
389
|
+
: null;
|
|
390
|
+
|
|
391
|
+
return {
|
|
392
|
+
generatedAt,
|
|
393
|
+
releaseVersion,
|
|
394
|
+
fixtureCount: toFiniteNumber(detectionBenchmarkReport?.fixtureCount, 0),
|
|
395
|
+
top1Accuracy: toFiniteNumber(detectionBenchmarkReport?.top1Accuracy, 0),
|
|
396
|
+
manualCorrectionRate: toFiniteNumber(detectionBenchmarkReport?.manualCorrectionRate, 0),
|
|
397
|
+
benchmarkGatePassed: benchmarkGateReport?.passed === true,
|
|
398
|
+
intelligencePassed: benchmarkIntelligenceReport?.passed === true,
|
|
399
|
+
staleWatchlistCount,
|
|
400
|
+
reliabilityPassed: reliabilitySignals.passed,
|
|
401
|
+
reliabilityRiskLevel: reliabilitySignals.riskLevel,
|
|
402
|
+
incorrectDetectionRate: reliabilitySignals.metrics.incorrectDetectionRate,
|
|
403
|
+
lowConfidenceRate: reliabilitySignals.metrics.lowConfidenceRate,
|
|
404
|
+
vulnerabilityTotal: securityIndicators.vulnerabilityScan.severityCounts?.total ?? null,
|
|
405
|
+
criticalVulnerabilityCount: securityIndicators.vulnerabilityScan.severityCounts?.critical ?? null,
|
|
406
|
+
forbiddenContentPassed: securityIndicators.forbiddenContent.passed,
|
|
104
407
|
};
|
|
105
408
|
}
|
|
106
409
|
|
|
410
|
+
function buildReleaseDelta(historyEntries, currentSnapshot) {
|
|
411
|
+
const previousSnapshots = historyEntries.slice(0, -1);
|
|
412
|
+
if (previousSnapshots.length === 0) {
|
|
413
|
+
return null;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
const previousReleaseSnapshot = [...previousSnapshots].reverse().find(
|
|
417
|
+
(historyEntry) => historyEntry.releaseVersion !== currentSnapshot.releaseVersion
|
|
418
|
+
) || previousSnapshots[previousSnapshots.length - 1];
|
|
419
|
+
|
|
420
|
+
const top1AccuracyDelta = Number((currentSnapshot.top1Accuracy - previousReleaseSnapshot.top1Accuracy).toFixed(4));
|
|
421
|
+
const manualCorrectionDelta = Number((currentSnapshot.manualCorrectionRate - previousReleaseSnapshot.manualCorrectionRate).toFixed(4));
|
|
422
|
+
const staleWatchlistDelta =
|
|
423
|
+
(toFiniteNumber(currentSnapshot.staleWatchlistCount, 0) - toFiniteNumber(previousReleaseSnapshot.staleWatchlistCount, 0));
|
|
424
|
+
const vulnerabilityDelta =
|
|
425
|
+
(toFiniteNumber(currentSnapshot.vulnerabilityTotal, 0) - toFiniteNumber(previousReleaseSnapshot.vulnerabilityTotal, 0));
|
|
426
|
+
|
|
427
|
+
return {
|
|
428
|
+
currentReleaseVersion: currentSnapshot.releaseVersion,
|
|
429
|
+
previousReleaseVersion: previousReleaseSnapshot.releaseVersion,
|
|
430
|
+
comparedSnapshot: {
|
|
431
|
+
currentGeneratedAt: currentSnapshot.generatedAt,
|
|
432
|
+
previousGeneratedAt: previousReleaseSnapshot.generatedAt,
|
|
433
|
+
},
|
|
434
|
+
top1AccuracyDelta,
|
|
435
|
+
manualCorrectionRateDelta: manualCorrectionDelta,
|
|
436
|
+
staleWatchlistCountDelta: staleWatchlistDelta,
|
|
437
|
+
vulnerabilityTotalDelta: vulnerabilityDelta,
|
|
438
|
+
summary: [
|
|
439
|
+
`top1Accuracy: ${top1AccuracyDelta >= 0 ? '+' : ''}${top1AccuracyDelta}`,
|
|
440
|
+
`manualCorrectionRate: ${manualCorrectionDelta >= 0 ? '+' : ''}${manualCorrectionDelta}`,
|
|
441
|
+
`staleWatchlistCount: ${staleWatchlistDelta >= 0 ? '+' : ''}${staleWatchlistDelta}`,
|
|
442
|
+
`vulnerabilityTotal: ${vulnerabilityDelta >= 0 ? '+' : ''}${vulnerabilityDelta}`,
|
|
443
|
+
],
|
|
444
|
+
};
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
function buildTrendTable(historyEntries) {
|
|
448
|
+
return historyEntries.map((historyEntry, index) => ({
|
|
449
|
+
snapshotIndex: index + 1,
|
|
450
|
+
generatedAt: historyEntry.generatedAt,
|
|
451
|
+
releaseVersion: historyEntry.releaseVersion,
|
|
452
|
+
top1Accuracy: historyEntry.top1Accuracy,
|
|
453
|
+
manualCorrectionRate: historyEntry.manualCorrectionRate,
|
|
454
|
+
incorrectDetectionRate: historyEntry.incorrectDetectionRate,
|
|
455
|
+
lowConfidenceRate: historyEntry.lowConfidenceRate,
|
|
456
|
+
staleWatchlistCount: historyEntry.staleWatchlistCount,
|
|
457
|
+
vulnerabilityTotal: historyEntry.vulnerabilityTotal,
|
|
458
|
+
criticalVulnerabilityCount: historyEntry.criticalVulnerabilityCount,
|
|
459
|
+
benchmarkGatePassed: historyEntry.benchmarkGatePassed,
|
|
460
|
+
intelligencePassed: historyEntry.intelligencePassed,
|
|
461
|
+
reliabilityPassed: historyEntry.reliabilityPassed,
|
|
462
|
+
reliabilityRiskLevel: historyEntry.reliabilityRiskLevel,
|
|
463
|
+
}));
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
function buildChartSeries(historyEntries) {
|
|
467
|
+
return {
|
|
468
|
+
generatedAt: historyEntries.map((historyEntry) => historyEntry.generatedAt),
|
|
469
|
+
top1Accuracy: historyEntries.map((historyEntry) => historyEntry.top1Accuracy),
|
|
470
|
+
manualCorrectionRate: historyEntries.map((historyEntry) => historyEntry.manualCorrectionRate),
|
|
471
|
+
incorrectDetectionRate: historyEntries.map((historyEntry) => historyEntry.incorrectDetectionRate),
|
|
472
|
+
lowConfidenceRate: historyEntries.map((historyEntry) => historyEntry.lowConfidenceRate),
|
|
473
|
+
staleWatchlistCount: historyEntries.map((historyEntry) => historyEntry.staleWatchlistCount),
|
|
474
|
+
vulnerabilityTotal: historyEntries.map((historyEntry) => historyEntry.vulnerabilityTotal),
|
|
475
|
+
};
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
function convertTrendTableToCsv(trendTable) {
|
|
479
|
+
if (trendTable.length === 0) {
|
|
480
|
+
return '';
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
const headers = Object.keys(trendTable[0]);
|
|
484
|
+
const csvRows = [headers.join(',')];
|
|
485
|
+
|
|
486
|
+
for (const trendRow of trendTable) {
|
|
487
|
+
const rowValues = headers.map((header) => {
|
|
488
|
+
const rawValue = trendRow[header];
|
|
489
|
+
if (rawValue === null || rawValue === undefined) {
|
|
490
|
+
return '';
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
const normalizedValue = String(rawValue).replace(/"/g, '""');
|
|
494
|
+
return `"${normalizedValue}"`;
|
|
495
|
+
});
|
|
496
|
+
|
|
497
|
+
csvRows.push(rowValues.join(','));
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
return `${csvRows.join('\n')}\n`;
|
|
501
|
+
}
|
|
502
|
+
|
|
107
503
|
async function runBenchmarkEvidenceBundle() {
|
|
108
504
|
const reproducibilityProfile = readJsonOrNull(REPRO_PROFILE_PATH);
|
|
109
505
|
const thresholdConfiguration = readJsonOrNull(BENCHMARK_THRESHOLD_PATH);
|
|
110
506
|
const watchlistConfiguration = readJsonOrNull(BENCHMARK_WATCHLIST_PATH);
|
|
507
|
+
const memorySchemaConfiguration = readJsonOrNull(MEMORY_SCHEMA_PATH);
|
|
508
|
+
const memoryAdapterContractConfiguration = readJsonOrNull(MEMORY_ADAPTER_CONTRACT_PATH);
|
|
509
|
+
const releaseVersion = readReleaseVersion();
|
|
111
510
|
|
|
112
511
|
const detectionBenchmarkExecution = runJsonScript('scripts/detection-benchmark.mjs');
|
|
113
512
|
const benchmarkGateExecution = runJsonScript('scripts/benchmark-gate.mjs');
|
|
114
513
|
const benchmarkIntelligenceExecution = runJsonScript('scripts/benchmark-intelligence.mjs');
|
|
514
|
+
const memoryContinuityExecution = runJsonScript('scripts/memory-continuity-benchmark.mjs');
|
|
515
|
+
const forbiddenContentExecution = runNodeScript('scripts/forbidden-content-check.mjs');
|
|
516
|
+
const npmAuditIndicator = runNpmAuditIndicator();
|
|
115
517
|
|
|
116
518
|
const executionSummaries = [
|
|
117
519
|
summarizeExecution(detectionBenchmarkExecution),
|
|
118
520
|
summarizeExecution(benchmarkGateExecution),
|
|
119
521
|
summarizeExecution(benchmarkIntelligenceExecution),
|
|
522
|
+
summarizeExecution(memoryContinuityExecution),
|
|
120
523
|
];
|
|
121
524
|
|
|
122
|
-
const
|
|
525
|
+
const executionFailureCount = executionSummaries.filter((executionSummary) => {
|
|
123
526
|
if (executionSummary.parseError) {
|
|
124
527
|
return true;
|
|
125
528
|
}
|
|
@@ -131,41 +534,115 @@ async function runBenchmarkEvidenceBundle() {
|
|
|
131
534
|
return executionSummary.exitCode !== 0;
|
|
132
535
|
}).length;
|
|
133
536
|
|
|
537
|
+
const reliabilitySignals = buildReliabilitySignals(detectionBenchmarkExecution.parsedReport);
|
|
538
|
+
const reliabilityFailureCount = reliabilitySignals.failureCount > 0 ? 1 : 0;
|
|
539
|
+
const failureCount = executionFailureCount + reliabilityFailureCount;
|
|
540
|
+
const securityIndicators = buildSecurityIndicators(forbiddenContentExecution, npmAuditIndicator);
|
|
541
|
+
const bugIndicators = buildBugIndicators(reliabilitySignals);
|
|
542
|
+
|
|
543
|
+
const generatedAt = new Date().toISOString();
|
|
544
|
+
const currentSnapshot = buildHistorySnapshot({
|
|
545
|
+
generatedAt,
|
|
546
|
+
releaseVersion,
|
|
547
|
+
detectionBenchmarkReport: detectionBenchmarkExecution.parsedReport,
|
|
548
|
+
benchmarkGateReport: benchmarkGateExecution.parsedReport,
|
|
549
|
+
benchmarkIntelligenceReport: benchmarkIntelligenceExecution.parsedReport,
|
|
550
|
+
reliabilitySignals,
|
|
551
|
+
securityIndicators,
|
|
552
|
+
});
|
|
553
|
+
const previousHistoryEntries = loadBenchmarkHistory();
|
|
554
|
+
const historyEntries = mergeBenchmarkHistory(previousHistoryEntries, currentSnapshot);
|
|
555
|
+
const releaseDelta = buildReleaseDelta(historyEntries, currentSnapshot);
|
|
556
|
+
const trendTable = buildTrendTable(historyEntries);
|
|
557
|
+
const chartSeries = buildChartSeries(historyEntries);
|
|
558
|
+
|
|
559
|
+
const trendReport = {
|
|
560
|
+
generatedAt,
|
|
561
|
+
reportName: 'benchmark-trend-report',
|
|
562
|
+
releaseVersion,
|
|
563
|
+
historyCount: historyEntries.length,
|
|
564
|
+
releaseDelta,
|
|
565
|
+
trendTable,
|
|
566
|
+
chartSeries,
|
|
567
|
+
artifacts: {
|
|
568
|
+
historyPath: toRelativePath(HISTORY_OUTPUT_PATH),
|
|
569
|
+
jsonPath: toRelativePath(TREND_JSON_OUTPUT_PATH),
|
|
570
|
+
csvPath: toRelativePath(TREND_CSV_OUTPUT_PATH),
|
|
571
|
+
writeMode: isStdoutOnlyMode ? 'stdout-only' : 'stdout-and-file',
|
|
572
|
+
},
|
|
573
|
+
};
|
|
574
|
+
|
|
575
|
+
const historyPayload = {
|
|
576
|
+
generatedAt,
|
|
577
|
+
reportName: 'benchmark-history',
|
|
578
|
+
maxEntries: MAX_HISTORY_ENTRIES,
|
|
579
|
+
history: historyEntries,
|
|
580
|
+
};
|
|
581
|
+
|
|
582
|
+
const trendCsvPayload = convertTrendTableToCsv(trendTable);
|
|
583
|
+
const baseRerunInstructions = Array.isArray(reproducibilityProfile?.rerunInstructions)
|
|
584
|
+
? reproducibilityProfile.rerunInstructions
|
|
585
|
+
: [];
|
|
586
|
+
const baseCommandExamples = Array.isArray(reproducibilityProfile?.commandExamples)
|
|
587
|
+
? reproducibilityProfile.commandExamples
|
|
588
|
+
: [];
|
|
589
|
+
const rerunInstructions = appendUniqueTextValues(baseRerunInstructions, [
|
|
590
|
+
'Run npm run benchmark:continuity to validate cross-agent memory hydration, privacy redaction, and token-savings behavior.',
|
|
591
|
+
]);
|
|
592
|
+
const commandExamples = appendUniqueTextValues(baseCommandExamples, [
|
|
593
|
+
'npm run benchmark:continuity',
|
|
594
|
+
'node ./scripts/memory-continuity-benchmark.mjs --stdout-only',
|
|
595
|
+
]);
|
|
596
|
+
|
|
134
597
|
const evidenceBundleReport = {
|
|
135
|
-
generatedAt
|
|
598
|
+
generatedAt,
|
|
136
599
|
reportName: 'benchmark-evidence-bundle',
|
|
137
|
-
phase: 'v2.5.
|
|
600
|
+
phase: 'v2.5.2',
|
|
601
|
+
releaseVersion,
|
|
138
602
|
passed: failureCount === 0,
|
|
139
603
|
failureCount,
|
|
140
604
|
methodology: {
|
|
141
605
|
deterministicRuntime: reproducibilityProfile?.deterministicRuntime || null,
|
|
142
606
|
scenarioCount: Array.isArray(reproducibilityProfile?.scenarios) ? reproducibilityProfile.scenarios.length : 0,
|
|
143
|
-
commandCount:
|
|
607
|
+
commandCount: commandExamples.length,
|
|
144
608
|
},
|
|
145
|
-
rerunInstructions
|
|
146
|
-
|
|
147
|
-
: [],
|
|
148
|
-
commandExamples: Array.isArray(reproducibilityProfile?.commandExamples)
|
|
149
|
-
? reproducibilityProfile.commandExamples
|
|
150
|
-
: [],
|
|
609
|
+
rerunInstructions,
|
|
610
|
+
commandExamples,
|
|
151
611
|
rawInputs: {
|
|
152
612
|
scenarios: Array.isArray(reproducibilityProfile?.scenarios) ? reproducibilityProfile.scenarios : [],
|
|
153
613
|
benchmarkThresholds: thresholdConfiguration,
|
|
154
614
|
benchmarkWatchlist: Array.isArray(watchlistConfiguration?.repositories)
|
|
155
615
|
? watchlistConfiguration.repositories
|
|
156
616
|
: [],
|
|
617
|
+
memorySchema: memorySchemaConfiguration,
|
|
618
|
+
memoryAdapterContract: memoryAdapterContractConfiguration,
|
|
157
619
|
},
|
|
158
|
-
rubric: buildRubricSummary(
|
|
620
|
+
rubric: buildRubricSummary(
|
|
621
|
+
thresholdConfiguration,
|
|
622
|
+
benchmarkIntelligenceExecution.parsedReport,
|
|
623
|
+
memoryContinuityExecution.parsedReport
|
|
624
|
+
),
|
|
625
|
+
bugIndicators,
|
|
626
|
+
reliabilitySignals,
|
|
627
|
+
securityIndicators,
|
|
628
|
+
releaseDelta,
|
|
629
|
+
history: historyEntries,
|
|
630
|
+
trendReport,
|
|
159
631
|
outputs: {
|
|
160
632
|
detectionBenchmark: detectionBenchmarkExecution.parsedReport,
|
|
161
633
|
benchmarkGate: benchmarkGateExecution.parsedReport,
|
|
162
634
|
benchmarkIntelligence: benchmarkIntelligenceExecution.parsedReport,
|
|
635
|
+
memoryContinuityBenchmark: memoryContinuityExecution.parsedReport,
|
|
163
636
|
},
|
|
164
637
|
executions: executionSummaries,
|
|
165
638
|
};
|
|
166
639
|
|
|
167
640
|
if (!isStdoutOnlyMode) {
|
|
641
|
+
await fs.mkdir(join(REPOSITORY_ROOT, '.agent-context', 'state'), { recursive: true });
|
|
168
642
|
await fs.writeFile(OUTPUT_PATH, JSON.stringify(evidenceBundleReport, null, 2) + '\n', 'utf8');
|
|
643
|
+
await fs.writeFile(HISTORY_OUTPUT_PATH, JSON.stringify(historyPayload, null, 2) + '\n', 'utf8');
|
|
644
|
+
await fs.writeFile(TREND_JSON_OUTPUT_PATH, JSON.stringify(trendReport, null, 2) + '\n', 'utf8');
|
|
645
|
+
await fs.writeFile(TREND_CSV_OUTPUT_PATH, trendCsvPayload, 'utf8');
|
|
169
646
|
}
|
|
170
647
|
|
|
171
648
|
console.log(JSON.stringify(evidenceBundleReport, null, 2));
|
|
@@ -21,6 +21,7 @@ const REQUIRED_FILES = [
|
|
|
21
21
|
'docs/v1.7-issue-breakdown.md',
|
|
22
22
|
'docs/v1.7-execution-playbook.md',
|
|
23
23
|
'.agent-context/review-checklists/frontend-usability.md',
|
|
24
|
+
'.agent-context/review-checklists/frontend-excellence-rubric.md',
|
|
24
25
|
];
|
|
25
26
|
|
|
26
27
|
const REQUIRED_ROADMAP_SNIPPETS = [
|
|
@@ -37,6 +38,14 @@ const REQUIRED_CHECKLIST_SNIPPETS = [
|
|
|
37
38
|
'Documentation and Release Evidence',
|
|
38
39
|
];
|
|
39
40
|
|
|
41
|
+
const REQUIRED_EXCELLENCE_RUBRIC_SNIPPETS = [
|
|
42
|
+
'Visual Direction and Identity',
|
|
43
|
+
'Typography Quality',
|
|
44
|
+
'Color System Diversity and Contrast',
|
|
45
|
+
'Interaction Choreography',
|
|
46
|
+
'Awwwards-level reference quality',
|
|
47
|
+
];
|
|
48
|
+
|
|
40
49
|
function assertFileExists(relativeFilePath, failures) {
|
|
41
50
|
const absoluteFilePath = resolve(REPOSITORY_ROOT, relativeFilePath);
|
|
42
51
|
if (!existsSync(absoluteFilePath)) {
|
|
@@ -61,6 +70,7 @@ function runAudit() {
|
|
|
61
70
|
|
|
62
71
|
const roadmapPath = 'docs/roadmap.md';
|
|
63
72
|
const checklistPath = '.agent-context/review-checklists/frontend-usability.md';
|
|
73
|
+
const excellenceRubricPath = '.agent-context/review-checklists/frontend-excellence-rubric.md';
|
|
64
74
|
|
|
65
75
|
if (existsSync(resolve(REPOSITORY_ROOT, roadmapPath))) {
|
|
66
76
|
const roadmapContent = readFileSync(resolve(REPOSITORY_ROOT, roadmapPath), 'utf8');
|
|
@@ -72,6 +82,17 @@ function runAudit() {
|
|
|
72
82
|
assertContains('Checklist', checklistPath, checklistContent, REQUIRED_CHECKLIST_SNIPPETS, failures);
|
|
73
83
|
}
|
|
74
84
|
|
|
85
|
+
if (existsSync(resolve(REPOSITORY_ROOT, excellenceRubricPath))) {
|
|
86
|
+
const excellenceRubricContent = readFileSync(resolve(REPOSITORY_ROOT, excellenceRubricPath), 'utf8');
|
|
87
|
+
assertContains(
|
|
88
|
+
'Frontend excellence rubric',
|
|
89
|
+
excellenceRubricPath,
|
|
90
|
+
excellenceRubricContent,
|
|
91
|
+
REQUIRED_EXCELLENCE_RUBRIC_SNIPPETS,
|
|
92
|
+
failures
|
|
93
|
+
);
|
|
94
|
+
}
|
|
95
|
+
|
|
75
96
|
const reportPayload = {
|
|
76
97
|
generatedAt: new Date().toISOString(),
|
|
77
98
|
auditName: 'frontend-usability-audit',
|