@ryuenn3123/agentic-senior-core 2.0.26 → 2.0.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-context/state/benchmark-evidence-bundle.json +672 -22
- package/.agent-context/state/benchmark-history.json +75 -0
- package/.agent-context/state/benchmark-trend-report.csv +5 -0
- package/.agent-context/state/benchmark-trend-report.json +140 -0
- package/.agent-context/state/benchmark-watchlist.json +3 -3
- package/.agent-context/state/memory-adapter-contract.json +52 -0
- package/.agent-context/state/memory-continuity-benchmark.json +132 -0
- package/.agent-context/state/memory-schema-v1.json +88 -0
- package/.cursorrules +1 -1
- package/.windsurfrules +1 -1
- package/README.md +22 -0
- package/lib/cli/memory-continuity.mjs +266 -0
- package/package.json +2 -1
- package/scripts/benchmark-evidence-bundle.mjs +493 -16
- package/scripts/memory-continuity-benchmark.mjs +322 -0
- package/scripts/validate.mjs +3 -0
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* memory-continuity-benchmark.mjs
|
|
5
|
+
*
|
|
6
|
+
* Provider-agnostic continuity benchmark for cross-agent memory retrieval.
|
|
7
|
+
* Measures new-session hydration relevance, token savings, and privacy redaction safety.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
11
|
+
import fs from 'node:fs/promises';
|
|
12
|
+
import { dirname, join, resolve } from 'node:path';
|
|
13
|
+
import { fileURLToPath } from 'node:url';
|
|
14
|
+
import {
|
|
15
|
+
MEMORY_SCHEMA_VERSION,
|
|
16
|
+
SUPPORTED_MEMORY_ADAPTER_IDS,
|
|
17
|
+
normalizeMemoryObservation,
|
|
18
|
+
buildSessionStartIndex,
|
|
19
|
+
hydrateIndexedObservations,
|
|
20
|
+
estimateTokenUsage,
|
|
21
|
+
} from '../lib/cli/memory-continuity.mjs';
|
|
22
|
+
|
|
23
|
+
const SCRIPT_FILE_PATH = fileURLToPath(import.meta.url);
|
|
24
|
+
const SCRIPT_DIR = dirname(SCRIPT_FILE_PATH);
|
|
25
|
+
const REPOSITORY_ROOT = resolve(SCRIPT_DIR, '..');
|
|
26
|
+
const ARGUMENT_FLAGS = new Set(process.argv.slice(2));
|
|
27
|
+
const isStdoutOnlyMode = ARGUMENT_FLAGS.has('--stdout-only');
|
|
28
|
+
|
|
29
|
+
const MEMORY_SCHEMA_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'memory-schema-v1.json');
|
|
30
|
+
const MEMORY_ADAPTER_CONTRACT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'memory-adapter-contract.json');
|
|
31
|
+
const OUTPUT_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'memory-continuity-benchmark.json');
|
|
32
|
+
|
|
33
|
+
const THRESHOLDS = {
|
|
34
|
+
minimumRelevantRecall: 0.8,
|
|
35
|
+
minimumSessionStartTokenSavingsPercent: 35,
|
|
36
|
+
maximumUnsafeObservationCount: 0,
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
const MEMORY_FIXTURES = [
|
|
40
|
+
{
|
|
41
|
+
id: 'obs-001',
|
|
42
|
+
adapterId: 'claude-code',
|
|
43
|
+
eventType: 'decision',
|
|
44
|
+
projectId: 'agentic-senior-core',
|
|
45
|
+
sessionId: 'session-a',
|
|
46
|
+
timestamp: '2026-04-17T02:30:00.000Z',
|
|
47
|
+
title: 'Docker development lane agreed',
|
|
48
|
+
summary: 'Team agreed Docker for development and production lanes with separate optimization priorities.',
|
|
49
|
+
detail: 'Use compose for local build loop. Production images must stay minimal. api_key=prod-key-placeholder should be stored outside code.',
|
|
50
|
+
tags: ['docker', 'dev-prod-split', 'runtime'],
|
|
51
|
+
privacyLevel: 'internal',
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
id: 'obs-002',
|
|
55
|
+
adapterId: 'gemini-cli',
|
|
56
|
+
eventType: 'context',
|
|
57
|
+
projectId: 'agentic-senior-core',
|
|
58
|
+
sessionId: 'session-b',
|
|
59
|
+
timestamp: '2026-04-17T02:31:00.000Z',
|
|
60
|
+
title: 'Runtime environment policy',
|
|
61
|
+
summary: 'Windows and Linux or WSL command context should be explicit in generated guidance.',
|
|
62
|
+
detail: 'Runtime hint: prefer linux-wsl for container tasks on Windows hosts. <private>internal machine hostname and user alias</private>',
|
|
63
|
+
tags: ['runtime', 'linux-wsl', 'windows'],
|
|
64
|
+
privacyLevel: 'restricted',
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
id: 'obs-003',
|
|
68
|
+
adapterId: 'vscode-chat',
|
|
69
|
+
eventType: 'summary',
|
|
70
|
+
projectId: 'agentic-senior-core',
|
|
71
|
+
sessionId: 'session-c',
|
|
72
|
+
timestamp: '2026-04-17T02:32:00.000Z',
|
|
73
|
+
title: 'Frontend excellence baseline',
|
|
74
|
+
summary: 'Frontend rubric must enforce visual direction, typography intent, and responsive behavior checks.',
|
|
75
|
+
detail: 'Use diversity checks for layout and style systems, and avoid template output repetition.',
|
|
76
|
+
tags: ['frontend', 'rubric', 'quality'],
|
|
77
|
+
privacyLevel: 'internal',
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
id: 'obs-004',
|
|
81
|
+
adapterId: 'claude-code',
|
|
82
|
+
eventType: 'issue',
|
|
83
|
+
projectId: 'agentic-senior-core',
|
|
84
|
+
sessionId: 'session-d',
|
|
85
|
+
timestamp: '2026-04-17T02:33:00.000Z',
|
|
86
|
+
title: 'Nested template conditional issue',
|
|
87
|
+
summary: 'Nested conditional blocks caused unresolved placeholders in generated docs.',
|
|
88
|
+
detail: 'Fix by precomputing placeholder text before rendering template.',
|
|
89
|
+
tags: ['template', 'rendering', 'fix'],
|
|
90
|
+
privacyLevel: 'internal',
|
|
91
|
+
},
|
|
92
|
+
{
|
|
93
|
+
id: 'obs-005',
|
|
94
|
+
adapterId: 'gemini-cli',
|
|
95
|
+
eventType: 'tool-use',
|
|
96
|
+
projectId: 'agentic-senior-core',
|
|
97
|
+
sessionId: 'session-e',
|
|
98
|
+
timestamp: '2026-04-17T02:34:00.000Z',
|
|
99
|
+
title: 'Benchmark trend report generated',
|
|
100
|
+
summary: 'History and trend csv outputs are generated each run for release-over-release review.',
|
|
101
|
+
detail: 'Artifacts include benchmark-history json and benchmark-trend-report csv for quick chart ingestion.',
|
|
102
|
+
tags: ['benchmark', 'trend', 'history'],
|
|
103
|
+
privacyLevel: 'internal',
|
|
104
|
+
},
|
|
105
|
+
];
|
|
106
|
+
|
|
107
|
+
const CONTINUITY_SCENARIOS = [
|
|
108
|
+
{
|
|
109
|
+
scenarioId: 'docker-lane-hydration',
|
|
110
|
+
query: 'what is docker strategy for development and production',
|
|
111
|
+
expectedObservationIds: ['obs-001'],
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
scenarioId: 'runtime-hydration',
|
|
115
|
+
query: 'which runtime target should we prefer on windows with wsl',
|
|
116
|
+
expectedObservationIds: ['obs-002'],
|
|
117
|
+
},
|
|
118
|
+
{
|
|
119
|
+
scenarioId: 'frontend-quality-hydration',
|
|
120
|
+
query: 'show frontend rubric quality decisions',
|
|
121
|
+
expectedObservationIds: ['obs-003'],
|
|
122
|
+
},
|
|
123
|
+
];
|
|
124
|
+
|
|
125
|
+
function readJsonOrNull(filePath) {
|
|
126
|
+
if (!existsSync(filePath)) {
|
|
127
|
+
return null;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
try {
|
|
131
|
+
return JSON.parse(readFileSync(filePath, 'utf8'));
|
|
132
|
+
} catch {
|
|
133
|
+
return null;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function calculateRecall(expectedObservationIds, selectedIds) {
|
|
138
|
+
if (!Array.isArray(expectedObservationIds) || expectedObservationIds.length === 0) {
|
|
139
|
+
return 1;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
const selectedIdSet = new Set(selectedIds);
|
|
143
|
+
const matchedCount = expectedObservationIds.filter((expectedId) => selectedIdSet.has(expectedId)).length;
|
|
144
|
+
return Number((matchedCount / expectedObservationIds.length).toFixed(4));
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
function summarizePrivacyControls(normalizedObservations) {
|
|
148
|
+
const redactedObservationCount = normalizedObservations.filter(
|
|
149
|
+
(normalizedObservation) => normalizedObservation.privacy.redactionApplied
|
|
150
|
+
).length;
|
|
151
|
+
|
|
152
|
+
const privateTagRedactionCount = normalizedObservations.reduce(
|
|
153
|
+
(countAccumulator, normalizedObservation) => countAccumulator + normalizedObservation.privacy.privateTagRedactionCount,
|
|
154
|
+
0
|
|
155
|
+
);
|
|
156
|
+
|
|
157
|
+
const inlineRedactionCount = normalizedObservations.reduce(
|
|
158
|
+
(countAccumulator, normalizedObservation) => countAccumulator + normalizedObservation.privacy.inlineRedactionCount,
|
|
159
|
+
0
|
|
160
|
+
);
|
|
161
|
+
|
|
162
|
+
const unsafeObservationCount = normalizedObservations.filter((normalizedObservation) => {
|
|
163
|
+
const detailText = String(normalizedObservation.detail || '');
|
|
164
|
+
return /<private>/i.test(detailText);
|
|
165
|
+
}).length;
|
|
166
|
+
|
|
167
|
+
return {
|
|
168
|
+
redactedObservationCount,
|
|
169
|
+
privateTagRedactionCount,
|
|
170
|
+
inlineRedactionCount,
|
|
171
|
+
unsafeObservationCount,
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
function buildAdapterCoverage(adapterContract) {
|
|
176
|
+
const requiredAdapterIds = Array.isArray(adapterContract?.requiredAdapters)
|
|
177
|
+
? adapterContract.requiredAdapters
|
|
178
|
+
: [...SUPPORTED_MEMORY_ADAPTER_IDS];
|
|
179
|
+
|
|
180
|
+
const declaredAdapters = Array.isArray(adapterContract?.adapters)
|
|
181
|
+
? adapterContract.adapters.map((adapterEntry) => adapterEntry.adapterId)
|
|
182
|
+
: [];
|
|
183
|
+
|
|
184
|
+
const availableAdapterIds = Array.from(new Set([
|
|
185
|
+
...SUPPORTED_MEMORY_ADAPTER_IDS,
|
|
186
|
+
...declaredAdapters,
|
|
187
|
+
])).sort();
|
|
188
|
+
|
|
189
|
+
const missingAdapterIds = requiredAdapterIds.filter(
|
|
190
|
+
(requiredAdapterId) => !availableAdapterIds.includes(requiredAdapterId)
|
|
191
|
+
);
|
|
192
|
+
|
|
193
|
+
return {
|
|
194
|
+
requiredAdapterIds,
|
|
195
|
+
availableAdapterIds,
|
|
196
|
+
missingAdapterIds,
|
|
197
|
+
passed: missingAdapterIds.length === 0,
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
async function runMemoryContinuityBenchmark() {
|
|
202
|
+
const memorySchema = readJsonOrNull(MEMORY_SCHEMA_PATH);
|
|
203
|
+
const memoryAdapterContract = readJsonOrNull(MEMORY_ADAPTER_CONTRACT_PATH);
|
|
204
|
+
|
|
205
|
+
const normalizedObservations = MEMORY_FIXTURES.map((memoryFixture) => normalizeMemoryObservation(memoryFixture));
|
|
206
|
+
|
|
207
|
+
const fullContextTokenEstimate = normalizedObservations.reduce(
|
|
208
|
+
(tokenAccumulator, normalizedObservation) => tokenAccumulator
|
|
209
|
+
+ estimateTokenUsage(normalizedObservation.title)
|
|
210
|
+
+ estimateTokenUsage(normalizedObservation.summary)
|
|
211
|
+
+ estimateTokenUsage(normalizedObservation.detail),
|
|
212
|
+
0
|
|
213
|
+
);
|
|
214
|
+
|
|
215
|
+
const scenarioResults = CONTINUITY_SCENARIOS.map((continuityScenario) => {
|
|
216
|
+
const sessionStartIndex = buildSessionStartIndex(normalizedObservations, {
|
|
217
|
+
queryText: continuityScenario.query,
|
|
218
|
+
limit: 5,
|
|
219
|
+
});
|
|
220
|
+
|
|
221
|
+
const hydration = hydrateIndexedObservations(
|
|
222
|
+
sessionStartIndex.indexEntries,
|
|
223
|
+
normalizedObservations,
|
|
224
|
+
{ fullFetchLimit: 1 }
|
|
225
|
+
);
|
|
226
|
+
|
|
227
|
+
const recall = calculateRecall(continuityScenario.expectedObservationIds, hydration.selectedIds);
|
|
228
|
+
const sessionStartTokenEstimate = sessionStartIndex.totalTokenEstimate + hydration.hydrationTokenEstimate;
|
|
229
|
+
const tokenSavingsPercent = fullContextTokenEstimate === 0
|
|
230
|
+
? 0
|
|
231
|
+
: Number((((fullContextTokenEstimate - sessionStartTokenEstimate) / fullContextTokenEstimate) * 100).toFixed(2));
|
|
232
|
+
|
|
233
|
+
return {
|
|
234
|
+
scenarioId: continuityScenario.scenarioId,
|
|
235
|
+
query: continuityScenario.query,
|
|
236
|
+
expectedObservationIds: continuityScenario.expectedObservationIds,
|
|
237
|
+
indexObservationIds: sessionStartIndex.indexEntries.map((indexEntry) => indexEntry.id),
|
|
238
|
+
hydratedObservationIds: hydration.selectedIds,
|
|
239
|
+
relevantRecall: recall,
|
|
240
|
+
fullContextTokenEstimate,
|
|
241
|
+
sessionStartTokenEstimate,
|
|
242
|
+
sessionStartTokenSavingsPercent: tokenSavingsPercent,
|
|
243
|
+
};
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
const averageRelevantRecall = Number((
|
|
247
|
+
scenarioResults.reduce((sum, scenarioResult) => sum + scenarioResult.relevantRecall, 0)
|
|
248
|
+
/ scenarioResults.length
|
|
249
|
+
).toFixed(4));
|
|
250
|
+
|
|
251
|
+
const averageSessionStartTokenSavingsPercent = Number((
|
|
252
|
+
scenarioResults.reduce((sum, scenarioResult) => sum + scenarioResult.sessionStartTokenSavingsPercent, 0)
|
|
253
|
+
/ scenarioResults.length
|
|
254
|
+
).toFixed(2));
|
|
255
|
+
|
|
256
|
+
const privacyControls = summarizePrivacyControls(normalizedObservations);
|
|
257
|
+
const adapterCoverage = buildAdapterCoverage(memoryAdapterContract);
|
|
258
|
+
|
|
259
|
+
const checks = [
|
|
260
|
+
{
|
|
261
|
+
checkName: 'adapter-coverage',
|
|
262
|
+
passed: adapterCoverage.passed,
|
|
263
|
+
details: `required=${adapterCoverage.requiredAdapterIds.length} missing=${adapterCoverage.missingAdapterIds.length}`,
|
|
264
|
+
},
|
|
265
|
+
{
|
|
266
|
+
checkName: 'continuity-recall-threshold',
|
|
267
|
+
passed: averageRelevantRecall >= THRESHOLDS.minimumRelevantRecall,
|
|
268
|
+
details: `averageRelevantRecall=${averageRelevantRecall} minimum=${THRESHOLDS.minimumRelevantRecall}`,
|
|
269
|
+
},
|
|
270
|
+
{
|
|
271
|
+
checkName: 'session-start-token-savings-threshold',
|
|
272
|
+
passed: averageSessionStartTokenSavingsPercent >= THRESHOLDS.minimumSessionStartTokenSavingsPercent,
|
|
273
|
+
details: `averageSessionStartTokenSavingsPercent=${averageSessionStartTokenSavingsPercent} minimum=${THRESHOLDS.minimumSessionStartTokenSavingsPercent}`,
|
|
274
|
+
},
|
|
275
|
+
{
|
|
276
|
+
checkName: 'privacy-redaction-safety',
|
|
277
|
+
passed: privacyControls.unsafeObservationCount <= THRESHOLDS.maximumUnsafeObservationCount,
|
|
278
|
+
details: `unsafeObservationCount=${privacyControls.unsafeObservationCount} max=${THRESHOLDS.maximumUnsafeObservationCount}`,
|
|
279
|
+
},
|
|
280
|
+
];
|
|
281
|
+
|
|
282
|
+
const failureCount = checks.filter((checkResult) => !checkResult.passed).length;
|
|
283
|
+
|
|
284
|
+
const reportPayload = {
|
|
285
|
+
generatedAt: new Date().toISOString(),
|
|
286
|
+
reportName: 'memory-continuity-benchmark',
|
|
287
|
+
schemaVersion: MEMORY_SCHEMA_VERSION,
|
|
288
|
+
passed: failureCount === 0,
|
|
289
|
+
failureCount,
|
|
290
|
+
thresholds: THRESHOLDS,
|
|
291
|
+
adapterCoverage,
|
|
292
|
+
privacyControls,
|
|
293
|
+
continuitySummary: {
|
|
294
|
+
totalObservationCount: normalizedObservations.length,
|
|
295
|
+
scenarioCount: scenarioResults.length,
|
|
296
|
+
averageRelevantRecall,
|
|
297
|
+
averageSessionStartTokenSavingsPercent,
|
|
298
|
+
},
|
|
299
|
+
scenarios: scenarioResults,
|
|
300
|
+
references: {
|
|
301
|
+
memorySchemaPath: '.agent-context/state/memory-schema-v1.json',
|
|
302
|
+
memoryAdapterContractPath: '.agent-context/state/memory-adapter-contract.json',
|
|
303
|
+
benchmarkOutputPath: '.agent-context/state/memory-continuity-benchmark.json',
|
|
304
|
+
schemaDeclaredVersion: memorySchema?.schemaVersion || null,
|
|
305
|
+
adapterContractVersion: memoryAdapterContract?.schemaVersion || null,
|
|
306
|
+
},
|
|
307
|
+
checks,
|
|
308
|
+
};
|
|
309
|
+
|
|
310
|
+
if (!isStdoutOnlyMode) {
|
|
311
|
+
await fs.writeFile(OUTPUT_PATH, JSON.stringify(reportPayload, null, 2) + '\n', 'utf8');
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
console.log(JSON.stringify(reportPayload, null, 2));
|
|
315
|
+
process.exit(reportPayload.passed ? 0 : 1);
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
runMemoryContinuityBenchmark().catch((benchmarkError) => {
|
|
319
|
+
const errorMessage = benchmarkError instanceof Error ? benchmarkError.message : String(benchmarkError);
|
|
320
|
+
console.error(`Memory continuity benchmark failed: ${errorMessage}`);
|
|
321
|
+
process.exit(1);
|
|
322
|
+
});
|
package/scripts/validate.mjs
CHANGED
|
@@ -166,6 +166,7 @@ async function validateRequiredFiles() {
|
|
|
166
166
|
'scripts/benchmark-writer-judge-matrix.mjs',
|
|
167
167
|
'scripts/benchmark-gate.mjs',
|
|
168
168
|
'scripts/benchmark-intelligence.mjs',
|
|
169
|
+
'scripts/memory-continuity-benchmark.mjs',
|
|
169
170
|
'scripts/docs-quality-drift-report.mjs',
|
|
170
171
|
'scripts/governance-weekly-report.mjs',
|
|
171
172
|
'scripts/mcp-server.mjs',
|
|
@@ -193,6 +194,8 @@ async function validateRequiredFiles() {
|
|
|
193
194
|
'.agent-context/state/benchmark-reproducibility.json',
|
|
194
195
|
'.agent-context/state/benchmark-writer-judge-config.json',
|
|
195
196
|
'.agent-context/state/benchmark-watchlist.json',
|
|
197
|
+
'.agent-context/state/memory-schema-v1.json',
|
|
198
|
+
'.agent-context/state/memory-adapter-contract.json',
|
|
196
199
|
'.agent-context/state/skill-platform.json',
|
|
197
200
|
'.agent-context/skills/index.json',
|
|
198
201
|
'.vscode/mcp.json',
|