audrey 0.16.1 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +310 -643
- package/benchmarks/baselines.js +169 -0
- package/benchmarks/cases.js +421 -0
- package/benchmarks/reference-results.js +70 -0
- package/benchmarks/report.js +255 -0
- package/benchmarks/run.js +514 -0
- package/docs/assets/benchmarks/local-benchmark.svg +45 -0
- package/docs/assets/benchmarks/operations-benchmark.svg +45 -0
- package/docs/assets/benchmarks/published-memory-standards.svg +50 -0
- package/docs/benchmarking.md +151 -0
- package/docs/production-readiness.md +96 -0
- package/examples/fintech-ops-demo.js +67 -0
- package/examples/healthcare-ops-demo.js +67 -0
- package/examples/stripe-demo.js +105 -0
- package/mcp-server/config.js +80 -27
- package/mcp-server/index.js +611 -75
- package/mcp-server/serve.js +482 -0
- package/package.json +24 -5
- package/src/audrey.js +51 -13
- package/src/consolidate.js +70 -54
- package/src/db.js +22 -1
- package/src/embedding.js +16 -12
- package/src/encode.js +8 -2
- package/src/fts.js +134 -0
- package/src/import.js +28 -0
- package/src/llm.js +6 -3
- package/src/migrate.js +2 -2
- package/src/recall.js +253 -32
- package/src/utils.js +25 -0
- package/types/index.d.ts +434 -0
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import { createEmbeddingProvider } from '../src/embedding.js';
|
|
2
|
+
import { cosineSimilarity } from '../src/utils.js';
|
|
3
|
+
|
|
4
|
+
function normalize(text) {
|
|
5
|
+
return String(text || '').toLowerCase();
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
function tokenize(text) {
|
|
9
|
+
return normalize(text)
|
|
10
|
+
.replace(/[^a-z0-9]+/g, ' ')
|
|
11
|
+
.trim()
|
|
12
|
+
.split(/\s+/)
|
|
13
|
+
.filter(Boolean);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function keywordScore(queryTokens, content) {
|
|
17
|
+
const contentTokens = new Set(tokenize(content));
|
|
18
|
+
if (queryTokens.length === 0) return 0;
|
|
19
|
+
let matches = 0;
|
|
20
|
+
for (const token of queryTokens) {
|
|
21
|
+
if (contentTokens.has(token)) matches++;
|
|
22
|
+
}
|
|
23
|
+
return matches / queryTokens.length;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function sortByScore(rows) {
|
|
27
|
+
return rows
|
|
28
|
+
.filter(row => Number.isFinite(row.score))
|
|
29
|
+
.sort((a, b) => b.score - a.score || String(b.createdAt || '').localeCompare(String(a.createdAt || '')));
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function flattenMemories(benchmarkCase, ids = []) {
|
|
33
|
+
return benchmarkCase.memory.map((memory, index) => ({
|
|
34
|
+
id: ids[index] || `memory-${index + 1}`,
|
|
35
|
+
content: memory.content,
|
|
36
|
+
source: memory.source,
|
|
37
|
+
createdAt: memory.createdAt || new Date(Date.UTC(2026, 0, index + 1)).toISOString(),
|
|
38
|
+
private: Boolean(memory.private),
|
|
39
|
+
}));
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function buildSyntheticCase(query, memories, options = {}) {
|
|
43
|
+
return {
|
|
44
|
+
query,
|
|
45
|
+
memory: memories.map(memory => ({
|
|
46
|
+
content: memory.content,
|
|
47
|
+
source: memory.source,
|
|
48
|
+
createdAt: memory.createdAt,
|
|
49
|
+
private: memory.private,
|
|
50
|
+
})),
|
|
51
|
+
options,
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
async function runBaselineRetrieval(system, syntheticCase, providerConfig, limit = 5) {
|
|
56
|
+
switch (system) {
|
|
57
|
+
case 'Vector Only':
|
|
58
|
+
return runVectorOnlyBaseline(syntheticCase, providerConfig, limit);
|
|
59
|
+
case 'Keyword + Recency':
|
|
60
|
+
return runKeywordRecencyBaseline(syntheticCase, limit);
|
|
61
|
+
case 'Recent Window':
|
|
62
|
+
return runRecentWindowBaseline(syntheticCase, limit);
|
|
63
|
+
default:
|
|
64
|
+
throw new Error(`Unknown baseline system: ${system}`);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function createOperationMemory(state, step) {
|
|
69
|
+
const index = state.counter++;
|
|
70
|
+
return {
|
|
71
|
+
id: `memory-${index + 1}`,
|
|
72
|
+
content: step.memory.content,
|
|
73
|
+
source: step.memory.source,
|
|
74
|
+
createdAt: step.memory.createdAt || new Date(Date.UTC(2026, 0, index + 1)).toISOString(),
|
|
75
|
+
private: Boolean(step.memory.private),
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
async function applyBaselineStep(system, state, step, providerConfig) {
|
|
80
|
+
if (step.type === 'encode') {
|
|
81
|
+
const memory = createOperationMemory(state, step);
|
|
82
|
+
state.memories.push(memory);
|
|
83
|
+
if (step.saveAs) {
|
|
84
|
+
state.aliases.set(step.saveAs, memory.id);
|
|
85
|
+
}
|
|
86
|
+
return;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if (step.type === 'forgetByQuery') {
|
|
90
|
+
const syntheticCase = buildSyntheticCase(step.query, state.memories, step.options);
|
|
91
|
+
const [match] = await runBaselineRetrieval(system, syntheticCase, providerConfig, 1);
|
|
92
|
+
if (match && Number.isFinite(match.score) && match.score > 0) {
|
|
93
|
+
state.memories = state.memories.filter(memory => memory.id !== match.id);
|
|
94
|
+
}
|
|
95
|
+
return;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (step.type === 'consolidate') {
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
throw new Error(`Unsupported baseline step: ${step.type}`);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
export async function runBaselineScenario(system, benchmarkCase, providerConfig, limit = 5) {
|
|
106
|
+
if (benchmarkCase.kind !== 'operations') {
|
|
107
|
+
return runBaselineRetrieval(system, benchmarkCase, providerConfig, limit);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const state = {
|
|
111
|
+
counter: 0,
|
|
112
|
+
memories: [],
|
|
113
|
+
aliases: new Map(),
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
for (const step of benchmarkCase.steps || []) {
|
|
117
|
+
await applyBaselineStep(system, state, step, providerConfig);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
return runBaselineRetrieval(
|
|
121
|
+
system,
|
|
122
|
+
buildSyntheticCase(benchmarkCase.query, state.memories, benchmarkCase.options),
|
|
123
|
+
providerConfig,
|
|
124
|
+
limit,
|
|
125
|
+
);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
export function runKeywordRecencyBaseline(benchmarkCase, limit = 5) {
|
|
129
|
+
const queryTokens = tokenize(benchmarkCase.query);
|
|
130
|
+
return sortByScore(flattenMemories(benchmarkCase).map(memory => ({
|
|
131
|
+
...memory,
|
|
132
|
+
type: 'episodic',
|
|
133
|
+
score: keywordScore(queryTokens, memory.content),
|
|
134
|
+
}))).slice(0, limit);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
export function runRecentWindowBaseline(benchmarkCase, limit = 3) {
|
|
138
|
+
return flattenMemories(benchmarkCase)
|
|
139
|
+
.sort((a, b) => String(b.createdAt).localeCompare(String(a.createdAt)))
|
|
140
|
+
.slice(0, limit)
|
|
141
|
+
.map((memory, index) => ({
|
|
142
|
+
...memory,
|
|
143
|
+
type: 'episodic',
|
|
144
|
+
score: 1 - index * 0.1,
|
|
145
|
+
}));
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
export async function runVectorOnlyBaseline(benchmarkCase, providerConfig, limit = 5) {
|
|
149
|
+
const provider = createEmbeddingProvider(providerConfig);
|
|
150
|
+
if (typeof provider.ready === 'function') {
|
|
151
|
+
await provider.ready();
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const queryVector = await provider.embed(benchmarkCase.query);
|
|
155
|
+
const queryBuffer = provider.vectorToBuffer(queryVector);
|
|
156
|
+
|
|
157
|
+
const rows = [];
|
|
158
|
+
for (const memory of flattenMemories(benchmarkCase)) {
|
|
159
|
+
const vector = await provider.embed(memory.content);
|
|
160
|
+
const score = cosineSimilarity(queryBuffer, provider.vectorToBuffer(vector), provider);
|
|
161
|
+
rows.push({
|
|
162
|
+
...memory,
|
|
163
|
+
type: 'episodic',
|
|
164
|
+
score,
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
return sortByScore(rows).slice(0, limit);
|
|
169
|
+
}
|
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
export const RETRIEVAL_CASES = [
|
|
2
|
+
{
|
|
3
|
+
id: 'information-extraction',
|
|
4
|
+
suite: 'retrieval',
|
|
5
|
+
kind: 'retrieval',
|
|
6
|
+
family: 'information_extraction',
|
|
7
|
+
title: 'Information extraction',
|
|
8
|
+
description: 'Recover a directly stated user fact from durable memory.',
|
|
9
|
+
query: 'Where does Sam live now?',
|
|
10
|
+
expectAny: ['Austin'],
|
|
11
|
+
memory: [
|
|
12
|
+
{
|
|
13
|
+
content: 'Sam moved to Austin in March 2026 after leaving Denver.',
|
|
14
|
+
source: 'direct-observation',
|
|
15
|
+
tags: ['profile', 'location'],
|
|
16
|
+
context: { subject: 'sam', domain: 'assistant' },
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
content: 'Sam likes to work from coffee shops on South Congress.',
|
|
20
|
+
source: 'tool-result',
|
|
21
|
+
tags: ['preference', 'routine'],
|
|
22
|
+
context: { subject: 'sam', domain: 'assistant' },
|
|
23
|
+
},
|
|
24
|
+
],
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
id: 'knowledge-update',
|
|
28
|
+
suite: 'retrieval',
|
|
29
|
+
kind: 'retrieval',
|
|
30
|
+
family: 'knowledge_updates',
|
|
31
|
+
title: 'Knowledge updates',
|
|
32
|
+
description: 'Prefer the newer fact over stale preferences.',
|
|
33
|
+
query: 'What drink does Sam prefer now?',
|
|
34
|
+
expectAny: ['green tea'],
|
|
35
|
+
forbid: ['Sam prefers coffee before early meetings.'],
|
|
36
|
+
memory: [
|
|
37
|
+
{
|
|
38
|
+
content: 'Sam prefers coffee before early meetings.',
|
|
39
|
+
source: 'told-by-user',
|
|
40
|
+
tags: ['preference'],
|
|
41
|
+
context: { subject: 'sam', domain: 'assistant' },
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
content: 'Sam switched from coffee to green tea after January 2026.',
|
|
45
|
+
source: 'direct-observation',
|
|
46
|
+
tags: ['preference', 'update'],
|
|
47
|
+
context: { subject: 'sam', domain: 'assistant' },
|
|
48
|
+
supersedesIndex: 0,
|
|
49
|
+
},
|
|
50
|
+
],
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
id: 'multi-session-reasoning',
|
|
54
|
+
suite: 'retrieval',
|
|
55
|
+
kind: 'retrieval',
|
|
56
|
+
family: 'multi_session_reasoning',
|
|
57
|
+
title: 'Multi-session reasoning',
|
|
58
|
+
description: 'Synthesize a decision from multiple related episodes.',
|
|
59
|
+
query: 'Which vendor was approved after the pilot budget review?',
|
|
60
|
+
expectAny: ['Northwind'],
|
|
61
|
+
memory: [
|
|
62
|
+
{
|
|
63
|
+
content: 'During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.',
|
|
64
|
+
source: 'tool-result',
|
|
65
|
+
tags: ['project', 'pilot'],
|
|
66
|
+
context: { subject: 'sam', domain: 'operations' },
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
content: 'Finance rejected Fabricam because the support SLA was too weak.',
|
|
70
|
+
source: 'direct-observation',
|
|
71
|
+
tags: ['finance', 'vendor'],
|
|
72
|
+
context: { subject: 'sam', domain: 'operations' },
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
content: 'The pilot budget review approved Northwind for rollout after the support SLA review.',
|
|
76
|
+
source: 'direct-observation',
|
|
77
|
+
tags: ['finance', 'vendor', 'approval'],
|
|
78
|
+
context: { subject: 'sam', domain: 'operations' },
|
|
79
|
+
},
|
|
80
|
+
],
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
id: 'temporal-reasoning',
|
|
84
|
+
suite: 'retrieval',
|
|
85
|
+
kind: 'retrieval',
|
|
86
|
+
family: 'temporal_reasoning',
|
|
87
|
+
title: 'Temporal reasoning',
|
|
88
|
+
description: 'Answer by isolating the right time window.',
|
|
89
|
+
query: 'What happened in February 2026?',
|
|
90
|
+
expectAny: ['architecture review'],
|
|
91
|
+
memory: [
|
|
92
|
+
{
|
|
93
|
+
content: 'In January 2026 Sam kicked off the migration plan.',
|
|
94
|
+
source: 'tool-result',
|
|
95
|
+
tags: ['timeline'],
|
|
96
|
+
createdAt: '2026-01-12T09:00:00.000Z',
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
content: 'In February 2026 Sam completed the architecture review.',
|
|
100
|
+
source: 'direct-observation',
|
|
101
|
+
tags: ['timeline'],
|
|
102
|
+
createdAt: '2026-02-18T15:30:00.000Z',
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
content: 'In March 2026 Sam started the rollout checklist.',
|
|
106
|
+
source: 'tool-result',
|
|
107
|
+
tags: ['timeline'],
|
|
108
|
+
createdAt: '2026-03-02T08:15:00.000Z',
|
|
109
|
+
},
|
|
110
|
+
],
|
|
111
|
+
options: {
|
|
112
|
+
after: '2026-02-01T00:00:00.000Z',
|
|
113
|
+
before: '2026-03-01T00:00:00.000Z',
|
|
114
|
+
},
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
id: 'abstention',
|
|
118
|
+
suite: 'retrieval',
|
|
119
|
+
kind: 'retrieval',
|
|
120
|
+
family: 'abstention',
|
|
121
|
+
title: 'Abstention',
|
|
122
|
+
description: 'Avoid pretending to know a specific identifier that was never stored.',
|
|
123
|
+
query: 'What is Sam passport number?',
|
|
124
|
+
expectNone: true,
|
|
125
|
+
memory: [
|
|
126
|
+
{
|
|
127
|
+
content: 'Sam renewed a passport in February 2026.',
|
|
128
|
+
source: 'tool-result',
|
|
129
|
+
tags: ['travel'],
|
|
130
|
+
},
|
|
131
|
+
{
|
|
132
|
+
content: 'Sam has a trip to Toronto next month.',
|
|
133
|
+
source: 'told-by-user',
|
|
134
|
+
tags: ['travel'],
|
|
135
|
+
},
|
|
136
|
+
],
|
|
137
|
+
},
|
|
138
|
+
{
|
|
139
|
+
id: 'conflict-resolution',
|
|
140
|
+
suite: 'retrieval',
|
|
141
|
+
kind: 'retrieval',
|
|
142
|
+
family: 'conflict_resolution',
|
|
143
|
+
title: 'Conflict resolution',
|
|
144
|
+
description: 'Prefer high-reliability evidence over model-generated noise.',
|
|
145
|
+
query: 'What caused the outage?',
|
|
146
|
+
expectAny: ['TLS certificate', 'expired certificate'],
|
|
147
|
+
forbid: ['The outage was caused by database corruption.'],
|
|
148
|
+
memory: [
|
|
149
|
+
{
|
|
150
|
+
content: 'The outage was caused by an expired TLS certificate on api.example.com.',
|
|
151
|
+
source: 'direct-observation',
|
|
152
|
+
tags: ['incident', 'root-cause'],
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
content: 'The outage was caused by database corruption.',
|
|
156
|
+
source: 'model-generated',
|
|
157
|
+
tags: ['incident', 'root-cause'],
|
|
158
|
+
},
|
|
159
|
+
],
|
|
160
|
+
},
|
|
161
|
+
{
|
|
162
|
+
id: 'procedural-learning',
|
|
163
|
+
suite: 'retrieval',
|
|
164
|
+
kind: 'retrieval',
|
|
165
|
+
family: 'procedural_learning',
|
|
166
|
+
title: 'Procedural learning',
|
|
167
|
+
description: 'Turn repeated incidents into an actionable operating rule.',
|
|
168
|
+
query: 'What should the agent do when payout retries start returning 429?',
|
|
169
|
+
expectAny: ['cap retry batches', 'stagger retries'],
|
|
170
|
+
memory: [
|
|
171
|
+
{
|
|
172
|
+
content: 'Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute.',
|
|
173
|
+
source: 'direct-observation',
|
|
174
|
+
tags: ['payments', 'rate-limit'],
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
content: 'Payout incident volume dropped after retry batches were capped at 50 merchants per worker.',
|
|
178
|
+
source: 'tool-result',
|
|
179
|
+
tags: ['payments', 'rate-limit'],
|
|
180
|
+
},
|
|
181
|
+
{
|
|
182
|
+
content: 'Risk operations requested an escalation when multiple merchants were affected in the same hour.',
|
|
183
|
+
source: 'told-by-user',
|
|
184
|
+
tags: ['payments', 'escalation'],
|
|
185
|
+
},
|
|
186
|
+
],
|
|
187
|
+
consolidate: {
|
|
188
|
+
minClusterSize: 3,
|
|
189
|
+
similarityThreshold: -0.3,
|
|
190
|
+
principle: {
|
|
191
|
+
content: 'When payout retries start returning 429, cap retry batches and stagger retries before escalating.',
|
|
192
|
+
type: 'procedural',
|
|
193
|
+
conditions: ['processor returns 429', 'multiple merchants impacted'],
|
|
194
|
+
},
|
|
195
|
+
},
|
|
196
|
+
options: {
|
|
197
|
+
types: ['procedural', 'semantic'],
|
|
198
|
+
},
|
|
199
|
+
},
|
|
200
|
+
{
|
|
201
|
+
id: 'privacy-boundary',
|
|
202
|
+
suite: 'retrieval',
|
|
203
|
+
kind: 'retrieval',
|
|
204
|
+
family: 'privacy_boundary',
|
|
205
|
+
title: 'Privacy boundary',
|
|
206
|
+
description: 'Never leak private memory into public recall.',
|
|
207
|
+
query: 'What is Sam API key?',
|
|
208
|
+
expectNone: true,
|
|
209
|
+
forbid: ['sk-secret-demo-key'],
|
|
210
|
+
memory: [
|
|
211
|
+
{
|
|
212
|
+
content: 'Sam API key is sk-secret-demo-key.',
|
|
213
|
+
source: 'told-by-user',
|
|
214
|
+
private: true,
|
|
215
|
+
tags: ['secret'],
|
|
216
|
+
},
|
|
217
|
+
{
|
|
218
|
+
content: 'Sam prefers dark mode in internal dashboards.',
|
|
219
|
+
source: 'told-by-user',
|
|
220
|
+
tags: ['preference'],
|
|
221
|
+
},
|
|
222
|
+
],
|
|
223
|
+
},
|
|
224
|
+
];
|
|
225
|
+
|
|
226
|
+
export const OPERATION_CASES = [
|
|
227
|
+
{
|
|
228
|
+
id: 'operation-update-overwrite',
|
|
229
|
+
suite: 'operations',
|
|
230
|
+
kind: 'operations',
|
|
231
|
+
family: 'update_overwrite',
|
|
232
|
+
title: 'Update and overwrite',
|
|
233
|
+
description: 'Current-state recall should prefer the new fact after an explicit overwrite.',
|
|
234
|
+
query: 'What is the primary deployment region now?',
|
|
235
|
+
expectAny: ['eu-west-1'],
|
|
236
|
+
forbid: ['us-east-1'],
|
|
237
|
+
steps: [
|
|
238
|
+
{
|
|
239
|
+
type: 'encode',
|
|
240
|
+
saveAs: 'initial-region',
|
|
241
|
+
memory: {
|
|
242
|
+
content: 'The primary deployment region is us-east-1.',
|
|
243
|
+
source: 'told-by-user',
|
|
244
|
+
tags: ['deployment', 'region'],
|
|
245
|
+
},
|
|
246
|
+
},
|
|
247
|
+
{
|
|
248
|
+
type: 'encode',
|
|
249
|
+
supersedesRef: 'initial-region',
|
|
250
|
+
memory: {
|
|
251
|
+
content: 'As of March 2026, the primary deployment region is eu-west-1.',
|
|
252
|
+
source: 'direct-observation',
|
|
253
|
+
tags: ['deployment', 'region', 'update'],
|
|
254
|
+
},
|
|
255
|
+
},
|
|
256
|
+
],
|
|
257
|
+
},
|
|
258
|
+
{
|
|
259
|
+
id: 'operation-delete-and-abstain',
|
|
260
|
+
suite: 'operations',
|
|
261
|
+
kind: 'operations',
|
|
262
|
+
family: 'delete_and_abstain',
|
|
263
|
+
title: 'Delete and abstain',
|
|
264
|
+
description: 'Explicit deletion should remove a secret from later recall.',
|
|
265
|
+
query: 'What is the staging API token?',
|
|
266
|
+
expectNone: true,
|
|
267
|
+
forbid: ['tok-demo-staging-1234'],
|
|
268
|
+
steps: [
|
|
269
|
+
{
|
|
270
|
+
type: 'encode',
|
|
271
|
+
memory: {
|
|
272
|
+
content: 'The staging API token is tok-demo-staging-1234.',
|
|
273
|
+
source: 'told-by-user',
|
|
274
|
+
tags: ['secret', 'staging'],
|
|
275
|
+
},
|
|
276
|
+
},
|
|
277
|
+
{
|
|
278
|
+
type: 'encode',
|
|
279
|
+
memory: {
|
|
280
|
+
content: 'The staging environment rotates API credentials weekly.',
|
|
281
|
+
source: 'tool-result',
|
|
282
|
+
tags: ['staging', 'ops'],
|
|
283
|
+
},
|
|
284
|
+
},
|
|
285
|
+
{
|
|
286
|
+
type: 'forgetByQuery',
|
|
287
|
+
query: 'staging API token',
|
|
288
|
+
options: { minSimilarity: 0.35 },
|
|
289
|
+
},
|
|
290
|
+
],
|
|
291
|
+
},
|
|
292
|
+
{
|
|
293
|
+
id: 'operation-semantic-merge',
|
|
294
|
+
suite: 'operations',
|
|
295
|
+
kind: 'operations',
|
|
296
|
+
family: 'semantic_merge',
|
|
297
|
+
title: 'Semantic merge',
|
|
298
|
+
description: 'Related episodes should merge into a reusable semantic operating rule.',
|
|
299
|
+
query: 'When should the disputes queue trigger manual review?',
|
|
300
|
+
expectAny: ['manual review', 'same bin in one hour'],
|
|
301
|
+
steps: [
|
|
302
|
+
{
|
|
303
|
+
type: 'encode',
|
|
304
|
+
memory: {
|
|
305
|
+
content: 'Three charge disputes from the same BIN landed in the queue within one hour.',
|
|
306
|
+
source: 'direct-observation',
|
|
307
|
+
tags: ['fraud', 'disputes'],
|
|
308
|
+
},
|
|
309
|
+
},
|
|
310
|
+
{
|
|
311
|
+
type: 'encode',
|
|
312
|
+
memory: {
|
|
313
|
+
content: 'Fraud ops escalated repeated same-BIN disputes for analyst attention.',
|
|
314
|
+
source: 'tool-result',
|
|
315
|
+
tags: ['fraud', 'disputes'],
|
|
316
|
+
},
|
|
317
|
+
},
|
|
318
|
+
{
|
|
319
|
+
type: 'encode',
|
|
320
|
+
memory: {
|
|
321
|
+
content: 'The queue stabilized after repeated same-BIN disputes were reviewed manually.',
|
|
322
|
+
source: 'told-by-user',
|
|
323
|
+
tags: ['fraud', 'disputes'],
|
|
324
|
+
},
|
|
325
|
+
},
|
|
326
|
+
{
|
|
327
|
+
type: 'consolidate',
|
|
328
|
+
minClusterSize: 3,
|
|
329
|
+
similarityThreshold: -0.3,
|
|
330
|
+
principle: {
|
|
331
|
+
content: 'Repeated disputes from the same BIN in one hour should trigger manual review.',
|
|
332
|
+
type: 'semantic',
|
|
333
|
+
},
|
|
334
|
+
},
|
|
335
|
+
],
|
|
336
|
+
options: {
|
|
337
|
+
types: ['semantic'],
|
|
338
|
+
},
|
|
339
|
+
},
|
|
340
|
+
{
|
|
341
|
+
id: 'operation-procedural-merge',
|
|
342
|
+
suite: 'operations',
|
|
343
|
+
kind: 'operations',
|
|
344
|
+
family: 'procedural_merge',
|
|
345
|
+
title: 'Procedural merge',
|
|
346
|
+
description: 'Related episodes should merge into an executable procedure, not just a loose fact.',
|
|
347
|
+
query: 'What should the agent do after two webhook signature failures?',
|
|
348
|
+
expectAny: ['rotate the signing secret', 'replay queued events'],
|
|
349
|
+
steps: [
|
|
350
|
+
{
|
|
351
|
+
type: 'encode',
|
|
352
|
+
memory: {
|
|
353
|
+
content: 'Webhook signature verification failed twice for merchant ACME.',
|
|
354
|
+
source: 'direct-observation',
|
|
355
|
+
tags: ['webhooks', 'security'],
|
|
356
|
+
},
|
|
357
|
+
},
|
|
358
|
+
{
|
|
359
|
+
type: 'encode',
|
|
360
|
+
memory: {
|
|
361
|
+
content: 'Operations recovered the incident by rotating the signing secret.',
|
|
362
|
+
source: 'tool-result',
|
|
363
|
+
tags: ['webhooks', 'security'],
|
|
364
|
+
},
|
|
365
|
+
},
|
|
366
|
+
{
|
|
367
|
+
type: 'encode',
|
|
368
|
+
memory: {
|
|
369
|
+
content: 'Queued webhook events were replayed after the signing secret changed.',
|
|
370
|
+
source: 'told-by-user',
|
|
371
|
+
tags: ['webhooks', 'security'],
|
|
372
|
+
},
|
|
373
|
+
},
|
|
374
|
+
{
|
|
375
|
+
type: 'consolidate',
|
|
376
|
+
minClusterSize: 3,
|
|
377
|
+
similarityThreshold: -0.3,
|
|
378
|
+
principle: {
|
|
379
|
+
content: 'When webhook signature verification fails twice, rotate the signing secret and replay queued events.',
|
|
380
|
+
type: 'procedural',
|
|
381
|
+
conditions: ['signature verification fails twice', 'queued events pending'],
|
|
382
|
+
},
|
|
383
|
+
},
|
|
384
|
+
],
|
|
385
|
+
options: {
|
|
386
|
+
types: ['procedural', 'semantic'],
|
|
387
|
+
},
|
|
388
|
+
},
|
|
389
|
+
];
|
|
390
|
+
|
|
391
|
+
export const LOCAL_BENCHMARK_SUITES = [
|
|
392
|
+
{
|
|
393
|
+
id: 'retrieval',
|
|
394
|
+
title: 'Retrieval capabilities',
|
|
395
|
+
description: 'LongMemEval-style memory abilities plus privacy and abstention.',
|
|
396
|
+
cases: RETRIEVAL_CASES,
|
|
397
|
+
},
|
|
398
|
+
{
|
|
399
|
+
id: 'operations',
|
|
400
|
+
title: 'Memory operations',
|
|
401
|
+
description: 'Update, delete, merge, and abstention behavior after lifecycle operations.',
|
|
402
|
+
cases: OPERATION_CASES,
|
|
403
|
+
},
|
|
404
|
+
];
|
|
405
|
+
|
|
406
|
+
export const BENCHMARK_CASES = LOCAL_BENCHMARK_SUITES.flatMap(suite => suite.cases);
|
|
407
|
+
|
|
408
|
+
export const FAMILY_ORDER = [
|
|
409
|
+
'information_extraction',
|
|
410
|
+
'knowledge_updates',
|
|
411
|
+
'multi_session_reasoning',
|
|
412
|
+
'temporal_reasoning',
|
|
413
|
+
'abstention',
|
|
414
|
+
'conflict_resolution',
|
|
415
|
+
'procedural_learning',
|
|
416
|
+
'privacy_boundary',
|
|
417
|
+
'update_overwrite',
|
|
418
|
+
'delete_and_abstain',
|
|
419
|
+
'semantic_merge',
|
|
420
|
+
'procedural_merge',
|
|
421
|
+
];
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
export const PUBLISHED_LEADERBOARD = [
|
|
2
|
+
{
|
|
3
|
+
system: 'MIRIX',
|
|
4
|
+
benchmark: 'LoCoMo',
|
|
5
|
+
score: 85.4,
|
|
6
|
+
unit: 'accuracy',
|
|
7
|
+
source: 'https://arxiv.org/abs/2507.07957',
|
|
8
|
+
note: 'Published LoCoMo result from the MIRIX paper.',
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
system: 'Letta Filesystem',
|
|
12
|
+
benchmark: 'LoCoMo',
|
|
13
|
+
score: 74.0,
|
|
14
|
+
unit: 'accuracy',
|
|
15
|
+
source: 'https://www.letta.com/blog/benchmarking-ai-agent-memory',
|
|
16
|
+
note: 'Filesystem-style memory result reported by Letta.',
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
system: 'Mem0 Graph Memory',
|
|
20
|
+
benchmark: 'LoCoMo',
|
|
21
|
+
score: 68.5,
|
|
22
|
+
unit: 'accuracy',
|
|
23
|
+
source: 'https://arxiv.org/abs/2504.19413',
|
|
24
|
+
note: 'Graph memory variant reported in the Mem0 paper.',
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
system: 'Mem0',
|
|
28
|
+
benchmark: 'LoCoMo',
|
|
29
|
+
score: 66.9,
|
|
30
|
+
unit: 'accuracy',
|
|
31
|
+
source: 'https://arxiv.org/abs/2504.19413',
|
|
32
|
+
note: 'Core Mem0 LoCoMo score reported in the Mem0 paper.',
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
system: 'OpenAI Memory',
|
|
36
|
+
benchmark: 'LoCoMo',
|
|
37
|
+
score: 52.9,
|
|
38
|
+
unit: 'accuracy',
|
|
39
|
+
source: 'https://arxiv.org/abs/2504.19413',
|
|
40
|
+
note: 'OpenAI memory baseline as reported by the Mem0 paper.',
|
|
41
|
+
},
|
|
42
|
+
];
|
|
43
|
+
|
|
44
|
+
export const MEMORY_TRENDS = [
|
|
45
|
+
{
|
|
46
|
+
title: 'Memory is moving from flat retrieval to typed systems',
|
|
47
|
+
summary: 'Recent work treats episodic, semantic, procedural, and graph memory as separate but cooperating layers.',
|
|
48
|
+
source: 'https://arxiv.org/abs/2507.03724',
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
title: 'Benchmarks now emphasize multi-session realism',
|
|
52
|
+
summary: 'LongMemEval and LoCoMo push memory systems toward temporal updates, abstraction, and cross-session reasoning instead of single-turn fact recall.',
|
|
53
|
+
source: 'https://arxiv.org/abs/2410.10813',
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
title: 'Context engineering is now competing with retrieval-first designs',
|
|
57
|
+
summary: 'Letta argues filesystem and memory-block approaches can outperform simpler retrieval-only memory on realistic long-horizon tasks.',
|
|
58
|
+
source: 'https://www.letta.com/blog/memory-blocks',
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
title: 'Production teams care about latency and token footprint, not just recall quality',
|
|
62
|
+
summary: 'Mem0 frames memory as a cost and latency optimization surface in addition to a personalization surface.',
|
|
63
|
+
source: 'https://arxiv.org/abs/2504.19413',
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
title: 'Temporal and multimodal memory are becoming table stakes',
|
|
67
|
+
summary: 'MIRIX and Graphiti both model time and state change explicitly instead of assuming memories stay forever true.',
|
|
68
|
+
source: 'https://arxiv.org/abs/2507.07957',
|
|
69
|
+
},
|
|
70
|
+
];
|