audrey 0.23.1 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +101 -15
- package/LICENSE +21 -21
- package/README.md +232 -6
- package/SECURITY.md +2 -1
- package/benchmarks/adapter-kit.mjs +20 -0
- package/benchmarks/adapter-self-test.mjs +166 -0
- package/benchmarks/adapters/example-allow.mjs +28 -0
- package/benchmarks/adapters/mem0-platform.mjs +267 -0
- package/benchmarks/adapters/registry.json +51 -0
- package/benchmarks/adapters/zep-cloud.mjs +280 -0
- package/benchmarks/baselines.js +169 -0
- package/benchmarks/build-leaderboard.mjs +170 -0
- package/benchmarks/cases.js +537 -0
- package/benchmarks/create-conformance-card.mjs +139 -0
- package/benchmarks/create-submission-bundle.mjs +176 -0
- package/benchmarks/dry-run-external-adapters.mjs +165 -0
- package/benchmarks/guardbench.js +1125 -0
- package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
- package/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
- package/benchmarks/output/external/guardbench-external-evidence.json +56 -0
- package/benchmarks/output/guardbench-conformance-card.json +63 -0
- package/benchmarks/output/guardbench-manifest.json +414 -0
- package/benchmarks/output/guardbench-raw.json +1271 -0
- package/benchmarks/output/guardbench-summary.json +2107 -0
- package/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
- package/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
- package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +63 -0
- package/benchmarks/output/submission-bundle/guardbench-manifest.json +414 -0
- package/benchmarks/output/submission-bundle/guardbench-raw.json +1271 -0
- package/benchmarks/output/submission-bundle/guardbench-summary.json +2107 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-conformance-card.schema.json +184 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-evidence.schema.json +108 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-run.schema.json +160 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-leaderboard.schema.json +179 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-manifest.schema.json +213 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-publication-verification.schema.json +47 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +184 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +249 -0
- package/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
- package/benchmarks/output/submission-bundle/validation-report.json +31 -0
- package/benchmarks/output/summary.json +2354 -0
- package/benchmarks/perf-snapshot.js +304 -0
- package/benchmarks/perf.bench.js +161 -0
- package/benchmarks/public-paths.mjs +78 -0
- package/benchmarks/reference-results.js +70 -0
- package/benchmarks/report.js +259 -0
- package/benchmarks/run-external-guardbench.mjs +281 -0
- package/benchmarks/run.js +682 -0
- package/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
- package/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
- package/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
- package/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
- package/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
- package/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
- package/benchmarks/schemas/guardbench-raw.schema.json +184 -0
- package/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/benchmarks/schemas/guardbench-summary.schema.json +249 -0
- package/benchmarks/snapshots/perf-0.22.2.json +123 -0
- package/benchmarks/snapshots/perf-0.23.0.json +123 -0
- package/benchmarks/validate-adapter-module.mjs +104 -0
- package/benchmarks/validate-adapter-registry.mjs +134 -0
- package/benchmarks/validate-adapter-self-test.mjs +96 -0
- package/benchmarks/validate-guardbench-artifacts.mjs +343 -0
- package/benchmarks/verify-external-evidence.mjs +296 -0
- package/benchmarks/verify-publication-artifacts.mjs +286 -0
- package/benchmarks/verify-submission-bundle.mjs +167 -0
- package/dist/mcp-server/config.d.ts +1 -1
- package/dist/mcp-server/config.d.ts.map +1 -1
- package/dist/mcp-server/config.js +1 -1
- package/dist/mcp-server/config.js.map +1 -1
- package/dist/mcp-server/index.d.ts +65 -3
- package/dist/mcp-server/index.d.ts.map +1 -1
- package/dist/mcp-server/index.js +675 -157
- package/dist/mcp-server/index.js.map +1 -1
- package/dist/src/action-key.d.ts +9 -0
- package/dist/src/action-key.d.ts.map +1 -0
- package/dist/src/action-key.js +49 -0
- package/dist/src/action-key.js.map +1 -0
- package/dist/src/adaptive.js +5 -5
- package/dist/src/affect.js +8 -8
- package/dist/src/audrey.d.ts +13 -0
- package/dist/src/audrey.d.ts.map +1 -1
- package/dist/src/audrey.js +68 -3
- package/dist/src/audrey.js.map +1 -1
- package/dist/src/capsule.js +4 -4
- package/dist/src/causal.js +3 -3
- package/dist/src/consolidate.js +48 -48
- package/dist/src/controller.d.ts +78 -6
- package/dist/src/controller.d.ts.map +1 -1
- package/dist/src/controller.js +273 -53
- package/dist/src/controller.js.map +1 -1
- package/dist/src/db.js +172 -172
- package/dist/src/decay.js +8 -8
- package/dist/src/embedding.d.ts +2 -1
- package/dist/src/embedding.d.ts.map +1 -1
- package/dist/src/embedding.js +39 -29
- package/dist/src/embedding.js.map +1 -1
- package/dist/src/encode.js +6 -6
- package/dist/src/feedback.d.ts +6 -0
- package/dist/src/feedback.d.ts.map +1 -1
- package/dist/src/feedback.js +6 -0
- package/dist/src/feedback.js.map +1 -1
- package/dist/src/forget.js +12 -12
- package/dist/src/hybrid-recall.js +9 -9
- package/dist/src/impact.js +6 -6
- package/dist/src/import.d.ts +3 -3
- package/dist/src/import.js +41 -41
- package/dist/src/index.d.ts +5 -4
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +3 -3
- package/dist/src/index.js.map +1 -1
- package/dist/src/interference.js +14 -14
- package/dist/src/introspect.js +18 -18
- package/dist/src/preflight.d.ts.map +1 -1
- package/dist/src/preflight.js +41 -0
- package/dist/src/preflight.js.map +1 -1
- package/dist/src/promote.js +7 -7
- package/dist/src/prompts.js +118 -118
- package/dist/src/recall.js +30 -30
- package/dist/src/reflexes.d.ts +1 -0
- package/dist/src/reflexes.d.ts.map +1 -1
- package/dist/src/reflexes.js +3 -0
- package/dist/src/reflexes.js.map +1 -1
- package/dist/src/rollback.js +4 -4
- package/dist/src/routes.d.ts.map +1 -1
- package/dist/src/routes.js +71 -2
- package/dist/src/routes.js.map +1 -1
- package/dist/src/validate.js +25 -25
- package/docs/AUDREY_PAPER_OUTLINE.md +175 -0
- package/docs/MEMORY_BENCHMARKING.md +59 -0
- package/docs/PRODUCTION_BACKLOG.md +304 -0
- package/docs/paper/00-master.md +48 -0
- package/docs/paper/01-introduction.md +27 -0
- package/docs/paper/02-related-work.md +47 -0
- package/docs/paper/03-problem-definition.md +108 -0
- package/docs/paper/04-design.md +164 -0
- package/docs/paper/05-guardbench-spec.md +412 -0
- package/docs/paper/06-implementation.md +113 -0
- package/docs/paper/07-evaluation.md +168 -0
- package/docs/paper/08-discussion-limitations.md +61 -0
- package/docs/paper/09-conclusion.md +11 -0
- package/docs/paper/SUBMISSION_README.md +162 -0
- package/docs/paper/appendix-a-demo-transcript.md +114 -0
- package/docs/paper/arxiv-compile-report.schema.json +116 -0
- package/docs/paper/arxiv-source.schema.json +61 -0
- package/docs/paper/audrey-paper-v1.md +1106 -0
- package/docs/paper/browser-launch-plan.json +209 -0
- package/docs/paper/browser-launch-plan.schema.json +100 -0
- package/docs/paper/browser-launch-results.json +86 -0
- package/docs/paper/browser-launch-results.schema.json +66 -0
- package/docs/paper/claim-register.json +138 -0
- package/docs/paper/claim-register.schema.json +81 -0
- package/docs/paper/evidence-ledger.md +103 -0
- package/docs/paper/output/arxiv/README-arxiv.txt +8 -0
- package/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
- package/docs/paper/output/arxiv/main.tex +949 -0
- package/docs/paper/output/arxiv/references.bib +222 -0
- package/docs/paper/output/arxiv-compile-report.json +24 -0
- package/docs/paper/output/submission-bundle/LICENSE +21 -0
- package/docs/paper/output/submission-bundle/README.md +555 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +56 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +63 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-manifest.json +414 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +1271 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +2107 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +31 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +2354 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +184 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +249 -0
- package/docs/paper/output/submission-bundle/docs/AUDREY_PAPER_OUTLINE.md +175 -0
- package/docs/paper/output/submission-bundle/docs/paper/00-master.md +48 -0
- package/docs/paper/output/submission-bundle/docs/paper/01-introduction.md +27 -0
- package/docs/paper/output/submission-bundle/docs/paper/02-related-work.md +47 -0
- package/docs/paper/output/submission-bundle/docs/paper/03-problem-definition.md +108 -0
- package/docs/paper/output/submission-bundle/docs/paper/04-design.md +164 -0
- package/docs/paper/output/submission-bundle/docs/paper/05-guardbench-spec.md +412 -0
- package/docs/paper/output/submission-bundle/docs/paper/06-implementation.md +113 -0
- package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +168 -0
- package/docs/paper/output/submission-bundle/docs/paper/08-discussion-limitations.md +61 -0
- package/docs/paper/output/submission-bundle/docs/paper/09-conclusion.md +11 -0
- package/docs/paper/output/submission-bundle/docs/paper/SUBMISSION_README.md +162 -0
- package/docs/paper/output/submission-bundle/docs/paper/appendix-a-demo-transcript.md +114 -0
- package/docs/paper/output/submission-bundle/docs/paper/arxiv-compile-report.schema.json +116 -0
- package/docs/paper/output/submission-bundle/docs/paper/arxiv-source.schema.json +61 -0
- package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +1106 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.json +209 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.schema.json +100 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.json +86 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.schema.json +66 -0
- package/docs/paper/output/submission-bundle/docs/paper/claim-register.json +138 -0
- package/docs/paper/output/submission-bundle/docs/paper/claim-register.schema.json +81 -0
- package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +103 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/README-arxiv.txt +8 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +949 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/references.bib +222 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +24 -0
- package/docs/paper/output/submission-bundle/docs/paper/paper-submission-bundle.schema.json +70 -0
- package/docs/paper/output/submission-bundle/docs/paper/publication-pack.json +81 -0
- package/docs/paper/output/submission-bundle/docs/paper/publication-pack.schema.json +60 -0
- package/docs/paper/output/submission-bundle/docs/paper/references.bib +222 -0
- package/docs/paper/output/submission-bundle/package.json +212 -0
- package/docs/paper/output/submission-bundle/paper-submission-manifest.json +379 -0
- package/docs/paper/paper-submission-bundle.schema.json +70 -0
- package/docs/paper/publication-pack.json +81 -0
- package/docs/paper/publication-pack.schema.json +60 -0
- package/docs/paper/references.bib +222 -0
- package/package.json +87 -4
- package/scripts/audit-release-completion.mjs +362 -0
- package/scripts/create-arxiv-source.mjs +362 -0
- package/scripts/create-paper-submission-bundle.mjs +210 -0
- package/scripts/finalize-release.mjs +526 -0
- package/scripts/prepare-release-cut.mjs +269 -0
- package/scripts/publish-release-bundle.mjs +209 -0
- package/scripts/publish-release-github-api.mjs +429 -0
- package/scripts/run-vitest.mjs +34 -0
- package/scripts/smoke-cli.js +92 -0
- package/scripts/sync-paper-artifacts.mjs +109 -0
- package/scripts/verify-arxiv-compile.mjs +440 -0
- package/scripts/verify-arxiv-source.mjs +194 -0
- package/scripts/verify-browser-launch-plan.mjs +237 -0
- package/scripts/verify-browser-launch-results.mjs +285 -0
- package/scripts/verify-paper-artifacts.mjs +338 -0
- package/scripts/verify-paper-claims.mjs +226 -0
- package/scripts/verify-paper-submission-bundle.mjs +207 -0
- package/scripts/verify-publication-pack.mjs +196 -0
- package/scripts/verify-python-package.py +201 -0
- package/scripts/verify-release-readiness.mjs +785 -0
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
// Honest performance snapshot for Audrey.
|
|
2
|
+
//
|
|
3
|
+
// What this measures
|
|
4
|
+
// - Encode latency at multiple corpus sizes
|
|
5
|
+
// - Hybrid recall latency at multiple corpus sizes
|
|
6
|
+
// - Post-encode queue processing time (consolidation/interference/validation pipeline)
|
|
7
|
+
//
|
|
8
|
+
// What this does NOT do
|
|
9
|
+
// - Compare against other memory systems. Synthetic head-to-head numbers are
|
|
10
|
+
// easy to game. If you want a real comparison, run the same workload against
|
|
11
|
+
// the system you care about and post your own results.
|
|
12
|
+
// - Use cloud embedding providers. Latency to a remote API is dominated by
|
|
13
|
+
// network round-trip and varies wildly by region and rate-limit state.
|
|
14
|
+
// We use the in-process mock embedding provider so the numbers reflect
|
|
15
|
+
// Audrey's own pipeline (SQLite, sqlite-vec, encode/recall logic, hybrid
|
|
16
|
+
// ranking) without third-party noise. Real-world recall p95 with a local
|
|
17
|
+
// 384-dim provider is typically 5-15x higher; with a hosted provider it is
|
|
18
|
+
// dominated by the API call.
|
|
19
|
+
//
|
|
20
|
+
// How to read the output
|
|
21
|
+
// - p50 / p95 / p99 are percentile latencies in milliseconds.
|
|
22
|
+
// - The numbers are wall-clock for a single call from a JS caller, including
|
|
23
|
+
// SQLite work and any post-encode queueing on encode rows.
|
|
24
|
+
// - Run on your own hardware and embedding provider before quoting numbers
|
|
25
|
+
// anywhere; results scale heavily with CPU, NVMe vs spinning disk, and
|
|
26
|
+
// embedding dimensionality.
|
|
27
|
+
|
|
28
|
+
import { performance } from 'node:perf_hooks';
|
|
29
|
+
import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
|
|
30
|
+
import { join, resolve } from 'node:path';
|
|
31
|
+
import { tmpdir, cpus, totalmem, arch, platform, release } from 'node:os';
|
|
32
|
+
import { execFileSync } from 'node:child_process';
|
|
33
|
+
import { pathToFileURL } from 'node:url';
|
|
34
|
+
import { Audrey } from '../dist/src/index.js';
|
|
35
|
+
|
|
36
|
+
const DEFAULT_SIZES = [100, 1000, 5000];
|
|
37
|
+
const DEFAULT_RECALL_RUNS = 50;
|
|
38
|
+
const QUERY_POOL = [
|
|
39
|
+
'rate limit handling for HTTP 429 retries',
|
|
40
|
+
'durable agent context across sessions',
|
|
41
|
+
'safe shell behavior on Windows hosts',
|
|
42
|
+
'official authority over inferred preferences',
|
|
43
|
+
'lexical signal for exact identifiers',
|
|
44
|
+
'deployment region migration steps',
|
|
45
|
+
'webhook signature recovery procedure',
|
|
46
|
+
'fraud queue manual review trigger',
|
|
47
|
+
];
|
|
48
|
+
|
|
49
|
+
const SEED_POOL = [
|
|
50
|
+
'Stripe API returned HTTP 429 during checkout retry; needs exponential backoff.',
|
|
51
|
+
'Project memory routing should prefer the local memory layer for durable context.',
|
|
52
|
+
'Tool trace learning marks repeated spawn EPERM failures as risky on Windows shells.',
|
|
53
|
+
'Calendar authority should come from the official source before inferred user notes.',
|
|
54
|
+
'Vector recall is faster but loses BM25 lexical signal on exact identifiers.',
|
|
55
|
+
'Webhook signature recovery requires rotating the signing secret and replaying queued events.',
|
|
56
|
+
'Fraud queue stabilizes when repeated same-BIN disputes are escalated for manual review.',
|
|
57
|
+
'Deployment region migrations should be coordinated against the provider rate-limit window.',
|
|
58
|
+
];
|
|
59
|
+
|
|
60
|
+
function percentile(values, rank) {
|
|
61
|
+
if (values.length === 0) return 0;
|
|
62
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
63
|
+
const index = Math.min(sorted.length - 1, Math.ceil((rank / 100) * sorted.length) - 1);
|
|
64
|
+
return sorted[index];
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function round(value) {
|
|
68
|
+
return Math.round(value * 1000) / 1000;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function summarize(values) {
|
|
72
|
+
if (values.length === 0) {
|
|
73
|
+
return { samples: 0, p50: 0, p95: 0, p99: 0, min: 0, max: 0, mean: 0 };
|
|
74
|
+
}
|
|
75
|
+
const total = values.reduce((acc, v) => acc + v, 0);
|
|
76
|
+
return {
|
|
77
|
+
samples: values.length,
|
|
78
|
+
p50: round(percentile(values, 50)),
|
|
79
|
+
p95: round(percentile(values, 95)),
|
|
80
|
+
p99: round(percentile(values, 99)),
|
|
81
|
+
min: round(Math.min(...values)),
|
|
82
|
+
max: round(Math.max(...values)),
|
|
83
|
+
mean: round(total / values.length),
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function gitSha() {
|
|
88
|
+
// execFileSync, no shell, fixed argv — provenance only, no user input flows in.
|
|
89
|
+
try {
|
|
90
|
+
return execFileSync('git', ['rev-parse', '--short', 'HEAD'], {
|
|
91
|
+
stdio: ['ignore', 'pipe', 'ignore'],
|
|
92
|
+
})
|
|
93
|
+
.toString()
|
|
94
|
+
.trim();
|
|
95
|
+
} catch {
|
|
96
|
+
return null;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
export function resolveAudreyVersion() {
|
|
101
|
+
if (process.env.npm_package_version) {
|
|
102
|
+
return process.env.npm_package_version;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
try {
|
|
106
|
+
const pkg = JSON.parse(readFileSync(new URL('../package.json', import.meta.url), 'utf8'));
|
|
107
|
+
return typeof pkg.version === 'string' ? pkg.version : null;
|
|
108
|
+
} catch {
|
|
109
|
+
return null;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function machineProvenance() {
|
|
114
|
+
const cpuList = cpus();
|
|
115
|
+
const cpuModel = cpuList[0] ? cpuList[0].model : 'unknown';
|
|
116
|
+
const totalGb = Math.round((totalmem() / 1024 / 1024 / 1024) * 10) / 10;
|
|
117
|
+
return {
|
|
118
|
+
node: process.versions.node,
|
|
119
|
+
v8: process.versions.v8,
|
|
120
|
+
platform: platform(),
|
|
121
|
+
arch: arch(),
|
|
122
|
+
osRelease: release(),
|
|
123
|
+
cpuCount: cpuList.length,
|
|
124
|
+
cpuModel,
|
|
125
|
+
memoryGb: totalGb,
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function createDataDir() {
|
|
130
|
+
const parents = [
|
|
131
|
+
process.env.AUDREY_PERF_PARENT_DIR,
|
|
132
|
+
tmpdir(),
|
|
133
|
+
join(process.cwd(), 'benchmarks', '.tmp'),
|
|
134
|
+
].filter(Boolean);
|
|
135
|
+
|
|
136
|
+
let lastError;
|
|
137
|
+
for (const parent of parents) {
|
|
138
|
+
try {
|
|
139
|
+
mkdirSync(parent, { recursive: true });
|
|
140
|
+
return mkdtempSync(join(parent, 'audrey-perf-snapshot-'));
|
|
141
|
+
} catch (err) {
|
|
142
|
+
lastError = err;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
throw lastError || new Error('Unable to create perf snapshot data directory');
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
function parseArgs(argv = process.argv.slice(2)) {
|
|
149
|
+
const args = {
|
|
150
|
+
sizes: [...DEFAULT_SIZES],
|
|
151
|
+
recallRuns: DEFAULT_RECALL_RUNS,
|
|
152
|
+
out: null,
|
|
153
|
+
json: false,
|
|
154
|
+
};
|
|
155
|
+
for (let i = 0; i < argv.length; i++) {
|
|
156
|
+
const token = argv[i];
|
|
157
|
+
if (token === '--sizes' && argv[i + 1]) {
|
|
158
|
+
args.sizes = argv[++i]
|
|
159
|
+
.split(',')
|
|
160
|
+
.map((s) => Number.parseInt(s.trim(), 10))
|
|
161
|
+
.filter((n) => Number.isFinite(n) && n > 0);
|
|
162
|
+
} else if (token === '--recall-runs' && argv[i + 1]) {
|
|
163
|
+
args.recallRuns = Number.parseInt(argv[++i], 10);
|
|
164
|
+
} else if (token === '--out' && argv[i + 1]) {
|
|
165
|
+
args.out = resolve(argv[++i]);
|
|
166
|
+
} else if (token === '--json') {
|
|
167
|
+
args.json = true;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
return args;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
async function runOneSize({ size, recallRuns }) {
|
|
174
|
+
const dataDir = createDataDir();
|
|
175
|
+
const audrey = new Audrey({
|
|
176
|
+
dataDir,
|
|
177
|
+
agent: 'perf-snapshot',
|
|
178
|
+
embedding: { provider: 'mock', dimensions: 64 },
|
|
179
|
+
llm: { provider: 'mock' },
|
|
180
|
+
});
|
|
181
|
+
|
|
182
|
+
const queueProcessingTimes = [];
|
|
183
|
+
audrey.on('post-encode-complete', (event) => {
|
|
184
|
+
queueProcessingTimes.push(event.processing_ms);
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
try {
|
|
188
|
+
const encodeTimes = [];
|
|
189
|
+
for (let i = 0; i < size; i++) {
|
|
190
|
+
const content = `${SEED_POOL[i % SEED_POOL.length]} (sample ${i})`;
|
|
191
|
+
const startedAt = performance.now();
|
|
192
|
+
await audrey.encode({
|
|
193
|
+
content,
|
|
194
|
+
source: 'direct-observation',
|
|
195
|
+
tags: ['perf-snapshot'],
|
|
196
|
+
});
|
|
197
|
+
encodeTimes.push(performance.now() - startedAt);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const drain = await audrey.drainPostEncodeQueue(60_000);
|
|
201
|
+
if (!drain.drained) {
|
|
202
|
+
throw new Error(`post-encode queue did not drain at size=${size}`);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const recallTimes = [];
|
|
206
|
+
for (let i = 0; i < recallRuns; i++) {
|
|
207
|
+
const query = QUERY_POOL[i % QUERY_POOL.length];
|
|
208
|
+
const startedAt = performance.now();
|
|
209
|
+
await audrey.recall(query, { limit: 5, retrieval: 'hybrid' });
|
|
210
|
+
recallTimes.push(performance.now() - startedAt);
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
return {
|
|
214
|
+
corpusSize: size,
|
|
215
|
+
encodeMs: summarize(encodeTimes),
|
|
216
|
+
hybridRecallMs: summarize(recallTimes),
|
|
217
|
+
postEncodeQueueMs: summarize(queueProcessingTimes),
|
|
218
|
+
queueEvents: queueProcessingTimes.length,
|
|
219
|
+
};
|
|
220
|
+
} finally {
|
|
221
|
+
audrey.close();
|
|
222
|
+
rmSync(dataDir, { recursive: true, force: true });
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
export async function runPerfSnapshot({ sizes = DEFAULT_SIZES, recallRuns = DEFAULT_RECALL_RUNS } = {}) {
|
|
227
|
+
const startedAt = Date.now();
|
|
228
|
+
const sized = [];
|
|
229
|
+
for (const size of sizes) {
|
|
230
|
+
sized.push(await runOneSize({ size, recallRuns }));
|
|
231
|
+
}
|
|
232
|
+
return {
|
|
233
|
+
generatedAt: new Date(startedAt).toISOString(),
|
|
234
|
+
durationMs: Date.now() - startedAt,
|
|
235
|
+
audreyVersion: resolveAudreyVersion(),
|
|
236
|
+
gitSha: gitSha(),
|
|
237
|
+
methodology: {
|
|
238
|
+
embedding: 'mock provider, 64 dimensions (in-process, no network)',
|
|
239
|
+
llm: 'mock provider (in-process)',
|
|
240
|
+
retrieval: 'hybrid (vector + lexical) with limit=5',
|
|
241
|
+
sizes,
|
|
242
|
+
recallRunsPerSize: recallRuns,
|
|
243
|
+
notes:
|
|
244
|
+
'Latency is wall-clock for a single call from a JS caller. Cloud and ' +
|
|
245
|
+
'local 384-dim providers will report higher recall latency dominated by ' +
|
|
246
|
+
'embedding cost and network. Run on your own hardware before quoting.',
|
|
247
|
+
},
|
|
248
|
+
machine: machineProvenance(),
|
|
249
|
+
sizes: sized,
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
function formatMs(value) {
|
|
254
|
+
if (value === 0) return '0';
|
|
255
|
+
if (value < 1) return value.toFixed(2);
|
|
256
|
+
return value.toFixed(1);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
export function formatMarkdownTable(snapshot) {
|
|
260
|
+
const lines = [];
|
|
261
|
+
lines.push(
|
|
262
|
+
`Audrey perf snapshot — ${snapshot.audreyVersion || 'dev'} on ${snapshot.machine.platform}/${snapshot.machine.arch}`,
|
|
263
|
+
);
|
|
264
|
+
lines.push('');
|
|
265
|
+
lines.push(
|
|
266
|
+
`Node ${snapshot.machine.node} · ${snapshot.machine.cpuCount}x ${snapshot.machine.cpuModel} · ${snapshot.machine.memoryGb} GB RAM`,
|
|
267
|
+
);
|
|
268
|
+
lines.push(
|
|
269
|
+
`Generated ${snapshot.generatedAt}${snapshot.gitSha ? ` (${snapshot.gitSha})` : ''}`,
|
|
270
|
+
);
|
|
271
|
+
lines.push('');
|
|
272
|
+
lines.push('| Corpus size | Encode p50 (ms) | Encode p95 (ms) | Recall p50 (ms) | Recall p95 (ms) | Recall p99 (ms) |');
|
|
273
|
+
lines.push('|---|---|---|---|---|---|');
|
|
274
|
+
for (const row of snapshot.sizes) {
|
|
275
|
+
lines.push(
|
|
276
|
+
`| ${row.corpusSize.toLocaleString()} ` +
|
|
277
|
+
`| ${formatMs(row.encodeMs.p50)} ` +
|
|
278
|
+
`| ${formatMs(row.encodeMs.p95)} ` +
|
|
279
|
+
`| ${formatMs(row.hybridRecallMs.p50)} ` +
|
|
280
|
+
`| ${formatMs(row.hybridRecallMs.p95)} ` +
|
|
281
|
+
`| ${formatMs(row.hybridRecallMs.p99)} |`,
|
|
282
|
+
);
|
|
283
|
+
}
|
|
284
|
+
return lines.join('\n');
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
if (process.argv[1] && import.meta.url === pathToFileURL(process.argv[1]).href) {
|
|
288
|
+
const args = parseArgs();
|
|
289
|
+
runPerfSnapshot({ sizes: args.sizes, recallRuns: args.recallRuns })
|
|
290
|
+
.then((snapshot) => {
|
|
291
|
+
if (args.out) {
|
|
292
|
+
writeFileSync(args.out, JSON.stringify(snapshot, null, 2) + '\n');
|
|
293
|
+
}
|
|
294
|
+
if (args.json) {
|
|
295
|
+
process.stdout.write(JSON.stringify(snapshot, null, 2) + '\n');
|
|
296
|
+
} else {
|
|
297
|
+
process.stdout.write(formatMarkdownTable(snapshot) + '\n');
|
|
298
|
+
}
|
|
299
|
+
})
|
|
300
|
+
.catch((err) => {
|
|
301
|
+
console.error('[audrey] perf snapshot failed:', err);
|
|
302
|
+
process.exit(1);
|
|
303
|
+
});
|
|
304
|
+
}
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import { performance } from 'node:perf_hooks';
|
|
2
|
+
import { mkdirSync, mkdtempSync, rmSync } from 'node:fs';
|
|
3
|
+
import { join } from 'node:path';
|
|
4
|
+
import { tmpdir } from 'node:os';
|
|
5
|
+
import { pathToFileURL } from 'node:url';
|
|
6
|
+
import { Audrey } from '../dist/src/index.js';
|
|
7
|
+
|
|
8
|
+
const RUNS = 20;
|
|
9
|
+
|
|
10
|
+
// Budget source: CHANGELOG.md#0220---2026-04-28, from the Audrey/MemoryGym
|
|
11
|
+
// latency pass. This mock-provider gate catches mechanical regressions in
|
|
12
|
+
// Audrey CI before live GPU benchmarks or MemoryGym release gates find them.
|
|
13
|
+
export const PERF_BUDGETS = Object.freeze({
|
|
14
|
+
encodeResponseP95Ms: 50,
|
|
15
|
+
hybridRecallP95Ms: 25,
|
|
16
|
+
queueProcessingP50Ms: 5,
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
function roundMs(value) {
|
|
20
|
+
return Math.round(value * 1000) / 1000;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
function percentile(values, percentileRank) {
|
|
24
|
+
if (values.length === 0) return 0;
|
|
25
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
26
|
+
const index = Math.min(sorted.length - 1, Math.ceil((percentileRank / 100) * sorted.length) - 1);
|
|
27
|
+
return sorted[index];
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function stats(values) {
|
|
31
|
+
if (values.length === 0) {
|
|
32
|
+
return { p50: 0, p95: 0, min: 0, max: 0 };
|
|
33
|
+
}
|
|
34
|
+
return {
|
|
35
|
+
p50: roundMs(percentile(values, 50)),
|
|
36
|
+
p95: roundMs(percentile(values, 95)),
|
|
37
|
+
min: roundMs(Math.min(...values)),
|
|
38
|
+
max: roundMs(Math.max(...values)),
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function assertBudget(name, actual, budget) {
|
|
43
|
+
if (actual >= budget) {
|
|
44
|
+
throw new Error(`${name} ${actual}ms exceeded budget ${budget}ms`);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function seedContent(index) {
|
|
49
|
+
const cases = [
|
|
50
|
+
'Stripe API returned HTTP 429 during checkout retry and needs exponential backoff.',
|
|
51
|
+
'Project memory routing should prefer Audrey MCP for durable agent context.',
|
|
52
|
+
'Tool trace learning marks repeated npm spawn EPERM failures as risky on Windows shells.',
|
|
53
|
+
'Calendar authority should come from the official source before inferred user notes.',
|
|
54
|
+
'Vector recall is faster but loses BM25 lexical signal on exact identifiers.',
|
|
55
|
+
];
|
|
56
|
+
return `${cases[index % cases.length]} Perf sample ${index}.`;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function createPerfDataDir() {
|
|
60
|
+
const parents = [
|
|
61
|
+
process.env.AUDREY_PERF_PARENT_DIR,
|
|
62
|
+
tmpdir(),
|
|
63
|
+
join(process.cwd(), 'benchmarks', '.tmp'),
|
|
64
|
+
].filter(Boolean);
|
|
65
|
+
let lastError;
|
|
66
|
+
|
|
67
|
+
for (const parent of parents) {
|
|
68
|
+
try {
|
|
69
|
+
mkdirSync(parent, { recursive: true });
|
|
70
|
+
return mkdtempSync(join(parent, 'audrey-perf-'));
|
|
71
|
+
} catch (err) {
|
|
72
|
+
lastError = err;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
throw lastError || new Error('Unable to create Audrey perf benchmark data directory');
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
export async function runPerfBenchmark({
|
|
80
|
+
runs = RUNS,
|
|
81
|
+
budgets = PERF_BUDGETS,
|
|
82
|
+
out = console.log,
|
|
83
|
+
} = {}) {
|
|
84
|
+
const dataDir = createPerfDataDir();
|
|
85
|
+
const audrey = new Audrey({
|
|
86
|
+
dataDir,
|
|
87
|
+
agent: 'perf-bench',
|
|
88
|
+
embedding: { provider: 'mock', dimensions: 64 },
|
|
89
|
+
llm: { provider: 'mock' },
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
const queueProcessingTimes = [];
|
|
93
|
+
audrey.on('post-encode-complete', event => {
|
|
94
|
+
queueProcessingTimes.push(event.processing_ms);
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
try {
|
|
98
|
+
const encodeTimes = [];
|
|
99
|
+
for (let i = 0; i < runs; i += 1) {
|
|
100
|
+
const startedAt = performance.now();
|
|
101
|
+
await audrey.encode({
|
|
102
|
+
content: seedContent(i),
|
|
103
|
+
source: 'direct-observation',
|
|
104
|
+
tags: ['perf-gate'],
|
|
105
|
+
affect: { valence: i % 2 === 0 ? 0.3 : -0.1, arousal: 0.2 },
|
|
106
|
+
});
|
|
107
|
+
encodeTimes.push(performance.now() - startedAt);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const drain = await audrey.drainPostEncodeQueue(5000);
|
|
111
|
+
if (!drain.drained) {
|
|
112
|
+
throw new Error(`post-encode queue did not drain: ${drain.pendingIds.join(', ')}`);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const recallTimes = [];
|
|
116
|
+
for (let i = 0; i < runs; i += 1) {
|
|
117
|
+
const startedAt = performance.now();
|
|
118
|
+
await audrey.recall('Stripe API 429 retry memory routing', {
|
|
119
|
+
limit: 5,
|
|
120
|
+
retrieval: 'hybrid',
|
|
121
|
+
});
|
|
122
|
+
recallTimes.push(performance.now() - startedAt);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const result = {
|
|
126
|
+
runs,
|
|
127
|
+
budgets,
|
|
128
|
+
encode_response_ms: stats(encodeTimes),
|
|
129
|
+
hybrid_recall_ms: stats(recallTimes),
|
|
130
|
+
queue_processing_ms: stats(queueProcessingTimes),
|
|
131
|
+
queue_events: queueProcessingTimes.length,
|
|
132
|
+
status: {
|
|
133
|
+
pending_consolidation_count: audrey.memoryStatus().pending_consolidation_count,
|
|
134
|
+
default_retrieval_mode: audrey.memoryStatus().default_retrieval_mode,
|
|
135
|
+
},
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
if (queueProcessingTimes.length !== runs) {
|
|
139
|
+
throw new Error(`expected ${runs} post-encode queue events, got ${queueProcessingTimes.length}`);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
assertBudget('encode response p95', result.encode_response_ms.p95, budgets.encodeResponseP95Ms);
|
|
143
|
+
assertBudget('hybrid recall p95', result.hybrid_recall_ms.p95, budgets.hybridRecallP95Ms);
|
|
144
|
+
assertBudget('queue processing p50', result.queue_processing_ms.p50, budgets.queueProcessingP50Ms);
|
|
145
|
+
|
|
146
|
+
out(`Audrey perf gate passed: encode p95=${result.encode_response_ms.p95}ms, `
|
|
147
|
+
+ `hybrid recall p95=${result.hybrid_recall_ms.p95}ms, `
|
|
148
|
+
+ `queue p50=${result.queue_processing_ms.p50}ms`);
|
|
149
|
+
return result;
|
|
150
|
+
} finally {
|
|
151
|
+
audrey.close();
|
|
152
|
+
rmSync(dataDir, { recursive: true, force: true });
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
if (process.argv[1] && import.meta.url === pathToFileURL(process.argv[1]).href) {
|
|
157
|
+
runPerfBenchmark().catch(err => {
|
|
158
|
+
console.error('[audrey] perf gate failed:', err);
|
|
159
|
+
process.exit(1);
|
|
160
|
+
});
|
|
161
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { existsSync, readFileSync, readdirSync } from 'node:fs';
|
|
2
|
+
import { basename, dirname, isAbsolute, join, relative, resolve } from 'node:path';
|
|
3
|
+
import { fileURLToPath } from 'node:url';
|
|
4
|
+
|
|
5
|
+
const ROOT = resolve(dirname(fileURLToPath(import.meta.url)), '..');
|
|
6
|
+
const WINDOWS_DRIVE_PATTERN = /(^|[^a-z])[A-Z]:[\\/]/i;
|
|
7
|
+
const EXTENDED_PATH_PATTERN = /\\\\\?\\/;
|
|
8
|
+
const FILE_URL_PATTERN = /file:\/\//i;
|
|
9
|
+
|
|
10
|
+
function isUrl(value) {
|
|
11
|
+
return /^[a-z][a-z0-9+.-]*:\/\//i.test(value);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export function publicPath(value) {
|
|
15
|
+
if (typeof value !== 'string') return value;
|
|
16
|
+
if (value === process.execPath) return 'node';
|
|
17
|
+
if (isUrl(value)) return value;
|
|
18
|
+
|
|
19
|
+
const resolved = resolve(value);
|
|
20
|
+
const rel = relative(ROOT, resolved);
|
|
21
|
+
if (rel && !rel.startsWith('..') && !isAbsolute(rel)) {
|
|
22
|
+
return rel.replaceAll('\\', '/');
|
|
23
|
+
}
|
|
24
|
+
if (value.includes('\\') || value.includes('/') || isAbsolute(value)) {
|
|
25
|
+
return `[LOCAL-PATH:${basename(value) || 'path'}]`;
|
|
26
|
+
}
|
|
27
|
+
return value;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export function publicCommand(command = []) {
|
|
31
|
+
return command.map(part => publicPath(part));
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export function publicArtifactValue(value) {
|
|
35
|
+
if (Array.isArray(value)) return value.map(item => publicArtifactValue(item));
|
|
36
|
+
if (value && typeof value === 'object') {
|
|
37
|
+
return Object.fromEntries(Object.entries(value).map(([key, item]) => [key, publicArtifactValue(item)]));
|
|
38
|
+
}
|
|
39
|
+
return publicPath(value);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export function containsLocalPath(text) {
|
|
43
|
+
return WINDOWS_DRIVE_PATTERN.test(text)
|
|
44
|
+
|| EXTENDED_PATH_PATTERN.test(text)
|
|
45
|
+
|| FILE_URL_PATTERN.test(text);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
export function findLocalPathLeaks(value, path = '$') {
|
|
49
|
+
if (typeof value === 'string') {
|
|
50
|
+
return containsLocalPath(value) ? [`${path}: ${value}`] : [];
|
|
51
|
+
}
|
|
52
|
+
if (Array.isArray(value)) {
|
|
53
|
+
return value.flatMap((item, index) => findLocalPathLeaks(item, `${path}[${index}]`));
|
|
54
|
+
}
|
|
55
|
+
if (value && typeof value === 'object') {
|
|
56
|
+
return Object.entries(value).flatMap(([key, item]) => findLocalPathLeaks(item, `${path}.${key}`));
|
|
57
|
+
}
|
|
58
|
+
return [];
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export function walkFiles(dir, root = dir) {
|
|
62
|
+
return readdirSync(dir, { withFileTypes: true }).flatMap(entry => {
|
|
63
|
+
const path = join(dir, entry.name);
|
|
64
|
+
if (entry.isDirectory()) return walkFiles(path, root);
|
|
65
|
+
return relative(root, path).replaceAll('\\', '/');
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
export function scanFilesForLocalPaths(root, files) {
|
|
70
|
+
const leaks = [];
|
|
71
|
+
for (const file of files) {
|
|
72
|
+
const path = join(root, file);
|
|
73
|
+
if (!existsSync(path)) continue;
|
|
74
|
+
const content = readFileSync(path, 'utf-8');
|
|
75
|
+
if (containsLocalPath(content)) leaks.push(file);
|
|
76
|
+
}
|
|
77
|
+
return leaks;
|
|
78
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
export const PUBLISHED_LEADERBOARD = [
|
|
2
|
+
{
|
|
3
|
+
system: 'MIRIX',
|
|
4
|
+
benchmark: 'LoCoMo',
|
|
5
|
+
score: 85.4,
|
|
6
|
+
unit: 'accuracy',
|
|
7
|
+
source: 'https://arxiv.org/abs/2507.07957',
|
|
8
|
+
note: 'Published LoCoMo result from the MIRIX paper.',
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
system: 'Letta Filesystem',
|
|
12
|
+
benchmark: 'LoCoMo',
|
|
13
|
+
score: 74.0,
|
|
14
|
+
unit: 'accuracy',
|
|
15
|
+
source: 'https://www.letta.com/blog/benchmarking-ai-agent-memory',
|
|
16
|
+
note: 'Filesystem-style memory result reported by Letta.',
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
system: 'Mem0 Graph Memory',
|
|
20
|
+
benchmark: 'LoCoMo',
|
|
21
|
+
score: 68.5,
|
|
22
|
+
unit: 'accuracy',
|
|
23
|
+
source: 'https://arxiv.org/abs/2504.19413',
|
|
24
|
+
note: 'Graph memory variant reported in the Mem0 paper.',
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
system: 'Mem0',
|
|
28
|
+
benchmark: 'LoCoMo',
|
|
29
|
+
score: 66.9,
|
|
30
|
+
unit: 'accuracy',
|
|
31
|
+
source: 'https://arxiv.org/abs/2504.19413',
|
|
32
|
+
note: 'Core Mem0 LoCoMo score reported in the Mem0 paper.',
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
system: 'OpenAI Memory',
|
|
36
|
+
benchmark: 'LoCoMo',
|
|
37
|
+
score: 52.9,
|
|
38
|
+
unit: 'accuracy',
|
|
39
|
+
source: 'https://arxiv.org/abs/2504.19413',
|
|
40
|
+
note: 'OpenAI memory baseline as reported by the Mem0 paper.',
|
|
41
|
+
},
|
|
42
|
+
];
|
|
43
|
+
|
|
44
|
+
export const MEMORY_TRENDS = [
|
|
45
|
+
{
|
|
46
|
+
title: 'Memory is moving from flat retrieval to typed systems',
|
|
47
|
+
summary: 'Recent work treats episodic, semantic, procedural, and graph memory as separate but cooperating layers.',
|
|
48
|
+
source: 'https://arxiv.org/abs/2507.03724',
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
title: 'Benchmarks now emphasize multi-session realism',
|
|
52
|
+
summary: 'LongMemEval and LoCoMo push memory systems toward temporal updates, abstraction, and cross-session reasoning instead of single-turn fact recall.',
|
|
53
|
+
source: 'https://arxiv.org/abs/2410.10813',
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
title: 'Context engineering is now competing with retrieval-first designs',
|
|
57
|
+
summary: 'Letta argues filesystem and memory-block approaches can outperform simpler retrieval-only memory on realistic long-horizon tasks.',
|
|
58
|
+
source: 'https://www.letta.com/blog/memory-blocks',
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
title: 'Production teams care about latency and token footprint, not just recall quality',
|
|
62
|
+
summary: 'Mem0 frames memory as a cost and latency optimization surface in addition to a personalization surface.',
|
|
63
|
+
source: 'https://arxiv.org/abs/2504.19413',
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
title: 'Temporal and multimodal memory are becoming table stakes',
|
|
67
|
+
summary: 'MIRIX and Graphiti both model time and state change explicitly instead of assuming memories stay forever true.',
|
|
68
|
+
source: 'https://arxiv.org/abs/2507.07957',
|
|
69
|
+
},
|
|
70
|
+
];
|