@1mbrain/benchmarks 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/README.md +85 -0
  2. package/fixtures/1mbrain-focused-mini/1mbrain-focused-mini.json +928 -0
  3. package/fixtures/1mbrain-focused-mini/README.md +45 -0
  4. package/fixtures/adversarial-memory/dataset_claude_adversarial.json +3333 -0
  5. package/fixtures/adversarial-memory/dataset_gemini_adversarial_memory.json +2984 -0
  6. package/fixtures/balanced-mini/dataset_claude_balanced_mini.json +2077 -0
  7. package/fixtures/balanced-mini/dataset_gemini_balanced_mini.json +1995 -0
  8. package/fixtures/generate_datasets.js +1741 -0
  9. package/fixtures/graph-stress-hard/README.md +43 -0
  10. package/fixtures/graph-stress-hard/dataset_graph_stress_hard.json +4374 -0
  11. package/fixtures/graph-stress-hard/generate_graph_stress_hard.js +526 -0
  12. package/fixtures/realistic-medium/dataset_claude_realistic_medium.json +7462 -0
  13. package/fixtures/realistic-medium/dataset_gemini_realistic_medium.json +7277 -0
  14. package/fixtures/realistic-medium/gen_claude_medium.js +600 -0
  15. package/package.json +22 -0
  16. package/reports/benchmark_report.md +48 -0
  17. package/reports/benchmark_report_claude_adversarial.md +42 -0
  18. package/reports/benchmark_report_claude_adversarial_adaptive.md +42 -0
  19. package/reports/benchmark_report_claude_adversarial_adaptive2_fast.md +42 -0
  20. package/reports/benchmark_report_claude_adversarial_adaptive_fast.md +42 -0
  21. package/reports/benchmark_report_claude_adversarial_rerank.md +42 -0
  22. package/reports/benchmark_report_claude_balanced_mini.md +42 -0
  23. package/reports/benchmark_report_claude_balanced_mini_adaptive.md +42 -0
  24. package/reports/benchmark_report_claude_balanced_mini_adaptive2_fast.md +42 -0
  25. package/reports/benchmark_report_claude_balanced_mini_adaptive_fast.md +42 -0
  26. package/reports/benchmark_report_claude_balanced_mini_rerank.md +42 -0
  27. package/reports/benchmark_report_claude_realistic_medium.md +42 -0
  28. package/reports/benchmark_report_claude_realistic_medium_adaptive.md +42 -0
  29. package/reports/benchmark_report_claude_realistic_medium_adaptive2_fast.md +42 -0
  30. package/reports/benchmark_report_claude_realistic_medium_adaptive_fast.md +42 -0
  31. package/reports/benchmark_report_claude_realistic_medium_evidence_rerank_local.md +42 -0
  32. package/reports/benchmark_report_claude_realistic_medium_openai_evidence_rerank.md +41 -0
  33. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal.md +41 -0
  34. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal_scoped.md +41 -0
  35. package/reports/benchmark_report_claude_realistic_medium_openai_phase8_no_judge.md +42 -0
  36. package/reports/benchmark_report_claude_realistic_medium_openai_rankingpolicy.md +41 -0
  37. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter.md +41 -0
  38. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter_absence_fix.md +41 -0
  39. package/reports/benchmark_report_claude_realistic_medium_openai_write_time_invalidation.md +41 -0
  40. package/reports/benchmark_report_claude_realistic_medium_rerank.md +42 -0
  41. package/reports/benchmark_report_claude_realistic_medium_stale_filter_local.md +42 -0
  42. package/reports/benchmark_report_graph_stress_hard.md +42 -0
  43. package/reports/benchmark_report_graph_stress_hard_absence_fix.md +42 -0
  44. package/reports/benchmark_report_graph_stress_hard_adaptive.md +42 -0
  45. package/reports/benchmark_report_graph_stress_hard_evidence_rerank.md +42 -0
  46. package/reports/benchmark_report_graph_stress_hard_multi_signal_current_guardrail.md +42 -0
  47. package/reports/benchmark_report_graph_stress_hard_multi_signal_guardrail_fixed.md +42 -0
  48. package/reports/benchmark_report_graph_stress_hard_multi_signal_local.md +42 -0
  49. package/reports/benchmark_report_graph_stress_hard_multi_signal_scoped_guardrail.md +42 -0
  50. package/reports/benchmark_report_graph_stress_hard_multi_signal_vector_pure_guardrail.md +42 -0
  51. package/reports/benchmark_report_graph_stress_hard_phase8_sdk_guardrail.md +42 -0
  52. package/reports/benchmark_report_graph_stress_hard_rerank.md +42 -0
  53. package/reports/benchmark_report_graph_stress_hard_stale_filter.md +42 -0
  54. package/reports/benchmark_report_graph_stress_hard_write_time_invalidation.md +42 -0
  55. package/results/.gitignore +2 -0
  56. package/src/adapters/1mbrain.ts +317 -0
  57. package/src/adapters/keyword-embedding.ts +48 -0
  58. package/src/adapters/mem0.ts +124 -0
  59. package/src/adapters/qdrant.ts +214 -0
  60. package/src/adapters/unavailable.ts +49 -0
  61. package/src/adapters/vector-baseline.ts +149 -0
  62. package/src/datasets/focused-mini.ts +158 -0
  63. package/src/datasets/synthetic-agent-memory.ts +532 -0
  64. package/src/llm-evaluator.ts +262 -0
  65. package/src/metrics.ts +482 -0
  66. package/src/provider.ts +151 -0
  67. package/src/runner.ts +635 -0
  68. package/tsconfig.json +10 -0
  69. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,49 @@
1
+ import type {
2
+ BenchmarkMemoryRecord,
3
+ BenchmarkRecallRequest,
4
+ BenchmarkRecallResult,
5
+ MemoryProviderAdapter,
6
+ ProviderAvailability,
7
+ } from '../provider.js';
8
+
9
+ export class UnavailableAdapter implements MemoryProviderAdapter {
10
+ readonly capabilities = {
11
+ associations: false,
12
+ forget: false,
13
+ decay: false,
14
+ portability: false,
15
+ } as const;
16
+
17
+ constructor(
18
+ readonly name: string,
19
+ readonly label: string,
20
+ private readonly reason: string,
21
+ ) {}
22
+
23
+ async availability(): Promise<ProviderAvailability> {
24
+ return {
25
+ status: 'unsupported',
26
+ reason: this.reason,
27
+ };
28
+ }
29
+
30
+ async reset(_agentId: string): Promise<void> {
31
+ throw new Error(this.reason);
32
+ }
33
+
34
+ async remember(_memory: BenchmarkMemoryRecord, _agentId: string): Promise<void> {
35
+ throw new Error(this.reason);
36
+ }
37
+
38
+ async recall(
39
+ _request: BenchmarkRecallRequest & {
40
+ agentId: string;
41
+ },
42
+ ): Promise<BenchmarkRecallResult[]> {
43
+ throw new Error(this.reason);
44
+ }
45
+
46
+ async close(): Promise<void> {
47
+ // No-op.
48
+ }
49
+ }
@@ -0,0 +1,149 @@
1
+ import { rm, stat } from 'node:fs/promises';
2
+ import { tmpdir } from 'node:os';
3
+ import { join } from 'node:path';
4
+ import { SqliteDatabaseProvider, logger } from '@1mbrain/core';
5
+ import type {
6
+ BenchmarkMemoryRecord,
7
+ BenchmarkRecallRequest,
8
+ BenchmarkRecallResult,
9
+ MemoryProviderAdapter,
10
+ ProviderAvailability,
11
+ ProviderStats,
12
+ } from '../provider.js';
13
+ import { KeywordEmbeddingProvider } from './keyword-embedding.js';
14
+
15
+ export class VectorBaselineAdapter implements MemoryProviderAdapter {
16
+ readonly name = 'vector_baseline';
17
+ readonly label = 'Vector Baseline (SQLite)';
18
+ readonly capabilities = {
19
+ associations: false,
20
+ forget: true,
21
+ decay: true,
22
+ portability: false,
23
+ } as const;
24
+
25
+ private readonly embedder = new KeywordEmbeddingProvider();
26
+ private db: SqliteDatabaseProvider | null = null;
27
+ private dbPath: string | null = null;
28
+
29
+ async availability(): Promise<ProviderAvailability> {
30
+ return { status: 'available' };
31
+ }
32
+
33
+ async reset(_agentId: string): Promise<void> {
34
+ await this.close();
35
+ logger.level = 'silent';
36
+
37
+ this.dbPath = join(tmpdir(), `${this.name}-${process.pid}-${Date.now()}.sqlite`);
38
+ this.db = new SqliteDatabaseProvider(this.dbPath);
39
+ await this.db.initialize();
40
+ }
41
+
42
+ async remember(memory: BenchmarkMemoryRecord, agentId: string): Promise<void> {
43
+ if (!this.db) {
44
+ throw new Error(`${this.name} is not initialized`);
45
+ }
46
+
47
+ await this.db.createMemory({
48
+ id: memory.id,
49
+ agentId,
50
+ type: memory.type,
51
+ content: memory.content,
52
+ embeddingModel: this.embedder.model,
53
+ embedding: await this.embedder.embed(memory.content),
54
+ importance: memory.importance ?? 0.75,
55
+ decayScore: 1,
56
+ tags: memory.tags,
57
+ metadata: {
58
+ ...(memory.metadata ?? {}),
59
+ benchId: memory.id,
60
+ benchTimestamp: memory.timestamp,
61
+ },
62
+ });
63
+ }
64
+
65
+ async recall(
66
+ request: BenchmarkRecallRequest & {
67
+ agentId: string;
68
+ },
69
+ ): Promise<BenchmarkRecallResult[]> {
70
+ if (!this.db) {
71
+ throw new Error(`${this.name} is not initialized`);
72
+ }
73
+
74
+ const results = await this.db.searchByVector(
75
+ request.agentId,
76
+ await this.embedder.embed(request.query ?? ''),
77
+ {
78
+ limit: request.limit ?? 5,
79
+ threshold: request.minScore ?? 0.08,
80
+ },
81
+ );
82
+
83
+ return results.map((result) => ({
84
+ memoryId: String(result.memory.metadata?.['benchId'] ?? result.memory.id),
85
+ content: result.memory.content,
86
+ score: result.similarity,
87
+ type: result.memory.type as BenchmarkMemoryRecord['type'],
88
+ source: 'vector',
89
+ metadata: result.memory.metadata,
90
+ }));
91
+ }
92
+
93
+ async forget(memoryId: string, agentId: string): Promise<void> {
94
+ if (!this.db) {
95
+ throw new Error(`${this.name} is not initialized`);
96
+ }
97
+
98
+ await this.db.deleteAssociations(memoryId);
99
+ await this.db.deleteMemory(memoryId, agentId);
100
+ }
101
+
102
+ async applyDecay(decayRate: number, minScore: number): Promise<number> {
103
+ if (!this.db) {
104
+ throw new Error(`${this.name} is not initialized`);
105
+ }
106
+
107
+ return this.db.applyDecay(decayRate, minScore);
108
+ }
109
+
110
+ async getStats(): Promise<ProviderStats> {
111
+ return {
112
+ storageSizeBytes: await sqliteFootprint(this.dbPath),
113
+ };
114
+ }
115
+
116
+ async close(): Promise<void> {
117
+ if (this.db) {
118
+ await this.db.close();
119
+ this.db = null;
120
+ }
121
+
122
+ if (this.dbPath) {
123
+ await removeSqliteArtifacts(this.dbPath);
124
+ this.dbPath = null;
125
+ }
126
+ }
127
+ }
128
+
129
+ async function sqliteFootprint(dbPath: string | null): Promise<number | null> {
130
+ if (!dbPath) return null;
131
+
132
+ const candidates = [dbPath, `${dbPath}-wal`, `${dbPath}-shm`];
133
+ let total = 0;
134
+ for (const candidate of candidates) {
135
+ try {
136
+ total += (await stat(candidate)).size;
137
+ } catch {
138
+ // Ignore files that do not exist.
139
+ }
140
+ }
141
+ return total;
142
+ }
143
+
144
+ async function removeSqliteArtifacts(dbPath: string): Promise<void> {
145
+ const candidates = [dbPath, `${dbPath}-wal`, `${dbPath}-shm`];
146
+ for (const candidate of candidates) {
147
+ await rm(candidate, { force: true }).catch(() => undefined);
148
+ }
149
+ }
@@ -0,0 +1,158 @@
1
+ import { readFileSync } from 'node:fs';
2
+ import { resolve } from 'node:path';
3
+ import type {
4
+ BenchmarkCase,
5
+ BenchmarkDataset,
6
+ BenchmarkMemoryRecord,
7
+ BenchmarkMemoryType,
8
+ BenchmarkOperation,
9
+ BenchmarkScenarioType,
10
+ } from '../provider.js';
11
+
12
+ type FixtureMemoryRecord = {
13
+ id: string;
14
+ type: BenchmarkMemoryType;
15
+ timestamp: string;
16
+ content: string;
17
+ tags: string[];
18
+ importance?: number;
19
+ metadata?: Record<string, unknown>;
20
+ associations?: Array<{
21
+ target_id?: string;
22
+ targetId?: string;
23
+ strength: number;
24
+ }>;
25
+ };
26
+
27
+ type FixtureQuestion = {
28
+ question_id: string;
29
+ category: string;
30
+ question: string;
31
+ expected_answer: string;
32
+ required_memory_ids: string[];
33
+ forbidden_memory_ids: string[];
34
+ };
35
+
36
+ type FixtureConversation = {
37
+ conversation_id: string;
38
+ agent_id: string;
39
+ memory_records: FixtureMemoryRecord[];
40
+ questions: FixtureQuestion[];
41
+ };
42
+
43
+ type FocusedMiniFixture = {
44
+ name: string;
45
+ generated_at: string;
46
+ conversations: FixtureConversation[];
47
+ };
48
+
49
+ const CATEGORY_TO_SCENARIO: Record<string, BenchmarkScenarioType> = {
50
+ atomic_fact_recall: 'basic_semantic_recall',
51
+ abstention: 'noise_resistance',
52
+ context_injection: 'agent_task_context',
53
+ contradiction_resolution: 'memory_update',
54
+ current_preference: 'memory_update',
55
+ entity_disambiguation: 'noise_resistance',
56
+ graph_traversal: 'multi_hop_recall',
57
+ multi_hop: 'multi_hop_recall',
58
+ multi_hop_association: 'multi_hop_recall',
59
+ noise_resistance: 'noise_resistance',
60
+ paraphrased_semantic_recall: 'basic_semantic_recall',
61
+ portability: 'portability',
62
+ procedural_recall: 'basic_semantic_recall',
63
+ review_behavior: 'agent_task_context',
64
+ root_cause_recall: 'multi_hop_recall',
65
+ temporal_update: 'memory_update',
66
+ };
67
+
68
+ export function createFocusedMiniDataset(packageRoot: string): BenchmarkDataset {
69
+ return createFixtureDataset(
70
+ packageRoot,
71
+ 'fixtures/1mbrain-focused-mini/1mbrain-focused-mini.json',
72
+ );
73
+ }
74
+
75
+ export function createFixtureDataset(packageRoot: string, relativePath: string): BenchmarkDataset {
76
+ const fixturePath = resolve(packageRoot, relativePath);
77
+ const fixture = JSON.parse(readFileSync(fixturePath, 'utf8')) as FocusedMiniFixture;
78
+ const cases: BenchmarkCase[] = [];
79
+
80
+ for (const conversation of fixture.conversations) {
81
+ const memories = conversation.memory_records.map(toBenchmarkMemory);
82
+
83
+ for (const question of conversation.questions) {
84
+ const scenarioType = CATEGORY_TO_SCENARIO[question.category] ?? 'basic_semantic_recall';
85
+ const operations: BenchmarkOperation[] =
86
+ scenarioType === 'portability'
87
+ ? [
88
+ {
89
+ kind: 'export_import',
90
+ targetAgentId: `${conversation.agent_id}_${question.question_id}_imported`,
91
+ },
92
+ ]
93
+ : [];
94
+
95
+ cases.push({
96
+ scenarioId: question.question_id,
97
+ scenarioType,
98
+ title: question.question_id,
99
+ description: `${conversation.conversation_id}: ${question.category}`,
100
+ agentId: `${conversation.agent_id}_${question.question_id}`,
101
+ memories,
102
+ operations,
103
+ question: question.question,
104
+ expectedAnswer: question.expected_answer,
105
+ recallOptions: {
106
+ limit: 10,
107
+ minScore: 0.08,
108
+ maxHops: 3,
109
+ activationThreshold: 0.05,
110
+ blendWeight: 0.45,
111
+ },
112
+ expectations: {
113
+ requiredMemoryIds: question.required_memory_ids,
114
+ forbiddenMemoryIds: question.forbidden_memory_ids,
115
+ shouldAbstain: question.category === 'abstention' ? true : undefined,
116
+ preferredOver: buildPreferredOver(question),
117
+ },
118
+ });
119
+ }
120
+ }
121
+
122
+ return {
123
+ name: fixture.name,
124
+ generatedAt: fixture.generated_at,
125
+ cases,
126
+ };
127
+ }
128
+
129
+ function toBenchmarkMemory(memory: FixtureMemoryRecord): BenchmarkMemoryRecord {
130
+ return {
131
+ id: memory.id,
132
+ type: memory.type,
133
+ timestamp: memory.timestamp,
134
+ content: memory.content,
135
+ tags: memory.tags,
136
+ importance: memory.importance,
137
+ metadata: memory.metadata,
138
+ associations: memory.associations?.flatMap((association) => {
139
+ const targetId = association.targetId ?? association.target_id;
140
+ return targetId ? [{ targetId, strength: association.strength }] : [];
141
+ }),
142
+ };
143
+ }
144
+
145
+ function buildPreferredOver(question: FixtureQuestion): BenchmarkCase['expectations']['preferredOver'] {
146
+ if (question.forbidden_memory_ids.length === 0 || question.required_memory_ids.length === 0) {
147
+ return undefined;
148
+ }
149
+
150
+ if (!['current_preference', 'contradiction_resolution', 'temporal_update'].includes(question.category)) {
151
+ return undefined;
152
+ }
153
+
154
+ return question.required_memory_ids.map((preferredId) => ({
155
+ preferredId,
156
+ competingIds: question.forbidden_memory_ids,
157
+ }));
158
+ }