@1mbrain/benchmarks 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/README.md +85 -0
  2. package/fixtures/1mbrain-focused-mini/1mbrain-focused-mini.json +928 -0
  3. package/fixtures/1mbrain-focused-mini/README.md +45 -0
  4. package/fixtures/adversarial-memory/dataset_claude_adversarial.json +3333 -0
  5. package/fixtures/adversarial-memory/dataset_gemini_adversarial_memory.json +2984 -0
  6. package/fixtures/balanced-mini/dataset_claude_balanced_mini.json +2077 -0
  7. package/fixtures/balanced-mini/dataset_gemini_balanced_mini.json +1995 -0
  8. package/fixtures/generate_datasets.js +1741 -0
  9. package/fixtures/graph-stress-hard/README.md +43 -0
  10. package/fixtures/graph-stress-hard/dataset_graph_stress_hard.json +4374 -0
  11. package/fixtures/graph-stress-hard/generate_graph_stress_hard.js +526 -0
  12. package/fixtures/realistic-medium/dataset_claude_realistic_medium.json +7462 -0
  13. package/fixtures/realistic-medium/dataset_gemini_realistic_medium.json +7277 -0
  14. package/fixtures/realistic-medium/gen_claude_medium.js +600 -0
  15. package/package.json +22 -0
  16. package/reports/benchmark_report.md +48 -0
  17. package/reports/benchmark_report_claude_adversarial.md +42 -0
  18. package/reports/benchmark_report_claude_adversarial_adaptive.md +42 -0
  19. package/reports/benchmark_report_claude_adversarial_adaptive2_fast.md +42 -0
  20. package/reports/benchmark_report_claude_adversarial_adaptive_fast.md +42 -0
  21. package/reports/benchmark_report_claude_adversarial_rerank.md +42 -0
  22. package/reports/benchmark_report_claude_balanced_mini.md +42 -0
  23. package/reports/benchmark_report_claude_balanced_mini_adaptive.md +42 -0
  24. package/reports/benchmark_report_claude_balanced_mini_adaptive2_fast.md +42 -0
  25. package/reports/benchmark_report_claude_balanced_mini_adaptive_fast.md +42 -0
  26. package/reports/benchmark_report_claude_balanced_mini_rerank.md +42 -0
  27. package/reports/benchmark_report_claude_realistic_medium.md +42 -0
  28. package/reports/benchmark_report_claude_realistic_medium_adaptive.md +42 -0
  29. package/reports/benchmark_report_claude_realistic_medium_adaptive2_fast.md +42 -0
  30. package/reports/benchmark_report_claude_realistic_medium_adaptive_fast.md +42 -0
  31. package/reports/benchmark_report_claude_realistic_medium_evidence_rerank_local.md +42 -0
  32. package/reports/benchmark_report_claude_realistic_medium_openai_evidence_rerank.md +41 -0
  33. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal.md +41 -0
  34. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal_scoped.md +41 -0
  35. package/reports/benchmark_report_claude_realistic_medium_openai_phase8_no_judge.md +42 -0
  36. package/reports/benchmark_report_claude_realistic_medium_openai_rankingpolicy.md +41 -0
  37. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter.md +41 -0
  38. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter_absence_fix.md +41 -0
  39. package/reports/benchmark_report_claude_realistic_medium_openai_write_time_invalidation.md +41 -0
  40. package/reports/benchmark_report_claude_realistic_medium_rerank.md +42 -0
  41. package/reports/benchmark_report_claude_realistic_medium_stale_filter_local.md +42 -0
  42. package/reports/benchmark_report_graph_stress_hard.md +42 -0
  43. package/reports/benchmark_report_graph_stress_hard_absence_fix.md +42 -0
  44. package/reports/benchmark_report_graph_stress_hard_adaptive.md +42 -0
  45. package/reports/benchmark_report_graph_stress_hard_evidence_rerank.md +42 -0
  46. package/reports/benchmark_report_graph_stress_hard_multi_signal_current_guardrail.md +42 -0
  47. package/reports/benchmark_report_graph_stress_hard_multi_signal_guardrail_fixed.md +42 -0
  48. package/reports/benchmark_report_graph_stress_hard_multi_signal_local.md +42 -0
  49. package/reports/benchmark_report_graph_stress_hard_multi_signal_scoped_guardrail.md +42 -0
  50. package/reports/benchmark_report_graph_stress_hard_multi_signal_vector_pure_guardrail.md +42 -0
  51. package/reports/benchmark_report_graph_stress_hard_phase8_sdk_guardrail.md +42 -0
  52. package/reports/benchmark_report_graph_stress_hard_rerank.md +42 -0
  53. package/reports/benchmark_report_graph_stress_hard_stale_filter.md +42 -0
  54. package/reports/benchmark_report_graph_stress_hard_write_time_invalidation.md +42 -0
  55. package/results/.gitignore +2 -0
  56. package/src/adapters/1mbrain.ts +317 -0
  57. package/src/adapters/keyword-embedding.ts +48 -0
  58. package/src/adapters/mem0.ts +124 -0
  59. package/src/adapters/qdrant.ts +214 -0
  60. package/src/adapters/unavailable.ts +49 -0
  61. package/src/adapters/vector-baseline.ts +149 -0
  62. package/src/datasets/focused-mini.ts +158 -0
  63. package/src/datasets/synthetic-agent-memory.ts +532 -0
  64. package/src/llm-evaluator.ts +262 -0
  65. package/src/metrics.ts +482 -0
  66. package/src/provider.ts +151 -0
  67. package/src/runner.ts +635 -0
  68. package/tsconfig.json +10 -0
  69. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,262 @@
1
+ import type { BenchmarkCase, BenchmarkRecallResult } from './provider.js';
2
+ import type { CaseEvaluation } from './metrics.js';
3
+
4
+ export interface LlmCaseEvaluation {
5
+ model: string;
6
+ generatedAnswer: string;
7
+ score0To5: number;
8
+ hallucination: boolean;
9
+ rationale: string;
10
+ }
11
+
12
+ type DeepSeekChatResponse = {
13
+ choices?: Array<{
14
+ finish_reason?: string;
15
+ message?: {
16
+ content?: string;
17
+ reasoning_content?: string;
18
+ };
19
+ }>;
20
+ };
21
+
22
+ type DeepSeekEvalPayload = {
23
+ generated_answer?: string;
24
+ score_0_to_5?: number;
25
+ hallucination?: boolean;
26
+ rationale?: string;
27
+ };
28
+
29
+ export function getLlmEvaluatorType(): 'deepseek' | 'openai' | null {
30
+ const envValue = (process.env['BENCH_LLM_EVAL'] ?? '').toLowerCase();
31
+ if (envValue === 'deepseek') return 'deepseek';
32
+ if (envValue === 'openai') return 'openai';
33
+ return null;
34
+ }
35
+
36
+ export function shouldUseLlmEvaluation(): boolean {
37
+ return getLlmEvaluatorType() !== null;
38
+ }
39
+
40
+ export async function evaluateWithDeepSeek(
41
+ benchmarkCase: BenchmarkCase,
42
+ recallResults: BenchmarkRecallResult[],
43
+ ): Promise<LlmCaseEvaluation> {
44
+ const apiKey = process.env['DEEPSEEK_API_KEY'];
45
+ if (!apiKey) {
46
+ throw new Error('DEEPSEEK_API_KEY is required when BENCH_LLM_EVAL=deepseek');
47
+ }
48
+
49
+ const model = process.env['BENCH_LLM_MODEL'] ?? 'deepseek-v4-flash';
50
+ const baseUrl =
51
+ process.env['DEEPSEEK_BASE_URL'] ??
52
+ process.env['DEEPSEEK_API_BASE'] ??
53
+ 'https://api.deepseek.com';
54
+
55
+ try {
56
+ return await requestDeepSeekEvaluation({
57
+ apiKey,
58
+ baseUrl,
59
+ model,
60
+ benchmarkCase,
61
+ recallResults,
62
+ jsonMode: true,
63
+ });
64
+ } catch (error) {
65
+ const retry = await requestDeepSeekEvaluation({
66
+ apiKey,
67
+ baseUrl,
68
+ model,
69
+ benchmarkCase,
70
+ recallResults,
71
+ jsonMode: false,
72
+ });
73
+ retry.rationale = `${retry.rationale} (retry_after_json_mode_error=${error instanceof Error ? error.message : String(error)})`;
74
+ return retry;
75
+ }
76
+ }
77
+
78
+ export async function evaluateWithOpenAI(
79
+ benchmarkCase: BenchmarkCase,
80
+ recallResults: BenchmarkRecallResult[],
81
+ ): Promise<LlmCaseEvaluation> {
82
+ const apiKey = process.env['OPENAI_API_KEY'];
83
+ if (!apiKey) {
84
+ throw new Error('OPENAI_API_KEY is required when BENCH_LLM_EVAL=openai');
85
+ }
86
+
87
+ const model = process.env['BENCH_LLM_MODEL'] ?? 'gpt-4o-mini';
88
+ const baseUrl = process.env['OPENAI_BASE_URL'] ?? 'https://api.openai.com/v1';
89
+
90
+ const response = await fetch(`${baseUrl}/chat/completions`, {
91
+ method: 'POST',
92
+ headers: {
93
+ authorization: `Bearer ${apiKey}`,
94
+ 'content-type': 'application/json',
95
+ },
96
+ body: JSON.stringify({
97
+ model,
98
+ temperature: 0,
99
+ max_tokens: Number(process.env['BENCH_LLM_MAX_TOKENS'] ?? 1500),
100
+ response_format: { type: 'json_object' },
101
+ messages: [
102
+ {
103
+ role: 'system',
104
+ content: 'You are a strict memory benchmark evaluator. Use only the provided retrieved memories. Return valid JSON only.',
105
+ },
106
+ {
107
+ role: 'user',
108
+ content: buildPrompt(benchmarkCase, recallResults),
109
+ },
110
+ ],
111
+ }),
112
+ });
113
+
114
+ if (!response.ok) {
115
+ const text = await response.text();
116
+ throw new Error(`OpenAI ${response.status}: ${text}`);
117
+ }
118
+
119
+ const data = await response.json() as any;
120
+ const content = data.choices?.[0]?.message?.content;
121
+ if (!content) {
122
+ throw new Error(`OpenAI response did not include message content: ${JSON.stringify(data).slice(0, 500)}`);
123
+ }
124
+
125
+ const payload = parseJsonObject(content);
126
+ return {
127
+ model,
128
+ generatedAnswer: String(payload.generated_answer ?? ''),
129
+ score0To5: clampScore(payload.score_0_to_5),
130
+ hallucination: Boolean(payload.hallucination),
131
+ rationale: String(payload.rationale ?? ''),
132
+ };
133
+ }
134
+
135
+ async function requestDeepSeekEvaluation(options: {
136
+ apiKey: string;
137
+ baseUrl: string;
138
+ model: string;
139
+ benchmarkCase: BenchmarkCase;
140
+ recallResults: BenchmarkRecallResult[];
141
+ jsonMode: boolean;
142
+ }): Promise<LlmCaseEvaluation> {
143
+ const response = await fetch(`${options.baseUrl.replace(/\/+$/, '')}/v1/chat/completions`, {
144
+ method: 'POST',
145
+ headers: {
146
+ authorization: `Bearer ${options.apiKey}`,
147
+ 'content-type': 'application/json',
148
+ },
149
+ body: JSON.stringify({
150
+ model: options.model,
151
+ temperature: 0,
152
+ max_tokens: Number(process.env['BENCH_LLM_MAX_TOKENS'] ?? 1500),
153
+ ...(options.jsonMode ? { response_format: { type: 'json_object' } } : {}),
154
+ messages: [
155
+ {
156
+ role: 'system',
157
+ content:
158
+ 'You are a strict memory benchmark evaluator. Use only the provided retrieved memories. Return valid JSON only.',
159
+ },
160
+ {
161
+ role: 'user',
162
+ content: buildPrompt(options.benchmarkCase, options.recallResults),
163
+ },
164
+ ],
165
+ }),
166
+ });
167
+
168
+ if (!response.ok) {
169
+ const text = await response.text();
170
+ throw new Error(`DeepSeek ${response.status}: ${text}`);
171
+ }
172
+
173
+ const data = (await response.json()) as DeepSeekChatResponse;
174
+ const message = data.choices?.[0]?.message;
175
+ const content = message?.content?.trim()
176
+ ? message.content
177
+ : message?.reasoning_content?.trim()
178
+ ? message.reasoning_content
179
+ : undefined;
180
+ if (!content) {
181
+ throw new Error(`DeepSeek response did not include message content: ${JSON.stringify(data).slice(0, 500)}`);
182
+ }
183
+
184
+ const payload = parseJsonObject(content);
185
+ return {
186
+ model: options.model,
187
+ generatedAnswer: String(payload.generated_answer ?? ''),
188
+ score0To5: clampScore(payload.score_0_to_5),
189
+ hallucination: Boolean(payload.hallucination),
190
+ rationale: String(payload.rationale ?? ''),
191
+ };
192
+ }
193
+
194
+ export function applyLlmEvaluation(
195
+ evaluation: CaseEvaluation,
196
+ llmEvaluation: LlmCaseEvaluation,
197
+ ): void {
198
+ evaluation.answerAccuracy = llmEvaluation.score0To5;
199
+ evaluation.hallucinationRate = llmEvaluation.hallucination ? 1 : 0;
200
+ evaluation.notes.push(`llm_evaluator=${llmEvaluation.model}`);
201
+ if (llmEvaluation.score0To5 < 4) {
202
+ evaluation.failureTags.push('llm_answer_incorrect');
203
+ }
204
+ if (llmEvaluation.hallucination) {
205
+ evaluation.failureTags.push('llm_hallucination');
206
+ }
207
+ evaluation.failureTags = Array.from(new Set(evaluation.failureTags));
208
+ }
209
+
210
+ function buildPrompt(
211
+ benchmarkCase: BenchmarkCase,
212
+ recallResults: BenchmarkRecallResult[],
213
+ ): string {
214
+ const memories = recallResults
215
+ .slice(0, benchmarkCase.recallOptions.limit ?? 10)
216
+ .map((result, index) => {
217
+ return [
218
+ `Memory ${index + 1}`,
219
+ `id: ${result.memoryId}`,
220
+ `score: ${result.score}`,
221
+ `content: ${result.content}`,
222
+ ].join('\n');
223
+ })
224
+ .join('\n\n');
225
+
226
+ return [
227
+ `Question: ${benchmarkCase.question}`,
228
+ `Expected answer: ${benchmarkCase.expectedAnswer}`,
229
+ `Required memory ids: ${benchmarkCase.expectations.requiredMemoryIds.join(', ') || '(none)'}`,
230
+ `Forbidden memory ids: ${benchmarkCase.expectations.forbiddenMemoryIds.join(', ') || '(none)'}`,
231
+ '',
232
+ 'Retrieved memories:',
233
+ memories || '(none)',
234
+ '',
235
+ 'Task:',
236
+ '1. Generate a concise answer to the question using only the retrieved memories.',
237
+ '2. Score the generated answer from 0 to 5 against the expected answer.',
238
+ '3. Set hallucination=true if the answer uses unsupported facts or relies on forbidden/stale memory.',
239
+ '',
240
+ 'Return JSON with exactly these keys: generated_answer, score_0_to_5, hallucination, rationale.',
241
+ ].join('\n');
242
+ }
243
+
244
+ function parseJsonObject(content: string): DeepSeekEvalPayload {
245
+ try {
246
+ return JSON.parse(content) as DeepSeekEvalPayload;
247
+ } catch {
248
+ const match = content.match(/\{[\s\S]*\}/);
249
+ if (!match) {
250
+ throw new Error(`DeepSeek response was not JSON: ${content.slice(0, 200)}`);
251
+ }
252
+ return JSON.parse(match[0]) as DeepSeekEvalPayload;
253
+ }
254
+ }
255
+
256
+ function clampScore(value: unknown): number {
257
+ const numberValue = typeof value === 'number' ? value : Number(value);
258
+ if (!Number.isFinite(numberValue)) {
259
+ return 0;
260
+ }
261
+ return Math.max(0, Math.min(5, numberValue));
262
+ }