@remnic/cli 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1144 @@
1
+ // ../bench/dist/index.js
2
+ import { mkdir, mkdtemp, rm } from "fs/promises";
3
+ import { tmpdir } from "os";
4
+ import path from "path";
5
+ import { Orchestrator, parseConfig } from "@remnic/core";
6
+ import fs from "fs";
7
+ import path4 from "path";
8
+ import { randomUUID } from "crypto";
9
+ import { readFile as readFile2 } from "fs/promises";
10
+ import path3 from "path";
11
+ import { execSync } from "child_process";
12
+ import { mkdir as mkdir2, readFile, writeFile } from "fs/promises";
13
+ import path2 from "path";
14
+ async function createBenchOrchestrator(mode, overrides) {
15
+ const tempDir = await mkdtemp(path.join(tmpdir(), `remnic-bench-${mode}-`));
16
+ await mkdir(path.join(tempDir, "state"), { recursive: true });
17
+ const orchestrator = new Orchestrator(
18
+ parseConfig({
19
+ memoryDir: tempDir,
20
+ workspaceDir: tempDir,
21
+ qmdEnabled: false,
22
+ qmdColdTierEnabled: false,
23
+ transcriptEnabled: false,
24
+ hourlySummariesEnabled: false,
25
+ daySummaryEnabled: false,
26
+ identityEnabled: false,
27
+ identityContinuityEnabled: false,
28
+ namespacesEnabled: false,
29
+ sharedContextEnabled: false,
30
+ workTasksEnabled: false,
31
+ workProjectsEnabled: false,
32
+ commitmentLedgerEnabled: false,
33
+ resumeBundlesEnabled: false,
34
+ nativeKnowledge: { enabled: false },
35
+ lcmEnabled: true,
36
+ lcmLeafBatchSize: 4,
37
+ lcmRollupFanIn: 3,
38
+ lcmFreshTailTurns: 8,
39
+ lcmMaxDepth: 4,
40
+ lcmDeterministicMaxTokens: 512,
41
+ lcmRecallBudgetShare: 1,
42
+ extractionDedupeEnabled: mode === "direct",
43
+ extractionMinChars: mode === "direct" ? 10 : 1e6,
44
+ extractionMinUserTurns: mode === "direct" ? 0 : 1e6,
45
+ recallPlannerEnabled: mode === "direct",
46
+ queryExpansionEnabled: false,
47
+ rerankEnabled: false,
48
+ memoryBoxesEnabled: false,
49
+ traceWeaverEnabled: false,
50
+ threadingEnabled: false,
51
+ factDeduplicationEnabled: false,
52
+ knowledgeIndexEnabled: false,
53
+ entityRetrievalEnabled: false,
54
+ verifiedRecallEnabled: false,
55
+ queryAwareIndexingEnabled: false,
56
+ contradictionDetectionEnabled: false,
57
+ memoryLinkingEnabled: false,
58
+ topicExtractionEnabled: false,
59
+ chunkingEnabled: true,
60
+ episodeNoteModeEnabled: false,
61
+ ...overrides
62
+ })
63
+ );
64
+ await orchestrator.initialize();
65
+ if (!orchestrator.lcmEngine) {
66
+ throw new Error("Remnic benchmark adapter requires LCM to be enabled.");
67
+ }
68
+ return { tempDir, orchestrator };
69
+ }
70
+ function createAdapterFactory(mode) {
71
+ return async function createAdapter(options = {}) {
72
+ let state = await createBenchOrchestrator(mode, options.configOverrides);
73
+ const getEngine = () => {
74
+ const engine = state.orchestrator.lcmEngine;
75
+ if (!engine) {
76
+ throw new Error("LCM engine unavailable for Remnic benchmark adapter.");
77
+ }
78
+ return engine;
79
+ };
80
+ const cleanup = async () => {
81
+ state.orchestrator.lcmEngine?.close();
82
+ await rm(state.tempDir, { recursive: true, force: true });
83
+ };
84
+ const rebuild = async () => {
85
+ await cleanup();
86
+ state = await createBenchOrchestrator(mode, options.configOverrides);
87
+ };
88
+ return {
89
+ async store(sessionId, messages) {
90
+ await getEngine().observeMessages(
91
+ sessionId,
92
+ messages.map((message) => ({
93
+ role: message.role,
94
+ content: message.content
95
+ }))
96
+ );
97
+ },
98
+ async recall(sessionId, query, budgetChars) {
99
+ const engine = getEngine();
100
+ const budget = budgetChars ?? 32e3;
101
+ const sections = [];
102
+ if (query) {
103
+ const searchResults = await engine.searchContextFull(query, 20, sessionId);
104
+ if (searchResults.length > 0) {
105
+ sections.push(
106
+ `## Search results
107
+ ${searchResults.map((result) => `[turn ${result.turn_index}, ${result.role}]: ${result.content}`).join("\n\n")}`
108
+ );
109
+ }
110
+ }
111
+ const recallText = await engine.assembleRecall(sessionId, budget);
112
+ if (recallText) {
113
+ sections.push(recallText);
114
+ }
115
+ if (sections.length === 0) {
116
+ const stats = await engine.getStats(sessionId);
117
+ if (stats.totalMessages > 0) {
118
+ const expanded = await engine.expandContext(
119
+ sessionId,
120
+ 0,
121
+ stats.totalMessages - 1,
122
+ Math.floor(budget / 4)
123
+ );
124
+ if (expanded.length > 0) {
125
+ sections.push(
126
+ `## Raw messages
127
+ ${expanded.map((message) => `[${message.role}]: ${message.content}`).join("\n")}`
128
+ );
129
+ }
130
+ }
131
+ }
132
+ const joined = sections.join("\n\n");
133
+ return joined.length > budget ? joined.slice(0, budget) : joined;
134
+ },
135
+ async search(query, limit, sessionId) {
136
+ const results = await getEngine().searchContext(query, limit, sessionId);
137
+ return results.map((result) => ({
138
+ turnIndex: result.turn_index,
139
+ role: result.role,
140
+ snippet: result.snippet,
141
+ sessionId: result.session_id
142
+ }));
143
+ },
144
+ async reset(_sessionId) {
145
+ await rebuild();
146
+ },
147
+ async getStats(sessionId) {
148
+ return getEngine().getStats(sessionId);
149
+ },
150
+ async destroy() {
151
+ await cleanup();
152
+ }
153
+ };
154
+ };
155
+ }
156
+ var createLightweightAdapter = createAdapterFactory("lightweight");
157
+ var createRemnicAdapter = createAdapterFactory("direct");
158
+ var BENCHMARK_RESULT_SCHEMA = {
159
+ type: "object",
160
+ required: ["meta", "config", "cost", "results", "environment"],
161
+ properties: {
162
+ meta: {
163
+ type: "object",
164
+ required: [
165
+ "id",
166
+ "benchmark",
167
+ "benchmarkTier",
168
+ "version",
169
+ "remnicVersion",
170
+ "gitSha",
171
+ "timestamp",
172
+ "mode",
173
+ "runCount",
174
+ "seeds"
175
+ ],
176
+ properties: {
177
+ id: { type: "string" },
178
+ benchmark: { type: "string" },
179
+ benchmarkTier: {
180
+ type: "string",
181
+ enum: ["published", "remnic", "custom"]
182
+ },
183
+ version: { type: "string" },
184
+ remnicVersion: { type: "string" },
185
+ gitSha: { type: "string" },
186
+ timestamp: { type: "string" },
187
+ mode: { type: "string", enum: ["full", "quick"] },
188
+ runCount: { type: "number" },
189
+ seeds: {
190
+ type: "array",
191
+ items: { type: "number" }
192
+ }
193
+ }
194
+ },
195
+ config: {
196
+ type: "object",
197
+ required: [
198
+ "systemProvider",
199
+ "judgeProvider",
200
+ "adapterMode",
201
+ "remnicConfig"
202
+ ],
203
+ properties: {
204
+ systemProvider: {
205
+ anyOf: [
206
+ { type: "null" },
207
+ {
208
+ type: "object",
209
+ required: ["provider", "model"],
210
+ properties: {
211
+ provider: { type: "string" },
212
+ model: { type: "string" },
213
+ baseUrl: { type: "string" }
214
+ }
215
+ }
216
+ ]
217
+ },
218
+ judgeProvider: {
219
+ anyOf: [
220
+ { type: "null" },
221
+ {
222
+ type: "object",
223
+ required: ["provider", "model"],
224
+ properties: {
225
+ provider: { type: "string" },
226
+ model: { type: "string" },
227
+ baseUrl: { type: "string" }
228
+ }
229
+ }
230
+ ]
231
+ },
232
+ adapterMode: { type: "string" },
233
+ remnicConfig: { type: "object" }
234
+ }
235
+ },
236
+ cost: {
237
+ type: "object",
238
+ required: [
239
+ "totalTokens",
240
+ "inputTokens",
241
+ "outputTokens",
242
+ "estimatedCostUsd",
243
+ "totalLatencyMs",
244
+ "meanQueryLatencyMs"
245
+ ],
246
+ properties: {
247
+ totalTokens: { type: "number" },
248
+ inputTokens: { type: "number" },
249
+ outputTokens: { type: "number" },
250
+ estimatedCostUsd: { type: "number" },
251
+ totalLatencyMs: { type: "number" },
252
+ meanQueryLatencyMs: { type: "number" }
253
+ }
254
+ },
255
+ results: {
256
+ type: "object",
257
+ required: ["tasks", "aggregates"],
258
+ properties: {
259
+ tasks: {
260
+ type: "array",
261
+ items: {
262
+ type: "object",
263
+ required: [
264
+ "taskId",
265
+ "question",
266
+ "expected",
267
+ "actual",
268
+ "scores",
269
+ "latencyMs",
270
+ "tokens"
271
+ ],
272
+ properties: {
273
+ taskId: { type: "string" },
274
+ question: { type: "string" },
275
+ expected: { type: "string" },
276
+ actual: { type: "string" },
277
+ scores: { type: "object" },
278
+ latencyMs: { type: "number" },
279
+ tokens: {
280
+ type: "object",
281
+ required: ["input", "output"],
282
+ properties: {
283
+ input: { type: "number" },
284
+ output: { type: "number" }
285
+ }
286
+ }
287
+ }
288
+ }
289
+ },
290
+ aggregates: { type: "object" },
291
+ statistics: { type: "object" }
292
+ }
293
+ },
294
+ environment: {
295
+ type: "object",
296
+ required: ["os", "nodeVersion"],
297
+ properties: {
298
+ os: { type: "string" },
299
+ nodeVersion: { type: "string" },
300
+ hardware: { type: "string" }
301
+ }
302
+ }
303
+ }
304
+ };
305
+ var OpenAiCompatibleProvider = class {
306
+ provider = "openai";
307
+ id;
308
+ name;
309
+ config;
310
+ usage = {
311
+ inputTokens: 0,
312
+ outputTokens: 0,
313
+ totalTokens: 0
314
+ };
315
+ constructor(config) {
316
+ this.config = config;
317
+ this.id = `openai:${config.model}`;
318
+ this.name = config.model;
319
+ }
320
+ async complete(prompt, opts = {}) {
321
+ const startedAt = performance.now();
322
+ const response = await fetch(this.urlFor("chat/completions"), {
323
+ method: "POST",
324
+ headers: this.headers(opts.headers),
325
+ body: JSON.stringify({
326
+ model: this.config.model,
327
+ messages: [
328
+ ...opts.systemPrompt ? [{ role: "system", content: opts.systemPrompt }] : [],
329
+ { role: "user", content: prompt }
330
+ ],
331
+ temperature: opts.temperature,
332
+ max_tokens: opts.maxTokens
333
+ })
334
+ });
335
+ if (!response.ok) {
336
+ throw new Error(
337
+ `OpenAI-compatible completion failed: ${response.status} ${response.statusText}`
338
+ );
339
+ }
340
+ const payload = await response.json();
341
+ const promptTokens = payload.usage?.prompt_tokens ?? 0;
342
+ const completionTokens = payload.usage?.completion_tokens ?? 0;
343
+ this.recordUsage(promptTokens, completionTokens);
344
+ return {
345
+ text: readMessageText(payload),
346
+ tokens: {
347
+ input: promptTokens,
348
+ output: completionTokens
349
+ },
350
+ latencyMs: Math.round(performance.now() - startedAt),
351
+ model: payload.model ?? this.config.model
352
+ };
353
+ }
354
+ async discover() {
355
+ const response = await fetch(this.urlFor("models"), {
356
+ method: "GET",
357
+ headers: this.headers()
358
+ });
359
+ if (!response.ok) {
360
+ throw new Error(
361
+ `OpenAI-compatible model discovery failed: ${response.status} ${response.statusText}`
362
+ );
363
+ }
364
+ const payload = await response.json();
365
+ return (payload.data ?? []).map((model) => ({
366
+ id: model.id,
367
+ name: model.name ?? model.id,
368
+ contextLength: model.context_length ?? 0,
369
+ capabilities: model.capabilities ?? ["completion"],
370
+ ...model.quantization ? { quantization: model.quantization } : {},
371
+ ...model.parameter_count ? { parameterCount: model.parameter_count } : {}
372
+ }));
373
+ }
374
+ getUsage() {
375
+ return { ...this.usage };
376
+ }
377
+ resetUsage() {
378
+ this.usage = {
379
+ inputTokens: 0,
380
+ outputTokens: 0,
381
+ totalTokens: 0
382
+ };
383
+ }
384
+ headers(extraHeaders = {}) {
385
+ return {
386
+ "content-type": "application/json",
387
+ ...this.config.apiKey ? { authorization: `Bearer ${this.config.apiKey}` } : {},
388
+ ...this.config.headers ?? {},
389
+ ...extraHeaders
390
+ };
391
+ }
392
+ recordUsage(inputTokens, outputTokens) {
393
+ this.usage = {
394
+ inputTokens: this.usage.inputTokens + inputTokens,
395
+ outputTokens: this.usage.outputTokens + outputTokens,
396
+ totalTokens: this.usage.totalTokens + inputTokens + outputTokens
397
+ };
398
+ }
399
+ urlFor(pathname) {
400
+ const baseUrl = this.config.baseUrl ?? "https://api.openai.com/v1";
401
+ const normalizedBase = baseUrl.endsWith("/") ? baseUrl.slice(0, -1) : baseUrl;
402
+ const normalizedPath = pathname.startsWith("/") ? pathname.slice(1) : pathname;
403
+ return `${normalizedBase}/${normalizedPath}`;
404
+ }
405
+ };
406
+ function readMessageText(payload) {
407
+ const content = payload.choices?.[0]?.message?.content;
408
+ if (typeof content === "string") {
409
+ return content;
410
+ }
411
+ if (Array.isArray(content)) {
412
+ return content.map((part) => part.text ?? "").join("").trim();
413
+ }
414
+ return "";
415
+ }
416
+ function createOpenAiCompatibleProvider(config) {
417
+ return new OpenAiCompatibleProvider(config);
418
+ }
419
+ var LONG_MEM_EVAL_SMOKE_FIXTURE = [
420
+ {
421
+ question_id: 1,
422
+ question_type: "single-session-user",
423
+ question: "What city does the user live in?",
424
+ answer: "Paris",
425
+ question_date: "2025-01-01",
426
+ haystack_dates: ["2024-12-01"],
427
+ haystack_session_ids: ["session-1"],
428
+ haystack_sessions: [
429
+ [
430
+ { role: "user", content: "I moved to Paris last year." },
431
+ { role: "assistant", content: "Paris sounds great." }
432
+ ]
433
+ ],
434
+ answer_session_ids: ["session-1"]
435
+ }
436
+ ];
437
+ function exactMatch(predicted, expected) {
438
+ return normalizeText(predicted) === normalizeText(expected) ? 1 : 0;
439
+ }
440
+ function f1Score(predicted, expected) {
441
+ const predictedTokens = tokenize(predicted);
442
+ const expectedTokens = tokenize(expected);
443
+ if (predictedTokens.length === 0 && expectedTokens.length === 0) return 1;
444
+ if (predictedTokens.length === 0 || expectedTokens.length === 0) return 0;
445
+ const predictedCounts = frequencyMap(predictedTokens);
446
+ const expectedCounts = frequencyMap(expectedTokens);
447
+ let overlap = 0;
448
+ for (const [token, count] of expectedCounts.entries()) {
449
+ overlap += Math.min(count, predictedCounts.get(token) ?? 0);
450
+ }
451
+ if (overlap === 0) return 0;
452
+ const precision = overlap / predictedTokens.length;
453
+ const recall = overlap / expectedTokens.length;
454
+ return 2 * precision * recall / (precision + recall);
455
+ }
456
+ function rougeL(predicted, expected) {
457
+ const predictedTokens = tokenize(predicted);
458
+ const expectedTokens = tokenize(expected);
459
+ if (predictedTokens.length === 0 && expectedTokens.length === 0) return 1;
460
+ if (predictedTokens.length === 0 || expectedTokens.length === 0) return 0;
461
+ const lcsLength = longestCommonSubsequence(predictedTokens, expectedTokens);
462
+ const precision = lcsLength / predictedTokens.length;
463
+ const recall = lcsLength / expectedTokens.length;
464
+ if (precision + recall === 0) return 0;
465
+ return 2 * precision * recall / (precision + recall);
466
+ }
467
+ function recallAtK(retrieved, relevant, k) {
468
+ if (relevant.length === 0) return 1;
469
+ const topK = retrieved.slice(0, k).map(normalizeText);
470
+ const relevantSet = new Set(relevant.map(normalizeText));
471
+ const hits = new Set(
472
+ topK.filter((candidate) => relevantSet.has(candidate))
473
+ ).size;
474
+ return hits / relevantSet.size;
475
+ }
476
+ function containsAnswer(predicted, expected) {
477
+ const normalizedExpected = normalizeText(expected);
478
+ if (normalizedExpected.length === 0) return 0;
479
+ return normalizeText(predicted).includes(normalizedExpected) ? 1 : 0;
480
+ }
481
+ async function llmJudgeScore(judge, question, predicted, expected) {
482
+ if (!judge) return -1;
483
+ try {
484
+ return await judge.score(question, predicted, expected);
485
+ } catch {
486
+ return -1;
487
+ }
488
+ }
489
+ async function timed(fn) {
490
+ const startedAt = performance.now();
491
+ const result = await fn();
492
+ return {
493
+ result,
494
+ durationMs: Math.round(performance.now() - startedAt)
495
+ };
496
+ }
497
+ function aggregateTaskScores(metricsList) {
498
+ const metricValues = collectMetricValues(metricsList);
499
+ const aggregates = {};
500
+ for (const [metricName, values] of Object.entries(metricValues)) {
501
+ aggregates[metricName] = summarizeMetricValues(values);
502
+ }
503
+ return aggregates;
504
+ }
505
+ function collectMetricValues(metricsList) {
506
+ if (metricsList.length === 0) return {};
507
+ const metricNames = /* @__PURE__ */ new Set();
508
+ for (const metrics of metricsList) {
509
+ for (const metricName of Object.keys(metrics)) {
510
+ metricNames.add(metricName);
511
+ }
512
+ }
513
+ const metricValues = {};
514
+ for (const metricName of metricNames) {
515
+ const values = metricsList.map((metrics) => metrics[metricName]).filter((value) => typeof value === "number" && !Number.isNaN(value)).sort((left, right) => left - right);
516
+ if (values.length > 0) {
517
+ metricValues[metricName] = values;
518
+ }
519
+ }
520
+ return metricValues;
521
+ }
522
+ function summarizeMetricValues(values) {
523
+ const mean = values.reduce((sum, value) => sum + value, 0) / values.length;
524
+ const median = values.length % 2 === 0 ? (values[values.length / 2 - 1] + values[values.length / 2]) / 2 : values[Math.floor(values.length / 2)];
525
+ const variance = values.reduce((sum, value) => sum + (value - mean) ** 2, 0) / values.length;
526
+ return {
527
+ mean,
528
+ median,
529
+ stdDev: Math.sqrt(variance),
530
+ min: values[0],
531
+ max: values[values.length - 1]
532
+ };
533
+ }
534
+ function normalizeText(value) {
535
+ return String(value ?? "").trim().toLowerCase();
536
+ }
537
+ function tokenize(value) {
538
+ return normalizeText(value).replace(/[^\w\s]/g, " ").split(/\s+/).filter((token) => token.length > 0);
539
+ }
540
+ function frequencyMap(tokens) {
541
+ const counts = /* @__PURE__ */ new Map();
542
+ for (const token of tokens) {
543
+ counts.set(token, (counts.get(token) ?? 0) + 1);
544
+ }
545
+ return counts;
546
+ }
547
+ function longestCommonSubsequence(left, right) {
548
+ let previous = new Array(right.length + 1).fill(0);
549
+ let current = new Array(right.length + 1).fill(0);
550
+ for (let leftIndex = 1; leftIndex <= left.length; leftIndex += 1) {
551
+ for (let rightIndex = 1; rightIndex <= right.length; rightIndex += 1) {
552
+ if (left[leftIndex - 1] === right[rightIndex - 1]) {
553
+ current[rightIndex] = previous[rightIndex - 1] + 1;
554
+ } else {
555
+ current[rightIndex] = Math.max(
556
+ previous[rightIndex],
557
+ current[rightIndex - 1]
558
+ );
559
+ }
560
+ }
561
+ [previous, current] = [current, previous];
562
+ current.fill(0);
563
+ }
564
+ return previous[right.length];
565
+ }
566
+ function sanitizeFilenameSegment(value) {
567
+ const sanitized = value.trim().replace(/[^a-zA-Z0-9._-]/g, "_");
568
+ return sanitized.length > 0 ? sanitized : "unknown";
569
+ }
570
+ async function writeBenchmarkResult(result, outputDir) {
571
+ await mkdir2(outputDir, { recursive: true });
572
+ const safeRemnicVersion = sanitizeFilenameSegment(result.meta.remnicVersion);
573
+ const timestamp = result.meta.timestamp.replace(/[:.]/g, "-");
574
+ const filePath = path2.join(
575
+ outputDir,
576
+ `${result.meta.benchmark}-v${safeRemnicVersion}-${timestamp}.json`
577
+ );
578
+ await writeFile(filePath, JSON.stringify(result, null, 2) + "\n");
579
+ return filePath;
580
+ }
581
+ async function getRemnicVersion() {
582
+ try {
583
+ const packageJson = JSON.parse(
584
+ await readFile(
585
+ path2.resolve(import.meta.dirname, "../../../package.json"),
586
+ "utf8"
587
+ )
588
+ );
589
+ return typeof packageJson.version === "string" ? packageJson.version : "unknown";
590
+ } catch {
591
+ return "unknown";
592
+ }
593
+ }
594
+ function getGitSha() {
595
+ try {
596
+ return execSync("git rev-parse --short HEAD", { encoding: "utf8" }).trim();
597
+ } catch {
598
+ return "unknown";
599
+ }
600
+ }
601
+ var longMemEvalDefinition = {
602
+ id: "longmemeval",
603
+ title: "LongMemEval",
604
+ tier: "published",
605
+ status: "ready",
606
+ runnerAvailable: true,
607
+ meta: {
608
+ name: "longmemeval",
609
+ version: "2.0.0",
610
+ description: "Long-term memory evaluation across information extraction, multi-session reasoning, temporal reasoning, and knowledge updates.",
611
+ category: "retrieval",
612
+ citation: "Wu et al. LongMemEval: Benchmarking Chat Assistants on Long-Term Interactive Memory. ICLR 2025."
613
+ }
614
+ };
615
+ async function runLongMemEvalBenchmark(options) {
616
+ const dataset = await loadDataset(options.mode, options.datasetDir, options.limit);
617
+ const tasks = [];
618
+ for (const item of dataset) {
619
+ await options.system.reset();
620
+ const sessionIds = [];
621
+ for (let sessionIndex = 0; sessionIndex < item.haystack_sessions.length; sessionIndex += 1) {
622
+ const sessionId = item.haystack_session_ids[sessionIndex] ?? `session-${sessionIndex}`;
623
+ const messages = item.haystack_sessions[sessionIndex].map(
624
+ (turn) => ({
625
+ role: turn.role,
626
+ content: turn.content
627
+ })
628
+ );
629
+ sessionIds.push(sessionId);
630
+ if (messages.length > 0) {
631
+ await options.system.store(sessionId, messages);
632
+ }
633
+ }
634
+ const { result: recalledText, durationMs } = await timed(async () => {
635
+ const recalledSessions = await Promise.all(
636
+ sessionIds.map(
637
+ (sessionId) => options.system.recall(sessionId, item.question)
638
+ )
639
+ );
640
+ return recalledSessions.filter(Boolean).join("\n\n");
641
+ });
642
+ const searchResults = await options.system.search(item.question, 10);
643
+ const judgeScore = await llmJudgeScore(
644
+ options.system.judge,
645
+ item.question,
646
+ recalledText,
647
+ item.answer
648
+ );
649
+ const scores = {
650
+ f1: f1Score(recalledText, item.answer),
651
+ contains_answer: containsAnswer(recalledText, item.answer),
652
+ search_hits: searchResults.length
653
+ };
654
+ if (judgeScore >= 0) {
655
+ scores.llm_judge = judgeScore;
656
+ }
657
+ tasks.push({
658
+ taskId: `q${item.question_id}`,
659
+ question: item.question,
660
+ expected: item.answer,
661
+ actual: recalledText,
662
+ scores,
663
+ latencyMs: durationMs,
664
+ tokens: { input: 0, output: 0 },
665
+ details: {
666
+ questionType: item.question_type,
667
+ questionDate: item.question_date,
668
+ haystackDates: item.haystack_dates,
669
+ haystackSessionIds: item.haystack_session_ids,
670
+ answerSessionIds: item.answer_session_ids
671
+ }
672
+ });
673
+ }
674
+ const remnicVersion = await getRemnicVersion();
675
+ const totalLatencyMs = tasks.reduce((sum, task) => sum + task.latencyMs, 0);
676
+ const totalInputTokens = tasks.reduce(
677
+ (sum, task) => sum + task.tokens.input,
678
+ 0
679
+ );
680
+ const totalOutputTokens = tasks.reduce(
681
+ (sum, task) => sum + task.tokens.output,
682
+ 0
683
+ );
684
+ return {
685
+ meta: {
686
+ id: randomUUID(),
687
+ benchmark: options.benchmark.id,
688
+ benchmarkTier: options.benchmark.tier,
689
+ version: options.benchmark.meta.version,
690
+ remnicVersion,
691
+ gitSha: getGitSha(),
692
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
693
+ mode: options.mode,
694
+ runCount: 1,
695
+ seeds: [options.seed ?? 0]
696
+ },
697
+ config: {
698
+ systemProvider: options.systemProvider ?? null,
699
+ judgeProvider: options.judgeProvider ?? null,
700
+ adapterMode: options.adapterMode ?? "direct",
701
+ remnicConfig: options.remnicConfig ?? {}
702
+ },
703
+ cost: {
704
+ totalTokens: totalInputTokens + totalOutputTokens,
705
+ inputTokens: totalInputTokens,
706
+ outputTokens: totalOutputTokens,
707
+ estimatedCostUsd: 0,
708
+ totalLatencyMs,
709
+ meanQueryLatencyMs: tasks.length > 0 ? totalLatencyMs / tasks.length : 0
710
+ },
711
+ results: {
712
+ tasks,
713
+ aggregates: aggregateTaskScores(tasks.map((task) => task.scores))
714
+ },
715
+ environment: {
716
+ os: process.platform,
717
+ nodeVersion: process.version,
718
+ hardware: process.arch
719
+ }
720
+ };
721
+ }
722
+ async function loadDataset(mode, datasetDir, limit) {
723
+ const ensureDatasetItems = (items) => {
724
+ if (items.length === 0) {
725
+ throw new Error(
726
+ "LongMemEval dataset is empty after applying the requested limit."
727
+ );
728
+ }
729
+ return items;
730
+ };
731
+ if (datasetDir) {
732
+ const datasetErrors = [];
733
+ for (const filename of [
734
+ "longmemeval_oracle.json",
735
+ "longmemeval_s_cleaned.json",
736
+ "longmemeval.json"
737
+ ]) {
738
+ try {
739
+ const raw = await readFile2(path3.join(datasetDir, filename), "utf8");
740
+ const parsed = JSON.parse(raw);
741
+ return ensureDatasetItems(limit ? parsed.slice(0, limit) : parsed);
742
+ } catch (error) {
743
+ datasetErrors.push(
744
+ `${filename}: ${error instanceof Error ? error.message : String(error)}`
745
+ );
746
+ continue;
747
+ }
748
+ }
749
+ throw new Error(
750
+ `LongMemEval dataset not found under ${datasetDir}. Tried longmemeval_oracle.json, longmemeval_s_cleaned.json, and longmemeval.json. Errors: ${datasetErrors.join(" | ")}`
751
+ );
752
+ }
753
+ if (mode === "full") {
754
+ throw new Error(
755
+ "LongMemEval full mode requires datasetDir. Pass a dataset path or use quick mode to run the bundled smoke fixture."
756
+ );
757
+ }
758
+ const bundledFixture = limit ? LONG_MEM_EVAL_SMOKE_FIXTURE.slice(0, limit) : LONG_MEM_EVAL_SMOKE_FIXTURE;
759
+ if (bundledFixture.length > 0) {
760
+ return ensureDatasetItems(bundledFixture);
761
+ }
762
+ throw new Error("LongMemEval dataset not found and bundled smoke fixture is empty.");
763
+ }
764
+ var REGISTERED_BENCHMARKS = [
765
+ {
766
+ id: "ama-bench",
767
+ title: "AMA-Bench",
768
+ tier: "published",
769
+ status: "planned",
770
+ runnerAvailable: false,
771
+ meta: {
772
+ name: "ama-bench",
773
+ version: "1.0.0",
774
+ description: "Long-horizon agentic memory benchmark.",
775
+ category: "agentic"
776
+ }
777
+ },
778
+ {
779
+ id: "memory-arena",
780
+ title: "MemoryArena",
781
+ tier: "published",
782
+ status: "planned",
783
+ runnerAvailable: false,
784
+ meta: {
785
+ name: "memory-arena",
786
+ version: "1.0.0",
787
+ description: "Interdependent multi-session task benchmark.",
788
+ category: "agentic"
789
+ }
790
+ },
791
+ {
792
+ id: "amemgym",
793
+ title: "AMemGym",
794
+ tier: "published",
795
+ status: "planned",
796
+ runnerAvailable: false,
797
+ meta: {
798
+ name: "amemgym",
799
+ version: "1.0.0",
800
+ description: "Interactive personalization benchmark.",
801
+ category: "agentic"
802
+ }
803
+ },
804
+ {
805
+ ...longMemEvalDefinition,
806
+ run: runLongMemEvalBenchmark
807
+ },
808
+ {
809
+ id: "locomo",
810
+ title: "LoCoMo",
811
+ tier: "published",
812
+ status: "planned",
813
+ runnerAvailable: false,
814
+ meta: {
815
+ name: "locomo",
816
+ version: "1.0.0",
817
+ description: "Long conversation memory benchmark.",
818
+ category: "conversational"
819
+ }
820
+ }
821
+ ];
822
+ function listBenchmarks() {
823
+ return REGISTERED_BENCHMARKS.map(stripRuntimeFields);
824
+ }
825
+ function getBenchmark(id) {
826
+ const benchmark = REGISTERED_BENCHMARKS.find((candidate) => candidate.id === id);
827
+ return benchmark ? stripRuntimeFields(benchmark) : void 0;
828
+ }
829
+ function getRegisteredBenchmark(id) {
830
+ return REGISTERED_BENCHMARKS.find((candidate) => candidate.id === id);
831
+ }
832
+ function stripRuntimeFields(benchmark) {
833
+ return {
834
+ id: benchmark.id,
835
+ title: benchmark.title,
836
+ tier: benchmark.tier,
837
+ status: benchmark.status,
838
+ runnerAvailable: benchmark.runnerAvailable,
839
+ meta: benchmark.meta
840
+ };
841
+ }
842
+ var DEFAULT_BASELINE_PATH = path4.join(process.cwd(), "benchmarks", "baseline.json");
843
+ var DEFAULT_REPORT_PATH = path4.join(process.cwd(), "benchmarks", "report.json");
844
+ var BASELINE_VERSION = 1;
845
+ var DEFAULT_TOLERANCE = 10;
846
+ var DEFAULT_QUERIES = [
847
+ "What is the storage?",
848
+ "How do I access storage?",
849
+ "What categories exist?",
850
+ "How is memory organized?",
851
+ "What is the recall budget?",
852
+ "What is the extraction pipeline?",
853
+ "What facts are stored about the project?",
854
+ "What is the architecture?"
855
+ ];
856
+ function hrTimeMs() {
857
+ const [seconds, nanos] = process.hrtime();
858
+ return seconds * 1e3 + Math.round(nanos / 1e6);
859
+ }
860
+ async function runBenchmark(benchmarkId, options) {
861
+ const registeredBenchmark = getRegisteredBenchmark(benchmarkId);
862
+ if (!registeredBenchmark) {
863
+ throw new Error(
864
+ `Unknown benchmark "${benchmarkId}". Available benchmarks: ${listBenchmarks().map((benchmark) => benchmark.id).join(", ")}`
865
+ );
866
+ }
867
+ if (!registeredBenchmark.run) {
868
+ throw new Error(
869
+ `Benchmark "${benchmarkId}" is listed but has not been migrated into @remnic/bench yet.`
870
+ );
871
+ }
872
+ return registeredBenchmark.run({
873
+ ...options,
874
+ mode: options.mode ?? "quick",
875
+ benchmark: benchmarkDefinition(registeredBenchmark.id)
876
+ });
877
+ }
878
+ function benchmarkDefinition(id) {
879
+ const definition = getBenchmark(id);
880
+ if (!definition) {
881
+ throw new Error(`Benchmark definition disappeared for "${id}".`);
882
+ }
883
+ return definition;
884
+ }
885
+ function loadBaseline(baselinePath) {
886
+ const resolvedPath = baselinePath ?? DEFAULT_BASELINE_PATH;
887
+ if (!fs.existsSync(resolvedPath)) {
888
+ return void 0;
889
+ }
890
+ try {
891
+ const raw = JSON.parse(fs.readFileSync(resolvedPath, "utf8"));
892
+ if (raw.version !== BASELINE_VERSION) {
893
+ console.warn(
894
+ `Baseline version mismatch: expected ${BASELINE_VERSION}, got ${raw.version}`
895
+ );
896
+ }
897
+ return raw;
898
+ } catch {
899
+ return void 0;
900
+ }
901
+ }
902
+ function saveBaseline(baselinePath, baseline) {
903
+ fs.mkdirSync(path4.dirname(baselinePath), { recursive: true });
904
+ fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}
905
+ `);
906
+ }
907
+ async function recallWithTiers(service, query) {
908
+ const tiers = [];
909
+ const tierDetails = [];
910
+ const exactStart = hrTimeMs();
911
+ const exactResponse = await service.recall({
912
+ query,
913
+ mode: "auto"
914
+ });
915
+ const exactLatency = hrTimeMs() - exactStart;
916
+ if (exactResponse.results?.some(
917
+ (memory) => memory.preview.toLowerCase().includes(query.toLowerCase())
918
+ )) {
919
+ tiers.push("exact_match");
920
+ tierDetails.push({
921
+ tier: "exact_match",
922
+ latencyMs: exactLatency,
923
+ resultsCount: exactResponse.results.length
924
+ });
925
+ return { tiers, tierDetails };
926
+ }
927
+ const keywordStart = hrTimeMs();
928
+ const keywordResponse = await service.recall({
929
+ query,
930
+ mode: "auto"
931
+ });
932
+ const keywordLatency = hrTimeMs() - keywordStart;
933
+ const queryWords = query.toLowerCase().split(/\s+/).filter((word) => word.length > 2);
934
+ if (keywordResponse.results?.some(
935
+ (memory) => queryWords.some((word) => memory.preview.toLowerCase().includes(word))
936
+ )) {
937
+ tiers.push("category_match");
938
+ tierDetails.push({
939
+ tier: "category_match",
940
+ latencyMs: keywordLatency,
941
+ resultsCount: keywordResponse.results.length
942
+ });
943
+ return { tiers, tierDetails };
944
+ }
945
+ const confidenceStart = hrTimeMs();
946
+ const confidenceResponse = await service.recall({
947
+ query,
948
+ mode: "auto"
949
+ });
950
+ const confidenceLatency = hrTimeMs() - confidenceStart;
951
+ const taggedResults = (confidenceResponse.results ?? []).filter(
952
+ (memory) => memory.tags?.length > 0
953
+ );
954
+ if (taggedResults.length > 0) {
955
+ tiers.push("high_confidence");
956
+ tierDetails.push({
957
+ tier: "high_confidence",
958
+ latencyMs: confidenceLatency,
959
+ resultsCount: taggedResults.length
960
+ });
961
+ return { tiers, tierDetails };
962
+ }
963
+ const semanticStart = hrTimeMs();
964
+ const semanticResponse = await service.recall({
965
+ query,
966
+ mode: "auto"
967
+ });
968
+ const semanticLatency = hrTimeMs() - semanticStart;
969
+ if ((semanticResponse.results ?? []).length > 0) {
970
+ tiers.push("semantic_search");
971
+ tierDetails.push({
972
+ tier: "semantic_search",
973
+ latencyMs: semanticLatency,
974
+ resultsCount: semanticResponse.results.length
975
+ });
976
+ return { tiers, tierDetails };
977
+ }
978
+ const fullStart = hrTimeMs();
979
+ const fullResponse = await service.recall({
980
+ query,
981
+ mode: "full"
982
+ });
983
+ const fullLatency = hrTimeMs() - fullStart;
984
+ if ((fullResponse.results ?? []).length > 0) {
985
+ tiers.push("full_search");
986
+ tierDetails.push({
987
+ tier: "full_search",
988
+ latencyMs: fullLatency,
989
+ resultsCount: fullResponse.results.length
990
+ });
991
+ return { tiers, tierDetails };
992
+ }
993
+ tiers.push("no_results");
994
+ tierDetails.push({
995
+ tier: "no_results",
996
+ latencyMs: exactLatency + keywordLatency + confidenceLatency + semanticLatency + fullLatency,
997
+ resultsCount: 0
998
+ });
999
+ return { tiers, tierDetails };
1000
+ }
1001
+ async function runExplain(service, query) {
1002
+ const start = hrTimeMs();
1003
+ const { tiers, tierDetails } = await recallWithTiers(service, query);
1004
+ const totalDurationMs = hrTimeMs() - start;
1005
+ return {
1006
+ query,
1007
+ tiersUsed: tiers,
1008
+ tierResults: tierDetails,
1009
+ durationMs: tierDetails[0]?.latencyMs ?? totalDurationMs,
1010
+ totalDurationMs
1011
+ };
1012
+ }
1013
+ async function runSingle(service, queryText) {
1014
+ const start = hrTimeMs();
1015
+ const { tiers, tierDetails } = await recallWithTiers(service, queryText);
1016
+ const totalDurationMs = hrTimeMs() - start;
1017
+ return {
1018
+ query: queryText,
1019
+ latencyMs: totalDurationMs,
1020
+ tiersUsed: tiers,
1021
+ throughput: totalDurationMs > 0 ? 1 / (totalDurationMs / 1e3) : 0,
1022
+ resultsCount: tierDetails.reduce((sum, tier) => sum + tier.resultsCount, 0),
1023
+ totalDurationMs,
1024
+ tierDetails
1025
+ };
1026
+ }
1027
+ async function runBenchSuite(service, config = {}) {
1028
+ const queries = config.queries ?? DEFAULT_QUERIES;
1029
+ const regressionTolerance = config.regressionTolerance ?? DEFAULT_TOLERANCE;
1030
+ const baselinePath = config.baselinePath ?? DEFAULT_BASELINE_PATH;
1031
+ const reportPath = config.reportPath ?? DEFAULT_REPORT_PATH;
1032
+ const explain = config.explain ?? false;
1033
+ const results = [];
1034
+ const suiteStart = hrTimeMs();
1035
+ for (const query of queries) {
1036
+ if (explain) {
1037
+ const explained = await runExplain(service, query);
1038
+ results.push({
1039
+ query: explained.query,
1040
+ latencyMs: explained.durationMs,
1041
+ tiersUsed: explained.tiersUsed,
1042
+ throughput: explained.totalDurationMs > 0 ? 1 / (explained.totalDurationMs / 1e3) : 0,
1043
+ resultsCount: explained.tierResults.reduce(
1044
+ (sum, tier) => sum + tier.resultsCount,
1045
+ 0
1046
+ ),
1047
+ totalDurationMs: explained.totalDurationMs,
1048
+ tierDetails: explained.tierResults
1049
+ });
1050
+ } else {
1051
+ results.push(await runSingle(service, query));
1052
+ }
1053
+ }
1054
+ const totalDurationMs = hrTimeMs() - suiteStart;
1055
+ const metrics = {};
1056
+ for (const result of results) {
1057
+ metrics[result.query] = result.latencyMs;
1058
+ }
1059
+ const report = generateReport(results, reportPath);
1060
+ const baseline = loadBaseline(baselinePath);
1061
+ const regressionResult = checkRegression(metrics, baseline, regressionTolerance);
1062
+ if (!baseline) {
1063
+ saveBaseline(baselinePath, {
1064
+ version: BASELINE_VERSION,
1065
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1066
+ metrics
1067
+ });
1068
+ }
1069
+ return {
1070
+ results,
1071
+ report,
1072
+ totalDurationMs,
1073
+ regressions: regressionResult.regressions
1074
+ };
1075
+ }
1076
+ function checkRegression(metrics, baseline, tolerance) {
1077
+ if (!baseline) {
1078
+ return { passed: true, regressions: [] };
1079
+ }
1080
+ const regressions = [];
1081
+ for (const [metric, currentValue] of Object.entries(metrics)) {
1082
+ const baselineValue = baseline.metrics[metric];
1083
+ if (baselineValue === void 0) {
1084
+ continue;
1085
+ }
1086
+ const changePercent = baselineValue > 0 ? (currentValue - baselineValue) / baselineValue * 100 : 0;
1087
+ regressions.push({
1088
+ metric,
1089
+ currentValue,
1090
+ baselineValue,
1091
+ tolerance,
1092
+ passed: changePercent <= tolerance
1093
+ });
1094
+ }
1095
+ return {
1096
+ passed: regressions.every((regression) => regression.passed),
1097
+ regressions
1098
+ };
1099
+ }
1100
+ function generateReport(results, reportPath) {
1101
+ const report = {
1102
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1103
+ queries: results.map((result) => ({
1104
+ query: result.query,
1105
+ tiersUsed: result.tiersUsed,
1106
+ durationMs: result.latencyMs,
1107
+ resultsCount: result.resultsCount,
1108
+ throughput: result.throughput,
1109
+ tierDetails: result.tierDetails
1110
+ })),
1111
+ totalDurationMs: results.reduce((sum, result) => sum + result.totalDurationMs, 0)
1112
+ };
1113
+ if (reportPath) {
1114
+ fs.mkdirSync(path4.dirname(reportPath), { recursive: true });
1115
+ fs.writeFileSync(reportPath, `${JSON.stringify(report, null, 2)}
1116
+ `);
1117
+ }
1118
+ return report;
1119
+ }
1120
+
1121
+ export {
1122
+ createLightweightAdapter,
1123
+ createRemnicAdapter,
1124
+ BENCHMARK_RESULT_SCHEMA,
1125
+ createOpenAiCompatibleProvider,
1126
+ exactMatch,
1127
+ f1Score,
1128
+ rougeL,
1129
+ recallAtK,
1130
+ containsAnswer,
1131
+ llmJudgeScore,
1132
+ timed,
1133
+ aggregateTaskScores,
1134
+ writeBenchmarkResult,
1135
+ listBenchmarks,
1136
+ getBenchmark,
1137
+ runBenchmark,
1138
+ loadBaseline,
1139
+ saveBaseline,
1140
+ runExplain,
1141
+ runBenchSuite,
1142
+ checkRegression,
1143
+ generateReport
1144
+ };