@eidentic/bench 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -179,10 +179,263 @@ declare function loadLongMemEval(jsonPath: string, opts?: {
179
179
  * @param opts.maxBytes - Maximum allowed file size in bytes (default 256 MiB).
180
180
  * Increase this only if you are loading a vetted, trusted dataset file.
181
181
  */
182
- declare function loadLoCoMo(jsonPath: string, opts?: {
182
+ declare function loadLoCoMo$1(jsonPath: string, opts?: {
183
183
  maxBytes?: number;
184
184
  }): Promise<BenchDataset>;
185
185
 
186
+ /** One turn parsed from a session. */
187
+ interface LocomoTurn {
188
+ speaker: string;
189
+ diaId: string;
190
+ text: string;
191
+ }
192
+ /** One session parsed from the conversation object. */
193
+ interface LocomoSession {
194
+ /** 1-based session index from the dataset key. */
195
+ index: number;
196
+ /** Human-readable date-time string from the dataset, e.g. "1:56 pm on 8 May, 2023". */
197
+ dateTime: string;
198
+ /** Epoch milliseconds parsed from dateTime (0 when parse fails). */
199
+ dateTimeMs: number;
200
+ turns: LocomoTurn[];
201
+ }
202
+ /**
203
+ * Category definitions:
204
+ * 1 = multi-hop (282 questions)
205
+ * 2 = temporal (321 questions)
206
+ * 3 = open-domain (96 questions)
207
+ * 4 = single-hop (841 questions)
208
+ * 5 = adversarial/unanswerable (446 questions)
209
+ *
210
+ * NOTE: the numeric → semantic mapping above comes from the actual file counts.
211
+ * Primary score = categories 1–4 (denominator 1540). Category 5 is scored separately as refusal rate.
212
+ */
213
+ type LocomoCategory = 1 | 2 | 3 | 4 | 5;
214
+ /** One QA pair with typed evidence references. */
215
+ interface LocomoQA {
216
+ question: string;
217
+ /** Gold answer (string or number). Absent for category-5 (adversarial) questions. */
218
+ answer?: string;
219
+ category: LocomoCategory;
220
+ /** Dia-id references into the conversation sessions, e.g. ["D1:9", "D3:2"]. */
221
+ evidence: string[];
222
+ /** The trap answer for category 5 — a plausible-sounding but wrong answer. */
223
+ adversarialAnswer?: string;
224
+ }
225
+ /** One LoCoMo sample with parsed sessions and QA. */
226
+ interface LocomoSample {
227
+ sampleId: string;
228
+ speakerA: string;
229
+ speakerB: string;
230
+ sessions: LocomoSession[];
231
+ qa: LocomoQA[];
232
+ }
233
+ /** Typed dataset returned by loadLoCoMo. */
234
+ interface LocomoDataset {
235
+ samples: LocomoSample[];
236
+ }
237
+ /**
238
+ * Resolve evidence dia-ids to their actual turn texts.
239
+ *
240
+ * @param sample - The LoCoMo sample.
241
+ * @param diaIds - Array of dia-id strings, e.g. ["D1:9", "D3:2"].
242
+ * @returns - Array of matched turn texts (unmatched ids are silently skipped).
243
+ */
244
+ declare function resolveEvidence(sample: LocomoSample, diaIds: string[]): string[];
245
+
246
+ /**
247
+ * LoCoMo dataset loader — real schema.
248
+ *
249
+ * The real locomo10.json root is a bare JSON array of samples. Sessions are stored as dynamic
250
+ * keys session_N / session_N_date_time in the conversation object (not a nested array).
251
+ *
252
+ * Dataset source: https://github.com/snap-research/locomo
253
+ * License: CC BY-NC 4.0 — raw data must NOT be committed; results are publishable.
254
+ *
255
+ * LOCOMO_SOURCE_SHA below records the upstream commit that produced the locomo10.json file
256
+ * used in this implementation, for provenance.
257
+ */
258
+
259
+ /** Upstream commit SHA of snap-research/locomo at the time the loader was written. */
260
+ declare const LOCOMO_SOURCE_SHA = "3eb6f2c585f5e1699204e3c3bdf7adc5c28cb376";
261
+ /**
262
+ * Load a LoCoMo JSON file (real schema — bare array of samples with dynamic session keys).
263
+ *
264
+ * @param jsonPath - Absolute or relative path to the locomo10.json file.
265
+ * **Security note:** callers must validate untrusted paths before passing them here.
266
+ * @param opts.maxBytes - Maximum allowed file size (default 256 MiB).
267
+ * @returns - Typed LocomoDataset with parsed sessions and QA.
268
+ */
269
+ declare function loadLoCoMo(jsonPath: string, opts?: {
270
+ maxBytes?: number;
271
+ }): Promise<LocomoDataset>;
272
+
273
+ /**
274
+ * Fair-run LoCoMo benchmark harness.
275
+ *
276
+ * Key fair-run rules (non-negotiable, documented for methodology transparency):
277
+ *
278
+ * 1. BOTH speakers are humans — never map to user/assistant roles.
279
+ * Turns are ingested as "[SpeakerName]: <text>" into one Memory scope per conversation.
280
+ *
281
+ * 2. Timestamps structurally — session headers are prepended ("Session N — <date>") so
282
+ * temporal questions are answerable; ingestedAt metadata carries the epoch-ms.
283
+ *
284
+ * 3. Memory-mode answer step — retrieve topK <= ~10 snippets (never inflate topK to
285
+ * bypass retrieval quality), build a prompt from retrieved snippets, answer concisely.
286
+ *
287
+ * 4. Full-context mode — the MANDATORY baseline. Entire conversation fed as context.
288
+ *
289
+ * 5. Judging — strict LLM judge: correct only when the model answer contains the gold
290
+ * answer's specific information (paraphrase ok; vague/topical-only = wrong).
291
+ * For category 5: correct = model declined; adversarial_answer match = wrong.
292
+ *
293
+ * 6. Metrics — per-category accuracy, overall J(1-4) with exact denominator 1540,
294
+ * category-5 refusal rate separately, token/cost accounting, wall-clock.
295
+ *
296
+ * 7. Determinism — seed recorded; seeded shuffle used when sampleLimit/questionLimit set.
297
+ *
298
+ * 8. Resilience — per-question try/catch; checkpoint-resume via JSONL file.
299
+ */
300
+
301
+ /** Per-question scored row (also used as checkpoint entry). */
302
+ interface LocomoQuestionResult {
303
+ sampleId: string;
304
+ questionIndex: number;
305
+ question: string;
306
+ goldAnswer?: string;
307
+ category: 1 | 2 | 3 | 4 | 5;
308
+ modelAnswer: string;
309
+ correct: boolean;
310
+ /** True when category=5 and the model gave the adversarial trap answer. */
311
+ trapTriggered?: boolean;
312
+ error?: string;
313
+ /** Tokens used in the answer step for this question. */
314
+ answerInputTokens: number;
315
+ answerOutputTokens: number;
316
+ /** Tokens used in the judge step for this question. */
317
+ judgeInputTokens: number;
318
+ judgeOutputTokens: number;
319
+ }
320
+ /** Token / cost summary (no vendor pricing hard-coded — caller provides rates). */
321
+ interface TokenSummary {
322
+ ingestInputTokens: number;
323
+ ingestOutputTokens: number;
324
+ answerInputTokens: number;
325
+ answerOutputTokens: number;
326
+ judgeInputTokens: number;
327
+ judgeOutputTokens: number;
328
+ totalInputTokens: number;
329
+ totalOutputTokens: number;
330
+ }
331
+ /** Per-category accuracy. */
332
+ interface CategoryStats {
333
+ correct: number;
334
+ total: number;
335
+ accuracy: number;
336
+ }
337
+ /** The full benchmark report. */
338
+ interface LocomoReport {
339
+ /** Run configuration (included in every published result for transparency). */
340
+ config: {
341
+ mode: "memory" | "full-context";
342
+ topK: number;
343
+ answerModelId: string;
344
+ judgeModelId: string;
345
+ datasetSha: string;
346
+ seed: number;
347
+ categories: number[];
348
+ samplesRun: number;
349
+ questionsRun: number;
350
+ };
351
+ /** Overall J(1–4): accuracy on categories 1–4 only; denominator = questions actually run in 1–4. */
352
+ overallJ14: CategoryStats;
353
+ /** Per-category breakdown (keys "1" through "5"). */
354
+ byCategory: Record<string, CategoryStats>;
355
+ /** Category-5 refusal rate (separate from J14). */
356
+ cat5RefusalRate?: {
357
+ correct: number;
358
+ total: number;
359
+ rate: number;
360
+ };
361
+ /** Token usage accounting. */
362
+ tokens: TokenSummary;
363
+ /** Wall-clock duration of the run in milliseconds. */
364
+ wallClockMs: number;
365
+ /** Individual question results. */
366
+ questions: LocomoQuestionResult[];
367
+ /** Count of questions that threw errors (counted as wrong, not skipped). */
368
+ errorCount: number;
369
+ }
370
+ /** Factory for a fresh Memory instance, called once per sample. */
371
+ type MemoryFactory = (sampleId: string) => Memory | Promise<Memory>;
372
+ /** Options for runLocomoBench. */
373
+ interface LocomoBenchOptions {
374
+ /** Path to locomo10.json. */
375
+ dataPath: string;
376
+ /** Factory for a fresh Memory per conversation (used only when mode="memory"). */
377
+ memoryFactory?: MemoryFactory;
378
+ /** Model used to generate answers. */
379
+ answerModel: ModelPort;
380
+ /** Model used to judge correctness. */
381
+ judgeModel: ModelPort;
382
+ /** "memory" requires memoryFactory; "full-context" feeds the full conversation as context. */
383
+ mode: "memory" | "full-context";
384
+ /** Categories to include (default: [1,2,3,4,5]). */
385
+ categories?: number[];
386
+ /** Cap on samples to process (for quick pilot runs). */
387
+ sampleLimit?: number;
388
+ /** Cap on questions per sample (applied after seeded shuffle when set). */
389
+ questionLimit?: number;
390
+ /** Random seed for shuffle reproducibility. Default 42. */
391
+ seed?: number;
392
+ /** Max snippets retrieved per question in memory mode. MUST be <= 10 to avoid trivialising retrieval. */
393
+ topK?: number;
394
+ /** Concurrency (questions in flight simultaneously). Default 1. */
395
+ concurrency?: number;
396
+ /** Progress callback: (questionsCompleted, questionsTotal) */
397
+ onProgress?: (done: number, total: number) => void;
398
+ /** Path to a JSONL checkpoint file. Existing rows are skipped on resume. */
399
+ checkpointPath?: string;
400
+ /** Pre-loaded dataset (avoids re-reading the file if already loaded). */
401
+ dataset?: LocomoDataset;
402
+ }
403
+ /**
404
+ * Run the LoCoMo benchmark with the given options.
405
+ *
406
+ * @param opts - Configuration (see LocomoBenchOptions).
407
+ * @returns - Full LocomoReport with metrics, token accounting, and per-question details.
408
+ */
409
+ declare function runLocomoBench(opts: LocomoBenchOptions): Promise<LocomoReport>;
410
+
411
+ /**
412
+ * Markdown results renderer for LoCoMo benchmark reports.
413
+ *
414
+ * Produces a defensible, methodology-transparent table suitable for publication.
415
+ * Per the mandatory fair-run rules, results MUST include:
416
+ * - Model ids and judge model id
417
+ * - topK value
418
+ * - Dataset SHA (provenance)
419
+ * - Mode (memory | full-context)
420
+ * - Seed and n-questions
421
+ */
422
+
423
+ /** Optional price table for cost estimates (per 1M tokens, input/output). */
424
+ interface PriceTable {
425
+ /** Per-million input tokens in USD. */
426
+ inputPer1M: number;
427
+ /** Per-million output tokens in USD. */
428
+ outputPer1M: number;
429
+ }
430
+ /**
431
+ * Render one or more LoCoMo benchmark reports as a Markdown table with mandatory methodology notes.
432
+ *
433
+ * @param reports - Array of LocomoReport objects to compare.
434
+ * @param prices - Optional price table for cost-per-run estimates (per 1M tokens).
435
+ * @returns - Markdown string ready to write to a .md file.
436
+ */
437
+ declare function renderLocomoReportMarkdown(reports: LocomoReport[], prices?: PriceTable): string;
438
+
186
439
  /**
187
440
  * Write-quality benchmark for the Eidentic memory harness.
188
441
  *
@@ -493,4 +746,4 @@ interface TemporalBenchOptions {
493
746
  */
494
747
  declare function runTemporalBench(memory: Memory, dataset: SyntheticTemporalDataset, opts?: TemporalBenchOptions): Promise<TemporalBenchReport>;
495
748
 
496
- export { type BenchCase, type BenchDataset, type BenchOptions, type BenchQuestion, type BenchReport, type BenchTurn, CONTRADICTION_FIXTURES, type CaseResult, type ContradictionFixture, JUNK_STREAM_FIXTURES, type JunkItem, type QuestionResult, type StateTransition, type SyntheticTemporalDataset, type TemporalBenchOptions, type TemporalBenchReport, type TemporalEntity, type TemporalQuestion, type TemporalQuestionResult, type WriteQualityDetail, type WriteQualityOptions, type WriteQualityReport, factRecall, loadLoCoMo, loadLongMemEval, normalizeText, normalizedIncludes, recallAtK, runMemoryBench, runTemporalBench, runWriteQualityBench, syntheticDataset, syntheticTemporalDataset };
749
+ export { type BenchCase, type BenchDataset, type BenchOptions, type BenchQuestion, type BenchReport, type BenchTurn, CONTRADICTION_FIXTURES, type CaseResult, type CategoryStats, type ContradictionFixture, JUNK_STREAM_FIXTURES, type JunkItem, LOCOMO_SOURCE_SHA, type LocomoBenchOptions, type LocomoCategory, type LocomoDataset, type LocomoQA, type LocomoQuestionResult, type LocomoReport, type LocomoSample, type LocomoSession, type LocomoTurn, type MemoryFactory, type PriceTable, type QuestionResult, type StateTransition, type SyntheticTemporalDataset, type TemporalBenchOptions, type TemporalBenchReport, type TemporalEntity, type TemporalQuestion, type TemporalQuestionResult, type TokenSummary, type WriteQualityDetail, type WriteQualityOptions, type WriteQualityReport, factRecall, loadLoCoMo, loadLoCoMo$1 as loadLoCoMoLegacy, loadLongMemEval, normalizeText, normalizedIncludes, recallAtK, renderLocomoReportMarkdown, resolveEvidence, runLocomoBench, runMemoryBench, runTemporalBench, runWriteQualityBench, syntheticDataset, syntheticTemporalDataset };
package/dist/index.d.ts CHANGED
@@ -179,10 +179,263 @@ declare function loadLongMemEval(jsonPath: string, opts?: {
179
179
  * @param opts.maxBytes - Maximum allowed file size in bytes (default 256 MiB).
180
180
  * Increase this only if you are loading a vetted, trusted dataset file.
181
181
  */
182
- declare function loadLoCoMo(jsonPath: string, opts?: {
182
+ declare function loadLoCoMo$1(jsonPath: string, opts?: {
183
183
  maxBytes?: number;
184
184
  }): Promise<BenchDataset>;
185
185
 
186
+ /** One turn parsed from a session. */
187
+ interface LocomoTurn {
188
+ speaker: string;
189
+ diaId: string;
190
+ text: string;
191
+ }
192
+ /** One session parsed from the conversation object. */
193
+ interface LocomoSession {
194
+ /** 1-based session index from the dataset key. */
195
+ index: number;
196
+ /** Human-readable date-time string from the dataset, e.g. "1:56 pm on 8 May, 2023". */
197
+ dateTime: string;
198
+ /** Epoch milliseconds parsed from dateTime (0 when parse fails). */
199
+ dateTimeMs: number;
200
+ turns: LocomoTurn[];
201
+ }
202
+ /**
203
+ * Category definitions:
204
+ * 1 = multi-hop (282 questions)
205
+ * 2 = temporal (321 questions)
206
+ * 3 = open-domain (96 questions)
207
+ * 4 = single-hop (841 questions)
208
+ * 5 = adversarial/unanswerable (446 questions)
209
+ *
210
+ * NOTE: the numeric → semantic mapping above comes from the actual file counts.
211
+ * Primary score = categories 1–4 (denominator 1540). Category 5 is scored separately as refusal rate.
212
+ */
213
+ type LocomoCategory = 1 | 2 | 3 | 4 | 5;
214
+ /** One QA pair with typed evidence references. */
215
+ interface LocomoQA {
216
+ question: string;
217
+ /** Gold answer (string or number). Absent for category-5 (adversarial) questions. */
218
+ answer?: string;
219
+ category: LocomoCategory;
220
+ /** Dia-id references into the conversation sessions, e.g. ["D1:9", "D3:2"]. */
221
+ evidence: string[];
222
+ /** The trap answer for category 5 — a plausible-sounding but wrong answer. */
223
+ adversarialAnswer?: string;
224
+ }
225
+ /** One LoCoMo sample with parsed sessions and QA. */
226
+ interface LocomoSample {
227
+ sampleId: string;
228
+ speakerA: string;
229
+ speakerB: string;
230
+ sessions: LocomoSession[];
231
+ qa: LocomoQA[];
232
+ }
233
+ /** Typed dataset returned by loadLoCoMo. */
234
+ interface LocomoDataset {
235
+ samples: LocomoSample[];
236
+ }
237
+ /**
238
+ * Resolve evidence dia-ids to their actual turn texts.
239
+ *
240
+ * @param sample - The LoCoMo sample.
241
+ * @param diaIds - Array of dia-id strings, e.g. ["D1:9", "D3:2"].
242
+ * @returns - Array of matched turn texts (unmatched ids are silently skipped).
243
+ */
244
+ declare function resolveEvidence(sample: LocomoSample, diaIds: string[]): string[];
245
+
246
+ /**
247
+ * LoCoMo dataset loader — real schema.
248
+ *
249
+ * The real locomo10.json root is a bare JSON array of samples. Sessions are stored as dynamic
250
+ * keys session_N / session_N_date_time in the conversation object (not a nested array).
251
+ *
252
+ * Dataset source: https://github.com/snap-research/locomo
253
+ * License: CC BY-NC 4.0 — raw data must NOT be committed; results are publishable.
254
+ *
255
+ * LOCOMO_SOURCE_SHA below records the upstream commit that produced the locomo10.json file
256
+ * used in this implementation, for provenance.
257
+ */
258
+
259
+ /** Upstream commit SHA of snap-research/locomo at the time the loader was written. */
260
+ declare const LOCOMO_SOURCE_SHA = "3eb6f2c585f5e1699204e3c3bdf7adc5c28cb376";
261
+ /**
262
+ * Load a LoCoMo JSON file (real schema — bare array of samples with dynamic session keys).
263
+ *
264
+ * @param jsonPath - Absolute or relative path to the locomo10.json file.
265
+ * **Security note:** callers must validate untrusted paths before passing them here.
266
+ * @param opts.maxBytes - Maximum allowed file size (default 256 MiB).
267
+ * @returns - Typed LocomoDataset with parsed sessions and QA.
268
+ */
269
+ declare function loadLoCoMo(jsonPath: string, opts?: {
270
+ maxBytes?: number;
271
+ }): Promise<LocomoDataset>;
272
+
273
+ /**
274
+ * Fair-run LoCoMo benchmark harness.
275
+ *
276
+ * Key fair-run rules (non-negotiable, documented for methodology transparency):
277
+ *
278
+ * 1. BOTH speakers are humans — never map to user/assistant roles.
279
+ * Turns are ingested as "[SpeakerName]: <text>" into one Memory scope per conversation.
280
+ *
281
+ * 2. Timestamps structurally — session headers are prepended ("Session N — <date>") so
282
+ * temporal questions are answerable; ingestedAt metadata carries the epoch-ms.
283
+ *
284
+ * 3. Memory-mode answer step — retrieve topK <= ~10 snippets (never inflate topK to
285
+ * bypass retrieval quality), build a prompt from retrieved snippets, answer concisely.
286
+ *
287
+ * 4. Full-context mode — the MANDATORY baseline. Entire conversation fed as context.
288
+ *
289
+ * 5. Judging — strict LLM judge: correct only when the model answer contains the gold
290
+ * answer's specific information (paraphrase ok; vague/topical-only = wrong).
291
+ * For category 5: correct = model declined; adversarial_answer match = wrong.
292
+ *
293
+ * 6. Metrics — per-category accuracy, overall J(1-4) with exact denominator 1540,
294
+ * category-5 refusal rate separately, token/cost accounting, wall-clock.
295
+ *
296
+ * 7. Determinism — seed recorded; seeded shuffle used when sampleLimit/questionLimit set.
297
+ *
298
+ * 8. Resilience — per-question try/catch; checkpoint-resume via JSONL file.
299
+ */
300
+
301
+ /** Per-question scored row (also used as checkpoint entry). */
302
+ interface LocomoQuestionResult {
303
+ sampleId: string;
304
+ questionIndex: number;
305
+ question: string;
306
+ goldAnswer?: string;
307
+ category: 1 | 2 | 3 | 4 | 5;
308
+ modelAnswer: string;
309
+ correct: boolean;
310
+ /** True when category=5 and the model gave the adversarial trap answer. */
311
+ trapTriggered?: boolean;
312
+ error?: string;
313
+ /** Tokens used in the answer step for this question. */
314
+ answerInputTokens: number;
315
+ answerOutputTokens: number;
316
+ /** Tokens used in the judge step for this question. */
317
+ judgeInputTokens: number;
318
+ judgeOutputTokens: number;
319
+ }
320
+ /** Token / cost summary (no vendor pricing hard-coded — caller provides rates). */
321
+ interface TokenSummary {
322
+ ingestInputTokens: number;
323
+ ingestOutputTokens: number;
324
+ answerInputTokens: number;
325
+ answerOutputTokens: number;
326
+ judgeInputTokens: number;
327
+ judgeOutputTokens: number;
328
+ totalInputTokens: number;
329
+ totalOutputTokens: number;
330
+ }
331
+ /** Per-category accuracy. */
332
+ interface CategoryStats {
333
+ correct: number;
334
+ total: number;
335
+ accuracy: number;
336
+ }
337
+ /** The full benchmark report. */
338
+ interface LocomoReport {
339
+ /** Run configuration (included in every published result for transparency). */
340
+ config: {
341
+ mode: "memory" | "full-context";
342
+ topK: number;
343
+ answerModelId: string;
344
+ judgeModelId: string;
345
+ datasetSha: string;
346
+ seed: number;
347
+ categories: number[];
348
+ samplesRun: number;
349
+ questionsRun: number;
350
+ };
351
+ /** Overall J(1–4): accuracy on categories 1–4 only; denominator = questions actually run in 1–4. */
352
+ overallJ14: CategoryStats;
353
+ /** Per-category breakdown (keys "1" through "5"). */
354
+ byCategory: Record<string, CategoryStats>;
355
+ /** Category-5 refusal rate (separate from J14). */
356
+ cat5RefusalRate?: {
357
+ correct: number;
358
+ total: number;
359
+ rate: number;
360
+ };
361
+ /** Token usage accounting. */
362
+ tokens: TokenSummary;
363
+ /** Wall-clock duration of the run in milliseconds. */
364
+ wallClockMs: number;
365
+ /** Individual question results. */
366
+ questions: LocomoQuestionResult[];
367
+ /** Count of questions that threw errors (counted as wrong, not skipped). */
368
+ errorCount: number;
369
+ }
370
+ /** Factory for a fresh Memory instance, called once per sample. */
371
+ type MemoryFactory = (sampleId: string) => Memory | Promise<Memory>;
372
+ /** Options for runLocomoBench. */
373
+ interface LocomoBenchOptions {
374
+ /** Path to locomo10.json. */
375
+ dataPath: string;
376
+ /** Factory for a fresh Memory per conversation (used only when mode="memory"). */
377
+ memoryFactory?: MemoryFactory;
378
+ /** Model used to generate answers. */
379
+ answerModel: ModelPort;
380
+ /** Model used to judge correctness. */
381
+ judgeModel: ModelPort;
382
+ /** "memory" requires memoryFactory; "full-context" feeds the full conversation as context. */
383
+ mode: "memory" | "full-context";
384
+ /** Categories to include (default: [1,2,3,4,5]). */
385
+ categories?: number[];
386
+ /** Cap on samples to process (for quick pilot runs). */
387
+ sampleLimit?: number;
388
+ /** Cap on questions per sample (applied after seeded shuffle when set). */
389
+ questionLimit?: number;
390
+ /** Random seed for shuffle reproducibility. Default 42. */
391
+ seed?: number;
392
+ /** Max snippets retrieved per question in memory mode. MUST be <= 10 to avoid trivialising retrieval. */
393
+ topK?: number;
394
+ /** Concurrency (questions in flight simultaneously). Default 1. */
395
+ concurrency?: number;
396
+ /** Progress callback: (questionsCompleted, questionsTotal) */
397
+ onProgress?: (done: number, total: number) => void;
398
+ /** Path to a JSONL checkpoint file. Existing rows are skipped on resume. */
399
+ checkpointPath?: string;
400
+ /** Pre-loaded dataset (avoids re-reading the file if already loaded). */
401
+ dataset?: LocomoDataset;
402
+ }
403
+ /**
404
+ * Run the LoCoMo benchmark with the given options.
405
+ *
406
+ * @param opts - Configuration (see LocomoBenchOptions).
407
+ * @returns - Full LocomoReport with metrics, token accounting, and per-question details.
408
+ */
409
+ declare function runLocomoBench(opts: LocomoBenchOptions): Promise<LocomoReport>;
410
+
411
+ /**
412
+ * Markdown results renderer for LoCoMo benchmark reports.
413
+ *
414
+ * Produces a defensible, methodology-transparent table suitable for publication.
415
+ * Per the mandatory fair-run rules, results MUST include:
416
+ * - Model ids and judge model id
417
+ * - topK value
418
+ * - Dataset SHA (provenance)
419
+ * - Mode (memory | full-context)
420
+ * - Seed and n-questions
421
+ */
422
+
423
+ /** Optional price table for cost estimates (per 1M tokens, input/output). */
424
+ interface PriceTable {
425
+ /** Per-million input tokens in USD. */
426
+ inputPer1M: number;
427
+ /** Per-million output tokens in USD. */
428
+ outputPer1M: number;
429
+ }
430
+ /**
431
+ * Render one or more LoCoMo benchmark reports as a Markdown table with mandatory methodology notes.
432
+ *
433
+ * @param reports - Array of LocomoReport objects to compare.
434
+ * @param prices - Optional price table for cost-per-run estimates (per 1M tokens).
435
+ * @returns - Markdown string ready to write to a .md file.
436
+ */
437
+ declare function renderLocomoReportMarkdown(reports: LocomoReport[], prices?: PriceTable): string;
438
+
186
439
  /**
187
440
  * Write-quality benchmark for the Eidentic memory harness.
188
441
  *
@@ -493,4 +746,4 @@ interface TemporalBenchOptions {
493
746
  */
494
747
  declare function runTemporalBench(memory: Memory, dataset: SyntheticTemporalDataset, opts?: TemporalBenchOptions): Promise<TemporalBenchReport>;
495
748
 
496
- export { type BenchCase, type BenchDataset, type BenchOptions, type BenchQuestion, type BenchReport, type BenchTurn, CONTRADICTION_FIXTURES, type CaseResult, type ContradictionFixture, JUNK_STREAM_FIXTURES, type JunkItem, type QuestionResult, type StateTransition, type SyntheticTemporalDataset, type TemporalBenchOptions, type TemporalBenchReport, type TemporalEntity, type TemporalQuestion, type TemporalQuestionResult, type WriteQualityDetail, type WriteQualityOptions, type WriteQualityReport, factRecall, loadLoCoMo, loadLongMemEval, normalizeText, normalizedIncludes, recallAtK, runMemoryBench, runTemporalBench, runWriteQualityBench, syntheticDataset, syntheticTemporalDataset };
749
+ export { type BenchCase, type BenchDataset, type BenchOptions, type BenchQuestion, type BenchReport, type BenchTurn, CONTRADICTION_FIXTURES, type CaseResult, type CategoryStats, type ContradictionFixture, JUNK_STREAM_FIXTURES, type JunkItem, LOCOMO_SOURCE_SHA, type LocomoBenchOptions, type LocomoCategory, type LocomoDataset, type LocomoQA, type LocomoQuestionResult, type LocomoReport, type LocomoSample, type LocomoSession, type LocomoTurn, type MemoryFactory, type PriceTable, type QuestionResult, type StateTransition, type SyntheticTemporalDataset, type TemporalBenchOptions, type TemporalBenchReport, type TemporalEntity, type TemporalQuestion, type TemporalQuestionResult, type TokenSummary, type WriteQualityDetail, type WriteQualityOptions, type WriteQualityReport, factRecall, loadLoCoMo, loadLoCoMo$1 as loadLoCoMoLegacy, loadLongMemEval, normalizeText, normalizedIncludes, recallAtK, renderLocomoReportMarkdown, resolveEvidence, runLocomoBench, runMemoryBench, runTemporalBench, runWriteQualityBench, syntheticDataset, syntheticTemporalDataset };