@eidentic/bench 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -167,7 +167,7 @@ declare const syntheticDataset: BenchDataset;
167
167
  * @param opts.maxBytes - Maximum allowed file size in bytes (default 256 MiB).
168
168
  * Increase this only if you are loading a vetted, trusted dataset file.
169
169
  */
170
- declare function loadLongMemEval(jsonPath: string, opts?: {
170
+ declare function loadLongMemEval$1(jsonPath: string, opts?: {
171
171
  maxBytes?: number;
172
172
  }): Promise<BenchDataset>;
173
173
  /**
@@ -179,10 +179,542 @@ declare function loadLongMemEval(jsonPath: string, opts?: {
179
179
  * @param opts.maxBytes - Maximum allowed file size in bytes (default 256 MiB).
180
180
  * Increase this only if you are loading a vetted, trusted dataset file.
181
181
  */
182
- declare function loadLoCoMo(jsonPath: string, opts?: {
182
+ declare function loadLoCoMo$1(jsonPath: string, opts?: {
183
183
  maxBytes?: number;
184
184
  }): Promise<BenchDataset>;
185
185
 
186
+ /** One turn parsed from a session. */
187
+ interface LocomoTurn {
188
+ speaker: string;
189
+ diaId: string;
190
+ text: string;
191
+ }
192
+ /** One session parsed from the conversation object. */
193
+ interface LocomoSession {
194
+ /** 1-based session index from the dataset key. */
195
+ index: number;
196
+ /** Human-readable date-time string from the dataset, e.g. "1:56 pm on 8 May, 2023". */
197
+ dateTime: string;
198
+ /** Epoch milliseconds parsed from dateTime (0 when parse fails). */
199
+ dateTimeMs: number;
200
+ turns: LocomoTurn[];
201
+ }
202
+ /**
203
+ * Category definitions:
204
+ * 1 = multi-hop (282 questions)
205
+ * 2 = temporal (321 questions)
206
+ * 3 = open-domain (96 questions)
207
+ * 4 = single-hop (841 questions)
208
+ * 5 = adversarial/unanswerable (446 questions)
209
+ *
210
+ * NOTE: the numeric → semantic mapping above comes from the actual file counts.
211
+ * Primary score = categories 1–4 (denominator 1540). Category 5 is scored separately as refusal rate.
212
+ */
213
+ type LocomoCategory = 1 | 2 | 3 | 4 | 5;
214
+ /** One QA pair with typed evidence references. */
215
+ interface LocomoQA {
216
+ question: string;
217
+ /** Gold answer (string or number). Absent for category-5 (adversarial) questions. */
218
+ answer?: string;
219
+ category: LocomoCategory;
220
+ /** Dia-id references into the conversation sessions, e.g. ["D1:9", "D3:2"]. */
221
+ evidence: string[];
222
+ /** The trap answer for category 5 — a plausible-sounding but wrong answer. */
223
+ adversarialAnswer?: string;
224
+ }
225
+ /** One LoCoMo sample with parsed sessions and QA. */
226
+ interface LocomoSample {
227
+ sampleId: string;
228
+ speakerA: string;
229
+ speakerB: string;
230
+ sessions: LocomoSession[];
231
+ qa: LocomoQA[];
232
+ }
233
+ /** Typed dataset returned by loadLoCoMo. */
234
+ interface LocomoDataset {
235
+ samples: LocomoSample[];
236
+ }
237
+ /**
238
+ * Resolve evidence dia-ids to their actual turn texts.
239
+ *
240
+ * @param sample - The LoCoMo sample.
241
+ * @param diaIds - Array of dia-id strings, e.g. ["D1:9", "D3:2"].
242
+ * @returns - Array of matched turn texts (unmatched ids are silently skipped).
243
+ */
244
+ declare function resolveEvidence(sample: LocomoSample, diaIds: string[]): string[];
245
+
246
+ /**
247
+ * LoCoMo dataset loader — real schema.
248
+ *
249
+ * The real locomo10.json root is a bare JSON array of samples. Sessions are stored as dynamic
250
+ * keys session_N / session_N_date_time in the conversation object (not a nested array).
251
+ *
252
+ * Dataset source: https://github.com/snap-research/locomo
253
+ * License: CC BY-NC 4.0 — raw data must NOT be committed; results are publishable.
254
+ *
255
+ * LOCOMO_SOURCE_SHA below records the upstream commit that produced the locomo10.json file
256
+ * used in this implementation, for provenance.
257
+ */
258
+
259
+ /** Upstream commit SHA of snap-research/locomo at the time the loader was written. */
260
+ declare const LOCOMO_SOURCE_SHA = "3eb6f2c585f5e1699204e3c3bdf7adc5c28cb376";
261
+ /**
262
+ * Load a LoCoMo JSON file (real schema — bare array of samples with dynamic session keys).
263
+ *
264
+ * @param jsonPath - Absolute or relative path to the locomo10.json file.
265
+ * **Security note:** callers must validate untrusted paths before passing them here.
266
+ * @param opts.maxBytes - Maximum allowed file size (default 256 MiB).
267
+ * @returns - Typed LocomoDataset with parsed sessions and QA.
268
+ */
269
+ declare function loadLoCoMo(jsonPath: string, opts?: {
270
+ maxBytes?: number;
271
+ }): Promise<LocomoDataset>;
272
+
273
+ /**
274
+ * Fair-run LoCoMo benchmark harness.
275
+ *
276
+ * Key fair-run rules (non-negotiable, documented for methodology transparency):
277
+ *
278
+ * 1. BOTH speakers are humans — never map to user/assistant roles.
279
+ * Turns are ingested as "[SpeakerName]: <text>" into one Memory scope per conversation.
280
+ *
281
+ * 2. Timestamps structurally — session headers are prepended ("Session N — <date>") so
282
+ * temporal questions are answerable; ingestedAt metadata carries the epoch-ms.
283
+ *
284
+ * 3. Memory-mode answer step — retrieve topK <= ~10 snippets (never inflate topK to
285
+ * bypass retrieval quality), build a prompt from retrieved snippets, answer concisely.
286
+ *
287
+ * 4. Full-context mode — the MANDATORY baseline. Entire conversation fed as context.
288
+ *
289
+ * 5. Judging — strict LLM judge: correct only when the model answer contains the gold
290
+ * answer's specific information (paraphrase ok; vague/topical-only = wrong).
291
+ * For category 5: correct = model declined; adversarial_answer match = wrong.
292
+ *
293
+ * 6. Metrics — per-category accuracy, overall J(1-4) with exact denominator 1540,
294
+ * category-5 refusal rate separately, token/cost accounting, wall-clock.
295
+ *
296
+ * 7. Determinism — seed recorded; seeded shuffle used when sampleLimit/questionLimit set.
297
+ *
298
+ * 8. Resilience — per-question try/catch; checkpoint-resume via JSONL file.
299
+ */
300
+
301
+ /** Per-question scored row (also used as checkpoint entry). */
302
+ interface LocomoQuestionResult {
303
+ sampleId: string;
304
+ questionIndex: number;
305
+ question: string;
306
+ goldAnswer?: string;
307
+ category: 1 | 2 | 3 | 4 | 5;
308
+ modelAnswer: string;
309
+ correct: boolean;
310
+ /** True when category=5 and the model gave the adversarial trap answer. */
311
+ trapTriggered?: boolean;
312
+ error?: string;
313
+ /** Tokens used in the answer step for this question. */
314
+ answerInputTokens: number;
315
+ answerOutputTokens: number;
316
+ /** Tokens used in the judge step for this question. */
317
+ judgeInputTokens: number;
318
+ judgeOutputTokens: number;
319
+ }
320
+ /** Token / cost summary (no vendor pricing hard-coded — caller provides rates). */
321
+ interface TokenSummary {
322
+ ingestInputTokens: number;
323
+ ingestOutputTokens: number;
324
+ answerInputTokens: number;
325
+ answerOutputTokens: number;
326
+ judgeInputTokens: number;
327
+ judgeOutputTokens: number;
328
+ totalInputTokens: number;
329
+ totalOutputTokens: number;
330
+ }
331
+ /** Per-category accuracy. */
332
+ interface CategoryStats {
333
+ correct: number;
334
+ total: number;
335
+ accuracy: number;
336
+ }
337
+ /** The full benchmark report. */
338
+ interface LocomoReport {
339
+ /** Run configuration (included in every published result for transparency). */
340
+ config: {
341
+ mode: "memory" | "full-context";
342
+ topK: number;
343
+ answerModelId: string;
344
+ judgeModelId: string;
345
+ datasetSha: string;
346
+ seed: number;
347
+ categories: number[];
348
+ samplesRun: number;
349
+ questionsRun: number;
350
+ };
351
+ /** Overall J(1–4): accuracy on categories 1–4 only; denominator = questions actually run in 1–4. */
352
+ overallJ14: CategoryStats;
353
+ /** Per-category breakdown (keys "1" through "5"). */
354
+ byCategory: Record<string, CategoryStats>;
355
+ /** Category-5 refusal rate (separate from J14). */
356
+ cat5RefusalRate?: {
357
+ correct: number;
358
+ total: number;
359
+ rate: number;
360
+ };
361
+ /** Token usage accounting. */
362
+ tokens: TokenSummary;
363
+ /** Wall-clock duration of the run in milliseconds. */
364
+ wallClockMs: number;
365
+ /** Individual question results. */
366
+ questions: LocomoQuestionResult[];
367
+ /** Count of questions that threw errors (counted as wrong, not skipped). */
368
+ errorCount: number;
369
+ }
370
+ /** Factory for a fresh Memory instance, called once per sample. */
371
+ type MemoryFactory = (sampleId: string) => Memory | Promise<Memory>;
372
+ /** Options for runLocomoBench. */
373
+ interface LocomoBenchOptions {
374
+ /** Path to locomo10.json. */
375
+ dataPath: string;
376
+ /** Factory for a fresh Memory per conversation (used only when mode="memory"). */
377
+ memoryFactory?: MemoryFactory;
378
+ /** Model used to generate answers. */
379
+ answerModel: ModelPort;
380
+ /** Model used to judge correctness. */
381
+ judgeModel: ModelPort;
382
+ /** "memory" requires memoryFactory; "full-context" feeds the full conversation as context. */
383
+ mode: "memory" | "full-context";
384
+ /** Categories to include (default: [1,2,3,4,5]). */
385
+ categories?: number[];
386
+ /** Cap on samples to process (for quick pilot runs). */
387
+ sampleLimit?: number;
388
+ /** Cap on questions per sample (applied after seeded shuffle when set). */
389
+ questionLimit?: number;
390
+ /** Random seed for shuffle reproducibility. Default 42. */
391
+ seed?: number;
392
+ /** Max snippets retrieved per question in memory mode. MUST be <= 10 to avoid trivialising retrieval. */
393
+ topK?: number;
394
+ /** Concurrency (questions in flight simultaneously). Default 1. */
395
+ concurrency?: number;
396
+ /** Progress callback: (questionsCompleted, questionsTotal) */
397
+ onProgress?: (done: number, total: number) => void;
398
+ /** Path to a JSONL checkpoint file. Existing rows are skipped on resume. */
399
+ checkpointPath?: string;
400
+ /** Pre-loaded dataset (avoids re-reading the file if already loaded). */
401
+ dataset?: LocomoDataset;
402
+ }
403
+ /**
404
+ * Run the LoCoMo benchmark with the given options.
405
+ *
406
+ * @param opts - Configuration (see LocomoBenchOptions).
407
+ * @returns - Full LocomoReport with metrics, token accounting, and per-question details.
408
+ */
409
+ declare function runLocomoBench(opts: LocomoBenchOptions): Promise<LocomoReport>;
410
+
411
+ /**
412
+ * Markdown results renderer for LoCoMo benchmark reports.
413
+ *
414
+ * Produces a defensible, methodology-transparent table suitable for publication.
415
+ * Per the mandatory fair-run rules, results MUST include:
416
+ * - Model ids and judge model id
417
+ * - topK value
418
+ * - Dataset SHA (provenance)
419
+ * - Mode (memory | full-context)
420
+ * - Seed and n-questions
421
+ */
422
+
423
+ /** Optional price table for cost estimates (per 1M tokens, input/output). */
424
+ interface PriceTable {
425
+ /** Per-million input tokens in USD. */
426
+ inputPer1M: number;
427
+ /** Per-million output tokens in USD. */
428
+ outputPer1M: number;
429
+ }
430
+ /**
431
+ * Render one or more LoCoMo benchmark reports as a Markdown table with mandatory methodology notes.
432
+ *
433
+ * @param reports - Array of LocomoReport objects to compare.
434
+ * @param prices - Optional price table for cost-per-run estimates (per 1M tokens).
435
+ * @returns - Markdown string ready to write to a .md file.
436
+ */
437
+ declare function renderLocomoReportMarkdown(reports: LocomoReport[], prices?: PriceTable): string;
438
+
439
+ /** One turn inside a parsed session. */
440
+ interface LmeTurn {
441
+ role: "user" | "assistant";
442
+ content: string;
443
+ /** True when the turn contains the gold answer (may be absent in all turns). */
444
+ hasAnswer: boolean;
445
+ }
446
+ /** One session inside a question's haystack. */
447
+ interface LmeSession {
448
+ /** The session's original id from the dataset. */
449
+ id: string;
450
+ /** Human-readable date-time string from the dataset, e.g. "2023/05/20 (Sat) 02:36". */
451
+ dateTime: string;
452
+ /** Epoch milliseconds parsed from dateTime (0 when parse fails). */
453
+ dateTimeMs: number;
454
+ turns: LmeTurn[];
455
+ }
456
+ /**
457
+ * LongMemEval question types (7 base types, plus `*_abs` abstention variants).
458
+ *
459
+ * Base types:
460
+ * single-session-user — fact stated by the user in one session
461
+ * single-session-assistant — fact stated by the assistant in one session
462
+ * single-session-preference — user preference expressed in one session
463
+ * multi-session — evidence spans multiple sessions
464
+ * temporal-reasoning — requires reasoning about time/dates
465
+ * knowledge-update — the fact was updated in a later session
466
+ *
467
+ * Abstention variants (append "_abs"):
468
+ * The correct answer is to recognize the information is not present / premise
469
+ * is flawed and respond with "no information" rather than fabricating an answer.
470
+ */
471
+ type LmeBaseType = "single-session-user" | "single-session-assistant" | "single-session-preference" | "multi-session" | "temporal-reasoning" | "knowledge-update";
472
+ type LmeQuestionType = LmeBaseType | `${LmeBaseType}_abs` | (string & Record<never, never>);
473
+ /** One parsed LongMemEval question with its haystack. */
474
+ interface LmeQuestion {
475
+ /** Original question_id from the dataset, e.g. "e47becba" or "gpt4_xxxx". */
476
+ id: string;
477
+ /** Full question_type string (may include "_abs" suffix). */
478
+ type: LmeQuestionType;
479
+ /** Base type without any "_abs" suffix. */
480
+ baseType: LmeBaseType | string;
481
+ /** Whether this is an abstention variant (type ends with "_abs"). */
482
+ isAbstention: boolean;
483
+ question: string;
484
+ /** Gold answer string. For abstention questions, the correct response is to abstain. */
485
+ answer: string;
486
+ /** Human-readable question date string from the dataset. */
487
+ questionDate: string;
488
+ /** Epoch ms for the question date (0 when parse fails). */
489
+ questionDateMs: number;
490
+ /** Haystack sessions in date order. */
491
+ sessions: LmeSession[];
492
+ /** Session ids that contain the answer evidence. */
493
+ answerSessionIds: string[];
494
+ }
495
+ /** Typed dataset returned by loadLongMemEval (new real-schema loader). */
496
+ interface LmeDataset {
497
+ questions: LmeQuestion[];
498
+ }
499
+
500
+ /**
501
+ * LongMemEval dataset loader — real schema.
502
+ *
503
+ * Parses the real longmemeval_s.json (or _m / _oracle) format into typed LmeDataset.
504
+ *
505
+ * Dataset source: https://github.com/xiaowu0162/LongMemEval
506
+ * HuggingFace: https://huggingface.co/datasets/xiaowu0162/longmemeval
507
+ * License: MIT — results are publishable; raw data must NOT be committed.
508
+ *
509
+ * LONGMEMEVAL_SOURCE below records the upstream HuggingFace dataset repo + snapshot sha
510
+ * used when this loader was written, for provenance.
511
+ */
512
+
513
+ /** Provenance constant for the LongMemEval dataset. */
514
+ declare const LONGMEMEVAL_SOURCE: {
515
+ readonly url: "https://huggingface.co/datasets/xiaowu0162/longmemeval";
516
+ readonly snapshotSha: "2ec2a557f339b6c0369619b1ed5793734cc87533";
517
+ readonly file: "longmemeval_s";
518
+ readonly license: "MIT";
519
+ };
520
+ /**
521
+ * Parse a LongMemEval date-time string to epoch milliseconds.
522
+ *
523
+ * Observed format: "2023/05/20 (Sat) 02:36"
524
+ * Normalise: strip the "(Weekday)" token, replace "/" separators.
525
+ *
526
+ * Returns 0 when parsing fails (the date is optional / informational).
527
+ */
528
+ declare function parseLmeDateTimeString(raw: string): number;
529
+ /**
530
+ * Load a LongMemEval JSON file (real schema — JSON array of questions).
531
+ *
532
+ * @param jsonPath - Absolute or relative path to the longmemeval_s.json file.
533
+ * **Security note:** callers must validate untrusted paths before passing them here.
534
+ * @param opts.maxBytes - Maximum allowed file size (default 512 MiB).
535
+ * @returns - Typed LmeDataset with parsed sessions and questions.
536
+ */
537
+ declare function loadLongMemEval(jsonPath: string, opts?: {
538
+ maxBytes?: number;
539
+ }): Promise<LmeDataset>;
540
+
541
+ /**
542
+ * Fair-run LongMemEval benchmark harness.
543
+ *
544
+ * Key fair-run rules (non-negotiable, documented for methodology transparency):
545
+ *
546
+ * 1. PER-QUESTION memory scope — each question gets its own fresh haystack ingested into
547
+ * a fresh Memory instance. (Unlike LoCoMo which is per-conversation, LongMemEval
548
+ * is per-question: each question has its own haystack of ~50 sessions.)
549
+ *
550
+ * 2. Dual-granularity ingest — per-turn entries carry the session date in text so
551
+ * retrieved snippets are temporally anchored; one per-session chunk entry preserves
552
+ * multi-turn context for questions whose evidence spans adjacent turns.
553
+ *
554
+ * 3. Current date context — the question_date is passed to the answer prompt so temporal
555
+ * questions can reason about recency (e.g. "last week").
556
+ *
557
+ * 4. Memory-mode answer step — retrieve topK <= 10 snippets (never inflate topK to
558
+ * bypass retrieval quality), build a prompt from retrieved snippets + question + current date.
559
+ *
560
+ * 5. Full-context mode — the MANDATORY baseline. Sessions are concatenated in date order
561
+ * with session headers; if a haystack exceeds the context cap (120k chars ≈ ~90k tokens
562
+ * for gpt-4o-mini's 128k context), oldest sessions are truncated first and recorded.
563
+ *
564
+ * 6. Judging — strict LLM judge: correct only when model answer contains the gold answer's
565
+ * specific info (paraphrase ok; temporal: equivalent date expressions ok).
566
+ * For abstention questions: correct = model declined / said no-info / identified the
567
+ * flawed premise; fabricating a concrete answer = wrong.
568
+ *
569
+ * 7. Metrics — overall accuracy + per-question-type accuracy + abstention accuracy reported
570
+ * separately + token/cost accounting per phase + wall-clock. Full config disclosure in report.
571
+ *
572
+ * 8. Determinism — seed recorded; seeded shuffle used when questionLimit is set.
573
+ *
574
+ * 9. Resilience — per-question try/catch; checkpoint-resume via JSONL file.
575
+ */
576
+
577
+ /** Per-question scored row (also used as checkpoint entry). */
578
+ interface LmeQuestionResult {
579
+ questionId: string;
580
+ questionType: string;
581
+ isAbstention: boolean;
582
+ question: string;
583
+ goldAnswer: string;
584
+ modelAnswer: string;
585
+ correct: boolean;
586
+ /** True if the model appeared to abstain (no-info / declined). */
587
+ appearedToAbstain: boolean;
588
+ /** True if context was truncated in full-context mode. */
589
+ contextTruncated?: boolean;
590
+ error?: string;
591
+ /** Tokens used in the answer step. */
592
+ answerInputTokens: number;
593
+ answerOutputTokens: number;
594
+ /** Tokens used in the judge step. */
595
+ judgeInputTokens: number;
596
+ judgeOutputTokens: number;
597
+ }
598
+ /** Token / cost summary. */
599
+ interface LmeTokenSummary {
600
+ ingestEmbedTokens: number;
601
+ answerInputTokens: number;
602
+ answerOutputTokens: number;
603
+ judgeInputTokens: number;
604
+ judgeOutputTokens: number;
605
+ totalInputTokens: number;
606
+ totalOutputTokens: number;
607
+ }
608
+ /** Accuracy stats for a question type or overall. */
609
+ interface LmeTypeStats {
610
+ correct: number;
611
+ total: number;
612
+ accuracy: number;
613
+ }
614
+ /** Full benchmark report. */
615
+ interface LmeReport {
616
+ /** Run configuration (included in every published result for transparency). */
617
+ config: {
618
+ mode: "memory" | "full-context";
619
+ topK: number;
620
+ answerModelId: string;
621
+ judgeModelId: string;
622
+ datasetSource: typeof LONGMEMEVAL_SOURCE;
623
+ seed: number;
624
+ types: string[];
625
+ questionsRun: number;
626
+ };
627
+ /** Overall accuracy on all non-abstention questions. */
628
+ overall: LmeTypeStats;
629
+ /** Per-question-type accuracy (keys = type strings without "_abs" suffix). */
630
+ byType: Record<string, LmeTypeStats>;
631
+ /** Abstention accuracy: correct = model declined; wrong = fabricated concrete answer. */
632
+ abstentionAccuracy?: LmeTypeStats;
633
+ /** Token usage accounting. */
634
+ tokens: LmeTokenSummary;
635
+ /** Wall-clock duration in milliseconds. */
636
+ wallClockMs: number;
637
+ /** Individual question results. */
638
+ questions: LmeQuestionResult[];
639
+ /** Count of questions that threw errors (counted as wrong, not skipped). */
640
+ errorCount: number;
641
+ }
642
+ /** Factory for a fresh Memory instance, called once per question. */
643
+ type LmeMemoryFactory = (questionId: string) => Memory | Promise<Memory>;
644
+ /** Options for runLongMemEvalBench. */
645
+ interface LmeBenchOptions {
646
+ /** Path to longmemeval_s.json (required unless dataset is provided). */
647
+ dataPath?: string;
648
+ /** Pre-loaded dataset (avoids re-reading the file if already loaded). */
649
+ dataset?: LmeDataset;
650
+ /** Factory for a fresh Memory per question (required when mode="memory"). */
651
+ memoryFactory?: LmeMemoryFactory;
652
+ /** Model used to generate answers. */
653
+ answerModel: ModelPort;
654
+ /** Model used to judge correctness. */
655
+ judgeModel: ModelPort;
656
+ /** "memory" requires memoryFactory; "full-context" feeds the full haystack as context. */
657
+ mode: "memory" | "full-context";
658
+ /** Question types to include (default: all). */
659
+ types?: string[];
660
+ /** Cap on questions to process (for quick pilot runs). */
661
+ questionLimit?: number;
662
+ /** Random seed for shuffle reproducibility. Default 42. */
663
+ seed?: number;
664
+ /** Max snippets retrieved per question in memory mode. MUST be <= 10. Default 10. */
665
+ topK?: number;
666
+ /** Concurrency (questions in flight simultaneously). Default 1. */
667
+ concurrency?: number;
668
+ /** Progress callback: (questionsCompleted, questionsTotal) */
669
+ onProgress?: (done: number, total: number) => void;
670
+ /** Path to a JSONL checkpoint file. Existing rows are skipped on resume. */
671
+ checkpointPath?: string;
672
+ /**
673
+ * Max characters for full-context haystack (to avoid exceeding model context).
674
+ * Default 480000 chars (~120k tokens at ~4 chars/token, fitting gpt-4o-mini 128k).
675
+ * When exceeded, oldest sessions are dropped first and contextTruncated is recorded.
676
+ */
677
+ fullContextMaxChars?: number;
678
+ }
679
+ /**
680
+ * Run the LongMemEval benchmark with the given options.
681
+ *
682
+ * @param opts - Configuration (see LmeBenchOptions).
683
+ * @returns - Full LmeReport with metrics, token accounting, and per-question details.
684
+ */
685
+ declare function runLongMemEvalBench(opts: LmeBenchOptions): Promise<LmeReport>;
686
+
687
+ /**
688
+ * Markdown results renderer for LongMemEval benchmark reports.
689
+ *
690
+ * Produces a defensible, methodology-transparent table suitable for publication.
691
+ * Per the mandatory fair-run rules, results MUST include:
692
+ * - Model ids and judge model id
693
+ * - topK value (memory mode)
694
+ * - Dataset provenance (source URL + snapshot sha)
695
+ * - Mode (memory | full-context)
696
+ * - Seed and n-questions
697
+ * - Per-type accuracy breakdown
698
+ * - Abstention accuracy reported separately
699
+ */
700
+
701
+ /** Optional price table for cost estimates (per 1M tokens, input/output). */
702
+ interface LmePriceTable {
703
+ /** Per-million input tokens in USD. */
704
+ inputPer1M: number;
705
+ /** Per-million output tokens in USD. */
706
+ outputPer1M: number;
707
+ }
708
+ /**
709
+ * Render one or more LongMemEval benchmark reports as a Markdown table
710
+ * with mandatory methodology notes.
711
+ *
712
+ * @param reports - Array of LmeReport objects to compare.
713
+ * @param prices - Optional price table for cost-per-run estimates (per 1M tokens).
714
+ * @returns - Markdown string ready to write to a .md file.
715
+ */
716
+ declare function renderLongMemEvalReportMarkdown(reports: LmeReport[], prices?: LmePriceTable): string;
717
+
186
718
  /**
187
719
  * Write-quality benchmark for the Eidentic memory harness.
188
720
  *
@@ -493,4 +1025,4 @@ interface TemporalBenchOptions {
493
1025
  */
494
1026
  declare function runTemporalBench(memory: Memory, dataset: SyntheticTemporalDataset, opts?: TemporalBenchOptions): Promise<TemporalBenchReport>;
495
1027
 
496
- export { type BenchCase, type BenchDataset, type BenchOptions, type BenchQuestion, type BenchReport, type BenchTurn, CONTRADICTION_FIXTURES, type CaseResult, type ContradictionFixture, JUNK_STREAM_FIXTURES, type JunkItem, type QuestionResult, type StateTransition, type SyntheticTemporalDataset, type TemporalBenchOptions, type TemporalBenchReport, type TemporalEntity, type TemporalQuestion, type TemporalQuestionResult, type WriteQualityDetail, type WriteQualityOptions, type WriteQualityReport, factRecall, loadLoCoMo, loadLongMemEval, normalizeText, normalizedIncludes, recallAtK, runMemoryBench, runTemporalBench, runWriteQualityBench, syntheticDataset, syntheticTemporalDataset };
1028
+ export { type BenchCase, type BenchDataset, type BenchOptions, type BenchQuestion, type BenchReport, type BenchTurn, CONTRADICTION_FIXTURES, type CaseResult, type CategoryStats, type ContradictionFixture, JUNK_STREAM_FIXTURES, type JunkItem, LOCOMO_SOURCE_SHA, LONGMEMEVAL_SOURCE, type LmeBaseType, type LmeBenchOptions, type LmeDataset, type LmeMemoryFactory, type LmePriceTable, type LmeQuestion, type LmeQuestionResult, type LmeQuestionType, type LmeReport, type LmeSession, type LmeTokenSummary, type LmeTurn, type LmeTypeStats, type LocomoBenchOptions, type LocomoCategory, type LocomoDataset, type LocomoQA, type LocomoQuestionResult, type LocomoReport, type LocomoSample, type LocomoSession, type LocomoTurn, type MemoryFactory, type PriceTable, type QuestionResult, type StateTransition, type SyntheticTemporalDataset, type TemporalBenchOptions, type TemporalBenchReport, type TemporalEntity, type TemporalQuestion, type TemporalQuestionResult, type TokenSummary, type WriteQualityDetail, type WriteQualityOptions, type WriteQualityReport, factRecall, loadLoCoMo, loadLoCoMo$1 as loadLoCoMoLegacy, loadLongMemEval, loadLongMemEval$1 as loadLongMemEvalLegacy, normalizeText, normalizedIncludes, parseLmeDateTimeString, recallAtK, renderLocomoReportMarkdown, renderLongMemEvalReportMarkdown, resolveEvidence, runLocomoBench, runLongMemEvalBench, runMemoryBench, runTemporalBench, runWriteQualityBench, syntheticDataset, syntheticTemporalDataset };