@eidentic/bench 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -167,7 +167,7 @@ declare const syntheticDataset: BenchDataset;
167
167
  * @param opts.maxBytes - Maximum allowed file size in bytes (default 256 MiB).
168
168
  * Increase this only if you are loading a vetted, trusted dataset file.
169
169
  */
170
- declare function loadLongMemEval(jsonPath: string, opts?: {
170
+ declare function loadLongMemEval$1(jsonPath: string, opts?: {
171
171
  maxBytes?: number;
172
172
  }): Promise<BenchDataset>;
173
173
  /**
@@ -436,6 +436,285 @@ interface PriceTable {
436
436
  */
437
437
  declare function renderLocomoReportMarkdown(reports: LocomoReport[], prices?: PriceTable): string;
438
438
 
439
+ /** One turn inside a parsed session. */
440
+ interface LmeTurn {
441
+ role: "user" | "assistant";
442
+ content: string;
443
+ /** True when the turn contains the gold answer (may be absent in all turns). */
444
+ hasAnswer: boolean;
445
+ }
446
+ /** One session inside a question's haystack. */
447
+ interface LmeSession {
448
+ /** The session's original id from the dataset. */
449
+ id: string;
450
+ /** Human-readable date-time string from the dataset, e.g. "2023/05/20 (Sat) 02:36". */
451
+ dateTime: string;
452
+ /** Epoch milliseconds parsed from dateTime (0 when parse fails). */
453
+ dateTimeMs: number;
454
+ turns: LmeTurn[];
455
+ }
456
+ /**
457
+ * LongMemEval question types (7 base types, plus `*_abs` abstention variants).
458
+ *
459
+ * Base types:
460
+ * single-session-user — fact stated by the user in one session
461
+ * single-session-assistant — fact stated by the assistant in one session
462
+ * single-session-preference — user preference expressed in one session
463
+ * multi-session — evidence spans multiple sessions
464
+ * temporal-reasoning — requires reasoning about time/dates
465
+ * knowledge-update — the fact was updated in a later session
466
+ *
467
+ * Abstention variants (append "_abs"):
468
+ * The correct answer is to recognize the information is not present / premise
469
+ * is flawed and respond with "no information" rather than fabricating an answer.
470
+ */
471
+ type LmeBaseType = "single-session-user" | "single-session-assistant" | "single-session-preference" | "multi-session" | "temporal-reasoning" | "knowledge-update";
472
+ type LmeQuestionType = LmeBaseType | `${LmeBaseType}_abs` | (string & Record<never, never>);
473
+ /** One parsed LongMemEval question with its haystack. */
474
+ interface LmeQuestion {
475
+ /** Original question_id from the dataset, e.g. "e47becba" or "gpt4_xxxx". */
476
+ id: string;
477
+ /** Full question_type string (may include "_abs" suffix). */
478
+ type: LmeQuestionType;
479
+ /** Base type without any "_abs" suffix. */
480
+ baseType: LmeBaseType | string;
481
+ /** Whether this is an abstention variant (type ends with "_abs"). */
482
+ isAbstention: boolean;
483
+ question: string;
484
+ /** Gold answer string. For abstention questions, the correct response is to abstain. */
485
+ answer: string;
486
+ /** Human-readable question date string from the dataset. */
487
+ questionDate: string;
488
+ /** Epoch ms for the question date (0 when parse fails). */
489
+ questionDateMs: number;
490
+ /** Haystack sessions in date order. */
491
+ sessions: LmeSession[];
492
+ /** Session ids that contain the answer evidence. */
493
+ answerSessionIds: string[];
494
+ }
495
+ /** Typed dataset returned by loadLongMemEval (new real-schema loader). */
496
+ interface LmeDataset {
497
+ questions: LmeQuestion[];
498
+ }
499
+
500
+ /**
501
+ * LongMemEval dataset loader — real schema.
502
+ *
503
+ * Parses the real longmemeval_s.json (or _m / _oracle) format into typed LmeDataset.
504
+ *
505
+ * Dataset source: https://github.com/xiaowu0162/LongMemEval
506
+ * HuggingFace: https://huggingface.co/datasets/xiaowu0162/longmemeval
507
+ * License: MIT — results are publishable; raw data must NOT be committed.
508
+ *
509
+ * LONGMEMEVAL_SOURCE below records the upstream HuggingFace dataset repo + snapshot sha
510
+ * used when this loader was written, for provenance.
511
+ */
512
+
513
+ /** Provenance constant for the LongMemEval dataset. */
514
+ declare const LONGMEMEVAL_SOURCE: {
515
+ readonly url: "https://huggingface.co/datasets/xiaowu0162/longmemeval";
516
+ readonly snapshotSha: "2ec2a557f339b6c0369619b1ed5793734cc87533";
517
+ readonly file: "longmemeval_s";
518
+ readonly license: "MIT";
519
+ };
520
+ /**
521
+ * Parse a LongMemEval date-time string to epoch milliseconds.
522
+ *
523
+ * Observed format: "2023/05/20 (Sat) 02:36"
524
+ * Normalise: strip the "(Weekday)" token, replace "/" separators.
525
+ *
526
+ * Returns 0 when parsing fails (the date is optional / informational).
527
+ */
528
+ declare function parseLmeDateTimeString(raw: string): number;
529
+ /**
530
+ * Load a LongMemEval JSON file (real schema — JSON array of questions).
531
+ *
532
+ * @param jsonPath - Absolute or relative path to the longmemeval_s.json file.
533
+ * **Security note:** callers must validate untrusted paths before passing them here.
534
+ * @param opts.maxBytes - Maximum allowed file size (default 512 MiB).
535
+ * @returns - Typed LmeDataset with parsed sessions and questions.
536
+ */
537
+ declare function loadLongMemEval(jsonPath: string, opts?: {
538
+ maxBytes?: number;
539
+ }): Promise<LmeDataset>;
540
+
541
+ /**
542
+ * Fair-run LongMemEval benchmark harness.
543
+ *
544
+ * Key fair-run rules (non-negotiable, documented for methodology transparency):
545
+ *
546
+ * 1. PER-QUESTION memory scope — each question gets its own fresh haystack ingested into
547
+ * a fresh Memory instance. (Unlike LoCoMo which is per-conversation, LongMemEval
548
+ * is per-question: each question has its own haystack of ~50 sessions.)
549
+ *
550
+ * 2. Dual-granularity ingest — per-turn entries carry the session date in text so
551
+ * retrieved snippets are temporally anchored; one per-session chunk entry preserves
552
+ * multi-turn context for questions whose evidence spans adjacent turns.
553
+ *
554
+ * 3. Current date context — the question_date is passed to the answer prompt so temporal
555
+ * questions can reason about recency (e.g. "last week").
556
+ *
557
+ * 4. Memory-mode answer step — retrieve topK <= 10 snippets (never inflate topK to
558
+ * bypass retrieval quality), build a prompt from retrieved snippets + question + current date.
559
+ *
560
+ * 5. Full-context mode — the MANDATORY baseline. Sessions are concatenated in date order
561
+ * with session headers; if a haystack exceeds the context cap (120k chars ≈ ~90k tokens
562
+ * for gpt-4o-mini's 128k context), oldest sessions are truncated first and recorded.
563
+ *
564
+ * 6. Judging — strict LLM judge: correct only when model answer contains the gold answer's
565
+ * specific info (paraphrase ok; temporal: equivalent date expressions ok).
566
+ * For abstention questions: correct = model declined / said no-info / identified the
567
+ * flawed premise; fabricating a concrete answer = wrong.
568
+ *
569
+ * 7. Metrics — overall accuracy + per-question-type accuracy + abstention accuracy reported
570
+ * separately + token/cost accounting per phase + wall-clock. Full config disclosure in report.
571
+ *
572
+ * 8. Determinism — seed recorded; seeded shuffle used when questionLimit is set.
573
+ *
574
+ * 9. Resilience — per-question try/catch; checkpoint-resume via JSONL file.
575
+ */
576
+
577
+ /** Per-question scored row (also used as checkpoint entry). */
578
+ interface LmeQuestionResult {
579
+ questionId: string;
580
+ questionType: string;
581
+ isAbstention: boolean;
582
+ question: string;
583
+ goldAnswer: string;
584
+ modelAnswer: string;
585
+ correct: boolean;
586
+ /** True if the model appeared to abstain (no-info / declined). */
587
+ appearedToAbstain: boolean;
588
+ /** True if context was truncated in full-context mode. */
589
+ contextTruncated?: boolean;
590
+ error?: string;
591
+ /** Tokens used in the answer step. */
592
+ answerInputTokens: number;
593
+ answerOutputTokens: number;
594
+ /** Tokens used in the judge step. */
595
+ judgeInputTokens: number;
596
+ judgeOutputTokens: number;
597
+ }
598
+ /** Token / cost summary. */
599
+ interface LmeTokenSummary {
600
+ ingestEmbedTokens: number;
601
+ answerInputTokens: number;
602
+ answerOutputTokens: number;
603
+ judgeInputTokens: number;
604
+ judgeOutputTokens: number;
605
+ totalInputTokens: number;
606
+ totalOutputTokens: number;
607
+ }
608
+ /** Accuracy stats for a question type or overall. */
609
+ interface LmeTypeStats {
610
+ correct: number;
611
+ total: number;
612
+ accuracy: number;
613
+ }
614
+ /** Full benchmark report. */
615
+ interface LmeReport {
616
+ /** Run configuration (included in every published result for transparency). */
617
+ config: {
618
+ mode: "memory" | "full-context";
619
+ topK: number;
620
+ answerModelId: string;
621
+ judgeModelId: string;
622
+ datasetSource: typeof LONGMEMEVAL_SOURCE;
623
+ seed: number;
624
+ types: string[];
625
+ questionsRun: number;
626
+ };
627
+ /** Overall accuracy on all non-abstention questions. */
628
+ overall: LmeTypeStats;
629
+ /** Per-question-type accuracy (keys = type strings without "_abs" suffix). */
630
+ byType: Record<string, LmeTypeStats>;
631
+ /** Abstention accuracy: correct = model declined; wrong = fabricated concrete answer. */
632
+ abstentionAccuracy?: LmeTypeStats;
633
+ /** Token usage accounting. */
634
+ tokens: LmeTokenSummary;
635
+ /** Wall-clock duration in milliseconds. */
636
+ wallClockMs: number;
637
+ /** Individual question results. */
638
+ questions: LmeQuestionResult[];
639
+ /** Count of questions that threw errors (counted as wrong, not skipped). */
640
+ errorCount: number;
641
+ }
642
+ /** Factory for a fresh Memory instance, called once per question. */
643
+ type LmeMemoryFactory = (questionId: string) => Memory | Promise<Memory>;
644
+ /** Options for runLongMemEvalBench. */
645
+ interface LmeBenchOptions {
646
+ /** Path to longmemeval_s.json (required unless dataset is provided). */
647
+ dataPath?: string;
648
+ /** Pre-loaded dataset (avoids re-reading the file if already loaded). */
649
+ dataset?: LmeDataset;
650
+ /** Factory for a fresh Memory per question (required when mode="memory"). */
651
+ memoryFactory?: LmeMemoryFactory;
652
+ /** Model used to generate answers. */
653
+ answerModel: ModelPort;
654
+ /** Model used to judge correctness. */
655
+ judgeModel: ModelPort;
656
+ /** "memory" requires memoryFactory; "full-context" feeds the full haystack as context. */
657
+ mode: "memory" | "full-context";
658
+ /** Question types to include (default: all). */
659
+ types?: string[];
660
+ /** Cap on questions to process (for quick pilot runs). */
661
+ questionLimit?: number;
662
+ /** Random seed for shuffle reproducibility. Default 42. */
663
+ seed?: number;
664
+ /** Max snippets retrieved per question in memory mode. MUST be <= 10. Default 10. */
665
+ topK?: number;
666
+ /** Concurrency (questions in flight simultaneously). Default 1. */
667
+ concurrency?: number;
668
+ /** Progress callback: (questionsCompleted, questionsTotal) */
669
+ onProgress?: (done: number, total: number) => void;
670
+ /** Path to a JSONL checkpoint file. Existing rows are skipped on resume. */
671
+ checkpointPath?: string;
672
+ /**
673
+ * Max characters for full-context haystack (to avoid exceeding model context).
674
+ * Default 480000 chars (~120k tokens at ~4 chars/token, fitting gpt-4o-mini 128k).
675
+ * When exceeded, oldest sessions are dropped first and contextTruncated is recorded.
676
+ */
677
+ fullContextMaxChars?: number;
678
+ }
679
+ /**
680
+ * Run the LongMemEval benchmark with the given options.
681
+ *
682
+ * @param opts - Configuration (see LmeBenchOptions).
683
+ * @returns - Full LmeReport with metrics, token accounting, and per-question details.
684
+ */
685
+ declare function runLongMemEvalBench(opts: LmeBenchOptions): Promise<LmeReport>;
686
+
687
+ /**
688
+ * Markdown results renderer for LongMemEval benchmark reports.
689
+ *
690
+ * Produces a defensible, methodology-transparent table suitable for publication.
691
+ * Per the mandatory fair-run rules, results MUST include:
692
+ * - Model ids and judge model id
693
+ * - topK value (memory mode)
694
+ * - Dataset provenance (source URL + snapshot sha)
695
+ * - Mode (memory | full-context)
696
+ * - Seed and n-questions
697
+ * - Per-type accuracy breakdown
698
+ * - Abstention accuracy reported separately
699
+ */
700
+
701
+ /** Optional price table for cost estimates (per 1M tokens, input/output). */
702
+ interface LmePriceTable {
703
+ /** Per-million input tokens in USD. */
704
+ inputPer1M: number;
705
+ /** Per-million output tokens in USD. */
706
+ outputPer1M: number;
707
+ }
708
+ /**
709
+ * Render one or more LongMemEval benchmark reports as a Markdown table
710
+ * with mandatory methodology notes.
711
+ *
712
+ * @param reports - Array of LmeReport objects to compare.
713
+ * @param prices - Optional price table for cost-per-run estimates (per 1M tokens).
714
+ * @returns - Markdown string ready to write to a .md file.
715
+ */
716
+ declare function renderLongMemEvalReportMarkdown(reports: LmeReport[], prices?: LmePriceTable): string;
717
+
439
718
  /**
440
719
  * Write-quality benchmark for the Eidentic memory harness.
441
720
  *
@@ -746,4 +1025,4 @@ interface TemporalBenchOptions {
746
1025
  */
747
1026
  declare function runTemporalBench(memory: Memory, dataset: SyntheticTemporalDataset, opts?: TemporalBenchOptions): Promise<TemporalBenchReport>;
748
1027
 
749
- export { type BenchCase, type BenchDataset, type BenchOptions, type BenchQuestion, type BenchReport, type BenchTurn, CONTRADICTION_FIXTURES, type CaseResult, type CategoryStats, type ContradictionFixture, JUNK_STREAM_FIXTURES, type JunkItem, LOCOMO_SOURCE_SHA, type LocomoBenchOptions, type LocomoCategory, type LocomoDataset, type LocomoQA, type LocomoQuestionResult, type LocomoReport, type LocomoSample, type LocomoSession, type LocomoTurn, type MemoryFactory, type PriceTable, type QuestionResult, type StateTransition, type SyntheticTemporalDataset, type TemporalBenchOptions, type TemporalBenchReport, type TemporalEntity, type TemporalQuestion, type TemporalQuestionResult, type TokenSummary, type WriteQualityDetail, type WriteQualityOptions, type WriteQualityReport, factRecall, loadLoCoMo, loadLoCoMo$1 as loadLoCoMoLegacy, loadLongMemEval, normalizeText, normalizedIncludes, recallAtK, renderLocomoReportMarkdown, resolveEvidence, runLocomoBench, runMemoryBench, runTemporalBench, runWriteQualityBench, syntheticDataset, syntheticTemporalDataset };
1028
+ export { type BenchCase, type BenchDataset, type BenchOptions, type BenchQuestion, type BenchReport, type BenchTurn, CONTRADICTION_FIXTURES, type CaseResult, type CategoryStats, type ContradictionFixture, JUNK_STREAM_FIXTURES, type JunkItem, LOCOMO_SOURCE_SHA, LONGMEMEVAL_SOURCE, type LmeBaseType, type LmeBenchOptions, type LmeDataset, type LmeMemoryFactory, type LmePriceTable, type LmeQuestion, type LmeQuestionResult, type LmeQuestionType, type LmeReport, type LmeSession, type LmeTokenSummary, type LmeTurn, type LmeTypeStats, type LocomoBenchOptions, type LocomoCategory, type LocomoDataset, type LocomoQA, type LocomoQuestionResult, type LocomoReport, type LocomoSample, type LocomoSession, type LocomoTurn, type MemoryFactory, type PriceTable, type QuestionResult, type StateTransition, type SyntheticTemporalDataset, type TemporalBenchOptions, type TemporalBenchReport, type TemporalEntity, type TemporalQuestion, type TemporalQuestionResult, type TokenSummary, type WriteQualityDetail, type WriteQualityOptions, type WriteQualityReport, factRecall, loadLoCoMo, loadLoCoMo$1 as loadLoCoMoLegacy, loadLongMemEval, loadLongMemEval$1 as loadLongMemEvalLegacy, normalizeText, normalizedIncludes, parseLmeDateTimeString, recallAtK, renderLocomoReportMarkdown, renderLongMemEvalReportMarkdown, resolveEvidence, runLocomoBench, runLongMemEvalBench, runMemoryBench, runTemporalBench, runWriteQualityBench, syntheticDataset, syntheticTemporalDataset };
package/dist/index.d.ts CHANGED
@@ -167,7 +167,7 @@ declare const syntheticDataset: BenchDataset;
167
167
  * @param opts.maxBytes - Maximum allowed file size in bytes (default 256 MiB).
168
168
  * Increase this only if you are loading a vetted, trusted dataset file.
169
169
  */
170
- declare function loadLongMemEval(jsonPath: string, opts?: {
170
+ declare function loadLongMemEval$1(jsonPath: string, opts?: {
171
171
  maxBytes?: number;
172
172
  }): Promise<BenchDataset>;
173
173
  /**
@@ -436,6 +436,285 @@ interface PriceTable {
436
436
  */
437
437
  declare function renderLocomoReportMarkdown(reports: LocomoReport[], prices?: PriceTable): string;
438
438
 
439
+ /** One turn inside a parsed session. */
440
+ interface LmeTurn {
441
+ role: "user" | "assistant";
442
+ content: string;
443
+ /** True when the turn contains the gold answer (may be absent in all turns). */
444
+ hasAnswer: boolean;
445
+ }
446
+ /** One session inside a question's haystack. */
447
+ interface LmeSession {
448
+ /** The session's original id from the dataset. */
449
+ id: string;
450
+ /** Human-readable date-time string from the dataset, e.g. "2023/05/20 (Sat) 02:36". */
451
+ dateTime: string;
452
+ /** Epoch milliseconds parsed from dateTime (0 when parse fails). */
453
+ dateTimeMs: number;
454
+ turns: LmeTurn[];
455
+ }
456
+ /**
457
+ * LongMemEval question types (7 base types, plus `*_abs` abstention variants).
458
+ *
459
+ * Base types:
460
+ * single-session-user — fact stated by the user in one session
461
+ * single-session-assistant — fact stated by the assistant in one session
462
+ * single-session-preference — user preference expressed in one session
463
+ * multi-session — evidence spans multiple sessions
464
+ * temporal-reasoning — requires reasoning about time/dates
465
+ * knowledge-update — the fact was updated in a later session
466
+ *
467
+ * Abstention variants (append "_abs"):
468
+ * The correct answer is to recognize the information is not present / premise
469
+ * is flawed and respond with "no information" rather than fabricating an answer.
470
+ */
471
+ type LmeBaseType = "single-session-user" | "single-session-assistant" | "single-session-preference" | "multi-session" | "temporal-reasoning" | "knowledge-update";
472
+ type LmeQuestionType = LmeBaseType | `${LmeBaseType}_abs` | (string & Record<never, never>);
473
+ /** One parsed LongMemEval question with its haystack. */
474
+ interface LmeQuestion {
475
+ /** Original question_id from the dataset, e.g. "e47becba" or "gpt4_xxxx". */
476
+ id: string;
477
+ /** Full question_type string (may include "_abs" suffix). */
478
+ type: LmeQuestionType;
479
+ /** Base type without any "_abs" suffix. */
480
+ baseType: LmeBaseType | string;
481
+ /** Whether this is an abstention variant (type ends with "_abs"). */
482
+ isAbstention: boolean;
483
+ question: string;
484
+ /** Gold answer string. For abstention questions, the correct response is to abstain. */
485
+ answer: string;
486
+ /** Human-readable question date string from the dataset. */
487
+ questionDate: string;
488
+ /** Epoch ms for the question date (0 when parse fails). */
489
+ questionDateMs: number;
490
+ /** Haystack sessions in date order. */
491
+ sessions: LmeSession[];
492
+ /** Session ids that contain the answer evidence. */
493
+ answerSessionIds: string[];
494
+ }
495
+ /** Typed dataset returned by loadLongMemEval (new real-schema loader). */
496
+ interface LmeDataset {
497
+ questions: LmeQuestion[];
498
+ }
499
+
500
+ /**
501
+ * LongMemEval dataset loader — real schema.
502
+ *
503
+ * Parses the real longmemeval_s.json (or _m / _oracle) format into typed LmeDataset.
504
+ *
505
+ * Dataset source: https://github.com/xiaowu0162/LongMemEval
506
+ * HuggingFace: https://huggingface.co/datasets/xiaowu0162/longmemeval
507
+ * License: MIT — results are publishable; raw data must NOT be committed.
508
+ *
509
+ * LONGMEMEVAL_SOURCE below records the upstream HuggingFace dataset repo + snapshot sha
510
+ * used when this loader was written, for provenance.
511
+ */
512
+
513
+ /** Provenance constant for the LongMemEval dataset. */
514
+ declare const LONGMEMEVAL_SOURCE: {
515
+ readonly url: "https://huggingface.co/datasets/xiaowu0162/longmemeval";
516
+ readonly snapshotSha: "2ec2a557f339b6c0369619b1ed5793734cc87533";
517
+ readonly file: "longmemeval_s";
518
+ readonly license: "MIT";
519
+ };
520
+ /**
521
+ * Parse a LongMemEval date-time string to epoch milliseconds.
522
+ *
523
+ * Observed format: "2023/05/20 (Sat) 02:36"
524
+ * Normalise: strip the "(Weekday)" token, replace "/" separators.
525
+ *
526
+ * Returns 0 when parsing fails (the date is optional / informational).
527
+ */
528
+ declare function parseLmeDateTimeString(raw: string): number;
529
+ /**
530
+ * Load a LongMemEval JSON file (real schema — JSON array of questions).
531
+ *
532
+ * @param jsonPath - Absolute or relative path to the longmemeval_s.json file.
533
+ * **Security note:** callers must validate untrusted paths before passing them here.
534
+ * @param opts.maxBytes - Maximum allowed file size (default 512 MiB).
535
+ * @returns - Typed LmeDataset with parsed sessions and questions.
536
+ */
537
+ declare function loadLongMemEval(jsonPath: string, opts?: {
538
+ maxBytes?: number;
539
+ }): Promise<LmeDataset>;
540
+
541
+ /**
542
+ * Fair-run LongMemEval benchmark harness.
543
+ *
544
+ * Key fair-run rules (non-negotiable, documented for methodology transparency):
545
+ *
546
+ * 1. PER-QUESTION memory scope — each question gets its own fresh haystack ingested into
547
+ * a fresh Memory instance. (Unlike LoCoMo which is per-conversation, LongMemEval
548
+ * is per-question: each question has its own haystack of ~50 sessions.)
549
+ *
550
+ * 2. Dual-granularity ingest — per-turn entries carry the session date in text so
551
+ * retrieved snippets are temporally anchored; one per-session chunk entry preserves
552
+ * multi-turn context for questions whose evidence spans adjacent turns.
553
+ *
554
+ * 3. Current date context — the question_date is passed to the answer prompt so temporal
555
+ * questions can reason about recency (e.g. "last week").
556
+ *
557
+ * 4. Memory-mode answer step — retrieve topK <= 10 snippets (never inflate topK to
558
+ * bypass retrieval quality), build a prompt from retrieved snippets + question + current date.
559
+ *
560
+ * 5. Full-context mode — the MANDATORY baseline. Sessions are concatenated in date order
561
+ * with session headers; if a haystack exceeds the context cap (120k chars ≈ ~90k tokens
562
+ * for gpt-4o-mini's 128k context), oldest sessions are truncated first and recorded.
563
+ *
564
+ * 6. Judging — strict LLM judge: correct only when model answer contains the gold answer's
565
+ * specific info (paraphrase ok; temporal: equivalent date expressions ok).
566
+ * For abstention questions: correct = model declined / said no-info / identified the
567
+ * flawed premise; fabricating a concrete answer = wrong.
568
+ *
569
+ * 7. Metrics — overall accuracy + per-question-type accuracy + abstention accuracy reported
570
+ * separately + token/cost accounting per phase + wall-clock. Full config disclosure in report.
571
+ *
572
+ * 8. Determinism — seed recorded; seeded shuffle used when questionLimit is set.
573
+ *
574
+ * 9. Resilience — per-question try/catch; checkpoint-resume via JSONL file.
575
+ */
576
+
577
+ /** Per-question scored row (also used as checkpoint entry). */
578
+ interface LmeQuestionResult {
579
+ questionId: string;
580
+ questionType: string;
581
+ isAbstention: boolean;
582
+ question: string;
583
+ goldAnswer: string;
584
+ modelAnswer: string;
585
+ correct: boolean;
586
+ /** True if the model appeared to abstain (no-info / declined). */
587
+ appearedToAbstain: boolean;
588
+ /** True if context was truncated in full-context mode. */
589
+ contextTruncated?: boolean;
590
+ error?: string;
591
+ /** Tokens used in the answer step. */
592
+ answerInputTokens: number;
593
+ answerOutputTokens: number;
594
+ /** Tokens used in the judge step. */
595
+ judgeInputTokens: number;
596
+ judgeOutputTokens: number;
597
+ }
598
+ /** Token / cost summary. */
599
+ interface LmeTokenSummary {
600
+ ingestEmbedTokens: number;
601
+ answerInputTokens: number;
602
+ answerOutputTokens: number;
603
+ judgeInputTokens: number;
604
+ judgeOutputTokens: number;
605
+ totalInputTokens: number;
606
+ totalOutputTokens: number;
607
+ }
608
+ /** Accuracy stats for a question type or overall. */
609
+ interface LmeTypeStats {
610
+ correct: number;
611
+ total: number;
612
+ accuracy: number;
613
+ }
614
+ /** Full benchmark report. */
615
+ interface LmeReport {
616
+ /** Run configuration (included in every published result for transparency). */
617
+ config: {
618
+ mode: "memory" | "full-context";
619
+ topK: number;
620
+ answerModelId: string;
621
+ judgeModelId: string;
622
+ datasetSource: typeof LONGMEMEVAL_SOURCE;
623
+ seed: number;
624
+ types: string[];
625
+ questionsRun: number;
626
+ };
627
+ /** Overall accuracy on all non-abstention questions. */
628
+ overall: LmeTypeStats;
629
+ /** Per-question-type accuracy (keys = type strings without "_abs" suffix). */
630
+ byType: Record<string, LmeTypeStats>;
631
+ /** Abstention accuracy: correct = model declined; wrong = fabricated concrete answer. */
632
+ abstentionAccuracy?: LmeTypeStats;
633
+ /** Token usage accounting. */
634
+ tokens: LmeTokenSummary;
635
+ /** Wall-clock duration in milliseconds. */
636
+ wallClockMs: number;
637
+ /** Individual question results. */
638
+ questions: LmeQuestionResult[];
639
+ /** Count of questions that threw errors (counted as wrong, not skipped). */
640
+ errorCount: number;
641
+ }
642
+ /** Factory for a fresh Memory instance, called once per question. */
643
+ type LmeMemoryFactory = (questionId: string) => Memory | Promise<Memory>;
644
+ /** Options for runLongMemEvalBench. */
645
+ interface LmeBenchOptions {
646
+ /** Path to longmemeval_s.json (required unless dataset is provided). */
647
+ dataPath?: string;
648
+ /** Pre-loaded dataset (avoids re-reading the file if already loaded). */
649
+ dataset?: LmeDataset;
650
+ /** Factory for a fresh Memory per question (required when mode="memory"). */
651
+ memoryFactory?: LmeMemoryFactory;
652
+ /** Model used to generate answers. */
653
+ answerModel: ModelPort;
654
+ /** Model used to judge correctness. */
655
+ judgeModel: ModelPort;
656
+ /** "memory" requires memoryFactory; "full-context" feeds the full haystack as context. */
657
+ mode: "memory" | "full-context";
658
+ /** Question types to include (default: all). */
659
+ types?: string[];
660
+ /** Cap on questions to process (for quick pilot runs). */
661
+ questionLimit?: number;
662
+ /** Random seed for shuffle reproducibility. Default 42. */
663
+ seed?: number;
664
+ /** Max snippets retrieved per question in memory mode. MUST be <= 10. Default 10. */
665
+ topK?: number;
666
+ /** Concurrency (questions in flight simultaneously). Default 1. */
667
+ concurrency?: number;
668
+ /** Progress callback: (questionsCompleted, questionsTotal) */
669
+ onProgress?: (done: number, total: number) => void;
670
+ /** Path to a JSONL checkpoint file. Existing rows are skipped on resume. */
671
+ checkpointPath?: string;
672
+ /**
673
+ * Max characters for full-context haystack (to avoid exceeding model context).
674
+ * Default 480000 chars (~120k tokens at ~4 chars/token, fitting gpt-4o-mini 128k).
675
+ * When exceeded, oldest sessions are dropped first and contextTruncated is recorded.
676
+ */
677
+ fullContextMaxChars?: number;
678
+ }
679
+ /**
680
+ * Run the LongMemEval benchmark with the given options.
681
+ *
682
+ * @param opts - Configuration (see LmeBenchOptions).
683
+ * @returns - Full LmeReport with metrics, token accounting, and per-question details.
684
+ */
685
+ declare function runLongMemEvalBench(opts: LmeBenchOptions): Promise<LmeReport>;
686
+
687
+ /**
688
+ * Markdown results renderer for LongMemEval benchmark reports.
689
+ *
690
+ * Produces a defensible, methodology-transparent table suitable for publication.
691
+ * Per the mandatory fair-run rules, results MUST include:
692
+ * - Model ids and judge model id
693
+ * - topK value (memory mode)
694
+ * - Dataset provenance (source URL + snapshot sha)
695
+ * - Mode (memory | full-context)
696
+ * - Seed and n-questions
697
+ * - Per-type accuracy breakdown
698
+ * - Abstention accuracy reported separately
699
+ */
700
+
701
+ /** Optional price table for cost estimates (per 1M tokens, input/output). */
702
+ interface LmePriceTable {
703
+ /** Per-million input tokens in USD. */
704
+ inputPer1M: number;
705
+ /** Per-million output tokens in USD. */
706
+ outputPer1M: number;
707
+ }
708
+ /**
709
+ * Render one or more LongMemEval benchmark reports as a Markdown table
710
+ * with mandatory methodology notes.
711
+ *
712
+ * @param reports - Array of LmeReport objects to compare.
713
+ * @param prices - Optional price table for cost-per-run estimates (per 1M tokens).
714
+ * @returns - Markdown string ready to write to a .md file.
715
+ */
716
+ declare function renderLongMemEvalReportMarkdown(reports: LmeReport[], prices?: LmePriceTable): string;
717
+
439
718
  /**
440
719
  * Write-quality benchmark for the Eidentic memory harness.
441
720
  *
@@ -746,4 +1025,4 @@ interface TemporalBenchOptions {
746
1025
  */
747
1026
  declare function runTemporalBench(memory: Memory, dataset: SyntheticTemporalDataset, opts?: TemporalBenchOptions): Promise<TemporalBenchReport>;
748
1027
 
749
- export { type BenchCase, type BenchDataset, type BenchOptions, type BenchQuestion, type BenchReport, type BenchTurn, CONTRADICTION_FIXTURES, type CaseResult, type CategoryStats, type ContradictionFixture, JUNK_STREAM_FIXTURES, type JunkItem, LOCOMO_SOURCE_SHA, type LocomoBenchOptions, type LocomoCategory, type LocomoDataset, type LocomoQA, type LocomoQuestionResult, type LocomoReport, type LocomoSample, type LocomoSession, type LocomoTurn, type MemoryFactory, type PriceTable, type QuestionResult, type StateTransition, type SyntheticTemporalDataset, type TemporalBenchOptions, type TemporalBenchReport, type TemporalEntity, type TemporalQuestion, type TemporalQuestionResult, type TokenSummary, type WriteQualityDetail, type WriteQualityOptions, type WriteQualityReport, factRecall, loadLoCoMo, loadLoCoMo$1 as loadLoCoMoLegacy, loadLongMemEval, normalizeText, normalizedIncludes, recallAtK, renderLocomoReportMarkdown, resolveEvidence, runLocomoBench, runMemoryBench, runTemporalBench, runWriteQualityBench, syntheticDataset, syntheticTemporalDataset };
1028
+ export { type BenchCase, type BenchDataset, type BenchOptions, type BenchQuestion, type BenchReport, type BenchTurn, CONTRADICTION_FIXTURES, type CaseResult, type CategoryStats, type ContradictionFixture, JUNK_STREAM_FIXTURES, type JunkItem, LOCOMO_SOURCE_SHA, LONGMEMEVAL_SOURCE, type LmeBaseType, type LmeBenchOptions, type LmeDataset, type LmeMemoryFactory, type LmePriceTable, type LmeQuestion, type LmeQuestionResult, type LmeQuestionType, type LmeReport, type LmeSession, type LmeTokenSummary, type LmeTurn, type LmeTypeStats, type LocomoBenchOptions, type LocomoCategory, type LocomoDataset, type LocomoQA, type LocomoQuestionResult, type LocomoReport, type LocomoSample, type LocomoSession, type LocomoTurn, type MemoryFactory, type PriceTable, type QuestionResult, type StateTransition, type SyntheticTemporalDataset, type TemporalBenchOptions, type TemporalBenchReport, type TemporalEntity, type TemporalQuestion, type TemporalQuestionResult, type TokenSummary, type WriteQualityDetail, type WriteQualityOptions, type WriteQualityReport, factRecall, loadLoCoMo, loadLoCoMo$1 as loadLoCoMoLegacy, loadLongMemEval, loadLongMemEval$1 as loadLongMemEvalLegacy, normalizeText, normalizedIncludes, parseLmeDateTimeString, recallAtK, renderLocomoReportMarkdown, renderLongMemEvalReportMarkdown, resolveEvidence, runLocomoBench, runLongMemEvalBench, runMemoryBench, runTemporalBench, runWriteQualityBench, syntheticDataset, syntheticTemporalDataset };