npm - @eidentic/bench - Versions diffs - 0.1.1 → 0.1.2 - Mend

@eidentic/bench 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/chunk-KOMVTEKE.js +98 -0
package/dist/index.cjs +684 -3
package/dist/index.d.cts +281 -2
package/dist/index.d.ts +281 -2
package/dist/index.js +575 -3
package/dist/lme-loader-WSJ72GEP.js +10 -0
package/package.json +4 -4

package/dist/index.d.cts CHANGED Viewed

@@ -167,7 +167,7 @@ declare const syntheticDataset: BenchDataset;
  * @param opts.maxBytes - Maximum allowed file size in bytes (default 256 MiB).
  *   Increase this only if you are loading a vetted, trusted dataset file.
  */
-declare function loadLongMemEval(jsonPath: string, opts?: {
+declare function loadLongMemEval$1(jsonPath: string, opts?: {
     maxBytes?: number;
 }): Promise<BenchDataset>;
 /**
@@ -436,6 +436,285 @@ interface PriceTable {
  */
 declare function renderLocomoReportMarkdown(reports: LocomoReport[], prices?: PriceTable): string;
+/** One turn inside a parsed session. */
+interface LmeTurn {
+    role: "user" | "assistant";
+    content: string;
+    /** True when the turn contains the gold answer (may be absent in all turns). */
+    hasAnswer: boolean;
+}
+/** One session inside a question's haystack. */
+interface LmeSession {
+    /** The session's original id from the dataset. */
+    id: string;
+    /** Human-readable date-time string from the dataset, e.g. "2023/05/20 (Sat) 02:36". */
+    dateTime: string;
+    /** Epoch milliseconds parsed from dateTime (0 when parse fails). */
+    dateTimeMs: number;
+    turns: LmeTurn[];
+}
+/**
+ * LongMemEval question types (7 base types, plus `*_abs` abstention variants).
+ *
+ * Base types:
+ *   single-session-user           — fact stated by the user in one session
+ *   single-session-assistant      — fact stated by the assistant in one session
+ *   single-session-preference     — user preference expressed in one session
+ *   multi-session                 — evidence spans multiple sessions
+ *   temporal-reasoning            — requires reasoning about time/dates
+ *   knowledge-update              — the fact was updated in a later session
+ *
+ * Abstention variants (append "_abs"):
+ *   The correct answer is to recognize the information is not present / premise
+ *   is flawed and respond with "no information" rather than fabricating an answer.
+ */
+type LmeBaseType = "single-session-user" | "single-session-assistant" | "single-session-preference" | "multi-session" | "temporal-reasoning" | "knowledge-update";
+type LmeQuestionType = LmeBaseType | `${LmeBaseType}_abs` | (string & Record<never, never>);
+/** One parsed LongMemEval question with its haystack. */
+interface LmeQuestion {
+    /** Original question_id from the dataset, e.g. "e47becba" or "gpt4_xxxx". */
+    id: string;
+    /** Full question_type string (may include "_abs" suffix). */
+    type: LmeQuestionType;
+    /** Base type without any "_abs" suffix. */
+    baseType: LmeBaseType | string;
+    /** Whether this is an abstention variant (type ends with "_abs"). */
+    isAbstention: boolean;
+    question: string;
+    /** Gold answer string. For abstention questions, the correct response is to abstain. */
+    answer: string;
+    /** Human-readable question date string from the dataset. */
+    questionDate: string;
+    /** Epoch ms for the question date (0 when parse fails). */
+    questionDateMs: number;
+    /** Haystack sessions in date order. */
+    sessions: LmeSession[];
+    /** Session ids that contain the answer evidence. */
+    answerSessionIds: string[];
+}
+/** Typed dataset returned by loadLongMemEval (new real-schema loader). */
+interface LmeDataset {
+    questions: LmeQuestion[];
+}
+/**
+ * LongMemEval dataset loader — real schema.
+ *
+ * Parses the real longmemeval_s.json (or _m / _oracle) format into typed LmeDataset.
+ *
+ * Dataset source:   https://github.com/xiaowu0162/LongMemEval
+ * HuggingFace:      https://huggingface.co/datasets/xiaowu0162/longmemeval
+ * License:          MIT — results are publishable; raw data must NOT be committed.
+ *
+ * LONGMEMEVAL_SOURCE below records the upstream HuggingFace dataset repo + snapshot sha
+ * used when this loader was written, for provenance.
+ */
+/** Provenance constant for the LongMemEval dataset. */
+declare const LONGMEMEVAL_SOURCE: {
+    readonly url: "https://huggingface.co/datasets/xiaowu0162/longmemeval";
+    readonly snapshotSha: "2ec2a557f339b6c0369619b1ed5793734cc87533";
+    readonly file: "longmemeval_s";
+    readonly license: "MIT";
+};
+/**
+ * Parse a LongMemEval date-time string to epoch milliseconds.
+ *
+ * Observed format: "2023/05/20 (Sat) 02:36"
+ * Normalise: strip the "(Weekday)" token, replace "/" separators.
+ *
+ * Returns 0 when parsing fails (the date is optional / informational).
+ */
+declare function parseLmeDateTimeString(raw: string): number;
+/**
+ * Load a LongMemEval JSON file (real schema — JSON array of questions).
+ *
+ * @param jsonPath  - Absolute or relative path to the longmemeval_s.json file.
+ *   **Security note:** callers must validate untrusted paths before passing them here.
+ * @param opts.maxBytes  - Maximum allowed file size (default 512 MiB).
+ * @returns          - Typed LmeDataset with parsed sessions and questions.
+ */
+declare function loadLongMemEval(jsonPath: string, opts?: {
+    maxBytes?: number;
+}): Promise<LmeDataset>;
+/**
+ * Fair-run LongMemEval benchmark harness.
+ *
+ * Key fair-run rules (non-negotiable, documented for methodology transparency):
+ *
+ * 1. PER-QUESTION memory scope — each question gets its own fresh haystack ingested into
+ *    a fresh Memory instance. (Unlike LoCoMo which is per-conversation, LongMemEval
+ *    is per-question: each question has its own haystack of ~50 sessions.)
+ *
+ * 2. Dual-granularity ingest — per-turn entries carry the session date in text so
+ *    retrieved snippets are temporally anchored; one per-session chunk entry preserves
+ *    multi-turn context for questions whose evidence spans adjacent turns.
+ *
+ * 3. Current date context — the question_date is passed to the answer prompt so temporal
+ *    questions can reason about recency (e.g. "last week").
+ *
+ * 4. Memory-mode answer step — retrieve topK <= 10 snippets (never inflate topK to
+ *    bypass retrieval quality), build a prompt from retrieved snippets + question + current date.
+ *
+ * 5. Full-context mode — the MANDATORY baseline. Sessions are concatenated in date order
+ *    with session headers; if a haystack exceeds the context cap (120k chars ≈ ~90k tokens
+ *    for gpt-4o-mini's 128k context), oldest sessions are truncated first and recorded.
+ *
+ * 6. Judging — strict LLM judge: correct only when model answer contains the gold answer's
+ *    specific info (paraphrase ok; temporal: equivalent date expressions ok).
+ *    For abstention questions: correct = model declined / said no-info / identified the
+ *    flawed premise; fabricating a concrete answer = wrong.
+ *
+ * 7. Metrics — overall accuracy + per-question-type accuracy + abstention accuracy reported
+ *    separately + token/cost accounting per phase + wall-clock. Full config disclosure in report.
+ *
+ * 8. Determinism — seed recorded; seeded shuffle used when questionLimit is set.
+ *
+ * 9. Resilience — per-question try/catch; checkpoint-resume via JSONL file.
+ */
+/** Per-question scored row (also used as checkpoint entry). */
+interface LmeQuestionResult {
+    questionId: string;
+    questionType: string;
+    isAbstention: boolean;
+    question: string;
+    goldAnswer: string;
+    modelAnswer: string;
+    correct: boolean;
+    /** True if the model appeared to abstain (no-info / declined). */
+    appearedToAbstain: boolean;
+    /** True if context was truncated in full-context mode. */
+    contextTruncated?: boolean;
+    error?: string;
+    /** Tokens used in the answer step. */
+    answerInputTokens: number;
+    answerOutputTokens: number;
+    /** Tokens used in the judge step. */
+    judgeInputTokens: number;
+    judgeOutputTokens: number;
+}
+/** Token / cost summary. */
+interface LmeTokenSummary {
+    ingestEmbedTokens: number;
+    answerInputTokens: number;
+    answerOutputTokens: number;
+    judgeInputTokens: number;
+    judgeOutputTokens: number;
+    totalInputTokens: number;
+    totalOutputTokens: number;
+}
+/** Accuracy stats for a question type or overall. */
+interface LmeTypeStats {
+    correct: number;
+    total: number;
+    accuracy: number;
+}
+/** Full benchmark report. */
+interface LmeReport {
+    /** Run configuration (included in every published result for transparency). */
+    config: {
+        mode: "memory" | "full-context";
+        topK: number;
+        answerModelId: string;
+        judgeModelId: string;
+        datasetSource: typeof LONGMEMEVAL_SOURCE;
+        seed: number;
+        types: string[];
+        questionsRun: number;
+    };
+    /** Overall accuracy on all non-abstention questions. */
+    overall: LmeTypeStats;
+    /** Per-question-type accuracy (keys = type strings without "_abs" suffix). */
+    byType: Record<string, LmeTypeStats>;
+    /** Abstention accuracy: correct = model declined; wrong = fabricated concrete answer. */
+    abstentionAccuracy?: LmeTypeStats;
+    /** Token usage accounting. */
+    tokens: LmeTokenSummary;
+    /** Wall-clock duration in milliseconds. */
+    wallClockMs: number;
+    /** Individual question results. */
+    questions: LmeQuestionResult[];
+    /** Count of questions that threw errors (counted as wrong, not skipped). */
+    errorCount: number;
+}
+/** Factory for a fresh Memory instance, called once per question. */
+type LmeMemoryFactory = (questionId: string) => Memory | Promise<Memory>;
+/** Options for runLongMemEvalBench. */
+interface LmeBenchOptions {
+    /** Path to longmemeval_s.json (required unless dataset is provided). */
+    dataPath?: string;
+    /** Pre-loaded dataset (avoids re-reading the file if already loaded). */
+    dataset?: LmeDataset;
+    /** Factory for a fresh Memory per question (required when mode="memory"). */
+    memoryFactory?: LmeMemoryFactory;
+    /** Model used to generate answers. */
+    answerModel: ModelPort;
+    /** Model used to judge correctness. */
+    judgeModel: ModelPort;
+    /** "memory" requires memoryFactory; "full-context" feeds the full haystack as context. */
+    mode: "memory" | "full-context";
+    /** Question types to include (default: all). */
+    types?: string[];
+    /** Cap on questions to process (for quick pilot runs). */
+    questionLimit?: number;
+    /** Random seed for shuffle reproducibility. Default 42. */
+    seed?: number;
+    /** Max snippets retrieved per question in memory mode. MUST be <= 10. Default 10. */
+    topK?: number;
+    /** Concurrency (questions in flight simultaneously). Default 1. */
+    concurrency?: number;
+    /** Progress callback: (questionsCompleted, questionsTotal) */
+    onProgress?: (done: number, total: number) => void;
+    /** Path to a JSONL checkpoint file. Existing rows are skipped on resume. */
+    checkpointPath?: string;
+    /**
+     * Max characters for full-context haystack (to avoid exceeding model context).
+     * Default 480000 chars (~120k tokens at ~4 chars/token, fitting gpt-4o-mini 128k).
+     * When exceeded, oldest sessions are dropped first and contextTruncated is recorded.
+     */
+    fullContextMaxChars?: number;
+}
+/**
+ * Run the LongMemEval benchmark with the given options.
+ *
+ * @param opts - Configuration (see LmeBenchOptions).
+ * @returns    - Full LmeReport with metrics, token accounting, and per-question details.
+ */
+declare function runLongMemEvalBench(opts: LmeBenchOptions): Promise<LmeReport>;
+/**
+ * Markdown results renderer for LongMemEval benchmark reports.
+ *
+ * Produces a defensible, methodology-transparent table suitable for publication.
+ * Per the mandatory fair-run rules, results MUST include:
+ *   - Model ids and judge model id
+ *   - topK value (memory mode)
+ *   - Dataset provenance (source URL + snapshot sha)
+ *   - Mode (memory | full-context)
+ *   - Seed and n-questions
+ *   - Per-type accuracy breakdown
+ *   - Abstention accuracy reported separately
+ */
+/** Optional price table for cost estimates (per 1M tokens, input/output). */
+interface LmePriceTable {
+    /** Per-million input tokens in USD. */
+    inputPer1M: number;
+    /** Per-million output tokens in USD. */
+    outputPer1M: number;
+}
+/**
+ * Render one or more LongMemEval benchmark reports as a Markdown table
+ * with mandatory methodology notes.
+ *
+ * @param reports  - Array of LmeReport objects to compare.
+ * @param prices   - Optional price table for cost-per-run estimates (per 1M tokens).
+ * @returns        - Markdown string ready to write to a .md file.
+ */
+declare function renderLongMemEvalReportMarkdown(reports: LmeReport[], prices?: LmePriceTable): string;
 /**
  * Write-quality benchmark for the Eidentic memory harness.
  *
@@ -746,4 +1025,4 @@ interface TemporalBenchOptions {
  */
 declare function runTemporalBench(memory: Memory, dataset: SyntheticTemporalDataset, opts?: TemporalBenchOptions): Promise<TemporalBenchReport>;
-export { type BenchCase, type BenchDataset, type BenchOptions, type BenchQuestion, type BenchReport, type BenchTurn, CONTRADICTION_FIXTURES, type CaseResult, type CategoryStats, type ContradictionFixture, JUNK_STREAM_FIXTURES, type JunkItem, LOCOMO_SOURCE_SHA, type LocomoBenchOptions, type LocomoCategory, type LocomoDataset, type LocomoQA, type LocomoQuestionResult, type LocomoReport, type LocomoSample, type LocomoSession, type LocomoTurn, type MemoryFactory, type PriceTable, type QuestionResult, type StateTransition, type SyntheticTemporalDataset, type TemporalBenchOptions, type TemporalBenchReport, type TemporalEntity, type TemporalQuestion, type TemporalQuestionResult, type TokenSummary, type WriteQualityDetail, type WriteQualityOptions, type WriteQualityReport, factRecall, loadLoCoMo, loadLoCoMo$1 as loadLoCoMoLegacy, loadLongMemEval, normalizeText, normalizedIncludes, recallAtK, renderLocomoReportMarkdown, resolveEvidence, runLocomoBench, runMemoryBench, runTemporalBench, runWriteQualityBench, syntheticDataset, syntheticTemporalDataset };
+export { type BenchCase, type BenchDataset, type BenchOptions, type BenchQuestion, type BenchReport, type BenchTurn, CONTRADICTION_FIXTURES, type CaseResult, type CategoryStats, type ContradictionFixture, JUNK_STREAM_FIXTURES, type JunkItem, LOCOMO_SOURCE_SHA, LONGMEMEVAL_SOURCE, type LmeBaseType, type LmeBenchOptions, type LmeDataset, type LmeMemoryFactory, type LmePriceTable, type LmeQuestion, type LmeQuestionResult, type LmeQuestionType, type LmeReport, type LmeSession, type LmeTokenSummary, type LmeTurn, type LmeTypeStats, type LocomoBenchOptions, type LocomoCategory, type LocomoDataset, type LocomoQA, type LocomoQuestionResult, type LocomoReport, type LocomoSample, type LocomoSession, type LocomoTurn, type MemoryFactory, type PriceTable, type QuestionResult, type StateTransition, type SyntheticTemporalDataset, type TemporalBenchOptions, type TemporalBenchReport, type TemporalEntity, type TemporalQuestion, type TemporalQuestionResult, type TokenSummary, type WriteQualityDetail, type WriteQualityOptions, type WriteQualityReport, factRecall, loadLoCoMo, loadLoCoMo$1 as loadLoCoMoLegacy, loadLongMemEval, loadLongMemEval$1 as loadLongMemEvalLegacy, normalizeText, normalizedIncludes, parseLmeDateTimeString, recallAtK, renderLocomoReportMarkdown, renderLongMemEvalReportMarkdown, resolveEvidence, runLocomoBench, runLongMemEvalBench, runMemoryBench, runTemporalBench, runWriteQualityBench, syntheticDataset, syntheticTemporalDataset };

package/dist/index.d.ts CHANGED Viewed

@@ -167,7 +167,7 @@ declare const syntheticDataset: BenchDataset;
  * @param opts.maxBytes - Maximum allowed file size in bytes (default 256 MiB).
  *   Increase this only if you are loading a vetted, trusted dataset file.
  */
-declare function loadLongMemEval(jsonPath: string, opts?: {
+declare function loadLongMemEval$1(jsonPath: string, opts?: {
     maxBytes?: number;
 }): Promise<BenchDataset>;
 /**
@@ -436,6 +436,285 @@ interface PriceTable {
  */
 declare function renderLocomoReportMarkdown(reports: LocomoReport[], prices?: PriceTable): string;
+/** One turn inside a parsed session. */
+interface LmeTurn {
+    role: "user" | "assistant";
+    content: string;
+    /** True when the turn contains the gold answer (may be absent in all turns). */
+    hasAnswer: boolean;
+}
+/** One session inside a question's haystack. */
+interface LmeSession {
+    /** The session's original id from the dataset. */
+    id: string;
+    /** Human-readable date-time string from the dataset, e.g. "2023/05/20 (Sat) 02:36". */
+    dateTime: string;
+    /** Epoch milliseconds parsed from dateTime (0 when parse fails). */
+    dateTimeMs: number;
+    turns: LmeTurn[];
+}
+/**
+ * LongMemEval question types (7 base types, plus `*_abs` abstention variants).
+ *
+ * Base types:
+ *   single-session-user           — fact stated by the user in one session
+ *   single-session-assistant      — fact stated by the assistant in one session
+ *   single-session-preference     — user preference expressed in one session
+ *   multi-session                 — evidence spans multiple sessions
+ *   temporal-reasoning            — requires reasoning about time/dates
+ *   knowledge-update              — the fact was updated in a later session
+ *
+ * Abstention variants (append "_abs"):
+ *   The correct answer is to recognize the information is not present / premise
+ *   is flawed and respond with "no information" rather than fabricating an answer.
+ */
+type LmeBaseType = "single-session-user" | "single-session-assistant" | "single-session-preference" | "multi-session" | "temporal-reasoning" | "knowledge-update";
+type LmeQuestionType = LmeBaseType | `${LmeBaseType}_abs` | (string & Record<never, never>);
+/** One parsed LongMemEval question with its haystack. */
+interface LmeQuestion {
+    /** Original question_id from the dataset, e.g. "e47becba" or "gpt4_xxxx". */
+    id: string;
+    /** Full question_type string (may include "_abs" suffix). */
+    type: LmeQuestionType;
+    /** Base type without any "_abs" suffix. */
+    baseType: LmeBaseType | string;
+    /** Whether this is an abstention variant (type ends with "_abs"). */
+    isAbstention: boolean;
+    question: string;
+    /** Gold answer string. For abstention questions, the correct response is to abstain. */
+    answer: string;
+    /** Human-readable question date string from the dataset. */
+    questionDate: string;
+    /** Epoch ms for the question date (0 when parse fails). */
+    questionDateMs: number;
+    /** Haystack sessions in date order. */
+    sessions: LmeSession[];
+    /** Session ids that contain the answer evidence. */
+    answerSessionIds: string[];
+}
+/** Typed dataset returned by loadLongMemEval (new real-schema loader). */
+interface LmeDataset {
+    questions: LmeQuestion[];
+}
+/**
+ * LongMemEval dataset loader — real schema.
+ *
+ * Parses the real longmemeval_s.json (or _m / _oracle) format into typed LmeDataset.
+ *
+ * Dataset source:   https://github.com/xiaowu0162/LongMemEval
+ * HuggingFace:      https://huggingface.co/datasets/xiaowu0162/longmemeval
+ * License:          MIT — results are publishable; raw data must NOT be committed.
+ *
+ * LONGMEMEVAL_SOURCE below records the upstream HuggingFace dataset repo + snapshot sha
+ * used when this loader was written, for provenance.
+ */
+/** Provenance constant for the LongMemEval dataset. */
+declare const LONGMEMEVAL_SOURCE: {
+    readonly url: "https://huggingface.co/datasets/xiaowu0162/longmemeval";
+    readonly snapshotSha: "2ec2a557f339b6c0369619b1ed5793734cc87533";
+    readonly file: "longmemeval_s";
+    readonly license: "MIT";
+};
+/**
+ * Parse a LongMemEval date-time string to epoch milliseconds.
+ *
+ * Observed format: "2023/05/20 (Sat) 02:36"
+ * Normalise: strip the "(Weekday)" token, replace "/" separators.
+ *
+ * Returns 0 when parsing fails (the date is optional / informational).
+ */
+declare function parseLmeDateTimeString(raw: string): number;
+/**
+ * Load a LongMemEval JSON file (real schema — JSON array of questions).
+ *
+ * @param jsonPath  - Absolute or relative path to the longmemeval_s.json file.
+ *   **Security note:** callers must validate untrusted paths before passing them here.
+ * @param opts.maxBytes  - Maximum allowed file size (default 512 MiB).
+ * @returns          - Typed LmeDataset with parsed sessions and questions.
+ */
+declare function loadLongMemEval(jsonPath: string, opts?: {
+    maxBytes?: number;
+}): Promise<LmeDataset>;
+/**
+ * Fair-run LongMemEval benchmark harness.
+ *
+ * Key fair-run rules (non-negotiable, documented for methodology transparency):
+ *
+ * 1. PER-QUESTION memory scope — each question gets its own fresh haystack ingested into
+ *    a fresh Memory instance. (Unlike LoCoMo which is per-conversation, LongMemEval
+ *    is per-question: each question has its own haystack of ~50 sessions.)
+ *
+ * 2. Dual-granularity ingest — per-turn entries carry the session date in text so
+ *    retrieved snippets are temporally anchored; one per-session chunk entry preserves
+ *    multi-turn context for questions whose evidence spans adjacent turns.
+ *
+ * 3. Current date context — the question_date is passed to the answer prompt so temporal
+ *    questions can reason about recency (e.g. "last week").
+ *
+ * 4. Memory-mode answer step — retrieve topK <= 10 snippets (never inflate topK to
+ *    bypass retrieval quality), build a prompt from retrieved snippets + question + current date.
+ *
+ * 5. Full-context mode — the MANDATORY baseline. Sessions are concatenated in date order
+ *    with session headers; if a haystack exceeds the context cap (120k chars ≈ ~90k tokens
+ *    for gpt-4o-mini's 128k context), oldest sessions are truncated first and recorded.
+ *
+ * 6. Judging — strict LLM judge: correct only when model answer contains the gold answer's
+ *    specific info (paraphrase ok; temporal: equivalent date expressions ok).
+ *    For abstention questions: correct = model declined / said no-info / identified the
+ *    flawed premise; fabricating a concrete answer = wrong.
+ *
+ * 7. Metrics — overall accuracy + per-question-type accuracy + abstention accuracy reported
+ *    separately + token/cost accounting per phase + wall-clock. Full config disclosure in report.
+ *
+ * 8. Determinism — seed recorded; seeded shuffle used when questionLimit is set.
+ *
+ * 9. Resilience — per-question try/catch; checkpoint-resume via JSONL file.
+ */
+/** Per-question scored row (also used as checkpoint entry). */
+interface LmeQuestionResult {
+    questionId: string;
+    questionType: string;
+    isAbstention: boolean;
+    question: string;
+    goldAnswer: string;
+    modelAnswer: string;
+    correct: boolean;
+    /** True if the model appeared to abstain (no-info / declined). */
+    appearedToAbstain: boolean;
+    /** True if context was truncated in full-context mode. */
+    contextTruncated?: boolean;
+    error?: string;
+    /** Tokens used in the answer step. */
+    answerInputTokens: number;
+    answerOutputTokens: number;
+    /** Tokens used in the judge step. */
+    judgeInputTokens: number;
+    judgeOutputTokens: number;
+}
+/** Token / cost summary. */
+interface LmeTokenSummary {
+    ingestEmbedTokens: number;
+    answerInputTokens: number;
+    answerOutputTokens: number;
+    judgeInputTokens: number;
+    judgeOutputTokens: number;
+    totalInputTokens: number;
+    totalOutputTokens: number;
+}
+/** Accuracy stats for a question type or overall. */
+interface LmeTypeStats {
+    correct: number;
+    total: number;
+    accuracy: number;
+}
+/** Full benchmark report. */
+interface LmeReport {
+    /** Run configuration (included in every published result for transparency). */
+    config: {
+        mode: "memory" | "full-context";
+        topK: number;
+        answerModelId: string;
+        judgeModelId: string;
+        datasetSource: typeof LONGMEMEVAL_SOURCE;
+        seed: number;
+        types: string[];
+        questionsRun: number;
+    };
+    /** Overall accuracy on all non-abstention questions. */
+    overall: LmeTypeStats;
+    /** Per-question-type accuracy (keys = type strings without "_abs" suffix). */
+    byType: Record<string, LmeTypeStats>;
+    /** Abstention accuracy: correct = model declined; wrong = fabricated concrete answer. */
+    abstentionAccuracy?: LmeTypeStats;
+    /** Token usage accounting. */
+    tokens: LmeTokenSummary;
+    /** Wall-clock duration in milliseconds. */
+    wallClockMs: number;
+    /** Individual question results. */
+    questions: LmeQuestionResult[];
+    /** Count of questions that threw errors (counted as wrong, not skipped). */
+    errorCount: number;
+}
+/** Factory for a fresh Memory instance, called once per question. */
+type LmeMemoryFactory = (questionId: string) => Memory | Promise<Memory>;
+/** Options for runLongMemEvalBench. */
+interface LmeBenchOptions {
+    /** Path to longmemeval_s.json (required unless dataset is provided). */
+    dataPath?: string;
+    /** Pre-loaded dataset (avoids re-reading the file if already loaded). */
+    dataset?: LmeDataset;
+    /** Factory for a fresh Memory per question (required when mode="memory"). */
+    memoryFactory?: LmeMemoryFactory;
+    /** Model used to generate answers. */
+    answerModel: ModelPort;
+    /** Model used to judge correctness. */
+    judgeModel: ModelPort;
+    /** "memory" requires memoryFactory; "full-context" feeds the full haystack as context. */
+    mode: "memory" | "full-context";
+    /** Question types to include (default: all). */
+    types?: string[];
+    /** Cap on questions to process (for quick pilot runs). */
+    questionLimit?: number;
+    /** Random seed for shuffle reproducibility. Default 42. */
+    seed?: number;
+    /** Max snippets retrieved per question in memory mode. MUST be <= 10. Default 10. */
+    topK?: number;
+    /** Concurrency (questions in flight simultaneously). Default 1. */
+    concurrency?: number;
+    /** Progress callback: (questionsCompleted, questionsTotal) */
+    onProgress?: (done: number, total: number) => void;
+    /** Path to a JSONL checkpoint file. Existing rows are skipped on resume. */
+    checkpointPath?: string;
+    /**
+     * Max characters for full-context haystack (to avoid exceeding model context).
+     * Default 480000 chars (~120k tokens at ~4 chars/token, fitting gpt-4o-mini 128k).
+     * When exceeded, oldest sessions are dropped first and contextTruncated is recorded.
+     */
+    fullContextMaxChars?: number;
+}
+/**
+ * Run the LongMemEval benchmark with the given options.
+ *
+ * @param opts - Configuration (see LmeBenchOptions).
+ * @returns    - Full LmeReport with metrics, token accounting, and per-question details.
+ */
+declare function runLongMemEvalBench(opts: LmeBenchOptions): Promise<LmeReport>;
+/**
+ * Markdown results renderer for LongMemEval benchmark reports.
+ *
+ * Produces a defensible, methodology-transparent table suitable for publication.
+ * Per the mandatory fair-run rules, results MUST include:
+ *   - Model ids and judge model id
+ *   - topK value (memory mode)
+ *   - Dataset provenance (source URL + snapshot sha)
+ *   - Mode (memory | full-context)
+ *   - Seed and n-questions
+ *   - Per-type accuracy breakdown
+ *   - Abstention accuracy reported separately
+ */
+/** Optional price table for cost estimates (per 1M tokens, input/output). */
+interface LmePriceTable {
+    /** Per-million input tokens in USD. */
+    inputPer1M: number;
+    /** Per-million output tokens in USD. */
+    outputPer1M: number;
+}
+/**
+ * Render one or more LongMemEval benchmark reports as a Markdown table
+ * with mandatory methodology notes.
+ *
+ * @param reports  - Array of LmeReport objects to compare.
+ * @param prices   - Optional price table for cost-per-run estimates (per 1M tokens).
+ * @returns        - Markdown string ready to write to a .md file.
+ */
+declare function renderLongMemEvalReportMarkdown(reports: LmeReport[], prices?: LmePriceTable): string;
 /**
  * Write-quality benchmark for the Eidentic memory harness.
  *
@@ -746,4 +1025,4 @@ interface TemporalBenchOptions {
  */
 declare function runTemporalBench(memory: Memory, dataset: SyntheticTemporalDataset, opts?: TemporalBenchOptions): Promise<TemporalBenchReport>;
-export { type BenchCase, type BenchDataset, type BenchOptions, type BenchQuestion, type BenchReport, type BenchTurn, CONTRADICTION_FIXTURES, type CaseResult, type CategoryStats, type ContradictionFixture, JUNK_STREAM_FIXTURES, type JunkItem, LOCOMO_SOURCE_SHA, type LocomoBenchOptions, type LocomoCategory, type LocomoDataset, type LocomoQA, type LocomoQuestionResult, type LocomoReport, type LocomoSample, type LocomoSession, type LocomoTurn, type MemoryFactory, type PriceTable, type QuestionResult, type StateTransition, type SyntheticTemporalDataset, type TemporalBenchOptions, type TemporalBenchReport, type TemporalEntity, type TemporalQuestion, type TemporalQuestionResult, type TokenSummary, type WriteQualityDetail, type WriteQualityOptions, type WriteQualityReport, factRecall, loadLoCoMo, loadLoCoMo$1 as loadLoCoMoLegacy, loadLongMemEval, normalizeText, normalizedIncludes, recallAtK, renderLocomoReportMarkdown, resolveEvidence, runLocomoBench, runMemoryBench, runTemporalBench, runWriteQualityBench, syntheticDataset, syntheticTemporalDataset };
+export { type BenchCase, type BenchDataset, type BenchOptions, type BenchQuestion, type BenchReport, type BenchTurn, CONTRADICTION_FIXTURES, type CaseResult, type CategoryStats, type ContradictionFixture, JUNK_STREAM_FIXTURES, type JunkItem, LOCOMO_SOURCE_SHA, LONGMEMEVAL_SOURCE, type LmeBaseType, type LmeBenchOptions, type LmeDataset, type LmeMemoryFactory, type LmePriceTable, type LmeQuestion, type LmeQuestionResult, type LmeQuestionType, type LmeReport, type LmeSession, type LmeTokenSummary, type LmeTurn, type LmeTypeStats, type LocomoBenchOptions, type LocomoCategory, type LocomoDataset, type LocomoQA, type LocomoQuestionResult, type LocomoReport, type LocomoSample, type LocomoSession, type LocomoTurn, type MemoryFactory, type PriceTable, type QuestionResult, type StateTransition, type SyntheticTemporalDataset, type TemporalBenchOptions, type TemporalBenchReport, type TemporalEntity, type TemporalQuestion, type TemporalQuestionResult, type TokenSummary, type WriteQualityDetail, type WriteQualityOptions, type WriteQualityReport, factRecall, loadLoCoMo, loadLoCoMo$1 as loadLoCoMoLegacy, loadLongMemEval, loadLongMemEval$1 as loadLongMemEvalLegacy, normalizeText, normalizedIncludes, parseLmeDateTimeString, recallAtK, renderLocomoReportMarkdown, renderLongMemEvalReportMarkdown, resolveEvidence, runLocomoBench, runLongMemEvalBench, runMemoryBench, runTemporalBench, runWriteQualityBench, syntheticDataset, syntheticTemporalDataset };