npm - @eidentic/bench - Versions diffs - 0.1.1 → 0.1.2 - Mend

@eidentic/bench 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/chunk-KOMVTEKE.js +98 -0
package/dist/index.cjs +684 -3
package/dist/index.d.cts +281 -2
package/dist/index.d.ts +281 -2
package/dist/index.js +575 -3
package/dist/lme-loader-WSJ72GEP.js +10 -0
package/package.json +4 -4

package/dist/index.cjs CHANGED Viewed

@@ -136,22 +136,132 @@ var init_locomo_loader = __esm({
   }
 });
+// src/lme-loader.ts
+var lme_loader_exports = {};
+__export(lme_loader_exports, {
+  LONGMEMEVAL_SOURCE: () => LONGMEMEVAL_SOURCE,
+  loadLongMemEval: () => loadLongMemEval2,
+  parseLmeDateTimeString: () => parseLmeDateTimeString
+});
+async function assertFileSize3(filePath, maxBytes) {
+  let fileSize;
+  try {
+    const s = await (0, import_promises4.stat)(filePath);
+    fileSize = s.size;
+  } catch (err) {
+    throw new Error(
+      `bench loader: cannot stat file "${filePath}": ${err.message}`
+    );
+  }
+  if (fileSize > maxBytes) {
+    const mb = (fileSize / (1024 * 1024)).toFixed(1);
+    const capMb = (maxBytes / (1024 * 1024)).toFixed(0);
+    throw new Error(
+      `bench loader: file "${filePath}" is ${mb} MiB, which exceeds the ${capMb} MiB cap. Pass a larger maxBytes option if this is intentional.`
+    );
+  }
+}
+function parseLmeDateTimeString(raw) {
+  if (!raw) return 0;
+  const cleaned = raw.replace(/\s*\([A-Za-z]+\)\s*/, " ").trim();
+  const iso = cleaned.replace(/^(\d{4})\/(\d{2})\/(\d{2})/, "$1-$2-$3");
+  const ms = Date.parse(iso);
+  if (Number.isNaN(ms)) return 0;
+  return ms;
+}
+function extractBaseType(rawType) {
+  if (rawType.endsWith("_abs")) {
+    return rawType.slice(0, -4);
+  }
+  return rawType;
+}
+function parseSession(id, dateTime, rawTurns) {
+  const turns = rawTurns.map((t) => ({
+    role: t.role === "assistant" ? "assistant" : "user",
+    content: t.content ?? "",
+    hasAnswer: t.has_answer === true
+  }));
+  return {
+    id,
+    dateTime,
+    dateTimeMs: parseLmeDateTimeString(dateTime),
+    turns
+  };
+}
+async function loadLongMemEval2(jsonPath, opts) {
+  await assertFileSize3(jsonPath, opts?.maxBytes ?? DEFAULT_MAX_BYTES3);
+  const raw = JSON.parse(await (0, import_promises4.readFile)(jsonPath, "utf-8"));
+  if (!Array.isArray(raw)) {
+    throw new Error(
+      `bench loader: expected the LongMemEval JSON root to be an array, but got ${typeof raw}. Did you pass the correct file?`
+    );
+  }
+  const rawQuestions = raw;
+  const questions = rawQuestions.map((q, i) => {
+    const id = q.question_id ?? String(i);
+    const rawType = q.question_type ?? "single-session-user";
+    const baseType = extractBaseType(rawType);
+    const isAbstention = rawType.endsWith("_abs");
+    const rawSessions = Array.isArray(q.haystack_sessions) ? q.haystack_sessions : [];
+    const dates = Array.isArray(q.haystack_dates) ? q.haystack_dates : [];
+    const sessionIds = Array.isArray(q.haystack_session_ids) ? q.haystack_session_ids : [];
+    const sessions = rawSessions.map((turns, idx) => {
+      const sessId = sessionIds[idx] ?? `sess-${idx}`;
+      const dateTime = dates[idx] ?? "";
+      return parseSession(sessId, dateTime, Array.isArray(turns) ? turns : []);
+    });
+    sessions.sort((a, b) => a.dateTimeMs - b.dateTimeMs);
+    return {
+      id,
+      type: rawType,
+      baseType,
+      isAbstention,
+      question: q.question ?? "",
+      answer: q.answer ?? "",
+      questionDate: q.question_date ?? "",
+      questionDateMs: parseLmeDateTimeString(q.question_date ?? ""),
+      sessions,
+      answerSessionIds: Array.isArray(q.answer_session_ids) ? q.answer_session_ids : []
+    };
+  });
+  return { questions };
+}
+var import_promises4, LONGMEMEVAL_SOURCE, DEFAULT_MAX_BYTES3;
+var init_lme_loader = __esm({
+  "src/lme-loader.ts"() {
+    "use strict";
+    import_promises4 = require("node:fs/promises");
+    LONGMEMEVAL_SOURCE = {
+      url: "https://huggingface.co/datasets/xiaowu0162/longmemeval",
+      snapshotSha: "2ec2a557f339b6c0369619b1ed5793734cc87533",
+      file: "longmemeval_s",
+      license: "MIT"
+    };
+    DEFAULT_MAX_BYTES3 = 512 * 1024 * 1024;
+  }
+});
 // src/index.ts
 var index_exports = {};
 __export(index_exports, {
   CONTRADICTION_FIXTURES: () => CONTRADICTION_FIXTURES,
   JUNK_STREAM_FIXTURES: () => JUNK_STREAM_FIXTURES,
   LOCOMO_SOURCE_SHA: () => LOCOMO_SOURCE_SHA,
+  LONGMEMEVAL_SOURCE: () => LONGMEMEVAL_SOURCE,
   factRecall: () => factRecall,
   loadLoCoMo: () => loadLoCoMo2,
   loadLoCoMoLegacy: () => loadLoCoMo,
-  loadLongMemEval: () => loadLongMemEval,
+  loadLongMemEval: () => loadLongMemEval2,
+  loadLongMemEvalLegacy: () => loadLongMemEval,
   normalizeText: () => normalizeText,
   normalizedIncludes: () => normalizedIncludes,
+  parseLmeDateTimeString: () => parseLmeDateTimeString,
   recallAtK: () => recallAtK,
   renderLocomoReportMarkdown: () => renderLocomoReportMarkdown,
+  renderLongMemEvalReportMarkdown: () => renderLongMemEvalReportMarkdown,
   resolveEvidence: () => resolveEvidence,
   runLocomoBench: () => runLocomoBench,
+  runLongMemEvalBench: () => runLongMemEvalBench,
   runMemoryBench: () => runMemoryBench,
   runTemporalBench: () => runTemporalBench,
   runWriteQualityBench: () => runWriteQualityBench,
@@ -1094,6 +1204,572 @@ function resolveEvidence(sample, diaIds) {
   return results;
 }
+// src/index.ts
+init_lme_loader();
+// src/lme-run.ts
+var import_promises5 = require("node:fs/promises");
+var import_node_fs2 = require("node:fs");
+init_lme_loader();
+function makeRng2(seed) {
+  let s = seed >>> 0;
+  if (s === 0) s = 1;
+  return () => {
+    s ^= s << 13;
+    s ^= s >>> 17;
+    s ^= s << 5;
+    s = s >>> 0;
+    return s / 4294967296;
+  };
+}
+function seededShuffle2(arr, rng) {
+  const out = [...arr];
+  for (let i = out.length - 1; i > 0; i--) {
+    const j = Math.floor(rng() * (i + 1));
+    [out[i], out[j]] = [out[j], out[i]];
+  }
+  return out;
+}
+var DEFAULT_FULL_CONTEXT_MAX_CHARS = 48e4;
+var DECLINE_PATTERNS2 = /\bno information available\b|\bi (don'?t|do not) (know|have)\b|\bcannot (find|answer|determine|provide)\b|\bnot (mentioned|stated|provided|found|available)\b|\bunable to (find|answer|determine)\b|\bno relevant (information|data)\b/i;
+function appearsToDecline2(answer) {
+  return DECLINE_PATTERNS2.test(answer);
+}
+var ANSWER_INSTRUCTION2 = "Answer the question concisely and specifically based only on the information provided. If the provided context does not contain the answer, reply exactly: No information available";
+function buildMemoryAnswerPrompt2(snippets, question, currentDate) {
+  const context = snippets.length > 0 ? snippets.map((s, i) => `[${i + 1}] ${s}`).join("\n\n") : "(no relevant context retrieved)";
+  const dateNote = currentDate ? `
+Current date: ${currentDate}` : "";
+  return `Context from conversation history:
+${context}${dateNote}
+Question: ${question}
+${ANSWER_INSTRUCTION2}`;
+}
+function buildFullContextAnswerPrompt2(haystackText, question, currentDate) {
+  const dateNote = currentDate ? `
+Current date: ${currentDate}` : "";
+  return `Conversation history:
+${haystackText}${dateNote}
+Question: ${question}
+${ANSWER_INSTRUCTION2}`;
+}
+function buildJudgePrompt2(question, goldAnswer, modelAnswer, questionType, isAbstention) {
+  if (isAbstention) {
+    return `You are a strict judge. This question requires the model to recognize that the information is not present in the conversation history, or that the premise is flawed. A CORRECT response means the model abstained, said something equivalent to "No information available", "I don't know", or identified that the premise is wrong. An INCORRECT response means the model fabricated a specific, concrete answer.
+Question: ${question}
+Model answer: ${modelAnswer}
+Respond with JSON only: {"correct": true} if the model appropriately abstained, {"correct": false} if it fabricated a specific answer.`;
+  }
+  const temporalNote = questionType === "temporal-reasoning" ? "\nFor time/date questions: equivalent date expressions referring to the same date or duration are correct." : "";
+  return `You are a strict judge evaluating whether a model answer is correct.
+CORRECT: the model answer contains the specific information in the gold answer (exact match or clear paraphrase is fine).${temporalNote}
+WRONG: the model answer is vague, only topically related, contradicts the gold answer, or says "no information" when a specific answer exists.
+Question: ${question}
+Gold answer: ${goldAnswer}
+Model answer: ${modelAnswer}
+Respond with JSON only: {"correct": true} or {"correct": false}`;
+}
+function renderHaystack(sessions) {
+  const lines = [];
+  for (let i = 0; i < sessions.length; i++) {
+    const sess = sessions[i];
+    const label = `Session ${i + 1} \u2014 ${sess.dateTime || sess.id}`;
+    lines.push(label);
+    for (const turn of sess.turns) {
+      const roleLabel = turn.role === "user" ? "User" : "Assistant";
+      lines.push(`[${roleLabel}]: ${turn.content}`);
+    }
+    lines.push("");
+  }
+  return lines.join("\n").trim();
+}
+function renderHaystackCapped(sessions, maxChars) {
+  const full = renderHaystack(sessions);
+  if (full.length <= maxChars) return { text: full, truncated: false };
+  let kept = sessions.slice();
+  while (kept.length > 1) {
+    kept = kept.slice(1);
+    const t = renderHaystack(kept);
+    if (t.length <= maxChars) return { text: t, truncated: true };
+  }
+  return { text: renderHaystack(kept).slice(0, maxChars), truncated: true };
+}
+var EMBED_CHAR_CAP = 2e4;
+function capForEmbedding(text) {
+  return text.length <= EMBED_CHAR_CAP ? text : text.slice(0, EMBED_CHAR_CAP);
+}
+async function ingestQuestionIntoMemory(question, memory, scope) {
+  const events = [];
+  for (let i = 0; i < question.sessions.length; i++) {
+    const sess = question.sessions[i];
+    const sessLabel = `Session ${i + 1} \u2014 ${sess.dateTime || sess.id}`;
+    for (let t = 0; t < sess.turns.length; t++) {
+      const turn = sess.turns[t];
+      const roleLabel = turn.role === "user" ? "User" : "Assistant";
+      events.push({
+        id: `${question.id}:sess${i}:turn${t}`,
+        scope,
+        text: capForEmbedding(`[${sessLabel}] [${roleLabel}]: ${turn.content}`),
+        metadata: {
+          sessionId: sess.id,
+          sessionIndex: i,
+          turnRole: turn.role,
+          ingestedAt: sess.dateTimeMs || void 0
+        }
+      });
+    }
+    const sessionText = [
+      sessLabel,
+      ...sess.turns.map((t) => `[${t.role === "user" ? "User" : "Assistant"}]: ${t.content}`)
+    ].join("\n");
+    events.push({
+      id: `${question.id}:sess${i}:chunk`,
+      scope,
+      text: capForEmbedding(sessionText),
+      metadata: {
+        sessionId: sess.id,
+        sessionIndex: i,
+        ingestedAt: sess.dateTimeMs || void 0
+      }
+    });
+  }
+  await memory.ingest(events);
+}
+async function callJudge2(judgeModel, prompt) {
+  const response = await judgeModel.complete({
+    messages: [{ role: "user", content: prompt }],
+    tools: [],
+    outputSchema: {
+      type: "object",
+      properties: { correct: { type: "boolean" } },
+      required: ["correct"],
+      // OpenAI strict structured-output mode requires this to be explicit.
+      additionalProperties: false
+    }
+  });
+  let correct = false;
+  if (response.object && typeof response.object.correct === "boolean") {
+    correct = response.object.correct;
+  } else {
+    const text = response.content.filter((b) => b.type === "text").map((b) => b.text ?? "").join("").toLowerCase().trim();
+    if (/"correct"\s*:\s*true/i.test(text)) correct = true;
+    else if (/"correct"\s*:\s*false/i.test(text)) correct = false;
+    else correct = text.includes("true");
+  }
+  return {
+    correct,
+    inputTokens: response.usage?.inputTokens ?? 0,
+    outputTokens: response.usage?.outputTokens ?? 0
+  };
+}
+async function loadCheckpoint2(path) {
+  const done = /* @__PURE__ */ new Set();
+  if (!(0, import_node_fs2.existsSync)(path)) return done;
+  const raw = await (0, import_promises5.readFile)(path, "utf-8");
+  for (const line of raw.split("\n")) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    try {
+      const row = JSON.parse(trimmed);
+      if (row.questionId) done.add(row.questionId);
+    } catch {
+    }
+  }
+  return done;
+}
+async function appendCheckpointRow2(path, row) {
+  await (0, import_promises5.appendFile)(path, JSON.stringify(row) + "\n", "utf-8");
+}
+async function runLongMemEvalBench(opts) {
+  const {
+    answerModel,
+    judgeModel,
+    mode,
+    types,
+    questionLimit,
+    seed = 42,
+    concurrency = 1,
+    onProgress,
+    checkpointPath,
+    fullContextMaxChars = DEFAULT_FULL_CONTEXT_MAX_CHARS
+  } = opts;
+  const topK = Math.min(opts.topK ?? 10, 10);
+  const rng = makeRng2(seed);
+  if (mode === "memory" && !opts.memoryFactory) {
+    throw new Error("runLongMemEvalBench: memoryFactory is required when mode='memory'");
+  }
+  let dataset;
+  if (opts.dataset) {
+    dataset = opts.dataset;
+  } else if (opts.dataPath) {
+    const { loadLongMemEval: loader } = await Promise.resolve().then(() => (init_lme_loader(), lme_loader_exports));
+    dataset = await loader(opts.dataPath);
+  } else {
+    throw new Error("runLongMemEvalBench: either dataPath or dataset must be provided");
+  }
+  let questions = dataset.questions;
+  if (types && types.length > 0) {
+    questions = questions.filter((q) => types.includes(q.type) || types.includes(q.baseType));
+  }
+  if (questionLimit !== void 0 && questionLimit < questions.length) {
+    questions = seededShuffle2(questions, makeRng2(seed)).slice(0, questionLimit);
+  }
+  const checkpoint = checkpointPath ? await loadCheckpoint2(checkpointPath) : /* @__PURE__ */ new Set();
+  const results = [];
+  let totalAnswerInputTokens = 0;
+  let totalAnswerOutputTokens = 0;
+  let totalJudgeInputTokens = 0;
+  let totalJudgeOutputTokens = 0;
+  let ingestEmbedTokens = 0;
+  let errorCount = 0;
+  let done = 0;
+  const total = questions.length - checkpoint.size;
+  const startTime = Date.now();
+  const processQuestion = async (q) => {
+    if (checkpoint.has(q.id)) return;
+    let modelAnswer = "";
+    let answerIn = 0;
+    let answerOut = 0;
+    let judgeIn = 0;
+    let judgeOut = 0;
+    let correct = false;
+    let contextTruncated = false;
+    let errorMsg;
+    try {
+      if (mode === "memory") {
+        const memory = await opts.memoryFactory(q.id);
+        const scope = { kind: "agent", agentId: `lme:${q.id}` };
+        await ingestQuestionIntoMemory(q, memory, scope);
+        const retrieved = await memory.retrieve({ text: q.question, scope, topK });
+        const snippets = retrieved.snippets.map((s) => s.text);
+        const prompt = buildMemoryAnswerPrompt2(snippets, q.question, q.questionDate);
+        const resp = await answerModel.complete({
+          messages: [{ role: "user", content: prompt }],
+          tools: []
+        });
+        const textBlocks = resp.content.filter((b) => b.type === "text");
+        modelAnswer = textBlocks.map((b) => b.text).join("").trim();
+        answerIn = resp.usage?.inputTokens ?? 0;
+        answerOut = resp.usage?.outputTokens ?? 0;
+      } else {
+        const { text, truncated } = renderHaystackCapped(q.sessions, fullContextMaxChars);
+        contextTruncated = truncated;
+        const prompt = buildFullContextAnswerPrompt2(text, q.question, q.questionDate);
+        const resp = await answerModel.complete({
+          messages: [{ role: "user", content: prompt }],
+          tools: []
+        });
+        const textBlocks = resp.content.filter((b) => b.type === "text");
+        modelAnswer = textBlocks.map((b) => b.text).join("").trim();
+        answerIn = resp.usage?.inputTokens ?? 0;
+        answerOut = resp.usage?.outputTokens ?? 0;
+      }
+      const judgePrompt = buildJudgePrompt2(
+        q.question,
+        q.answer,
+        modelAnswer,
+        q.baseType,
+        q.isAbstention
+      );
+      const judgeResult = await callJudge2(judgeModel, judgePrompt);
+      correct = judgeResult.correct;
+      judgeIn = judgeResult.inputTokens;
+      judgeOut = judgeResult.outputTokens;
+    } catch (err) {
+      errorMsg = err.message;
+      errorCount++;
+      correct = false;
+    }
+    const appearedToAbstain = appearsToDecline2(modelAnswer);
+    const row = {
+      questionId: q.id,
+      questionType: q.type,
+      isAbstention: q.isAbstention,
+      question: q.question,
+      goldAnswer: q.answer,
+      modelAnswer,
+      correct,
+      appearedToAbstain,
+      ...contextTruncated ? { contextTruncated } : {},
+      ...errorMsg !== void 0 ? { error: errorMsg } : {},
+      answerInputTokens: answerIn,
+      answerOutputTokens: answerOut,
+      judgeInputTokens: judgeIn,
+      judgeOutputTokens: judgeOut
+    };
+    results.push(row);
+    totalAnswerInputTokens += answerIn;
+    totalAnswerOutputTokens += answerOut;
+    totalJudgeInputTokens += judgeIn;
+    totalJudgeOutputTokens += judgeOut;
+    if (checkpointPath) {
+      await appendCheckpointRow2(checkpointPath, row);
+    }
+    done++;
+    if (onProgress) onProgress(done, total);
+  };
+  const concurrencyLimit = Math.max(1, concurrency);
+  const pending = [];
+  for (const q of questions) {
+    const p = processQuestion(q);
+    pending.push(p);
+    if (pending.length >= concurrencyLimit) {
+      await Promise.all(pending.splice(0, concurrencyLimit));
+    }
+  }
+  if (pending.length > 0) await Promise.all(pending);
+  if (checkpointPath && checkpoint.size > 0) {
+    const raw = await (0, import_promises5.readFile)(checkpointPath, "utf-8").catch(() => "");
+    for (const line of raw.split("\n")) {
+      const trimmed = line.trim();
+      if (!trimmed) continue;
+      try {
+        const row = JSON.parse(trimmed);
+        if (checkpoint.has(row.questionId)) {
+          results.push(row);
+          totalAnswerInputTokens += row.answerInputTokens ?? 0;
+          totalAnswerOutputTokens += row.answerOutputTokens ?? 0;
+          totalJudgeInputTokens += row.judgeInputTokens ?? 0;
+          totalJudgeOutputTokens += row.judgeOutputTokens ?? 0;
+        }
+      } catch {
+      }
+    }
+  }
+  const byTypeMap = {};
+  let overallCorrect = 0;
+  let overallTotal = 0;
+  let abstentionCorrect = 0;
+  let abstentionTotal = 0;
+  for (const row of results) {
+    if (row.isAbstention) {
+      abstentionTotal++;
+      if (row.correct) abstentionCorrect++;
+    } else {
+      const bt = row.questionType.endsWith("_abs") ? row.questionType.slice(0, -4) : row.questionType;
+      if (!byTypeMap[bt]) byTypeMap[bt] = { correct: 0, total: 0 };
+      overallTotal++;
+      if (row.correct) overallCorrect++;
+      byTypeMap[bt].total++;
+      if (row.correct) byTypeMap[bt].correct++;
+    }
+  }
+  const byType = {};
+  for (const [t, stats] of Object.entries(byTypeMap)) {
+    byType[t] = {
+      ...stats,
+      accuracy: stats.total > 0 ? stats.correct / stats.total : 0
+    };
+  }
+  const abstentionAccuracy = abstentionTotal > 0 ? {
+    correct: abstentionCorrect,
+    total: abstentionTotal,
+    accuracy: abstentionCorrect / abstentionTotal
+  } : void 0;
+  const wallClockMs = Date.now() - startTime;
+  const allTypes = [...new Set(questions.map((q) => q.type))].sort();
+  return {
+    config: {
+      mode,
+      topK,
+      answerModelId: answerModel.modelId ?? "(unknown)",
+      judgeModelId: judgeModel.modelId ?? "(unknown)",
+      datasetSource: LONGMEMEVAL_SOURCE,
+      seed,
+      types: allTypes,
+      questionsRun: results.length
+    },
+    overall: {
+      correct: overallCorrect,
+      total: overallTotal,
+      accuracy: overallTotal > 0 ? overallCorrect / overallTotal : 0
+    },
+    byType,
+    ...abstentionAccuracy !== void 0 ? { abstentionAccuracy } : {},
+    tokens: {
+      ingestEmbedTokens,
+      answerInputTokens: totalAnswerInputTokens,
+      answerOutputTokens: totalAnswerOutputTokens,
+      judgeInputTokens: totalJudgeInputTokens,
+      judgeOutputTokens: totalJudgeOutputTokens,
+      totalInputTokens: totalAnswerInputTokens + totalJudgeInputTokens,
+      totalOutputTokens: totalAnswerOutputTokens + totalJudgeOutputTokens
+    },
+    wallClockMs,
+    questions: results,
+    errorCount
+  };
+}
+// src/lme-render.ts
+var QUESTION_TYPE_LABELS = {
+  "single-session-user": "Single-session (user)",
+  "single-session-assistant": "Single-session (asst.)",
+  "single-session-preference": "Single-session (pref.)",
+  "multi-session": "Multi-session",
+  "temporal-reasoning": "Temporal reasoning",
+  "knowledge-update": "Knowledge update"
+};
+function pct2(n) {
+  return (n * 100).toFixed(1) + "%";
+}
+function fmtNum2(n) {
+  return n.toLocaleString("en-US");
+}
+function estimateCost2(tokens, prices) {
+  if (!prices) return "\u2014";
+  const cost = tokens.totalInputTokens / 1e6 * prices.inputPer1M + tokens.totalOutputTokens / 1e6 * prices.outputPer1M;
+  return `$${cost.toFixed(4)}`;
+}
+function fmtStat(s) {
+  if (!s || s.total === 0) return "\u2014";
+  return `${pct2(s.accuracy)} (${s.correct}/${s.total})`;
+}
+function renderLongMemEvalReportMarkdown(reports, prices) {
+  const lines = [];
+  lines.push("# LongMemEval Benchmark Results");
+  lines.push("");
+  lines.push(
+    "Dataset: [LongMemEval](https://github.com/xiaowu0162/LongMemEval) (Wu et al.) \xB7 MIT License"
+  );
+  lines.push("Raw data is not redistributed. Only aggregate results are published here.");
+  lines.push("");
+  if (reports.length === 0) {
+    lines.push("_No results yet._");
+    return lines.join("\n");
+  }
+  const allBaseTypes = /* @__PURE__ */ new Set();
+  for (const r of reports) {
+    for (const t of Object.keys(r.byType)) allBaseTypes.add(t);
+  }
+  const sortedTypes = [
+    "single-session-user",
+    "single-session-assistant",
+    "single-session-preference",
+    "multi-session",
+    "temporal-reasoning",
+    "knowledge-update"
+  ].filter((t) => allBaseTypes.has(t));
+  for (const t of [...allBaseTypes].sort()) {
+    if (!sortedTypes.includes(t)) sortedTypes.push(t);
+  }
+  const typeHeaders = sortedTypes.map(
+    (t) => QUESTION_TYPE_LABELS[t] ?? t
+  );
+  const headers = [
+    "System / Mode",
+    ...typeHeaders,
+    "Overall accuracy",
+    "Abstention accuracy",
+    "Tokens/query",
+    "Est. cost/run",
+    "Answer model",
+    "Judge model",
+    "topK",
+    "n-Q",
+    "Seed",
+    "Dataset provenance"
+  ];
+  lines.push("## Results");
+  lines.push("");
+  lines.push("| " + headers.join(" | ") + " |");
+  lines.push("| " + headers.map(() => "---").join(" | ") + " |");
+  for (const r of reports) {
+    const c = r.config;
+    const typeRow = sortedTypes.map((t) => fmtStat(r.byType[t]));
+    const totalQ = r.questions.length;
+    const tokensPerQuery = totalQ > 0 ? Math.round(
+      (r.tokens.totalInputTokens + r.tokens.totalOutputTokens) / totalQ
+    ) : 0;
+    const provenance = `${c.datasetSource.url.replace("https://", "")} @ ${c.datasetSource.snapshotSha.slice(0, 8)}`;
+    const row = [
+      `${c.answerModelId} / ${c.mode}`,
+      ...typeRow,
+      fmtStat(r.overall),
+      fmtStat(r.abstentionAccuracy),
+      fmtNum2(tokensPerQuery),
+      estimateCost2(r.tokens, prices),
+      c.answerModelId,
+      c.judgeModelId,
+      c.mode === "memory" ? String(c.topK) : "\u2014",
+      fmtNum2(r.config.questionsRun),
+      String(c.seed),
+      provenance
+    ];
+    lines.push("| " + row.join(" | ") + " |");
+  }
+  lines.push("");
+  lines.push("## Run Configuration");
+  lines.push("");
+  for (const r of reports) {
+    const c = r.config;
+    lines.push(`### ${c.answerModelId} / ${c.mode}`);
+    lines.push("");
+    lines.push(`- **Mode**: ${c.mode}`);
+    lines.push(`- **Answer model**: ${c.answerModelId}`);
+    lines.push(`- **Judge model**: ${c.judgeModelId}`);
+    if (c.mode === "memory") lines.push(`- **topK**: ${c.topK}`);
+    lines.push(`- **Dataset source**: ${c.datasetSource.url}`);
+    lines.push(`- **Dataset snapshot SHA**: \`${c.datasetSource.snapshotSha}\``);
+    lines.push(`- **Dataset file**: ${c.datasetSource.file}`);
+    lines.push(`- **Dataset license**: ${c.datasetSource.license}`);
+    lines.push(`- **Seed**: ${c.seed}`);
+    lines.push(`- **Types**: ${c.types.join(", ") || "all"}`);
+    lines.push(`- **Questions run**: ${c.questionsRun}`);
+    lines.push(`- **Wall-clock**: ${(r.wallClockMs / 1e3).toFixed(1)}s`);
+    lines.push(`- **Errors**: ${r.errorCount}`);
+    lines.push(
+      `- **Tokens** (in/out): ${fmtNum2(r.tokens.totalInputTokens)} / ${fmtNum2(r.tokens.totalOutputTokens)}`
+    );
+    lines.push("");
+  }
+  lines.push("## Methodology Notes");
+  lines.push("");
+  lines.push(
+    "These results were produced using the Eidentic LongMemEval fair-run harness. The following rules apply:"
+  );
+  lines.push("");
+  lines.push(
+    "1. **Per-question memory scope.** Each question has its own haystack (~50 sessions on average). A fresh Memory instance is created per question; no cross-question contamination."
+  );
+  lines.push(
+    "2. **Dual-granularity ingest.** Each turn is ingested with its session date in the text (temporally anchored). An additional session-level chunk entry captures multi-turn context."
+  );
+  lines.push(
+    "3. **Current date in prompt.** The `question_date` is passed to the answer prompt so temporal questions can reason about recency."
+  );
+  lines.push(
+    "4. **topK \u2264 10 in memory mode.** Larger topK values trivialise retrieval quality and are not permitted."
+  );
+  lines.push(
+    "5. **Full-context baseline is required** alongside any memory-mode result."
+  );
+  lines.push(
+    "6. **Judge is strict**: a model answer is correct only when it contains the gold answer's specific information. Vague/topical-only answers are wrong. Equivalent date expressions for the same date/duration are correct (temporal-reasoning type)."
+  );
+  lines.push(
+    "7. **Abstention questions** (not present in longmemeval_s.json standard split): correct = model declined / said no-info / identified a flawed premise; fabricating a specific answer = wrong. Abstention accuracy is reported separately and not folded into overall accuracy."
+  );
+  lines.push(
+    "8. **Dataset license**: MIT \u2014 raw data is not redistributed; only aggregate results are published."
+  );
+  lines.push("");
+  lines.push("> Per-type question counts in longmemeval_s.json (500 total):");
+  lines.push("> single-session-user 70, single-session-assistant 56, single-session-preference 30,");
+  lines.push("> multi-session 133, temporal-reasoning 133, knowledge-update 78.");
+  lines.push("> No abstention variants in the standard _s split.");
+  lines.push("");
+  return lines.join("\n");
+}
 // src/write-quality.ts
 var CONTRADICTION_FIXTURES = [
   {
@@ -1471,7 +2147,7 @@ async function runTemporalBench(memory, dataset, opts = {}) {
 }
 // src/datasets/temporal.ts
-function makeRng2(seed) {
+function makeRng3(seed) {
   let s = seed >>> 0;
   if (s === 0) s = 1;
   return () => {
@@ -1554,7 +2230,7 @@ function syntheticTemporalDataset(opts = {}) {
   const entityCount = opts.entityCount ?? 4;
   const seed = opts.seed ?? 42;
   const changesPerProperty = opts.changesPerProperty ?? 3;
-  const rng = makeRng2(seed);
+  const rng = makeRng3(seed);
   const entities = [];
   const asserts = [];
   const questions = [];
@@ -1642,16 +2318,21 @@ function syntheticTemporalDataset(opts = {}) {
   CONTRADICTION_FIXTURES,
   JUNK_STREAM_FIXTURES,
   LOCOMO_SOURCE_SHA,
+  LONGMEMEVAL_SOURCE,
   factRecall,
   loadLoCoMo,
   loadLoCoMoLegacy,
   loadLongMemEval,
+  loadLongMemEvalLegacy,
   normalizeText,
   normalizedIncludes,
+  parseLmeDateTimeString,
   recallAtK,
   renderLocomoReportMarkdown,
+  renderLongMemEvalReportMarkdown,
   resolveEvidence,
   runLocomoBench,
+  runLongMemEvalBench,
   runMemoryBench,
   runTemporalBench,
   runWriteQualityBench,