@eidentic/bench 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,3 +1,8 @@
1
+ import {
2
+ LOCOMO_SOURCE_SHA,
3
+ loadLoCoMo
4
+ } from "./chunk-PVIWNXCY.js";
5
+
1
6
  // src/recall.ts
2
7
  function normalizeText(text) {
3
8
  return text.toLowerCase().replace(/[''`]/g, "").replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
@@ -362,7 +367,7 @@ function mapLoCoMoType(t) {
362
367
  if (t.includes("multi")) return "multi-session";
363
368
  return "single-session";
364
369
  }
365
- async function loadLoCoMo(jsonPath, opts) {
370
+ async function loadLoCoMo2(jsonPath, opts) {
366
371
  await assertFileSize(jsonPath, opts?.maxBytes);
367
372
  const raw = JSON.parse(await readFile(jsonPath, "utf-8"));
368
373
  let cases;
@@ -397,6 +402,537 @@ async function loadLoCoMo(jsonPath, opts) {
397
402
  return { name: "LoCoMo", cases: benchCases };
398
403
  }
399
404
 
405
+ // src/locomo-run.ts
406
+ import { readFile as readFile2, appendFile } from "node:fs/promises";
407
+ import { existsSync } from "node:fs";
408
+ function makeRng(seed) {
409
+ let s = seed >>> 0;
410
+ if (s === 0) s = 1;
411
+ return () => {
412
+ s ^= s << 13;
413
+ s ^= s >>> 17;
414
+ s ^= s << 5;
415
+ s = s >>> 0;
416
+ return s / 4294967296;
417
+ };
418
+ }
419
+ function seededShuffle(arr, rng) {
420
+ const out = [...arr];
421
+ for (let i = out.length - 1; i > 0; i--) {
422
+ const j = Math.floor(rng() * (i + 1));
423
+ [out[i], out[j]] = [out[j], out[i]];
424
+ }
425
+ return out;
426
+ }
427
+ var ANSWER_INSTRUCTION = "Answer the question concisely and specifically based only on the information provided. If the provided context contains no answer to the question, reply exactly: No information available";
428
+ function buildMemoryAnswerPrompt(snippets, question) {
429
+ const context = snippets.length > 0 ? snippets.map((s, i) => `[${i + 1}] ${s}`).join("\n\n") : "(no relevant context retrieved)";
430
+ return `Context from conversation:
431
+ ${context}
432
+
433
+ Question: ${question}
434
+
435
+ ${ANSWER_INSTRUCTION}`;
436
+ }
437
+ function buildFullContextAnswerPrompt(conversationText, question) {
438
+ return `Conversation:
439
+ ${conversationText}
440
+
441
+ Question: ${question}
442
+
443
+ ${ANSWER_INSTRUCTION}`;
444
+ }
445
+ function buildJudgePrompt(question, goldAnswer, modelAnswer, adversarialAnswer, isCategory5) {
446
+ if (isCategory5) {
447
+ const trapNote = adversarialAnswer ? `
448
+ WARNING \u2014 trap answer to watch for: "${adversarialAnswer}". If the model answer closely matches this trap, mark incorrect.` : "";
449
+ return `You are a strict judge. For this unanswerable question, a CORRECT response means the model refused to answer or said something equivalent to "No information available" or "I don't know".${trapNote}
450
+
451
+ Question: ${question}
452
+ Model answer: ${modelAnswer}
453
+
454
+ Respond with JSON only: {"correct": true} if the model declined (correct), {"correct": false} if it made a claim.`;
455
+ }
456
+ return `You are a strict judge evaluating whether a model answer is correct.
457
+ CORRECT: the model answer contains the specific information in the gold answer (exact match or clear paraphrase).
458
+ WRONG: the model answer is vague, only topically related, or contradicts the gold answer.
459
+ For date/time questions: equivalent date expressions referring to the same date are correct.
460
+
461
+ Question: ${question}
462
+ Gold answer: ${goldAnswer ?? "(none)"}
463
+ Model answer: ${modelAnswer}
464
+
465
+ Respond with JSON only: {"correct": true} or {"correct": false}`;
466
+ }
467
+ function renderConversation(sample) {
468
+ const lines = [];
469
+ for (const sess of sample.sessions) {
470
+ const dateLabel = sess.dateTime ? ` \u2014 ${sess.dateTime}` : "";
471
+ lines.push(`Session ${sess.index}${dateLabel}`);
472
+ for (const turn of sess.turns) {
473
+ lines.push(`[${turn.speaker}]: ${turn.text}`);
474
+ }
475
+ lines.push("");
476
+ }
477
+ return lines.join("\n").trim();
478
+ }
479
+ async function ingestSampleIntoMemory(sample, memory, scope) {
480
+ const events = [];
481
+ for (const sess of sample.sessions) {
482
+ const dateLabel = sess.dateTime ? ` \u2014 ${sess.dateTime}` : "";
483
+ const header = `Session ${sess.index}${dateLabel}`;
484
+ for (const turn of sess.turns) {
485
+ events.push({
486
+ id: `${sample.sampleId}:turn:${turn.diaId}`,
487
+ scope,
488
+ text: `[${header}] [${turn.speaker}]: ${turn.text}`,
489
+ metadata: {
490
+ diaId: turn.diaId,
491
+ sessionIndex: sess.index,
492
+ ingestedAt: sess.dateTimeMs || void 0
493
+ }
494
+ });
495
+ }
496
+ const sessionText = [
497
+ header,
498
+ ...sess.turns.map((t) => `[${t.speaker}]: ${t.text}`)
499
+ ].join("\n");
500
+ events.push({
501
+ id: `${sample.sampleId}:sess:${sess.index}:chunk`,
502
+ scope,
503
+ text: sessionText,
504
+ metadata: { sessionIndex: sess.index, ingestedAt: sess.dateTimeMs || void 0 }
505
+ });
506
+ }
507
+ await memory.ingest(events);
508
+ }
509
+ async function callJudge(judgeModel, prompt) {
510
+ const response = await judgeModel.complete({
511
+ messages: [{ role: "user", content: prompt }],
512
+ tools: [],
513
+ outputSchema: {
514
+ type: "object",
515
+ properties: { correct: { type: "boolean" } },
516
+ required: ["correct"],
517
+ // OpenAI strict structured-output mode requires this to be explicit.
518
+ additionalProperties: false
519
+ }
520
+ });
521
+ let correct = false;
522
+ if (response.object && typeof response.object.correct === "boolean") {
523
+ correct = response.object.correct;
524
+ } else {
525
+ const text = response.content.filter((b) => b.type === "text").map((b) => b.text ?? "").join("").toLowerCase().trim();
526
+ if (/"correct"\s*:\s*true/i.test(text)) correct = true;
527
+ else if (/"correct"\s*:\s*false/i.test(text)) correct = false;
528
+ else correct = text.includes("true");
529
+ }
530
+ return {
531
+ correct,
532
+ inputTokens: response.usage?.inputTokens ?? 0,
533
+ outputTokens: response.usage?.outputTokens ?? 0
534
+ };
535
+ }
536
+ async function loadCheckpoint(path) {
537
+ const done = /* @__PURE__ */ new Set();
538
+ if (!existsSync(path)) return done;
539
+ const raw = await readFile2(path, "utf-8");
540
+ for (const line of raw.split("\n")) {
541
+ const trimmed = line.trim();
542
+ if (!trimmed) continue;
543
+ try {
544
+ const row = JSON.parse(trimmed);
545
+ if (row.sampleId && row.questionIndex !== void 0) {
546
+ done.add(`${row.sampleId}:${row.questionIndex}`);
547
+ }
548
+ } catch {
549
+ }
550
+ }
551
+ return done;
552
+ }
553
+ async function appendCheckpointRow(path, row) {
554
+ await appendFile(path, JSON.stringify(row) + "\n", "utf-8");
555
+ }
556
+ var DECLINE_PATTERNS = /\bno information available\b|\bi (don'?t|do not) know\b|\bcannot (find|answer|determine)\b|\bunavailable\b/i;
557
+ function appearsToDecline(answer) {
558
+ return DECLINE_PATTERNS.test(answer);
559
+ }
560
+ async function runLocomoBench(opts) {
561
+ const {
562
+ answerModel,
563
+ judgeModel,
564
+ mode,
565
+ categories = [1, 2, 3, 4, 5],
566
+ sampleLimit,
567
+ questionLimit,
568
+ seed = 42,
569
+ topK: rawTopK = 10,
570
+ concurrency = 1,
571
+ onProgress,
572
+ checkpointPath
573
+ } = opts;
574
+ const topK = Math.min(rawTopK, 10);
575
+ const rng = makeRng(seed);
576
+ let dataset;
577
+ if (opts.dataset) {
578
+ dataset = opts.dataset;
579
+ } else {
580
+ const { loadLoCoMo: loadLoCoMo3 } = await import("./locomo-loader-YA3IEOND.js");
581
+ dataset = await loadLoCoMo3(opts.dataPath);
582
+ }
583
+ let samples = [...dataset.samples];
584
+ if (sampleLimit !== void 0 && sampleLimit < samples.length) {
585
+ samples = seededShuffle(samples, makeRng(seed)).slice(0, sampleLimit);
586
+ }
587
+ const queue = [];
588
+ for (const sample of samples) {
589
+ let qaList = sample.qa.filter((q) => categories.includes(q.category));
590
+ if (questionLimit !== void 0 && questionLimit < qaList.length) {
591
+ qaList = seededShuffle(qaList, makeRng(seed + 1)).slice(0, questionLimit);
592
+ }
593
+ for (const qa of qaList) {
594
+ queue.push({ sample, qaIndex: sample.qa.indexOf(qa), qa });
595
+ }
596
+ }
597
+ const checkpoint = checkpointPath ? await loadCheckpoint(checkpointPath) : /* @__PURE__ */ new Set();
598
+ const conversationText = /* @__PURE__ */ new Map();
599
+ if (mode === "full-context") {
600
+ for (const sample of samples) {
601
+ conversationText.set(sample.sampleId, renderConversation(sample));
602
+ }
603
+ }
604
+ const memories = /* @__PURE__ */ new Map();
605
+ let ingestInputTokens = 0;
606
+ let ingestOutputTokens = 0;
607
+ if (mode === "memory") {
608
+ if (!opts.memoryFactory) {
609
+ throw new Error("runLocomoBench: memoryFactory is required when mode='memory'");
610
+ }
611
+ for (const sample of samples) {
612
+ const memory = await opts.memoryFactory(sample.sampleId);
613
+ const scope = { kind: "agent", agentId: `locomo:${sample.sampleId}` };
614
+ await ingestSampleIntoMemory(sample, memory, scope);
615
+ memories.set(sample.sampleId, memory);
616
+ }
617
+ }
618
+ const results = [];
619
+ let totalAnswerInputTokens = 0;
620
+ let totalAnswerOutputTokens = 0;
621
+ let totalJudgeInputTokens = 0;
622
+ let totalJudgeOutputTokens = 0;
623
+ let errorCount = 0;
624
+ let done = 0;
625
+ const total = queue.length - checkpoint.size;
626
+ const startTime = Date.now();
627
+ const processItem = async (item) => {
628
+ const key = `${item.sample.sampleId}:${item.qaIndex}`;
629
+ if (checkpoint.has(key)) return;
630
+ let modelAnswer = "";
631
+ let answerIn = 0;
632
+ let answerOut = 0;
633
+ let judgeIn = 0;
634
+ let judgeOut = 0;
635
+ let correct = false;
636
+ let trapTriggered;
637
+ let errorMsg;
638
+ try {
639
+ if (mode === "memory") {
640
+ const memory = memories.get(item.sample.sampleId);
641
+ const scope = { kind: "agent", agentId: `locomo:${item.sample.sampleId}` };
642
+ const retrieved = await memory.retrieve({ text: item.qa.question, scope, topK });
643
+ const snippets = retrieved.snippets.map((s) => s.text);
644
+ const prompt = buildMemoryAnswerPrompt(snippets, item.qa.question);
645
+ const resp = await answerModel.complete({
646
+ messages: [{ role: "user", content: prompt }],
647
+ tools: []
648
+ });
649
+ const textBlocks = resp.content.filter((b) => b.type === "text");
650
+ modelAnswer = textBlocks.map((b) => b.text).join("").trim();
651
+ answerIn = resp.usage?.inputTokens ?? 0;
652
+ answerOut = resp.usage?.outputTokens ?? 0;
653
+ } else {
654
+ const convText = conversationText.get(item.sample.sampleId) ?? "";
655
+ const prompt = buildFullContextAnswerPrompt(convText, item.qa.question);
656
+ const resp = await answerModel.complete({
657
+ messages: [{ role: "user", content: prompt }],
658
+ tools: []
659
+ });
660
+ const textBlocks = resp.content.filter((b) => b.type === "text");
661
+ modelAnswer = textBlocks.map((b) => b.text).join("").trim();
662
+ answerIn = resp.usage?.inputTokens ?? 0;
663
+ answerOut = resp.usage?.outputTokens ?? 0;
664
+ }
665
+ const isCategory5 = item.qa.category === 5;
666
+ const judgePrompt = buildJudgePrompt(
667
+ item.qa.question,
668
+ item.qa.answer,
669
+ modelAnswer,
670
+ item.qa.adversarialAnswer,
671
+ isCategory5
672
+ );
673
+ const judgeResult = await callJudge(judgeModel, judgePrompt);
674
+ correct = judgeResult.correct;
675
+ judgeIn = judgeResult.inputTokens;
676
+ judgeOut = judgeResult.outputTokens;
677
+ if (isCategory5 && item.qa.adversarialAnswer) {
678
+ const adversarialLower = item.qa.adversarialAnswer.toLowerCase();
679
+ const answerLower = modelAnswer.toLowerCase();
680
+ trapTriggered = answerLower.includes(adversarialLower) || adversarialLower.includes(answerLower.slice(0, Math.min(answerLower.length, 30)));
681
+ if (trapTriggered && correct) correct = false;
682
+ if (!correct && appearsToDecline(modelAnswer)) {
683
+ correct = true;
684
+ trapTriggered = false;
685
+ }
686
+ }
687
+ } catch (err) {
688
+ errorMsg = err.message;
689
+ errorCount++;
690
+ correct = false;
691
+ }
692
+ const row = {
693
+ sampleId: item.sample.sampleId,
694
+ questionIndex: item.qaIndex,
695
+ question: item.qa.question,
696
+ goldAnswer: item.qa.answer,
697
+ category: item.qa.category,
698
+ modelAnswer,
699
+ correct,
700
+ ...trapTriggered !== void 0 ? { trapTriggered } : {},
701
+ ...errorMsg !== void 0 ? { error: errorMsg } : {},
702
+ answerInputTokens: answerIn,
703
+ answerOutputTokens: answerOut,
704
+ judgeInputTokens: judgeIn,
705
+ judgeOutputTokens: judgeOut
706
+ };
707
+ results.push(row);
708
+ totalAnswerInputTokens += answerIn;
709
+ totalAnswerOutputTokens += answerOut;
710
+ totalJudgeInputTokens += judgeIn;
711
+ totalJudgeOutputTokens += judgeOut;
712
+ if (checkpointPath) {
713
+ await appendCheckpointRow(checkpointPath, row);
714
+ }
715
+ done++;
716
+ if (onProgress) onProgress(done, total);
717
+ };
718
+ const concurrencyLimit = Math.max(1, concurrency);
719
+ const pending = [];
720
+ for (const item of queue) {
721
+ const p = processItem(item);
722
+ pending.push(p);
723
+ if (pending.length >= concurrencyLimit) {
724
+ await Promise.all(pending.splice(0, concurrencyLimit));
725
+ }
726
+ }
727
+ if (pending.length > 0) await Promise.all(pending);
728
+ if (checkpointPath && checkpoint.size > 0) {
729
+ const raw = await readFile2(checkpointPath, "utf-8").catch(() => "");
730
+ for (const line of raw.split("\n")) {
731
+ const trimmed = line.trim();
732
+ if (!trimmed) continue;
733
+ try {
734
+ const row = JSON.parse(trimmed);
735
+ const key = `${row.sampleId}:${row.questionIndex}`;
736
+ if (checkpoint.has(key)) {
737
+ results.push(row);
738
+ totalAnswerInputTokens += row.answerInputTokens ?? 0;
739
+ totalAnswerOutputTokens += row.answerOutputTokens ?? 0;
740
+ totalJudgeInputTokens += row.judgeInputTokens ?? 0;
741
+ totalJudgeOutputTokens += row.judgeOutputTokens ?? 0;
742
+ }
743
+ } catch {
744
+ }
745
+ }
746
+ }
747
+ const byCategoryMap = {};
748
+ for (const row of results) {
749
+ const cat = String(row.category);
750
+ if (!byCategoryMap[cat]) byCategoryMap[cat] = { correct: 0, total: 0 };
751
+ byCategoryMap[cat].total++;
752
+ if (row.correct) byCategoryMap[cat].correct++;
753
+ }
754
+ const byCategory = {};
755
+ for (const [cat, stats] of Object.entries(byCategoryMap)) {
756
+ byCategory[cat] = {
757
+ ...stats,
758
+ accuracy: stats.total > 0 ? stats.correct / stats.total : 0
759
+ };
760
+ }
761
+ let j14Correct = 0;
762
+ let j14Total = 0;
763
+ for (const [cat, stats] of Object.entries(byCategory)) {
764
+ const n = parseInt(cat);
765
+ if (n >= 1 && n <= 4) {
766
+ j14Correct += stats.correct;
767
+ j14Total += stats.total;
768
+ }
769
+ }
770
+ const cat5Stats = byCategory["5"];
771
+ const cat5RefusalRate = cat5Stats ? { correct: cat5Stats.correct, total: cat5Stats.total, rate: cat5Stats.accuracy } : void 0;
772
+ const wallClockMs = Date.now() - startTime;
773
+ return {
774
+ config: {
775
+ mode,
776
+ topK,
777
+ answerModelId: answerModel.modelId ?? "(unknown)",
778
+ judgeModelId: judgeModel.modelId ?? "(unknown)",
779
+ datasetSha: LOCOMO_SOURCE_SHA,
780
+ seed,
781
+ categories: [...categories].sort((a, b) => a - b),
782
+ samplesRun: samples.length,
783
+ questionsRun: results.length
784
+ },
785
+ overallJ14: {
786
+ correct: j14Correct,
787
+ total: j14Total,
788
+ accuracy: j14Total > 0 ? j14Correct / j14Total : 0
789
+ },
790
+ byCategory,
791
+ cat5RefusalRate,
792
+ tokens: {
793
+ ingestInputTokens,
794
+ ingestOutputTokens,
795
+ answerInputTokens: totalAnswerInputTokens,
796
+ answerOutputTokens: totalAnswerOutputTokens,
797
+ judgeInputTokens: totalJudgeInputTokens,
798
+ judgeOutputTokens: totalJudgeOutputTokens,
799
+ totalInputTokens: ingestInputTokens + totalAnswerInputTokens + totalJudgeInputTokens,
800
+ totalOutputTokens: ingestOutputTokens + totalAnswerOutputTokens + totalJudgeOutputTokens
801
+ },
802
+ wallClockMs,
803
+ questions: results,
804
+ errorCount
805
+ };
806
+ }
807
+
808
+ // src/locomo-render.ts
809
+ function pct(n) {
810
+ return (n * 100).toFixed(1) + "%";
811
+ }
812
+ function fmtNum(n) {
813
+ return n.toLocaleString("en-US");
814
+ }
815
+ function estimateCost(tokens, prices) {
816
+ if (!prices) return "\u2014";
817
+ const cost = tokens.totalInputTokens / 1e6 * prices.inputPer1M + tokens.totalOutputTokens / 1e6 * prices.outputPer1M;
818
+ return `$${cost.toFixed(4)}`;
819
+ }
820
+ function renderLocomoReportMarkdown(reports, prices) {
821
+ const lines = [];
822
+ lines.push("# LoCoMo Benchmark Results");
823
+ lines.push("");
824
+ lines.push("Dataset: [LoCoMo](https://github.com/snap-research/locomo) (Snap Research) \xB7 CC BY-NC 4.0");
825
+ lines.push("Raw data is not redistributed. Only aggregate results are published here.");
826
+ lines.push("");
827
+ if (reports.length === 0) {
828
+ lines.push("_No results yet._");
829
+ return lines.join("\n");
830
+ }
831
+ const headers = [
832
+ "System / Mode",
833
+ "Cat1 (multi-hop)",
834
+ "Cat2 (temporal)",
835
+ "Cat3 (open-domain)",
836
+ "Cat4 (single-hop)",
837
+ "J(1\u20134) overall",
838
+ "Cat5 refusal rate",
839
+ "Tokens/query",
840
+ "Est. cost/run",
841
+ "Answer model",
842
+ "Judge model",
843
+ "topK",
844
+ "n-Q",
845
+ "Seed",
846
+ "Dataset SHA"
847
+ ];
848
+ lines.push("## Results");
849
+ lines.push("");
850
+ lines.push("| " + headers.join(" | ") + " |");
851
+ lines.push("| " + headers.map(() => "---").join(" | ") + " |");
852
+ for (const r of reports) {
853
+ const c = r.config;
854
+ const cat = (n) => {
855
+ const s = r.byCategory[String(n)];
856
+ if (!s) return "\u2014";
857
+ return `${pct(s.accuracy)} (${s.correct}/${s.total})`;
858
+ };
859
+ const cat5 = r.cat5RefusalRate ? `${pct(r.cat5RefusalRate.rate)} (${r.cat5RefusalRate.correct}/${r.cat5RefusalRate.total})` : "\u2014";
860
+ const totalQ = r.questions.length;
861
+ const tokensPerQuery = totalQ > 0 ? Math.round((r.tokens.totalInputTokens + r.tokens.totalOutputTokens) / totalQ) : 0;
862
+ const row = [
863
+ `${c.answerModelId} / ${c.mode}`,
864
+ cat(1),
865
+ cat(2),
866
+ cat(3),
867
+ cat(4),
868
+ `${pct(r.overallJ14.accuracy)} (${r.overallJ14.correct}/${r.overallJ14.total})`,
869
+ cat5,
870
+ fmtNum(tokensPerQuery),
871
+ estimateCost(r.tokens, prices),
872
+ c.answerModelId,
873
+ c.judgeModelId,
874
+ String(c.topK),
875
+ fmtNum(r.config.questionsRun),
876
+ String(c.seed),
877
+ c.datasetSha.slice(0, 8)
878
+ ];
879
+ lines.push("| " + row.join(" | ") + " |");
880
+ }
881
+ lines.push("");
882
+ lines.push("## Run Configuration");
883
+ lines.push("");
884
+ for (const r of reports) {
885
+ const c = r.config;
886
+ lines.push(`### ${c.answerModelId} / ${c.mode}`);
887
+ lines.push("");
888
+ lines.push(`- **Mode**: ${c.mode}`);
889
+ lines.push(`- **Answer model**: ${c.answerModelId}`);
890
+ lines.push(`- **Judge model**: ${c.judgeModelId}`);
891
+ lines.push(`- **topK**: ${c.topK}`);
892
+ lines.push(`- **Dataset SHA**: \`${c.datasetSha}\``);
893
+ lines.push(`- **Seed**: ${c.seed}`);
894
+ lines.push(`- **Categories**: ${c.categories.join(", ")}`);
895
+ lines.push(`- **Samples run**: ${c.samplesRun}`);
896
+ lines.push(`- **Questions run**: ${c.questionsRun}`);
897
+ lines.push(`- **Wall-clock**: ${(r.wallClockMs / 1e3).toFixed(1)}s`);
898
+ lines.push(`- **Errors**: ${r.errorCount}`);
899
+ lines.push(`- **Tokens** (in/out): ${fmtNum(r.tokens.totalInputTokens)} / ${fmtNum(r.tokens.totalOutputTokens)}`);
900
+ lines.push("");
901
+ }
902
+ lines.push("## Methodology Notes");
903
+ lines.push("");
904
+ lines.push("These results were produced using the Eidentic LoCoMo fair-run harness. The following rules apply:");
905
+ lines.push("");
906
+ lines.push("1. **Both speakers are treated as humans.** Turns are ingested as `[SpeakerName]: text` \u2014 never mapped to user/assistant roles.");
907
+ lines.push("2. **Timestamps are structural.** Each session is prefixed with a header line `Session N \u2014 <date>` and `ingestedAt` metadata carries the epoch-ms.");
908
+ lines.push("3. **topK \u2264 10 in memory mode.** Larger topK values trivialise retrieval quality and are not permitted.");
909
+ lines.push("4. **Full-context baseline is required** alongside any memory-mode result.");
910
+ lines.push("5. **Judge is strict**: a model answer is correct only when it contains the gold answer's specific information. Vague/topical-only answers are wrong.");
911
+ lines.push("6. **Category 5 (adversarial)**: correct = model declined; adversarial-trap match = wrong.");
912
+ lines.push("7. **Primary metric J(1\u20134)**: denominator is the number of cat 1\u20134 questions actually run (max 1540 on full dataset).");
913
+ lines.push("8. **Dataset license**: CC BY-NC 4.0 \u2014 raw data is not redistributed; only aggregate results are published.");
914
+ lines.push("");
915
+ lines.push("> Category mapping in locomo10.json: 1=multi-hop (282), 2=temporal (321), 3=open-domain (96), 4=single-hop (841), 5=adversarial (446).");
916
+ lines.push("");
917
+ return lines.join("\n");
918
+ }
919
+
920
+ // src/locomo-types.ts
921
+ function resolveEvidence(sample, diaIds) {
922
+ const turnMap = /* @__PURE__ */ new Map();
923
+ for (const sess of sample.sessions) {
924
+ for (const turn of sess.turns) {
925
+ turnMap.set(turn.diaId, turn);
926
+ }
927
+ }
928
+ const results = [];
929
+ for (const id of diaIds) {
930
+ const t = turnMap.get(id);
931
+ if (t) results.push(t.text);
932
+ }
933
+ return results;
934
+ }
935
+
400
936
  // src/write-quality.ts
401
937
  var CONTRADICTION_FIXTURES = [
402
938
  {
@@ -774,7 +1310,7 @@ async function runTemporalBench(memory, dataset, opts = {}) {
774
1310
  }
775
1311
 
776
1312
  // src/datasets/temporal.ts
777
- function makeRng(seed) {
1313
+ function makeRng2(seed) {
778
1314
  let s = seed >>> 0;
779
1315
  if (s === 0) s = 1;
780
1316
  return () => {
@@ -857,7 +1393,7 @@ function syntheticTemporalDataset(opts = {}) {
857
1393
  const entityCount = opts.entityCount ?? 4;
858
1394
  const seed = opts.seed ?? 42;
859
1395
  const changesPerProperty = opts.changesPerProperty ?? 3;
860
- const rng = makeRng(seed);
1396
+ const rng = makeRng2(seed);
861
1397
  const entities = [];
862
1398
  const asserts = [];
863
1399
  const questions = [];
@@ -943,12 +1479,17 @@ function syntheticTemporalDataset(opts = {}) {
943
1479
  export {
944
1480
  CONTRADICTION_FIXTURES,
945
1481
  JUNK_STREAM_FIXTURES,
1482
+ LOCOMO_SOURCE_SHA,
946
1483
  factRecall,
947
1484
  loadLoCoMo,
1485
+ loadLoCoMo2 as loadLoCoMoLegacy,
948
1486
  loadLongMemEval,
949
1487
  normalizeText,
950
1488
  normalizedIncludes,
951
1489
  recallAtK,
1490
+ renderLocomoReportMarkdown,
1491
+ resolveEvidence,
1492
+ runLocomoBench,
952
1493
  runMemoryBench,
953
1494
  runTemporalBench,
954
1495
  runWriteQualityBench,
@@ -0,0 +1,8 @@
1
+ import {
2
+ LOCOMO_SOURCE_SHA,
3
+ loadLoCoMo
4
+ } from "./chunk-PVIWNXCY.js";
5
+ export {
6
+ LOCOMO_SOURCE_SHA,
7
+ loadLoCoMo
8
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@eidentic/bench",
3
- "version": "0.1.0",
3
+ "version": "0.1.1",
4
4
  "type": "module",
5
5
  "license": "Apache-2.0",
6
6
  "publishConfig": {
@@ -28,9 +28,9 @@
28
28
  "README.md"
29
29
  ],
30
30
  "dependencies": {
31
- "@eidentic/types": "0.1.0",
32
- "@eidentic/eval": "0.1.0",
33
- "@eidentic/memory": "0.1.0"
31
+ "@eidentic/memory": "0.1.1",
32
+ "@eidentic/eval": "0.1.1",
33
+ "@eidentic/types": "0.1.1"
34
34
  },
35
35
  "description": "Memory benchmark harness for Eidentic — run LongMemEval / LoCoMo / temporal-reasoning benchmarks with deterministic recall metrics.",
36
36
  "keywords": [