@eidentic/bench 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -2
- package/dist/chunk-PVIWNXCY.js +99 -0
- package/dist/index.cjs +657 -3
- package/dist/index.d.cts +255 -2
- package/dist/index.d.ts +255 -2
- package/dist/index.js +544 -3
- package/dist/locomo-loader-YA3IEOND.js +8 -0
- package/package.json +4 -4
package/dist/index.js
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
import {
|
|
2
|
+
LOCOMO_SOURCE_SHA,
|
|
3
|
+
loadLoCoMo
|
|
4
|
+
} from "./chunk-PVIWNXCY.js";
|
|
5
|
+
|
|
1
6
|
// src/recall.ts
|
|
2
7
|
function normalizeText(text) {
|
|
3
8
|
return text.toLowerCase().replace(/[''`]/g, "").replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
|
|
@@ -362,7 +367,7 @@ function mapLoCoMoType(t) {
|
|
|
362
367
|
if (t.includes("multi")) return "multi-session";
|
|
363
368
|
return "single-session";
|
|
364
369
|
}
|
|
365
|
-
async function
|
|
370
|
+
async function loadLoCoMo2(jsonPath, opts) {
|
|
366
371
|
await assertFileSize(jsonPath, opts?.maxBytes);
|
|
367
372
|
const raw = JSON.parse(await readFile(jsonPath, "utf-8"));
|
|
368
373
|
let cases;
|
|
@@ -397,6 +402,537 @@ async function loadLoCoMo(jsonPath, opts) {
|
|
|
397
402
|
return { name: "LoCoMo", cases: benchCases };
|
|
398
403
|
}
|
|
399
404
|
|
|
405
|
+
// src/locomo-run.ts
|
|
406
|
+
import { readFile as readFile2, appendFile } from "node:fs/promises";
|
|
407
|
+
import { existsSync } from "node:fs";
|
|
408
|
+
function makeRng(seed) {
|
|
409
|
+
let s = seed >>> 0;
|
|
410
|
+
if (s === 0) s = 1;
|
|
411
|
+
return () => {
|
|
412
|
+
s ^= s << 13;
|
|
413
|
+
s ^= s >>> 17;
|
|
414
|
+
s ^= s << 5;
|
|
415
|
+
s = s >>> 0;
|
|
416
|
+
return s / 4294967296;
|
|
417
|
+
};
|
|
418
|
+
}
|
|
419
|
+
function seededShuffle(arr, rng) {
|
|
420
|
+
const out = [...arr];
|
|
421
|
+
for (let i = out.length - 1; i > 0; i--) {
|
|
422
|
+
const j = Math.floor(rng() * (i + 1));
|
|
423
|
+
[out[i], out[j]] = [out[j], out[i]];
|
|
424
|
+
}
|
|
425
|
+
return out;
|
|
426
|
+
}
|
|
427
|
+
var ANSWER_INSTRUCTION = "Answer the question concisely and specifically based only on the information provided. If the provided context contains no answer to the question, reply exactly: No information available";
|
|
428
|
+
function buildMemoryAnswerPrompt(snippets, question) {
|
|
429
|
+
const context = snippets.length > 0 ? snippets.map((s, i) => `[${i + 1}] ${s}`).join("\n\n") : "(no relevant context retrieved)";
|
|
430
|
+
return `Context from conversation:
|
|
431
|
+
${context}
|
|
432
|
+
|
|
433
|
+
Question: ${question}
|
|
434
|
+
|
|
435
|
+
${ANSWER_INSTRUCTION}`;
|
|
436
|
+
}
|
|
437
|
+
function buildFullContextAnswerPrompt(conversationText, question) {
|
|
438
|
+
return `Conversation:
|
|
439
|
+
${conversationText}
|
|
440
|
+
|
|
441
|
+
Question: ${question}
|
|
442
|
+
|
|
443
|
+
${ANSWER_INSTRUCTION}`;
|
|
444
|
+
}
|
|
445
|
+
function buildJudgePrompt(question, goldAnswer, modelAnswer, adversarialAnswer, isCategory5) {
|
|
446
|
+
if (isCategory5) {
|
|
447
|
+
const trapNote = adversarialAnswer ? `
|
|
448
|
+
WARNING \u2014 trap answer to watch for: "${adversarialAnswer}". If the model answer closely matches this trap, mark incorrect.` : "";
|
|
449
|
+
return `You are a strict judge. For this unanswerable question, a CORRECT response means the model refused to answer or said something equivalent to "No information available" or "I don't know".${trapNote}
|
|
450
|
+
|
|
451
|
+
Question: ${question}
|
|
452
|
+
Model answer: ${modelAnswer}
|
|
453
|
+
|
|
454
|
+
Respond with JSON only: {"correct": true} if the model declined (correct), {"correct": false} if it made a claim.`;
|
|
455
|
+
}
|
|
456
|
+
return `You are a strict judge evaluating whether a model answer is correct.
|
|
457
|
+
CORRECT: the model answer contains the specific information in the gold answer (exact match or clear paraphrase).
|
|
458
|
+
WRONG: the model answer is vague, only topically related, or contradicts the gold answer.
|
|
459
|
+
For date/time questions: equivalent date expressions referring to the same date are correct.
|
|
460
|
+
|
|
461
|
+
Question: ${question}
|
|
462
|
+
Gold answer: ${goldAnswer ?? "(none)"}
|
|
463
|
+
Model answer: ${modelAnswer}
|
|
464
|
+
|
|
465
|
+
Respond with JSON only: {"correct": true} or {"correct": false}`;
|
|
466
|
+
}
|
|
467
|
+
function renderConversation(sample) {
|
|
468
|
+
const lines = [];
|
|
469
|
+
for (const sess of sample.sessions) {
|
|
470
|
+
const dateLabel = sess.dateTime ? ` \u2014 ${sess.dateTime}` : "";
|
|
471
|
+
lines.push(`Session ${sess.index}${dateLabel}`);
|
|
472
|
+
for (const turn of sess.turns) {
|
|
473
|
+
lines.push(`[${turn.speaker}]: ${turn.text}`);
|
|
474
|
+
}
|
|
475
|
+
lines.push("");
|
|
476
|
+
}
|
|
477
|
+
return lines.join("\n").trim();
|
|
478
|
+
}
|
|
479
|
+
async function ingestSampleIntoMemory(sample, memory, scope) {
|
|
480
|
+
const events = [];
|
|
481
|
+
for (const sess of sample.sessions) {
|
|
482
|
+
const dateLabel = sess.dateTime ? ` \u2014 ${sess.dateTime}` : "";
|
|
483
|
+
const header = `Session ${sess.index}${dateLabel}`;
|
|
484
|
+
for (const turn of sess.turns) {
|
|
485
|
+
events.push({
|
|
486
|
+
id: `${sample.sampleId}:turn:${turn.diaId}`,
|
|
487
|
+
scope,
|
|
488
|
+
text: `[${header}] [${turn.speaker}]: ${turn.text}`,
|
|
489
|
+
metadata: {
|
|
490
|
+
diaId: turn.diaId,
|
|
491
|
+
sessionIndex: sess.index,
|
|
492
|
+
ingestedAt: sess.dateTimeMs || void 0
|
|
493
|
+
}
|
|
494
|
+
});
|
|
495
|
+
}
|
|
496
|
+
const sessionText = [
|
|
497
|
+
header,
|
|
498
|
+
...sess.turns.map((t) => `[${t.speaker}]: ${t.text}`)
|
|
499
|
+
].join("\n");
|
|
500
|
+
events.push({
|
|
501
|
+
id: `${sample.sampleId}:sess:${sess.index}:chunk`,
|
|
502
|
+
scope,
|
|
503
|
+
text: sessionText,
|
|
504
|
+
metadata: { sessionIndex: sess.index, ingestedAt: sess.dateTimeMs || void 0 }
|
|
505
|
+
});
|
|
506
|
+
}
|
|
507
|
+
await memory.ingest(events);
|
|
508
|
+
}
|
|
509
|
+
async function callJudge(judgeModel, prompt) {
|
|
510
|
+
const response = await judgeModel.complete({
|
|
511
|
+
messages: [{ role: "user", content: prompt }],
|
|
512
|
+
tools: [],
|
|
513
|
+
outputSchema: {
|
|
514
|
+
type: "object",
|
|
515
|
+
properties: { correct: { type: "boolean" } },
|
|
516
|
+
required: ["correct"],
|
|
517
|
+
// OpenAI strict structured-output mode requires this to be explicit.
|
|
518
|
+
additionalProperties: false
|
|
519
|
+
}
|
|
520
|
+
});
|
|
521
|
+
let correct = false;
|
|
522
|
+
if (response.object && typeof response.object.correct === "boolean") {
|
|
523
|
+
correct = response.object.correct;
|
|
524
|
+
} else {
|
|
525
|
+
const text = response.content.filter((b) => b.type === "text").map((b) => b.text ?? "").join("").toLowerCase().trim();
|
|
526
|
+
if (/"correct"\s*:\s*true/i.test(text)) correct = true;
|
|
527
|
+
else if (/"correct"\s*:\s*false/i.test(text)) correct = false;
|
|
528
|
+
else correct = text.includes("true");
|
|
529
|
+
}
|
|
530
|
+
return {
|
|
531
|
+
correct,
|
|
532
|
+
inputTokens: response.usage?.inputTokens ?? 0,
|
|
533
|
+
outputTokens: response.usage?.outputTokens ?? 0
|
|
534
|
+
};
|
|
535
|
+
}
|
|
536
|
+
async function loadCheckpoint(path) {
|
|
537
|
+
const done = /* @__PURE__ */ new Set();
|
|
538
|
+
if (!existsSync(path)) return done;
|
|
539
|
+
const raw = await readFile2(path, "utf-8");
|
|
540
|
+
for (const line of raw.split("\n")) {
|
|
541
|
+
const trimmed = line.trim();
|
|
542
|
+
if (!trimmed) continue;
|
|
543
|
+
try {
|
|
544
|
+
const row = JSON.parse(trimmed);
|
|
545
|
+
if (row.sampleId && row.questionIndex !== void 0) {
|
|
546
|
+
done.add(`${row.sampleId}:${row.questionIndex}`);
|
|
547
|
+
}
|
|
548
|
+
} catch {
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
return done;
|
|
552
|
+
}
|
|
553
|
+
async function appendCheckpointRow(path, row) {
|
|
554
|
+
await appendFile(path, JSON.stringify(row) + "\n", "utf-8");
|
|
555
|
+
}
|
|
556
|
+
var DECLINE_PATTERNS = /\bno information available\b|\bi (don'?t|do not) know\b|\bcannot (find|answer|determine)\b|\bunavailable\b/i;
|
|
557
|
+
function appearsToDecline(answer) {
|
|
558
|
+
return DECLINE_PATTERNS.test(answer);
|
|
559
|
+
}
|
|
560
|
+
async function runLocomoBench(opts) {
|
|
561
|
+
const {
|
|
562
|
+
answerModel,
|
|
563
|
+
judgeModel,
|
|
564
|
+
mode,
|
|
565
|
+
categories = [1, 2, 3, 4, 5],
|
|
566
|
+
sampleLimit,
|
|
567
|
+
questionLimit,
|
|
568
|
+
seed = 42,
|
|
569
|
+
topK: rawTopK = 10,
|
|
570
|
+
concurrency = 1,
|
|
571
|
+
onProgress,
|
|
572
|
+
checkpointPath
|
|
573
|
+
} = opts;
|
|
574
|
+
const topK = Math.min(rawTopK, 10);
|
|
575
|
+
const rng = makeRng(seed);
|
|
576
|
+
let dataset;
|
|
577
|
+
if (opts.dataset) {
|
|
578
|
+
dataset = opts.dataset;
|
|
579
|
+
} else {
|
|
580
|
+
const { loadLoCoMo: loadLoCoMo3 } = await import("./locomo-loader-YA3IEOND.js");
|
|
581
|
+
dataset = await loadLoCoMo3(opts.dataPath);
|
|
582
|
+
}
|
|
583
|
+
let samples = [...dataset.samples];
|
|
584
|
+
if (sampleLimit !== void 0 && sampleLimit < samples.length) {
|
|
585
|
+
samples = seededShuffle(samples, makeRng(seed)).slice(0, sampleLimit);
|
|
586
|
+
}
|
|
587
|
+
const queue = [];
|
|
588
|
+
for (const sample of samples) {
|
|
589
|
+
let qaList = sample.qa.filter((q) => categories.includes(q.category));
|
|
590
|
+
if (questionLimit !== void 0 && questionLimit < qaList.length) {
|
|
591
|
+
qaList = seededShuffle(qaList, makeRng(seed + 1)).slice(0, questionLimit);
|
|
592
|
+
}
|
|
593
|
+
for (const qa of qaList) {
|
|
594
|
+
queue.push({ sample, qaIndex: sample.qa.indexOf(qa), qa });
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
const checkpoint = checkpointPath ? await loadCheckpoint(checkpointPath) : /* @__PURE__ */ new Set();
|
|
598
|
+
const conversationText = /* @__PURE__ */ new Map();
|
|
599
|
+
if (mode === "full-context") {
|
|
600
|
+
for (const sample of samples) {
|
|
601
|
+
conversationText.set(sample.sampleId, renderConversation(sample));
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
const memories = /* @__PURE__ */ new Map();
|
|
605
|
+
let ingestInputTokens = 0;
|
|
606
|
+
let ingestOutputTokens = 0;
|
|
607
|
+
if (mode === "memory") {
|
|
608
|
+
if (!opts.memoryFactory) {
|
|
609
|
+
throw new Error("runLocomoBench: memoryFactory is required when mode='memory'");
|
|
610
|
+
}
|
|
611
|
+
for (const sample of samples) {
|
|
612
|
+
const memory = await opts.memoryFactory(sample.sampleId);
|
|
613
|
+
const scope = { kind: "agent", agentId: `locomo:${sample.sampleId}` };
|
|
614
|
+
await ingestSampleIntoMemory(sample, memory, scope);
|
|
615
|
+
memories.set(sample.sampleId, memory);
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
const results = [];
|
|
619
|
+
let totalAnswerInputTokens = 0;
|
|
620
|
+
let totalAnswerOutputTokens = 0;
|
|
621
|
+
let totalJudgeInputTokens = 0;
|
|
622
|
+
let totalJudgeOutputTokens = 0;
|
|
623
|
+
let errorCount = 0;
|
|
624
|
+
let done = 0;
|
|
625
|
+
const total = queue.length - checkpoint.size;
|
|
626
|
+
const startTime = Date.now();
|
|
627
|
+
const processItem = async (item) => {
|
|
628
|
+
const key = `${item.sample.sampleId}:${item.qaIndex}`;
|
|
629
|
+
if (checkpoint.has(key)) return;
|
|
630
|
+
let modelAnswer = "";
|
|
631
|
+
let answerIn = 0;
|
|
632
|
+
let answerOut = 0;
|
|
633
|
+
let judgeIn = 0;
|
|
634
|
+
let judgeOut = 0;
|
|
635
|
+
let correct = false;
|
|
636
|
+
let trapTriggered;
|
|
637
|
+
let errorMsg;
|
|
638
|
+
try {
|
|
639
|
+
if (mode === "memory") {
|
|
640
|
+
const memory = memories.get(item.sample.sampleId);
|
|
641
|
+
const scope = { kind: "agent", agentId: `locomo:${item.sample.sampleId}` };
|
|
642
|
+
const retrieved = await memory.retrieve({ text: item.qa.question, scope, topK });
|
|
643
|
+
const snippets = retrieved.snippets.map((s) => s.text);
|
|
644
|
+
const prompt = buildMemoryAnswerPrompt(snippets, item.qa.question);
|
|
645
|
+
const resp = await answerModel.complete({
|
|
646
|
+
messages: [{ role: "user", content: prompt }],
|
|
647
|
+
tools: []
|
|
648
|
+
});
|
|
649
|
+
const textBlocks = resp.content.filter((b) => b.type === "text");
|
|
650
|
+
modelAnswer = textBlocks.map((b) => b.text).join("").trim();
|
|
651
|
+
answerIn = resp.usage?.inputTokens ?? 0;
|
|
652
|
+
answerOut = resp.usage?.outputTokens ?? 0;
|
|
653
|
+
} else {
|
|
654
|
+
const convText = conversationText.get(item.sample.sampleId) ?? "";
|
|
655
|
+
const prompt = buildFullContextAnswerPrompt(convText, item.qa.question);
|
|
656
|
+
const resp = await answerModel.complete({
|
|
657
|
+
messages: [{ role: "user", content: prompt }],
|
|
658
|
+
tools: []
|
|
659
|
+
});
|
|
660
|
+
const textBlocks = resp.content.filter((b) => b.type === "text");
|
|
661
|
+
modelAnswer = textBlocks.map((b) => b.text).join("").trim();
|
|
662
|
+
answerIn = resp.usage?.inputTokens ?? 0;
|
|
663
|
+
answerOut = resp.usage?.outputTokens ?? 0;
|
|
664
|
+
}
|
|
665
|
+
const isCategory5 = item.qa.category === 5;
|
|
666
|
+
const judgePrompt = buildJudgePrompt(
|
|
667
|
+
item.qa.question,
|
|
668
|
+
item.qa.answer,
|
|
669
|
+
modelAnswer,
|
|
670
|
+
item.qa.adversarialAnswer,
|
|
671
|
+
isCategory5
|
|
672
|
+
);
|
|
673
|
+
const judgeResult = await callJudge(judgeModel, judgePrompt);
|
|
674
|
+
correct = judgeResult.correct;
|
|
675
|
+
judgeIn = judgeResult.inputTokens;
|
|
676
|
+
judgeOut = judgeResult.outputTokens;
|
|
677
|
+
if (isCategory5 && item.qa.adversarialAnswer) {
|
|
678
|
+
const adversarialLower = item.qa.adversarialAnswer.toLowerCase();
|
|
679
|
+
const answerLower = modelAnswer.toLowerCase();
|
|
680
|
+
trapTriggered = answerLower.includes(adversarialLower) || adversarialLower.includes(answerLower.slice(0, Math.min(answerLower.length, 30)));
|
|
681
|
+
if (trapTriggered && correct) correct = false;
|
|
682
|
+
if (!correct && appearsToDecline(modelAnswer)) {
|
|
683
|
+
correct = true;
|
|
684
|
+
trapTriggered = false;
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
} catch (err) {
|
|
688
|
+
errorMsg = err.message;
|
|
689
|
+
errorCount++;
|
|
690
|
+
correct = false;
|
|
691
|
+
}
|
|
692
|
+
const row = {
|
|
693
|
+
sampleId: item.sample.sampleId,
|
|
694
|
+
questionIndex: item.qaIndex,
|
|
695
|
+
question: item.qa.question,
|
|
696
|
+
goldAnswer: item.qa.answer,
|
|
697
|
+
category: item.qa.category,
|
|
698
|
+
modelAnswer,
|
|
699
|
+
correct,
|
|
700
|
+
...trapTriggered !== void 0 ? { trapTriggered } : {},
|
|
701
|
+
...errorMsg !== void 0 ? { error: errorMsg } : {},
|
|
702
|
+
answerInputTokens: answerIn,
|
|
703
|
+
answerOutputTokens: answerOut,
|
|
704
|
+
judgeInputTokens: judgeIn,
|
|
705
|
+
judgeOutputTokens: judgeOut
|
|
706
|
+
};
|
|
707
|
+
results.push(row);
|
|
708
|
+
totalAnswerInputTokens += answerIn;
|
|
709
|
+
totalAnswerOutputTokens += answerOut;
|
|
710
|
+
totalJudgeInputTokens += judgeIn;
|
|
711
|
+
totalJudgeOutputTokens += judgeOut;
|
|
712
|
+
if (checkpointPath) {
|
|
713
|
+
await appendCheckpointRow(checkpointPath, row);
|
|
714
|
+
}
|
|
715
|
+
done++;
|
|
716
|
+
if (onProgress) onProgress(done, total);
|
|
717
|
+
};
|
|
718
|
+
const concurrencyLimit = Math.max(1, concurrency);
|
|
719
|
+
const pending = [];
|
|
720
|
+
for (const item of queue) {
|
|
721
|
+
const p = processItem(item);
|
|
722
|
+
pending.push(p);
|
|
723
|
+
if (pending.length >= concurrencyLimit) {
|
|
724
|
+
await Promise.all(pending.splice(0, concurrencyLimit));
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
if (pending.length > 0) await Promise.all(pending);
|
|
728
|
+
if (checkpointPath && checkpoint.size > 0) {
|
|
729
|
+
const raw = await readFile2(checkpointPath, "utf-8").catch(() => "");
|
|
730
|
+
for (const line of raw.split("\n")) {
|
|
731
|
+
const trimmed = line.trim();
|
|
732
|
+
if (!trimmed) continue;
|
|
733
|
+
try {
|
|
734
|
+
const row = JSON.parse(trimmed);
|
|
735
|
+
const key = `${row.sampleId}:${row.questionIndex}`;
|
|
736
|
+
if (checkpoint.has(key)) {
|
|
737
|
+
results.push(row);
|
|
738
|
+
totalAnswerInputTokens += row.answerInputTokens ?? 0;
|
|
739
|
+
totalAnswerOutputTokens += row.answerOutputTokens ?? 0;
|
|
740
|
+
totalJudgeInputTokens += row.judgeInputTokens ?? 0;
|
|
741
|
+
totalJudgeOutputTokens += row.judgeOutputTokens ?? 0;
|
|
742
|
+
}
|
|
743
|
+
} catch {
|
|
744
|
+
}
|
|
745
|
+
}
|
|
746
|
+
}
|
|
747
|
+
const byCategoryMap = {};
|
|
748
|
+
for (const row of results) {
|
|
749
|
+
const cat = String(row.category);
|
|
750
|
+
if (!byCategoryMap[cat]) byCategoryMap[cat] = { correct: 0, total: 0 };
|
|
751
|
+
byCategoryMap[cat].total++;
|
|
752
|
+
if (row.correct) byCategoryMap[cat].correct++;
|
|
753
|
+
}
|
|
754
|
+
const byCategory = {};
|
|
755
|
+
for (const [cat, stats] of Object.entries(byCategoryMap)) {
|
|
756
|
+
byCategory[cat] = {
|
|
757
|
+
...stats,
|
|
758
|
+
accuracy: stats.total > 0 ? stats.correct / stats.total : 0
|
|
759
|
+
};
|
|
760
|
+
}
|
|
761
|
+
let j14Correct = 0;
|
|
762
|
+
let j14Total = 0;
|
|
763
|
+
for (const [cat, stats] of Object.entries(byCategory)) {
|
|
764
|
+
const n = parseInt(cat);
|
|
765
|
+
if (n >= 1 && n <= 4) {
|
|
766
|
+
j14Correct += stats.correct;
|
|
767
|
+
j14Total += stats.total;
|
|
768
|
+
}
|
|
769
|
+
}
|
|
770
|
+
const cat5Stats = byCategory["5"];
|
|
771
|
+
const cat5RefusalRate = cat5Stats ? { correct: cat5Stats.correct, total: cat5Stats.total, rate: cat5Stats.accuracy } : void 0;
|
|
772
|
+
const wallClockMs = Date.now() - startTime;
|
|
773
|
+
return {
|
|
774
|
+
config: {
|
|
775
|
+
mode,
|
|
776
|
+
topK,
|
|
777
|
+
answerModelId: answerModel.modelId ?? "(unknown)",
|
|
778
|
+
judgeModelId: judgeModel.modelId ?? "(unknown)",
|
|
779
|
+
datasetSha: LOCOMO_SOURCE_SHA,
|
|
780
|
+
seed,
|
|
781
|
+
categories: [...categories].sort((a, b) => a - b),
|
|
782
|
+
samplesRun: samples.length,
|
|
783
|
+
questionsRun: results.length
|
|
784
|
+
},
|
|
785
|
+
overallJ14: {
|
|
786
|
+
correct: j14Correct,
|
|
787
|
+
total: j14Total,
|
|
788
|
+
accuracy: j14Total > 0 ? j14Correct / j14Total : 0
|
|
789
|
+
},
|
|
790
|
+
byCategory,
|
|
791
|
+
cat5RefusalRate,
|
|
792
|
+
tokens: {
|
|
793
|
+
ingestInputTokens,
|
|
794
|
+
ingestOutputTokens,
|
|
795
|
+
answerInputTokens: totalAnswerInputTokens,
|
|
796
|
+
answerOutputTokens: totalAnswerOutputTokens,
|
|
797
|
+
judgeInputTokens: totalJudgeInputTokens,
|
|
798
|
+
judgeOutputTokens: totalJudgeOutputTokens,
|
|
799
|
+
totalInputTokens: ingestInputTokens + totalAnswerInputTokens + totalJudgeInputTokens,
|
|
800
|
+
totalOutputTokens: ingestOutputTokens + totalAnswerOutputTokens + totalJudgeOutputTokens
|
|
801
|
+
},
|
|
802
|
+
wallClockMs,
|
|
803
|
+
questions: results,
|
|
804
|
+
errorCount
|
|
805
|
+
};
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
// src/locomo-render.ts
|
|
809
|
+
function pct(n) {
|
|
810
|
+
return (n * 100).toFixed(1) + "%";
|
|
811
|
+
}
|
|
812
|
+
function fmtNum(n) {
|
|
813
|
+
return n.toLocaleString("en-US");
|
|
814
|
+
}
|
|
815
|
+
function estimateCost(tokens, prices) {
|
|
816
|
+
if (!prices) return "\u2014";
|
|
817
|
+
const cost = tokens.totalInputTokens / 1e6 * prices.inputPer1M + tokens.totalOutputTokens / 1e6 * prices.outputPer1M;
|
|
818
|
+
return `$${cost.toFixed(4)}`;
|
|
819
|
+
}
|
|
820
|
+
function renderLocomoReportMarkdown(reports, prices) {
|
|
821
|
+
const lines = [];
|
|
822
|
+
lines.push("# LoCoMo Benchmark Results");
|
|
823
|
+
lines.push("");
|
|
824
|
+
lines.push("Dataset: [LoCoMo](https://github.com/snap-research/locomo) (Snap Research) \xB7 CC BY-NC 4.0");
|
|
825
|
+
lines.push("Raw data is not redistributed. Only aggregate results are published here.");
|
|
826
|
+
lines.push("");
|
|
827
|
+
if (reports.length === 0) {
|
|
828
|
+
lines.push("_No results yet._");
|
|
829
|
+
return lines.join("\n");
|
|
830
|
+
}
|
|
831
|
+
const headers = [
|
|
832
|
+
"System / Mode",
|
|
833
|
+
"Cat1 (multi-hop)",
|
|
834
|
+
"Cat2 (temporal)",
|
|
835
|
+
"Cat3 (open-domain)",
|
|
836
|
+
"Cat4 (single-hop)",
|
|
837
|
+
"J(1\u20134) overall",
|
|
838
|
+
"Cat5 refusal rate",
|
|
839
|
+
"Tokens/query",
|
|
840
|
+
"Est. cost/run",
|
|
841
|
+
"Answer model",
|
|
842
|
+
"Judge model",
|
|
843
|
+
"topK",
|
|
844
|
+
"n-Q",
|
|
845
|
+
"Seed",
|
|
846
|
+
"Dataset SHA"
|
|
847
|
+
];
|
|
848
|
+
lines.push("## Results");
|
|
849
|
+
lines.push("");
|
|
850
|
+
lines.push("| " + headers.join(" | ") + " |");
|
|
851
|
+
lines.push("| " + headers.map(() => "---").join(" | ") + " |");
|
|
852
|
+
for (const r of reports) {
|
|
853
|
+
const c = r.config;
|
|
854
|
+
const cat = (n) => {
|
|
855
|
+
const s = r.byCategory[String(n)];
|
|
856
|
+
if (!s) return "\u2014";
|
|
857
|
+
return `${pct(s.accuracy)} (${s.correct}/${s.total})`;
|
|
858
|
+
};
|
|
859
|
+
const cat5 = r.cat5RefusalRate ? `${pct(r.cat5RefusalRate.rate)} (${r.cat5RefusalRate.correct}/${r.cat5RefusalRate.total})` : "\u2014";
|
|
860
|
+
const totalQ = r.questions.length;
|
|
861
|
+
const tokensPerQuery = totalQ > 0 ? Math.round((r.tokens.totalInputTokens + r.tokens.totalOutputTokens) / totalQ) : 0;
|
|
862
|
+
const row = [
|
|
863
|
+
`${c.answerModelId} / ${c.mode}`,
|
|
864
|
+
cat(1),
|
|
865
|
+
cat(2),
|
|
866
|
+
cat(3),
|
|
867
|
+
cat(4),
|
|
868
|
+
`${pct(r.overallJ14.accuracy)} (${r.overallJ14.correct}/${r.overallJ14.total})`,
|
|
869
|
+
cat5,
|
|
870
|
+
fmtNum(tokensPerQuery),
|
|
871
|
+
estimateCost(r.tokens, prices),
|
|
872
|
+
c.answerModelId,
|
|
873
|
+
c.judgeModelId,
|
|
874
|
+
String(c.topK),
|
|
875
|
+
fmtNum(r.config.questionsRun),
|
|
876
|
+
String(c.seed),
|
|
877
|
+
c.datasetSha.slice(0, 8)
|
|
878
|
+
];
|
|
879
|
+
lines.push("| " + row.join(" | ") + " |");
|
|
880
|
+
}
|
|
881
|
+
lines.push("");
|
|
882
|
+
lines.push("## Run Configuration");
|
|
883
|
+
lines.push("");
|
|
884
|
+
for (const r of reports) {
|
|
885
|
+
const c = r.config;
|
|
886
|
+
lines.push(`### ${c.answerModelId} / ${c.mode}`);
|
|
887
|
+
lines.push("");
|
|
888
|
+
lines.push(`- **Mode**: ${c.mode}`);
|
|
889
|
+
lines.push(`- **Answer model**: ${c.answerModelId}`);
|
|
890
|
+
lines.push(`- **Judge model**: ${c.judgeModelId}`);
|
|
891
|
+
lines.push(`- **topK**: ${c.topK}`);
|
|
892
|
+
lines.push(`- **Dataset SHA**: \`${c.datasetSha}\``);
|
|
893
|
+
lines.push(`- **Seed**: ${c.seed}`);
|
|
894
|
+
lines.push(`- **Categories**: ${c.categories.join(", ")}`);
|
|
895
|
+
lines.push(`- **Samples run**: ${c.samplesRun}`);
|
|
896
|
+
lines.push(`- **Questions run**: ${c.questionsRun}`);
|
|
897
|
+
lines.push(`- **Wall-clock**: ${(r.wallClockMs / 1e3).toFixed(1)}s`);
|
|
898
|
+
lines.push(`- **Errors**: ${r.errorCount}`);
|
|
899
|
+
lines.push(`- **Tokens** (in/out): ${fmtNum(r.tokens.totalInputTokens)} / ${fmtNum(r.tokens.totalOutputTokens)}`);
|
|
900
|
+
lines.push("");
|
|
901
|
+
}
|
|
902
|
+
lines.push("## Methodology Notes");
|
|
903
|
+
lines.push("");
|
|
904
|
+
lines.push("These results were produced using the Eidentic LoCoMo fair-run harness. The following rules apply:");
|
|
905
|
+
lines.push("");
|
|
906
|
+
lines.push("1. **Both speakers are treated as humans.** Turns are ingested as `[SpeakerName]: text` \u2014 never mapped to user/assistant roles.");
|
|
907
|
+
lines.push("2. **Timestamps are structural.** Each session is prefixed with a header line `Session N \u2014 <date>` and `ingestedAt` metadata carries the epoch-ms.");
|
|
908
|
+
lines.push("3. **topK \u2264 10 in memory mode.** Larger topK values trivialise retrieval quality and are not permitted.");
|
|
909
|
+
lines.push("4. **Full-context baseline is required** alongside any memory-mode result.");
|
|
910
|
+
lines.push("5. **Judge is strict**: a model answer is correct only when it contains the gold answer's specific information. Vague/topical-only answers are wrong.");
|
|
911
|
+
lines.push("6. **Category 5 (adversarial)**: correct = model declined; adversarial-trap match = wrong.");
|
|
912
|
+
lines.push("7. **Primary metric J(1\u20134)**: denominator is the number of cat 1\u20134 questions actually run (max 1540 on full dataset).");
|
|
913
|
+
lines.push("8. **Dataset license**: CC BY-NC 4.0 \u2014 raw data is not redistributed; only aggregate results are published.");
|
|
914
|
+
lines.push("");
|
|
915
|
+
lines.push("> Category mapping in locomo10.json: 1=multi-hop (282), 2=temporal (321), 3=open-domain (96), 4=single-hop (841), 5=adversarial (446).");
|
|
916
|
+
lines.push("");
|
|
917
|
+
return lines.join("\n");
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
// src/locomo-types.ts
|
|
921
|
+
function resolveEvidence(sample, diaIds) {
|
|
922
|
+
const turnMap = /* @__PURE__ */ new Map();
|
|
923
|
+
for (const sess of sample.sessions) {
|
|
924
|
+
for (const turn of sess.turns) {
|
|
925
|
+
turnMap.set(turn.diaId, turn);
|
|
926
|
+
}
|
|
927
|
+
}
|
|
928
|
+
const results = [];
|
|
929
|
+
for (const id of diaIds) {
|
|
930
|
+
const t = turnMap.get(id);
|
|
931
|
+
if (t) results.push(t.text);
|
|
932
|
+
}
|
|
933
|
+
return results;
|
|
934
|
+
}
|
|
935
|
+
|
|
400
936
|
// src/write-quality.ts
|
|
401
937
|
var CONTRADICTION_FIXTURES = [
|
|
402
938
|
{
|
|
@@ -774,7 +1310,7 @@ async function runTemporalBench(memory, dataset, opts = {}) {
|
|
|
774
1310
|
}
|
|
775
1311
|
|
|
776
1312
|
// src/datasets/temporal.ts
|
|
777
|
-
function
|
|
1313
|
+
function makeRng2(seed) {
|
|
778
1314
|
let s = seed >>> 0;
|
|
779
1315
|
if (s === 0) s = 1;
|
|
780
1316
|
return () => {
|
|
@@ -857,7 +1393,7 @@ function syntheticTemporalDataset(opts = {}) {
|
|
|
857
1393
|
const entityCount = opts.entityCount ?? 4;
|
|
858
1394
|
const seed = opts.seed ?? 42;
|
|
859
1395
|
const changesPerProperty = opts.changesPerProperty ?? 3;
|
|
860
|
-
const rng =
|
|
1396
|
+
const rng = makeRng2(seed);
|
|
861
1397
|
const entities = [];
|
|
862
1398
|
const asserts = [];
|
|
863
1399
|
const questions = [];
|
|
@@ -943,12 +1479,17 @@ function syntheticTemporalDataset(opts = {}) {
|
|
|
943
1479
|
export {
|
|
944
1480
|
CONTRADICTION_FIXTURES,
|
|
945
1481
|
JUNK_STREAM_FIXTURES,
|
|
1482
|
+
LOCOMO_SOURCE_SHA,
|
|
946
1483
|
factRecall,
|
|
947
1484
|
loadLoCoMo,
|
|
1485
|
+
loadLoCoMo2 as loadLoCoMoLegacy,
|
|
948
1486
|
loadLongMemEval,
|
|
949
1487
|
normalizeText,
|
|
950
1488
|
normalizedIncludes,
|
|
951
1489
|
recallAtK,
|
|
1490
|
+
renderLocomoReportMarkdown,
|
|
1491
|
+
resolveEvidence,
|
|
1492
|
+
runLocomoBench,
|
|
952
1493
|
runMemoryBench,
|
|
953
1494
|
runTemporalBench,
|
|
954
1495
|
runWriteQualityBench,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@eidentic/bench",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"publishConfig": {
|
|
@@ -28,9 +28,9 @@
|
|
|
28
28
|
"README.md"
|
|
29
29
|
],
|
|
30
30
|
"dependencies": {
|
|
31
|
-
"@eidentic/
|
|
32
|
-
"@eidentic/eval": "0.1.
|
|
33
|
-
"@eidentic/
|
|
31
|
+
"@eidentic/memory": "0.1.1",
|
|
32
|
+
"@eidentic/eval": "0.1.1",
|
|
33
|
+
"@eidentic/types": "0.1.1"
|
|
34
34
|
},
|
|
35
35
|
"description": "Memory benchmark harness for Eidentic — run LongMemEval / LoCoMo / temporal-reasoning benchmarks with deterministic recall metrics.",
|
|
36
36
|
"keywords": [
|