zidane 5.10.2 → 5.10.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -4
- package/dist/{agent-Bt123Fdy.d.ts → agent-MbmvNVAP.d.ts} +168 -3
- package/dist/agent-MbmvNVAP.d.ts.map +1 -0
- package/dist/chat/pure.d.ts +3 -3
- package/dist/chat.d.ts +6 -6
- package/dist/chat.js +2 -2
- package/dist/eval.d.ts +2 -2
- package/dist/eval.js +179 -11
- package/dist/eval.js.map +1 -1
- package/dist/{headless-Cn6XXmr3.js → headless-DHdHSA2s.js} +5 -5
- package/dist/{headless-Cn6XXmr3.js.map → headless-DHdHSA2s.js.map} +1 -1
- package/dist/headless.d.ts +1 -1
- package/dist/headless.js +1 -1
- package/dist/{index-BodGKXBV.d.ts → index-CCHh9Yca.d.ts} +2 -2
- package/dist/{index-BodGKXBV.d.ts.map → index-CCHh9Yca.d.ts.map} +1 -1
- package/dist/{index-C7BvI1Hi.d.ts → index-Iki5q09p.d.ts} +24 -3
- package/dist/index-Iki5q09p.d.ts.map +1 -0
- package/dist/index.d.ts +4 -4
- package/dist/index.js +9 -9
- package/dist/{login-DoGslmKC.js → login-D-SWsD7j.js} +3 -3
- package/dist/{login-DoGslmKC.js.map → login-D-SWsD7j.js.map} +1 -1
- package/dist/{mcp-BdN9UjTO.js → mcp-BVuDO44W.js} +3 -3
- package/dist/{mcp-BdN9UjTO.js.map → mcp-BVuDO44W.js.map} +1 -1
- package/dist/mcp.d.ts +1 -1
- package/dist/mcp.js +1 -1
- package/dist/{messages-DdfOKKx_.js → messages-_E1RxSxV.js} +135 -13
- package/dist/messages-_E1RxSxV.js.map +1 -0
- package/dist/output/stream-json.d.ts +2 -2
- package/dist/output/stream-json.js +1 -1
- package/dist/output/terminal.d.ts +2 -2
- package/dist/{presets-DCrQmY3b.js → presets-BtAXrPCY.js} +2 -2
- package/dist/{presets-DCrQmY3b.js.map → presets-BtAXrPCY.js.map} +1 -1
- package/dist/presets.d.ts +2 -2
- package/dist/presets.js +1 -1
- package/dist/{providers-BxHepM_P.js → providers-B6M0Oer3.js} +68 -7
- package/dist/providers-B6M0Oer3.js.map +1 -0
- package/dist/providers.d.ts +2 -2
- package/dist/providers.js +3 -3
- package/dist/restate.d.ts +1 -1
- package/dist/session/sqlite.d.ts +1 -1
- package/dist/{session-C0uGIWm_.js → session-CZniOWFD.js} +2 -2
- package/dist/{session-C0uGIWm_.js.map → session-CZniOWFD.js.map} +1 -1
- package/dist/session.d.ts +1 -1
- package/dist/session.js +2 -2
- package/dist/skills.d.ts +2 -2
- package/dist/{tool-formatters-BuB31L-c.d.ts → tool-formatters-CgE32BNa.d.ts} +2 -2
- package/dist/tool-formatters-CgE32BNa.d.ts.map +1 -0
- package/dist/tools/fetch-url.d.ts +1 -1
- package/dist/tools/web-search.d.ts +1 -1
- package/dist/{tools-Bk9TqmCV.js → tools-0IqJRRj8.js} +254 -18
- package/dist/tools-0IqJRRj8.js.map +1 -0
- package/dist/tools.d.ts +2 -2
- package/dist/tools.js +1 -1
- package/dist/{transcript-anchors-Bkuspqgn.js → transcript-anchors-CXheYWkt.js} +6 -6
- package/dist/{transcript-anchors-Bkuspqgn.js.map → transcript-anchors-CXheYWkt.js.map} +1 -1
- package/dist/{transcript-anchors-DhVgKmEl.d.ts → transcript-anchors-DkJotRvR.d.ts} +5 -5
- package/dist/{transcript-anchors-DhVgKmEl.d.ts.map → transcript-anchors-DkJotRvR.d.ts.map} +1 -1
- package/dist/tui.d.ts +3 -3
- package/dist/tui.js +7 -7
- package/dist/{turn-operations-DYKtoVd9.d.ts → turn-operations-BuL1RjGN.d.ts} +3 -3
- package/dist/{turn-operations-DYKtoVd9.d.ts.map → turn-operations-BuL1RjGN.d.ts.map} +1 -1
- package/dist/types-BiobHM1D.js.map +1 -1
- package/dist/types.d.ts +2 -2
- package/package.json +1 -1
- package/scripts/eval.ts +18 -1
- package/dist/agent-Bt123Fdy.d.ts.map +0 -1
- package/dist/index-C7BvI1Hi.d.ts.map +0 -1
- package/dist/messages-DdfOKKx_.js.map +0 -1
- package/dist/providers-BxHepM_P.js.map +0 -1
- package/dist/tool-formatters-BuB31L-c.d.ts.map +0 -1
- package/dist/tools-Bk9TqmCV.js.map +0 -1
package/dist/eval.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { _ as alwaysQuote } from "./tools-
|
|
1
|
+
import { _ as alwaysQuote } from "./tools-0IqJRRj8.js";
|
|
2
2
|
import { r as createProcessContext } from "./contexts-GKAWYq07.js";
|
|
3
|
-
import { a as headlessEventToJsonl, c as runHeadless } from "./headless-
|
|
4
|
-
import { i as createMemoryStore, t as createSession } from "./session-
|
|
3
|
+
import { a as headlessEventToJsonl, c as runHeadless } from "./headless-DHdHSA2s.js";
|
|
4
|
+
import { i as createMemoryStore, t as createSession } from "./session-CZniOWFD.js";
|
|
5
5
|
import { join, relative, resolve } from "node:path";
|
|
6
6
|
import { tmpdir } from "node:os";
|
|
7
7
|
import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
|
|
@@ -327,10 +327,10 @@ const JUDGE_TOOL = {
|
|
|
327
327
|
}
|
|
328
328
|
};
|
|
329
329
|
async function runEvalCase(options) {
|
|
330
|
-
const { id, suite, tags = [], artifactDir, workspace, scorers = [], onEvent, ...headless } = options;
|
|
330
|
+
const { id, suite, variant, tags = [], artifactDir, workspace, scorers = [], onEvent, ...headless } = options;
|
|
331
331
|
const { metrics: declaredMetrics, sourceFile, ...headlessRest } = headless;
|
|
332
332
|
const events = [];
|
|
333
|
-
const caseArtifactDir = artifactDir ? join(artifactDir, safeSegment(suite ?? "eval"), safeSegment(id)) : void 0;
|
|
333
|
+
const caseArtifactDir = artifactDir ? join(artifactDir, ...variant ? [safeSegment(variant)] : [], safeSegment(suite ?? "eval"), safeSegment(id)) : void 0;
|
|
334
334
|
const workspaceState = await prepareWorkspace(headlessRest.execution, workspace);
|
|
335
335
|
const caseMetricIds = new Set(Object.keys(declaredMetrics ?? {}));
|
|
336
336
|
const hasCaseMetrics = caseMetricIds.size > 0;
|
|
@@ -373,6 +373,7 @@ async function runEvalCase(options) {
|
|
|
373
373
|
const evalResult = {
|
|
374
374
|
id,
|
|
375
375
|
...suite ? { suite } : {},
|
|
376
|
+
...variant ? { variant } : {},
|
|
376
377
|
tags,
|
|
377
378
|
result,
|
|
378
379
|
score,
|
|
@@ -557,6 +558,7 @@ function functionalityMetric(metricId, scorers, name = metricId) {
|
|
|
557
558
|
function formatEvalCaseSummary(result) {
|
|
558
559
|
const lines = [
|
|
559
560
|
`Eval ${result.suite ? `${result.suite}/` : ""}${result.id}`,
|
|
561
|
+
...result.variant ? [`variant: ${result.variant}`] : [],
|
|
560
562
|
`status: ${result.result.status}`,
|
|
561
563
|
`passed: ${result.passed}`,
|
|
562
564
|
`score: ${result.score.toFixed(2)}`,
|
|
@@ -602,7 +604,8 @@ function registerEvalTests(options) {
|
|
|
602
604
|
});
|
|
603
605
|
const work = [];
|
|
604
606
|
for (const evalCase of options.cases) {
|
|
605
|
-
const
|
|
607
|
+
const caseLabel = `${evalCase.suite ? `${evalCase.suite}/` : ""}${evalCase.id}`;
|
|
608
|
+
const baseLabel = evalCase.variant ? `[${evalCase.variant}] ${caseLabel}` : caseLabel;
|
|
606
609
|
for (let run = 1; run <= repeat; run++) {
|
|
607
610
|
const label = repeat > 1 ? `${baseLabel} #${run}` : baseLabel;
|
|
608
611
|
const caseId = repeat > 1 ? `${evalCase.id}-repeat-${run}` : evalCase.id;
|
|
@@ -663,7 +666,12 @@ function createEvalRunReporter(options = {}) {
|
|
|
663
666
|
async record(result) {
|
|
664
667
|
results.push(result);
|
|
665
668
|
if (options.outputDir) {
|
|
666
|
-
const
|
|
669
|
+
const segments = [
|
|
670
|
+
...result.variant ? [safeSegment(result.variant)] : [],
|
|
671
|
+
safeSegment(result.suite ?? "eval"),
|
|
672
|
+
safeSegment(result.id)
|
|
673
|
+
];
|
|
674
|
+
const casePath = join(options.outputDir, "cases", `${segments.join("--")}.json`);
|
|
667
675
|
await mkdir(join(options.outputDir, "cases"), { recursive: true });
|
|
668
676
|
await writeFile(casePath, `${JSON.stringify(result, null, 2)}\n`);
|
|
669
677
|
}
|
|
@@ -681,14 +689,21 @@ function createEvalRunReporter(options = {}) {
|
|
|
681
689
|
}
|
|
682
690
|
};
|
|
683
691
|
}
|
|
684
|
-
/** Stable ordering by `suite/id
|
|
692
|
+
/** Stable ordering by `variant`, then `suite/id`, so parallel completion order doesn't shuffle output. */
|
|
685
693
|
function sortCases(results) {
|
|
686
694
|
return [...results].sort((a, b) => {
|
|
695
|
+
const va = a.variant ?? "";
|
|
696
|
+
const vb = b.variant ?? "";
|
|
697
|
+
if (va !== vb) return va.localeCompare(vb);
|
|
687
698
|
const ka = `${a.suite ?? ""}/${a.id}`;
|
|
688
699
|
const kb = `${b.suite ?? ""}/${b.id}`;
|
|
689
700
|
return ka.localeCompare(kb);
|
|
690
701
|
});
|
|
691
702
|
}
|
|
703
|
+
/** Distinct variant labels across results, in sorted order. */
|
|
704
|
+
function distinctVariants(results) {
|
|
705
|
+
return [...new Set(results.map((r) => r.variant).filter((v) => Boolean(v)))].sort();
|
|
706
|
+
}
|
|
692
707
|
function buildEvalRunSummary(input) {
|
|
693
708
|
const results = sortCases(input);
|
|
694
709
|
const usage = results.reduce((acc, result) => {
|
|
@@ -715,6 +730,7 @@ function buildEvalRunSummary(input) {
|
|
|
715
730
|
cases: results.map((result) => ({
|
|
716
731
|
id: result.id,
|
|
717
732
|
...result.suite ? { suite: result.suite } : {},
|
|
733
|
+
...result.variant ? { variant: result.variant } : {},
|
|
718
734
|
passed: result.passed,
|
|
719
735
|
score: result.score,
|
|
720
736
|
status: result.result.status,
|
|
@@ -725,9 +741,37 @@ function buildEvalRunSummary(input) {
|
|
|
725
741
|
trajectory: result.trajectory
|
|
726
742
|
})),
|
|
727
743
|
metrics: aggregateMetrics(results),
|
|
728
|
-
tagScores: aggregateTagScores(results)
|
|
744
|
+
tagScores: aggregateTagScores(results),
|
|
745
|
+
...distinctVariants(results).length > 0 ? { variants: aggregateVariants(results) } : {}
|
|
729
746
|
};
|
|
730
747
|
}
|
|
748
|
+
function aggregateVariants(results) {
|
|
749
|
+
return distinctVariants(results).map((variant) => {
|
|
750
|
+
const group = results.filter((r) => r.variant === variant);
|
|
751
|
+
const usage = group.reduce((acc, result) => {
|
|
752
|
+
acc.input += result.result.usage.input;
|
|
753
|
+
acc.output += result.result.usage.output;
|
|
754
|
+
acc.cacheRead += result.result.usage.cacheRead;
|
|
755
|
+
acc.cacheCreation += result.result.usage.cacheCreation;
|
|
756
|
+
acc.cost += result.result.usage.cost ?? 0;
|
|
757
|
+
return acc;
|
|
758
|
+
}, {
|
|
759
|
+
input: 0,
|
|
760
|
+
output: 0,
|
|
761
|
+
cacheRead: 0,
|
|
762
|
+
cacheCreation: 0,
|
|
763
|
+
cost: 0
|
|
764
|
+
});
|
|
765
|
+
return {
|
|
766
|
+
variant,
|
|
767
|
+
count: group.length,
|
|
768
|
+
passed: group.filter((r) => r.passed).length,
|
|
769
|
+
score: mean(group.map((r) => r.score)),
|
|
770
|
+
durationMs: group.reduce((sum, r) => sum + r.result.durationMs, 0),
|
|
771
|
+
usage
|
|
772
|
+
};
|
|
773
|
+
});
|
|
774
|
+
}
|
|
731
775
|
function aggregateMetrics(results) {
|
|
732
776
|
const byId = /* @__PURE__ */ new Map();
|
|
733
777
|
for (const result of results) for (const metric of result.metrics) {
|
|
@@ -776,8 +820,17 @@ function formatEvalRunSummaryWithOptions(input, options = {}) {
|
|
|
776
820
|
if (results.length === 0) return `${color.heading("Eval run summary")}\n${color.muted("no evals ran")}`;
|
|
777
821
|
const summary = buildEvalRunSummary(results);
|
|
778
822
|
const out = [];
|
|
823
|
+
const variants = distinctVariants(results);
|
|
779
824
|
out.push(color.heading("Eval run summary"));
|
|
780
|
-
|
|
825
|
+
if (variants.length > 1) {
|
|
826
|
+
out.push(renderComparisonTable(results, variants, color));
|
|
827
|
+
for (const variant of variants) {
|
|
828
|
+
const group = results.filter((r) => r.variant === variant);
|
|
829
|
+
out.push("");
|
|
830
|
+
out.push(color.heading(variant));
|
|
831
|
+
out.push(renderResultsTable(group, buildEvalRunSummary(group), color));
|
|
832
|
+
}
|
|
833
|
+
} else out.push(renderResultsTable(results, summary, color));
|
|
781
834
|
const tagTable = renderTagTable(summary.tagScores, color);
|
|
782
835
|
if (tagTable) {
|
|
783
836
|
out.push("");
|
|
@@ -790,7 +843,8 @@ function formatEvalRunSummaryWithOptions(input, options = {}) {
|
|
|
790
843
|
return hyperlinks ? oscLink(abs, abs) : abs;
|
|
791
844
|
};
|
|
792
845
|
for (const result of results) {
|
|
793
|
-
const
|
|
846
|
+
const caseLabel = `${result.suite ? `${result.suite}/` : ""}${result.id}`;
|
|
847
|
+
const label = result.variant ? `[${result.variant}] ${caseLabel}` : caseLabel;
|
|
794
848
|
out.push(`${color.dot(result.passed)} ${color.caseStatus(result.passed, label)}`);
|
|
795
849
|
out.push(`${color.muted("score")} ${color.score(result.score)}`);
|
|
796
850
|
const tagEntries = Object.entries(result.tagScores);
|
|
@@ -848,6 +902,120 @@ function oscLink(target, text) {
|
|
|
848
902
|
function absPath(target) {
|
|
849
903
|
return resolve(target.replace(/^file:\/\//, ""));
|
|
850
904
|
}
|
|
905
|
+
/**
|
|
906
|
+
* Case key used to line up the same eval across variants in the comparison
|
|
907
|
+
* matrix. Strips the `-repeat-N` suffix `registerEvalTests` appends so a
|
|
908
|
+
* repeated case aggregates into one row (mean score, x/y passed).
|
|
909
|
+
*/
|
|
910
|
+
function comparisonCaseKey(result) {
|
|
911
|
+
const id = result.id.replace(/-repeat-\d+$/, "");
|
|
912
|
+
return `${result.suite ? `${result.suite}/` : ""}${id}`;
|
|
913
|
+
}
|
|
914
|
+
/**
|
|
915
|
+
* Cross-variant comparison matrix: one row per eval case, one column per
|
|
916
|
+
* variant (provider/model target), plus rollup rows (score / passed / cost /
|
|
917
|
+
* time) so two models can be read side by side at a glance.
|
|
918
|
+
*/
|
|
919
|
+
function renderComparisonTable(input, variants, color) {
|
|
920
|
+
const results = sortCases(input);
|
|
921
|
+
const caseKeys = [...new Set(results.map(comparisonCaseKey))].sort();
|
|
922
|
+
const byCell = /* @__PURE__ */ new Map();
|
|
923
|
+
for (const result of results) {
|
|
924
|
+
const key = `${comparisonCaseKey(result)}\u0000${result.variant ?? ""}`;
|
|
925
|
+
const list = byCell.get(key) ?? [];
|
|
926
|
+
list.push(result);
|
|
927
|
+
byCell.set(key, list);
|
|
928
|
+
}
|
|
929
|
+
const cellFor = (caseKey, variant) => {
|
|
930
|
+
const group = byCell.get(`${caseKey}\u0000${variant}`);
|
|
931
|
+
if (!group || group.length === 0) return { text: "-" };
|
|
932
|
+
const score = mean(group.map((r) => r.score));
|
|
933
|
+
const passedCount = group.filter((r) => r.passed).length;
|
|
934
|
+
const allPassed = passedCount === group.length;
|
|
935
|
+
const status = group.length > 1 ? `${passedCount}/${group.length}` : allPassed ? "pass" : "fail";
|
|
936
|
+
return {
|
|
937
|
+
text: `${score.toFixed(2)} ${status}`,
|
|
938
|
+
score,
|
|
939
|
+
passed: allPassed
|
|
940
|
+
};
|
|
941
|
+
};
|
|
942
|
+
const variantSummaries = aggregateVariants(results);
|
|
943
|
+
const summaryOf = (variant) => variantSummaries.find((s) => s.variant === variant);
|
|
944
|
+
const rows = caseKeys.map((key) => ({
|
|
945
|
+
label: key,
|
|
946
|
+
cells: variants.map((variant) => cellFor(key, variant))
|
|
947
|
+
}));
|
|
948
|
+
const rollups = [
|
|
949
|
+
{
|
|
950
|
+
label: "SCORE",
|
|
951
|
+
rollup: true,
|
|
952
|
+
cells: variants.map((variant) => {
|
|
953
|
+
const s = summaryOf(variant);
|
|
954
|
+
return s ? {
|
|
955
|
+
text: s.score.toFixed(2),
|
|
956
|
+
score: s.score
|
|
957
|
+
} : { text: "-" };
|
|
958
|
+
})
|
|
959
|
+
},
|
|
960
|
+
{
|
|
961
|
+
label: "PASSED",
|
|
962
|
+
rollup: true,
|
|
963
|
+
cells: variants.map((variant) => {
|
|
964
|
+
const s = summaryOf(variant);
|
|
965
|
+
return s ? {
|
|
966
|
+
text: `${s.passed}/${s.count}`,
|
|
967
|
+
passed: s.passed === s.count
|
|
968
|
+
} : { text: "-" };
|
|
969
|
+
})
|
|
970
|
+
},
|
|
971
|
+
{
|
|
972
|
+
label: "COST",
|
|
973
|
+
rollup: true,
|
|
974
|
+
cells: variants.map((variant) => {
|
|
975
|
+
return { text: formatCost(summaryOf(variant)?.usage.cost ?? 0) };
|
|
976
|
+
})
|
|
977
|
+
},
|
|
978
|
+
{
|
|
979
|
+
label: "TIME",
|
|
980
|
+
rollup: true,
|
|
981
|
+
cells: variants.map((variant) => {
|
|
982
|
+
const s = summaryOf(variant);
|
|
983
|
+
return { text: s ? formatDuration(s.durationMs) : "-" };
|
|
984
|
+
})
|
|
985
|
+
}
|
|
986
|
+
];
|
|
987
|
+
const header = ["EVAL", ...variants];
|
|
988
|
+
const allRows = [...rows, ...rollups];
|
|
989
|
+
const widths = header.map((h, i) => Math.max(h.length, ...allRows.map((row) => (i === 0 ? row.label : row.cells[i - 1].text).length)));
|
|
990
|
+
const top = `┌${widths.map((w) => "─".repeat(w + 2)).join("┬")}┐`;
|
|
991
|
+
const mid = `├${widths.map((w) => "─".repeat(w + 2)).join("┼")}┤`;
|
|
992
|
+
const bot = `└${widths.map((w) => "─".repeat(w + 2)).join("┴")}┘`;
|
|
993
|
+
const paintComparisonCell = (cell, padded) => {
|
|
994
|
+
if (cell.score === void 0 && cell.passed === void 0) return color.muted(padded);
|
|
995
|
+
if (cell.score !== void 0) {
|
|
996
|
+
const scoreText = cell.score.toFixed(2);
|
|
997
|
+
const rest = cell.text.slice(scoreText.length);
|
|
998
|
+
const coloredRest = cell.passed === void 0 ? color.muted(rest) : cell.passed ? color.pass(rest) : color.fail(rest);
|
|
999
|
+
return padded.replace(cell.text, `${color.score(cell.score)}${coloredRest}`);
|
|
1000
|
+
}
|
|
1001
|
+
return padded.replace(cell.text, cell.passed ? color.pass(cell.text) : color.fail(cell.text));
|
|
1002
|
+
};
|
|
1003
|
+
const renderRow = (row) => {
|
|
1004
|
+
const labelPadded = padEnd(row.label, widths[0]);
|
|
1005
|
+
return `│ ${row.rollup ? color.heading(labelPadded) : color.muted(labelPadded)} │${row.cells.map((cell, i) => {
|
|
1006
|
+
return ` ${paintComparisonCell(cell, padStart(cell.text, widths[i + 1]))} `;
|
|
1007
|
+
}).join("│")}│`;
|
|
1008
|
+
};
|
|
1009
|
+
return [
|
|
1010
|
+
top,
|
|
1011
|
+
`│ ${color.heading(padEnd(header[0], widths[0]))} │${variants.map((v, i) => ` ${color.heading(padStart(v, widths[i + 1]))} `).join("│")}│`,
|
|
1012
|
+
mid,
|
|
1013
|
+
...rows.map(renderRow),
|
|
1014
|
+
mid,
|
|
1015
|
+
...rollups.map(renderRow),
|
|
1016
|
+
bot
|
|
1017
|
+
].join("\n");
|
|
1018
|
+
}
|
|
851
1019
|
function renderResultsTable(results, summary, color) {
|
|
852
1020
|
const header = [
|
|
853
1021
|
"EVAL",
|