zidane 5.10.2 → 5.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/README.md +6 -4
  2. package/dist/{agent-Bt123Fdy.d.ts → agent-Dbhh2fr7.d.ts} +216 -3
  3. package/dist/agent-Dbhh2fr7.d.ts.map +1 -0
  4. package/dist/chat/pure.d.ts +3 -3
  5. package/dist/chat.d.ts +6 -6
  6. package/dist/chat.js +2 -2
  7. package/dist/eval.d.ts +2 -2
  8. package/dist/eval.js +179 -11
  9. package/dist/eval.js.map +1 -1
  10. package/dist/{headless-Cn6XXmr3.js → headless-WsGaqG1W.js} +5 -5
  11. package/dist/{headless-Cn6XXmr3.js.map → headless-WsGaqG1W.js.map} +1 -1
  12. package/dist/headless.d.ts +1 -1
  13. package/dist/headless.js +1 -1
  14. package/dist/{index-C7BvI1Hi.d.ts → index-CgsSvsR5.d.ts} +24 -3
  15. package/dist/index-CgsSvsR5.d.ts.map +1 -0
  16. package/dist/{index-BodGKXBV.d.ts → index-DbMQsGZP.d.ts} +2 -2
  17. package/dist/{index-BodGKXBV.d.ts.map → index-DbMQsGZP.d.ts.map} +1 -1
  18. package/dist/index.d.ts +4 -4
  19. package/dist/index.js +9 -9
  20. package/dist/{login-DoGslmKC.js → login-D-SWsD7j.js} +3 -3
  21. package/dist/{login-DoGslmKC.js.map → login-D-SWsD7j.js.map} +1 -1
  22. package/dist/{mcp-BdN9UjTO.js → mcp-BVuDO44W.js} +3 -3
  23. package/dist/{mcp-BdN9UjTO.js.map → mcp-BVuDO44W.js.map} +1 -1
  24. package/dist/mcp.d.ts +1 -1
  25. package/dist/mcp.js +1 -1
  26. package/dist/{messages-DdfOKKx_.js → messages-_E1RxSxV.js} +135 -13
  27. package/dist/messages-_E1RxSxV.js.map +1 -0
  28. package/dist/output/stream-json.d.ts +2 -2
  29. package/dist/output/stream-json.js +1 -1
  30. package/dist/output/terminal.d.ts +2 -2
  31. package/dist/{presets-DCrQmY3b.js → presets-9NpXoxzg.js} +2 -2
  32. package/dist/{presets-DCrQmY3b.js.map → presets-9NpXoxzg.js.map} +1 -1
  33. package/dist/presets.d.ts +2 -2
  34. package/dist/presets.js +1 -1
  35. package/dist/{providers-BxHepM_P.js → providers-B6M0Oer3.js} +68 -7
  36. package/dist/providers-B6M0Oer3.js.map +1 -0
  37. package/dist/providers.d.ts +2 -2
  38. package/dist/providers.js +3 -3
  39. package/dist/restate.d.ts +1 -1
  40. package/dist/session/sqlite.d.ts +1 -1
  41. package/dist/{session-C0uGIWm_.js → session-CZniOWFD.js} +2 -2
  42. package/dist/{session-C0uGIWm_.js.map → session-CZniOWFD.js.map} +1 -1
  43. package/dist/session.d.ts +1 -1
  44. package/dist/session.js +2 -2
  45. package/dist/skills.d.ts +2 -2
  46. package/dist/{tool-formatters-BuB31L-c.d.ts → tool-formatters-DkcN6HZt.d.ts} +2 -2
  47. package/dist/tool-formatters-DkcN6HZt.d.ts.map +1 -0
  48. package/dist/tools/fetch-url.d.ts +1 -1
  49. package/dist/tools/web-search.d.ts +1 -1
  50. package/dist/{tools-Bk9TqmCV.js → tools-BbVXIpFo.js} +256 -18
  51. package/dist/tools-BbVXIpFo.js.map +1 -0
  52. package/dist/tools.d.ts +2 -2
  53. package/dist/tools.js +1 -1
  54. package/dist/{transcript-anchors-Bkuspqgn.js → transcript-anchors-BtRC9WEQ.js} +6 -6
  55. package/dist/{transcript-anchors-Bkuspqgn.js.map → transcript-anchors-BtRC9WEQ.js.map} +1 -1
  56. package/dist/{transcript-anchors-DhVgKmEl.d.ts → transcript-anchors-DS8yTwq3.d.ts} +5 -5
  57. package/dist/{transcript-anchors-DhVgKmEl.d.ts.map → transcript-anchors-DS8yTwq3.d.ts.map} +1 -1
  58. package/dist/tui.d.ts +3 -3
  59. package/dist/tui.js +7 -7
  60. package/dist/{turn-operations-DYKtoVd9.d.ts → turn-operations-CTZwjdxD.d.ts} +3 -3
  61. package/dist/{turn-operations-DYKtoVd9.d.ts.map → turn-operations-CTZwjdxD.d.ts.map} +1 -1
  62. package/dist/types-BiobHM1D.js.map +1 -1
  63. package/dist/types.d.ts +2 -2
  64. package/package.json +1 -1
  65. package/scripts/eval.ts +18 -1
  66. package/dist/agent-Bt123Fdy.d.ts.map +0 -1
  67. package/dist/index-C7BvI1Hi.d.ts.map +0 -1
  68. package/dist/messages-DdfOKKx_.js.map +0 -1
  69. package/dist/providers-BxHepM_P.js.map +0 -1
  70. package/dist/tool-formatters-BuB31L-c.d.ts.map +0 -1
  71. package/dist/tools-Bk9TqmCV.js.map +0 -1
package/dist/eval.js CHANGED
@@ -1,7 +1,7 @@
1
- import { _ as alwaysQuote } from "./tools-Bk9TqmCV.js";
1
+ import { _ as alwaysQuote } from "./tools-BbVXIpFo.js";
2
2
  import { r as createProcessContext } from "./contexts-GKAWYq07.js";
3
- import { a as headlessEventToJsonl, c as runHeadless } from "./headless-Cn6XXmr3.js";
4
- import { i as createMemoryStore, t as createSession } from "./session-C0uGIWm_.js";
3
+ import { a as headlessEventToJsonl, c as runHeadless } from "./headless-WsGaqG1W.js";
4
+ import { i as createMemoryStore, t as createSession } from "./session-CZniOWFD.js";
5
5
  import { join, relative, resolve } from "node:path";
6
6
  import { tmpdir } from "node:os";
7
7
  import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
@@ -327,10 +327,10 @@ const JUDGE_TOOL = {
327
327
  }
328
328
  };
329
329
  async function runEvalCase(options) {
330
- const { id, suite, tags = [], artifactDir, workspace, scorers = [], onEvent, ...headless } = options;
330
+ const { id, suite, variant, tags = [], artifactDir, workspace, scorers = [], onEvent, ...headless } = options;
331
331
  const { metrics: declaredMetrics, sourceFile, ...headlessRest } = headless;
332
332
  const events = [];
333
- const caseArtifactDir = artifactDir ? join(artifactDir, safeSegment(suite ?? "eval"), safeSegment(id)) : void 0;
333
+ const caseArtifactDir = artifactDir ? join(artifactDir, ...variant ? [safeSegment(variant)] : [], safeSegment(suite ?? "eval"), safeSegment(id)) : void 0;
334
334
  const workspaceState = await prepareWorkspace(headlessRest.execution, workspace);
335
335
  const caseMetricIds = new Set(Object.keys(declaredMetrics ?? {}));
336
336
  const hasCaseMetrics = caseMetricIds.size > 0;
@@ -373,6 +373,7 @@ async function runEvalCase(options) {
373
373
  const evalResult = {
374
374
  id,
375
375
  ...suite ? { suite } : {},
376
+ ...variant ? { variant } : {},
376
377
  tags,
377
378
  result,
378
379
  score,
@@ -557,6 +558,7 @@ function functionalityMetric(metricId, scorers, name = metricId) {
557
558
  function formatEvalCaseSummary(result) {
558
559
  const lines = [
559
560
  `Eval ${result.suite ? `${result.suite}/` : ""}${result.id}`,
561
+ ...result.variant ? [`variant: ${result.variant}`] : [],
560
562
  `status: ${result.result.status}`,
561
563
  `passed: ${result.passed}`,
562
564
  `score: ${result.score.toFixed(2)}`,
@@ -602,7 +604,8 @@ function registerEvalTests(options) {
602
604
  });
603
605
  const work = [];
604
606
  for (const evalCase of options.cases) {
605
- const baseLabel = `${evalCase.suite ? `${evalCase.suite}/` : ""}${evalCase.id}`;
607
+ const caseLabel = `${evalCase.suite ? `${evalCase.suite}/` : ""}${evalCase.id}`;
608
+ const baseLabel = evalCase.variant ? `[${evalCase.variant}] ${caseLabel}` : caseLabel;
606
609
  for (let run = 1; run <= repeat; run++) {
607
610
  const label = repeat > 1 ? `${baseLabel} #${run}` : baseLabel;
608
611
  const caseId = repeat > 1 ? `${evalCase.id}-repeat-${run}` : evalCase.id;
@@ -663,7 +666,12 @@ function createEvalRunReporter(options = {}) {
663
666
  async record(result) {
664
667
  results.push(result);
665
668
  if (options.outputDir) {
666
- const casePath = join(options.outputDir, "cases", `${safeSegment(result.suite ?? "eval")}--${safeSegment(result.id)}.json`);
669
+ const segments = [
670
+ ...result.variant ? [safeSegment(result.variant)] : [],
671
+ safeSegment(result.suite ?? "eval"),
672
+ safeSegment(result.id)
673
+ ];
674
+ const casePath = join(options.outputDir, "cases", `${segments.join("--")}.json`);
667
675
  await mkdir(join(options.outputDir, "cases"), { recursive: true });
668
676
  await writeFile(casePath, `${JSON.stringify(result, null, 2)}\n`);
669
677
  }
@@ -681,14 +689,21 @@ function createEvalRunReporter(options = {}) {
681
689
  }
682
690
  };
683
691
  }
684
- /** Stable ordering by `suite/id` so parallel completion order doesn't shuffle output. */
692
+ /** Stable ordering by `variant`, then `suite/id`, so parallel completion order doesn't shuffle output. */
685
693
  function sortCases(results) {
686
694
  return [...results].sort((a, b) => {
695
+ const va = a.variant ?? "";
696
+ const vb = b.variant ?? "";
697
+ if (va !== vb) return va.localeCompare(vb);
687
698
  const ka = `${a.suite ?? ""}/${a.id}`;
688
699
  const kb = `${b.suite ?? ""}/${b.id}`;
689
700
  return ka.localeCompare(kb);
690
701
  });
691
702
  }
703
+ /** Distinct variant labels across results, in sorted order. */
704
+ function distinctVariants(results) {
705
+ return [...new Set(results.map((r) => r.variant).filter((v) => Boolean(v)))].sort();
706
+ }
692
707
  function buildEvalRunSummary(input) {
693
708
  const results = sortCases(input);
694
709
  const usage = results.reduce((acc, result) => {
@@ -715,6 +730,7 @@ function buildEvalRunSummary(input) {
715
730
  cases: results.map((result) => ({
716
731
  id: result.id,
717
732
  ...result.suite ? { suite: result.suite } : {},
733
+ ...result.variant ? { variant: result.variant } : {},
718
734
  passed: result.passed,
719
735
  score: result.score,
720
736
  status: result.result.status,
@@ -725,9 +741,37 @@ function buildEvalRunSummary(input) {
725
741
  trajectory: result.trajectory
726
742
  })),
727
743
  metrics: aggregateMetrics(results),
728
- tagScores: aggregateTagScores(results)
744
+ tagScores: aggregateTagScores(results),
745
+ ...distinctVariants(results).length > 0 ? { variants: aggregateVariants(results) } : {}
729
746
  };
730
747
  }
748
+ function aggregateVariants(results) {
749
+ return distinctVariants(results).map((variant) => {
750
+ const group = results.filter((r) => r.variant === variant);
751
+ const usage = group.reduce((acc, result) => {
752
+ acc.input += result.result.usage.input;
753
+ acc.output += result.result.usage.output;
754
+ acc.cacheRead += result.result.usage.cacheRead;
755
+ acc.cacheCreation += result.result.usage.cacheCreation;
756
+ acc.cost += result.result.usage.cost ?? 0;
757
+ return acc;
758
+ }, {
759
+ input: 0,
760
+ output: 0,
761
+ cacheRead: 0,
762
+ cacheCreation: 0,
763
+ cost: 0
764
+ });
765
+ return {
766
+ variant,
767
+ count: group.length,
768
+ passed: group.filter((r) => r.passed).length,
769
+ score: mean(group.map((r) => r.score)),
770
+ durationMs: group.reduce((sum, r) => sum + r.result.durationMs, 0),
771
+ usage
772
+ };
773
+ });
774
+ }
731
775
  function aggregateMetrics(results) {
732
776
  const byId = /* @__PURE__ */ new Map();
733
777
  for (const result of results) for (const metric of result.metrics) {
@@ -776,8 +820,17 @@ function formatEvalRunSummaryWithOptions(input, options = {}) {
776
820
  if (results.length === 0) return `${color.heading("Eval run summary")}\n${color.muted("no evals ran")}`;
777
821
  const summary = buildEvalRunSummary(results);
778
822
  const out = [];
823
+ const variants = distinctVariants(results);
779
824
  out.push(color.heading("Eval run summary"));
780
- out.push(renderResultsTable(results, summary, color));
825
+ if (variants.length > 1) {
826
+ out.push(renderComparisonTable(results, variants, color));
827
+ for (const variant of variants) {
828
+ const group = results.filter((r) => r.variant === variant);
829
+ out.push("");
830
+ out.push(color.heading(variant));
831
+ out.push(renderResultsTable(group, buildEvalRunSummary(group), color));
832
+ }
833
+ } else out.push(renderResultsTable(results, summary, color));
781
834
  const tagTable = renderTagTable(summary.tagScores, color);
782
835
  if (tagTable) {
783
836
  out.push("");
@@ -790,7 +843,8 @@ function formatEvalRunSummaryWithOptions(input, options = {}) {
790
843
  return hyperlinks ? oscLink(abs, abs) : abs;
791
844
  };
792
845
  for (const result of results) {
793
- const label = `${result.suite ? `${result.suite}/` : ""}${result.id}`;
846
+ const caseLabel = `${result.suite ? `${result.suite}/` : ""}${result.id}`;
847
+ const label = result.variant ? `[${result.variant}] ${caseLabel}` : caseLabel;
794
848
  out.push(`${color.dot(result.passed)} ${color.caseStatus(result.passed, label)}`);
795
849
  out.push(`${color.muted("score")} ${color.score(result.score)}`);
796
850
  const tagEntries = Object.entries(result.tagScores);
@@ -848,6 +902,120 @@ function oscLink(target, text) {
848
902
  function absPath(target) {
849
903
  return resolve(target.replace(/^file:\/\//, ""));
850
904
  }
905
+ /**
906
+ * Case key used to line up the same eval across variants in the comparison
907
+ * matrix. Strips the `-repeat-N` suffix `registerEvalTests` appends so a
908
+ * repeated case aggregates into one row (mean score, x/y passed).
909
+ */
910
+ function comparisonCaseKey(result) {
911
+ const id = result.id.replace(/-repeat-\d+$/, "");
912
+ return `${result.suite ? `${result.suite}/` : ""}${id}`;
913
+ }
914
+ /**
915
+ * Cross-variant comparison matrix: one row per eval case, one column per
916
+ * variant (provider/model target), plus rollup rows (score / passed / cost /
917
+ * time) so two models can be read side by side at a glance.
918
+ */
919
+ function renderComparisonTable(input, variants, color) {
920
+ const results = sortCases(input);
921
+ const caseKeys = [...new Set(results.map(comparisonCaseKey))].sort();
922
+ const byCell = /* @__PURE__ */ new Map();
923
+ for (const result of results) {
924
+ const key = `${comparisonCaseKey(result)}\u0000${result.variant ?? ""}`;
925
+ const list = byCell.get(key) ?? [];
926
+ list.push(result);
927
+ byCell.set(key, list);
928
+ }
929
+ const cellFor = (caseKey, variant) => {
930
+ const group = byCell.get(`${caseKey}\u0000${variant}`);
931
+ if (!group || group.length === 0) return { text: "-" };
932
+ const score = mean(group.map((r) => r.score));
933
+ const passedCount = group.filter((r) => r.passed).length;
934
+ const allPassed = passedCount === group.length;
935
+ const status = group.length > 1 ? `${passedCount}/${group.length}` : allPassed ? "pass" : "fail";
936
+ return {
937
+ text: `${score.toFixed(2)} ${status}`,
938
+ score,
939
+ passed: allPassed
940
+ };
941
+ };
942
+ const variantSummaries = aggregateVariants(results);
943
+ const summaryOf = (variant) => variantSummaries.find((s) => s.variant === variant);
944
+ const rows = caseKeys.map((key) => ({
945
+ label: key,
946
+ cells: variants.map((variant) => cellFor(key, variant))
947
+ }));
948
+ const rollups = [
949
+ {
950
+ label: "SCORE",
951
+ rollup: true,
952
+ cells: variants.map((variant) => {
953
+ const s = summaryOf(variant);
954
+ return s ? {
955
+ text: s.score.toFixed(2),
956
+ score: s.score
957
+ } : { text: "-" };
958
+ })
959
+ },
960
+ {
961
+ label: "PASSED",
962
+ rollup: true,
963
+ cells: variants.map((variant) => {
964
+ const s = summaryOf(variant);
965
+ return s ? {
966
+ text: `${s.passed}/${s.count}`,
967
+ passed: s.passed === s.count
968
+ } : { text: "-" };
969
+ })
970
+ },
971
+ {
972
+ label: "COST",
973
+ rollup: true,
974
+ cells: variants.map((variant) => {
975
+ return { text: formatCost(summaryOf(variant)?.usage.cost ?? 0) };
976
+ })
977
+ },
978
+ {
979
+ label: "TIME",
980
+ rollup: true,
981
+ cells: variants.map((variant) => {
982
+ const s = summaryOf(variant);
983
+ return { text: s ? formatDuration(s.durationMs) : "-" };
984
+ })
985
+ }
986
+ ];
987
+ const header = ["EVAL", ...variants];
988
+ const allRows = [...rows, ...rollups];
989
+ const widths = header.map((h, i) => Math.max(h.length, ...allRows.map((row) => (i === 0 ? row.label : row.cells[i - 1].text).length)));
990
+ const top = `┌${widths.map((w) => "─".repeat(w + 2)).join("┬")}┐`;
991
+ const mid = `├${widths.map((w) => "─".repeat(w + 2)).join("┼")}┤`;
992
+ const bot = `└${widths.map((w) => "─".repeat(w + 2)).join("┴")}┘`;
993
+ const paintComparisonCell = (cell, padded) => {
994
+ if (cell.score === void 0 && cell.passed === void 0) return color.muted(padded);
995
+ if (cell.score !== void 0) {
996
+ const scoreText = cell.score.toFixed(2);
997
+ const rest = cell.text.slice(scoreText.length);
998
+ const coloredRest = cell.passed === void 0 ? color.muted(rest) : cell.passed ? color.pass(rest) : color.fail(rest);
999
+ return padded.replace(cell.text, `${color.score(cell.score)}${coloredRest}`);
1000
+ }
1001
+ return padded.replace(cell.text, cell.passed ? color.pass(cell.text) : color.fail(cell.text));
1002
+ };
1003
+ const renderRow = (row) => {
1004
+ const labelPadded = padEnd(row.label, widths[0]);
1005
+ return `│ ${row.rollup ? color.heading(labelPadded) : color.muted(labelPadded)} │${row.cells.map((cell, i) => {
1006
+ return ` ${paintComparisonCell(cell, padStart(cell.text, widths[i + 1]))} `;
1007
+ }).join("│")}│`;
1008
+ };
1009
+ return [
1010
+ top,
1011
+ `│ ${color.heading(padEnd(header[0], widths[0]))} │${variants.map((v, i) => ` ${color.heading(padStart(v, widths[i + 1]))} `).join("│")}│`,
1012
+ mid,
1013
+ ...rows.map(renderRow),
1014
+ mid,
1015
+ ...rollups.map(renderRow),
1016
+ bot
1017
+ ].join("\n");
1018
+ }
851
1019
  function renderResultsTable(results, summary, color) {
852
1020
  const header = [
853
1021
  "EVAL",