@m4trix/evals 0.25.0 → 0.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -2,7 +2,7 @@
2
2
  'use strict';
3
3
 
4
4
  var fullscreenInk = require('fullscreen-ink');
5
- var React2 = require('react');
5
+ var React = require('react');
6
6
  var ink = require('ink');
7
7
  var jsxRuntime = require('react/jsx-runtime');
8
8
  var path = require('path');
@@ -37,7 +37,7 @@ function _interopNamespace(e) {
37
37
  return Object.freeze(n);
38
38
  }
39
39
 
40
- var React2__default = /*#__PURE__*/_interopDefault(React2);
40
+ var React__default = /*#__PURE__*/_interopDefault(React);
41
41
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
42
42
  var stringify__default = /*#__PURE__*/_interopDefault(stringify);
43
43
 
@@ -117,11 +117,7 @@ function getFooterText(state) {
117
117
  }
118
118
  return "\u2191\u2193 move Enter add/remove S start run / search Esc cancel q quit";
119
119
  }
120
- function ListItem({
121
- selected,
122
- label,
123
- itemKey
124
- }) {
120
+ function ListItem({ selected, label, itemKey }) {
125
121
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
126
122
  selected ? "\u25B8 " : " ",
127
123
  label
@@ -148,9 +144,7 @@ function Pane({
148
144
  }
149
145
  );
150
146
  }
151
- function SectionHeader({
152
- children
153
- }) {
147
+ function SectionHeader({ children }) {
154
148
  return /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children });
155
149
  }
156
150
  function StatusText({ status }) {
@@ -162,10 +156,7 @@ function StatusText({ status }) {
162
156
  ] });
163
157
  }
164
158
  var LEFT_PANE_WIDTH = 44;
165
- function RunsSidebar({
166
- state,
167
- runs
168
- }) {
159
+ function RunsSidebar({ state, runs }) {
169
160
  const focused = state.focus === "left";
170
161
  return /* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH, focused, children: [
171
162
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Runs" }),
@@ -194,11 +185,7 @@ function RunsSidebar({
194
185
  ] });
195
186
  }
196
187
  var BLOCKS = ["\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"];
197
- function Sparkline({
198
- data,
199
- width,
200
- label
201
- }) {
188
+ function Sparkline({ data, width, label }) {
202
189
  if (data.length === 0)
203
190
  return null;
204
191
  const max = Math.max(...data);
@@ -428,9 +415,7 @@ var data_mock_default = {
428
415
  { name: "contract_match", score: 100 },
429
416
  { name: "arg_validity", score: 100 }
430
417
  ],
431
- checks: [
432
- { name: "tool_calls", passed: true, detail: "0 unexpected" }
433
- ],
418
+ checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
434
419
  failures: [],
435
420
  meta: {
436
421
  model: "gpt-4o-mini",
@@ -453,9 +438,21 @@ var data_mock_default = {
453
438
  }
454
439
  ],
455
440
  evaluators: [
456
- { id: "json-schema-validator", name: "JSON Schema Validator", configPreview: "strict=true" },
457
- { id: "tool-call-contract-checker", name: "Tool-call Contract Checker", configPreview: "unexpectedCalls=error" },
458
- { id: "rubric-judge", name: "Rubric Judge (LLM)", configPreview: "model=gpt-4o-mini; scale=0-100" },
441
+ {
442
+ id: "json-schema-validator",
443
+ name: "JSON Schema Validator",
444
+ configPreview: "strict=true"
445
+ },
446
+ {
447
+ id: "tool-call-contract-checker",
448
+ name: "Tool-call Contract Checker",
449
+ configPreview: "unexpectedCalls=error"
450
+ },
451
+ {
452
+ id: "rubric-judge",
453
+ name: "Rubric Judge (LLM)",
454
+ configPreview: "model=gpt-4o-mini; scale=0-100"
455
+ },
459
456
  { id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
460
457
  ]
461
458
  };
@@ -535,9 +532,7 @@ async function loadRunnerData(runner) {
535
532
  const memSnapshots = runner.getAllRunSnapshots();
536
533
  const seen = new Set(memSnapshots.map((s) => s.runId));
537
534
  const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
538
- const snapshots = [...memSnapshots, ...fromDisk].sort(
539
- (a, b) => b.queuedAt - a.queuedAt
540
- );
535
+ const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
541
536
  if (datasets.length === 0 && evaluators.length === 0) {
542
537
  return loadMockData();
543
538
  }
@@ -659,7 +654,11 @@ function reduceCliState(state, action) {
659
654
  return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
660
655
  }
661
656
  if (state.level === "datasets") {
662
- return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1), overviewScrollOffset: 0 };
657
+ return {
658
+ ...state,
659
+ datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1),
660
+ overviewScrollOffset: 0
661
+ };
663
662
  }
664
663
  if (state.level === "runs") {
665
664
  return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
@@ -677,10 +676,17 @@ function reduceCliState(state, action) {
677
676
  return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
678
677
  }
679
678
  if (state.level === "datasets" && state.focus === "right") {
680
- return { ...state, overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1) };
679
+ return {
680
+ ...state,
681
+ overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1)
682
+ };
681
683
  }
682
684
  if (state.level === "datasets") {
683
- return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1), overviewScrollOffset: 0 };
685
+ return {
686
+ ...state,
687
+ datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1),
688
+ overviewScrollOffset: 0
689
+ };
684
690
  }
685
691
  if (state.level === "runs") {
686
692
  return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
@@ -762,18 +768,8 @@ var defaultRunnerConfig = {
762
768
  discovery: {
763
769
  rootDir: process.cwd(),
764
770
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
765
- evaluatorSuffixes: [
766
- ".evaluator.ts",
767
- ".evaluator.tsx",
768
- ".evaluator.js",
769
- ".evaluator.mjs"
770
- ],
771
- testCaseSuffixes: [
772
- ".test-case.ts",
773
- ".test-case.tsx",
774
- ".test-case.js",
775
- ".test-case.mjs"
776
- ],
771
+ evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
772
+ testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
777
773
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
778
774
  },
779
775
  artifactDirectory: ".eval-results",
@@ -840,14 +836,15 @@ function getJitiLoader() {
840
836
  }
841
837
  const createJiti2 = jitiModule__namespace.createJiti ?? jitiModule__namespace.default;
842
838
  if (typeof createJiti2 !== "function") {
843
- throw new Error(
844
- "Failed to initialize jiti for m4trix eval config loading."
845
- );
839
+ throw new Error("Failed to initialize jiti for m4trix eval config loading.");
846
840
  }
847
- cachedLoader = createJiti2((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
848
- interopDefault: true,
849
- moduleCache: true
850
- });
841
+ cachedLoader = createJiti2(
842
+ (typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)),
843
+ {
844
+ interopDefault: true,
845
+ moduleCache: true
846
+ }
847
+ );
851
848
  return cachedLoader;
852
849
  }
853
850
  function resolveConfigModuleExport(loadedModule) {
@@ -951,9 +948,7 @@ async function loadModuleExports(filePath) {
951
948
  }
952
949
  async function collectDatasetsFromFiles(config) {
953
950
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
954
- const matched = files.filter(
955
- (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
956
- );
951
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
957
952
  const found = await Promise.all(
958
953
  matched.map(async (absolutePath) => {
959
954
  const exports = await loadModuleExports(absolutePath);
@@ -970,9 +965,7 @@ async function collectDatasetsFromFiles(config) {
970
965
  }
971
966
  async function collectEvaluatorsFromFiles(config) {
972
967
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
973
- const matched = files.filter(
974
- (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
975
- );
968
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
976
969
  const found = await Promise.all(
977
970
  matched.map(async (absolutePath) => {
978
971
  const exports = await loadModuleExports(absolutePath);
@@ -989,9 +982,7 @@ async function collectEvaluatorsFromFiles(config) {
989
982
  }
990
983
  async function collectTestCasesFromFiles(config) {
991
984
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
992
- const matched = files.filter(
993
- (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
994
- );
985
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
995
986
  const found = await Promise.all(
996
987
  matched.map(async (absolutePath) => {
997
988
  const exports = await loadModuleExports(absolutePath);
@@ -1063,16 +1054,8 @@ function createDiffString(expected, actual, diffOptions) {
1063
1054
  const expectedProcessed = preprocessForDiff(expected, diffOptions);
1064
1055
  const actualProcessed = preprocessForDiff(actual, diffOptions);
1065
1056
  if (diffOptions?.keysOnly) {
1066
- const expectedKeys = JSON.stringify(
1067
- extractKeys(expectedProcessed),
1068
- null,
1069
- 2
1070
- );
1071
- const actualKeys = JSON.stringify(
1072
- extractKeys(actualProcessed),
1073
- null,
1074
- 2
1075
- );
1057
+ const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
1058
+ const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
1076
1059
  const parts2 = diff.diffLines(expectedKeys, actualKeys);
1077
1060
  return formatDiffParts(parts2);
1078
1061
  }
@@ -1083,9 +1066,7 @@ function createDiffString(expected, actual, diffOptions) {
1083
1066
  }
1084
1067
  const parts = diff.diffLines(expectedStr, actualStr);
1085
1068
  if (diffOptions?.outputNewOnly) {
1086
- const filtered = parts.filter(
1087
- (p) => p.added === true
1088
- );
1069
+ const filtered = parts.filter((p) => p.added === true);
1089
1070
  return formatDiffParts(filtered);
1090
1071
  }
1091
1072
  return formatDiffParts(parts);
@@ -1187,10 +1168,7 @@ var ScoreAggregate = {
1187
1168
  const count = values.length || 1;
1188
1169
  const result = {};
1189
1170
  for (const field of fields) {
1190
- result[field] = values.reduce(
1191
- (s, v) => s + (v[field] ?? 0),
1192
- 0
1193
- ) / count;
1171
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
1194
1172
  }
1195
1173
  return result;
1196
1174
  };
@@ -1224,13 +1202,10 @@ var ScoreAggregate = {
1224
1202
  (s, v) => s + (v[valueField] ?? 0),
1225
1203
  0
1226
1204
  );
1227
- const sumSq = values.reduce(
1228
- (s, v) => {
1229
- const value = v[valueField] ?? 0;
1230
- return s + value * value;
1231
- },
1232
- 0
1233
- );
1205
+ const sumSq = values.reduce((s, v) => {
1206
+ const value = v[valueField] ?? 0;
1207
+ return s + value * value;
1208
+ }, 0);
1234
1209
  const mean = sum / count;
1235
1210
  const variance = (sumSq - count * mean * mean) / (count - 1);
1236
1211
  stdDev = variance > 0 ? Math.sqrt(variance) : 0;
@@ -1461,20 +1436,14 @@ function nowIsoForFile() {
1461
1436
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1462
1437
  }
1463
1438
  function createArtifactPath(artifactDirectory, datasetId, runId) {
1464
- return path.join(
1465
- artifactDirectory,
1466
- `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1467
- );
1439
+ return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
1468
1440
  }
1469
1441
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1470
1442
  const { testCaseItem, rerunIndex, rerunTotal } = unit;
1471
1443
  return effect.Effect.gen(function* () {
1472
1444
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
1473
1445
  const started = Date.now();
1474
- const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
1475
- n + 1,
1476
- n + 1
1477
- ]);
1446
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [n + 1, n + 1]);
1478
1447
  yield* publishEvent({
1479
1448
  type: "TestCaseStarted",
1480
1449
  runId: task.runId,
@@ -1507,9 +1476,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1507
1476
  return error;
1508
1477
  };
1509
1478
  try {
1510
- const ctx = yield* effect.Effect.promise(
1511
- () => Promise.resolve(evaluator.resolveContext())
1512
- );
1479
+ const ctx = yield* effect.Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
1513
1480
  const result = yield* effect.Effect.promise(
1514
1481
  () => Promise.resolve().then(
1515
1482
  () => evaluateFn({
@@ -1564,10 +1531,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1564
1531
  }
1565
1532
  }
1566
1533
  const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1567
- const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1568
- n + 1,
1569
- n + 1
1570
- ]);
1534
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
1571
1535
  const progressEvent = {
1572
1536
  type: "TestCaseProgress",
1573
1537
  runId: task.runId,
@@ -1616,10 +1580,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1616
1580
  } else {
1617
1581
  yield* effect.Ref.update(failedRef, (n) => n + 1);
1618
1582
  }
1619
- const [passed, failed] = yield* effect.Effect.all([
1620
- effect.Ref.get(passedRef),
1621
- effect.Ref.get(failedRef)
1622
- ]);
1583
+ const [passed, failed] = yield* effect.Effect.all([effect.Ref.get(passedRef), effect.Ref.get(failedRef)]);
1623
1584
  yield* updateSnapshot(task.runId, (snapshot) => ({
1624
1585
  ...snapshot,
1625
1586
  passedTestCases: passed,
@@ -1969,15 +1930,11 @@ var EffectRunner = class {
1969
1930
  this.persistenceQueue = effect.Effect.runSync(
1970
1931
  effect.Queue.unbounded()
1971
1932
  );
1972
- this.snapshotsRef = effect.Effect.runSync(
1973
- effect.Ref.make(/* @__PURE__ */ new Map())
1974
- );
1933
+ this.snapshotsRef = effect.Effect.runSync(effect.Ref.make(/* @__PURE__ */ new Map()));
1975
1934
  this.listeners = /* @__PURE__ */ new Set();
1976
1935
  this.datasetsById = /* @__PURE__ */ new Map();
1977
1936
  this.evaluatorsById = /* @__PURE__ */ new Map();
1978
- this.schedulerFiber = effect.Effect.runFork(
1979
- this.createSchedulerEffect()
1980
- );
1937
+ this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
1981
1938
  this.persistenceFiber = effect.Effect.runFork(
1982
1939
  createPersistenceWorker(this.persistenceQueue)
1983
1940
  );
@@ -2124,9 +2081,9 @@ var EffectRunner = class {
2124
2081
  return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
2125
2082
  }
2126
2083
  getAllRunSnapshots() {
2127
- return Array.from(
2128
- effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
2129
- ).sort((a, b) => b.queuedAt - a.queuedAt);
2084
+ return Array.from(effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()).sort(
2085
+ (a, b) => b.queuedAt - a.queuedAt
2086
+ );
2130
2087
  }
2131
2088
  async loadRunSnapshotsFromArtifacts() {
2132
2089
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -2238,9 +2195,9 @@ function DatasetsView({
2238
2195
  }) {
2239
2196
  const leftFocused = state.focus === "left";
2240
2197
  const rightFocused = state.focus === "right";
2241
- const [runScores, setRunScores] = React2.useState([]);
2242
- const [loading, setLoading] = React2.useState(false);
2243
- React2.useEffect(() => {
2198
+ const [runScores, setRunScores] = React.useState([]);
2199
+ const [loading, setLoading] = React.useState(false);
2200
+ React.useEffect(() => {
2244
2201
  if (!selectedDataset?.runs?.length) {
2245
2202
  setRunScores([]);
2246
2203
  return;
@@ -2252,7 +2209,7 @@ function DatasetsView({
2252
2209
  const barData = runScores.slice(0, MAX_RUNS_FOR_CHART).reverse();
2253
2210
  const trendValues = runScores.slice(0, MAX_RUNS_FOR_TREND).map((r) => r.value).reverse();
2254
2211
  const trendBatched = batchAverage(trendValues, TREND_BATCH_SIZE);
2255
- const overviewRows = React2.useMemo(() => {
2212
+ const overviewRows = React.useMemo(() => {
2256
2213
  const rows = [];
2257
2214
  rows.push(
2258
2215
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." }, "overview")
@@ -2342,11 +2299,7 @@ function DatasetsView({
2342
2299
  ] })
2343
2300
  ] });
2344
2301
  }
2345
- function RunsView({
2346
- state,
2347
- dataset,
2348
- selectedRun
2349
- }) {
2302
+ function RunsView({ state, dataset, selectedRun }) {
2350
2303
  const runs = dataset?.runs ?? [];
2351
2304
  const rightFocused = state.focus === "right";
2352
2305
  return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
@@ -2362,10 +2315,10 @@ function RunsView({
2362
2315
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2363
2316
  "Commit: ",
2364
2317
  selectedRun.meta.commit,
2365
- " Branch: ",
2318
+ " Branch: ",
2366
2319
  selectedRun.meta.branch,
2320
+ " Seed:",
2367
2321
  " ",
2368
- "Seed: ",
2369
2322
  selectedRun.meta.seed
2370
2323
  ] }),
2371
2324
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
@@ -2378,23 +2331,10 @@ function RunsView({
2378
2331
  format: (v) => `${v}%`
2379
2332
  }
2380
2333
  ),
2381
- /* @__PURE__ */ jsxRuntime.jsx(
2382
- TextBar,
2383
- {
2384
- label: "avg score",
2385
- value: Math.round(selectedRun.performance.avgScore * 100)
2386
- }
2387
- ),
2334
+ /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: "avg score", value: Math.round(selectedRun.performance.avgScore * 100) }),
2388
2335
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
2389
2336
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
2390
- selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
2391
- TextBar,
2392
- {
2393
- label: dimension.name,
2394
- value: dimension.score
2395
- },
2396
- dimension.name
2397
- )),
2337
+ selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: dimension.name, value: dimension.score }, dimension.name)),
2398
2338
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
2399
2339
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
2400
2340
  /* @__PURE__ */ jsxRuntime.jsx(
@@ -2497,15 +2437,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2497
2437
  ...dimensions.map((d) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
2498
2438
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp2"),
2499
2439
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
2500
- ...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(
2501
- CheckRow,
2502
- {
2503
- name: c.name,
2504
- passed: c.passed,
2505
- detail: c.detail
2506
- },
2507
- `chk-${c.name}`
2508
- )),
2440
+ ...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
2509
2441
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp3"),
2510
2442
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Performance" }, "perf-h"),
2511
2443
  /* @__PURE__ */ jsxRuntime.jsx(
@@ -2622,17 +2554,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2622
2554
  }
2623
2555
  } else {
2624
2556
  rows.push(
2625
- /* @__PURE__ */ jsxRuntime.jsxs(
2626
- ink.Text,
2627
- {
2628
- color: "gray",
2629
- children: [
2630
- " ",
2631
- "n/a"
2632
- ]
2633
- },
2634
- `tc-${tc.testCaseId}-${item.evaluatorId}-n/a`
2635
- )
2557
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2558
+ " ",
2559
+ "n/a"
2560
+ ] }, `tc-${tc.testCaseId}-${item.evaluatorId}-n/a`)
2636
2561
  );
2637
2562
  }
2638
2563
  if (!item.passed && item.logs && item.logs.length > 0) {
@@ -2689,12 +2614,12 @@ function RunDetailsView({
2689
2614
  }) {
2690
2615
  const runs = dataset?.runs ?? [];
2691
2616
  const rightFocused = state.focus === "right";
2692
- const [testCases, setTestCases] = React2.useState([]);
2693
- const evaluatorNameById = React2__default.default.useMemo(
2617
+ const [testCases, setTestCases] = React.useState([]);
2618
+ const evaluatorNameById = React__default.default.useMemo(
2694
2619
  () => new Map(evaluators.map((e) => [e.id, e.name])),
2695
2620
  [evaluators]
2696
2621
  );
2697
- React2.useEffect(() => {
2622
+ React.useEffect(() => {
2698
2623
  if (!selectedRun?.meta?.artifact) {
2699
2624
  setTestCases([]);
2700
2625
  return;
@@ -2713,7 +2638,7 @@ function RunDetailsView({
2713
2638
  const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
2714
2639
  return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2715
2640
  /* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
2716
- /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React2__default.default.Fragment, { children: row }, i)) }) })
2641
+ /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React__default.default.Fragment, { children: row }, i)) }) })
2717
2642
  ] });
2718
2643
  }
2719
2644
  var LEFT_PANE_WIDTH3 = 44;
@@ -2736,19 +2661,11 @@ function NewEvaluationView({
2736
2661
  visibleEvaluators.map((evaluator, index) => {
2737
2662
  const selected = index === state.evaluatorMenuIndex;
2738
2663
  const inSelection = state.selectedEvaluatorIds.includes(evaluator.id);
2739
- return /* @__PURE__ */ jsxRuntime.jsxs(
2740
- ink.Text,
2741
- {
2742
- color: selected ? "cyan" : "gray",
2743
- bold: selected,
2744
- children: [
2745
- selected ? "\u25B8 " : " ",
2746
- inSelection ? "[x] " : "[ ] ",
2747
- evaluator.name
2748
- ]
2749
- },
2750
- evaluator.id
2751
- );
2664
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
2665
+ selected ? "\u25B8 " : " ",
2666
+ inSelection ? "[x] " : "[ ] ",
2667
+ evaluator.name
2668
+ ] }, evaluator.id);
2752
2669
  })
2753
2670
  ] }),
2754
2671
  /* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
@@ -2780,30 +2697,20 @@ function clampCursor(state, filteredDatasetsLength, selectedRunCount) {
2780
2697
  ...state,
2781
2698
  datasetMenuIndex: Math.max(0, Math.min(state.datasetMenuIndex, datasetMax)),
2782
2699
  runMenuIndex: Math.max(0, Math.min(state.runMenuIndex, runMax)),
2783
- evaluatorMenuIndex: Math.max(
2784
- 0,
2785
- Math.min(state.evaluatorMenuIndex, evaluatorMax)
2786
- )
2700
+ evaluatorMenuIndex: Math.max(0, Math.min(state.evaluatorMenuIndex, evaluatorMax))
2787
2701
  };
2788
2702
  }
2789
- function EvalsCliApp({
2790
- data,
2791
- args,
2792
- runner
2793
- }) {
2703
+ function EvalsCliApp({ data, args, runner }) {
2794
2704
  const { exit } = ink.useApp();
2795
2705
  const { width: stdoutWidth, height: stdoutHeight } = fullscreenInk.useScreenSize();
2796
- const [liveData, setLiveData] = React2.useState(data);
2797
- const [runtimeMessage, setRuntimeMessage] = React2.useState();
2798
- const overviewRowCountRef = React2.useRef(0);
2799
- const [state, dispatch] = React2.useReducer(
2800
- reduceCliState,
2801
- createInitialState(data, args)
2802
- );
2803
- React2.useEffect(() => {
2706
+ const [liveData, setLiveData] = React.useState(data);
2707
+ const [runtimeMessage, setRuntimeMessage] = React.useState();
2708
+ const overviewRowCountRef = React.useRef(0);
2709
+ const [state, dispatch] = React.useReducer(reduceCliState, createInitialState(data, args));
2710
+ React.useEffect(() => {
2804
2711
  setLiveData(data);
2805
2712
  }, [data]);
2806
- React2.useEffect(() => {
2713
+ React.useEffect(() => {
2807
2714
  if (!runner) {
2808
2715
  return void 0;
2809
2716
  }
@@ -2822,7 +2729,7 @@ function EvalsCliApp({
2822
2729
  }
2823
2730
  });
2824
2731
  }, [runner]);
2825
- const filteredDatasets = React2.useMemo(
2732
+ const filteredDatasets = React.useMemo(
2826
2733
  () => getFilteredDatasets(liveData, state.searchQuery),
2827
2734
  [liveData, state.searchQuery]
2828
2735
  );
@@ -2831,14 +2738,8 @@ function EvalsCliApp({
2831
2738
  filteredDatasets.length,
2832
2739
  getDatasetByMenuIndex(filteredDatasets, state.datasetMenuIndex)?.runs.length ?? 0
2833
2740
  );
2834
- const selectedDataset = getDatasetByMenuIndex(
2835
- filteredDatasets,
2836
- clampedState.datasetMenuIndex
2837
- );
2838
- const selectedRun = getRunByMenuIndex(
2839
- selectedDataset,
2840
- clampedState.runMenuIndex
2841
- );
2741
+ const selectedDataset = getDatasetByMenuIndex(filteredDatasets, clampedState.datasetMenuIndex);
2742
+ const selectedRun = getRunByMenuIndex(selectedDataset, clampedState.runMenuIndex);
2842
2743
  const visibleEvaluators = liveData.evaluators.filter(
2843
2744
  (evaluator) => evaluator.name.toLowerCase().includes(clampedState.searchQuery.toLowerCase())
2844
2745
  );
@@ -2932,9 +2833,7 @@ function EvalsCliApp({
2932
2833
  `Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
2933
2834
  );
2934
2835
  }).catch((error) => {
2935
- setRuntimeMessage(
2936
- error instanceof Error ? error.message : "Failed to start evaluation."
2937
- );
2836
+ setRuntimeMessage(error instanceof Error ? error.message : "Failed to start evaluation.");
2938
2837
  });
2939
2838
  }
2940
2839
  });
@@ -2961,14 +2860,7 @@ function EvalsCliApp({
2961
2860
  );
2962
2861
  }
2963
2862
  if (clampedState.level === "runs") {
2964
- return /* @__PURE__ */ jsxRuntime.jsx(
2965
- RunsView,
2966
- {
2967
- state: clampedState,
2968
- dataset: selectedDataset,
2969
- selectedRun
2970
- }
2971
- );
2863
+ return /* @__PURE__ */ jsxRuntime.jsx(RunsView, { state: clampedState, dataset: selectedDataset, selectedRun });
2972
2864
  }
2973
2865
  return /* @__PURE__ */ jsxRuntime.jsx(
2974
2866
  RunDetailsView,
@@ -2980,82 +2872,44 @@ function EvalsCliApp({
2980
2872
  }
2981
2873
  );
2982
2874
  };
2983
- return /* @__PURE__ */ jsxRuntime.jsxs(
2984
- ink.Box,
2985
- {
2986
- flexDirection: "column",
2987
- flexGrow: 1,
2988
- width: stdoutWidth,
2989
- height: stdoutHeight,
2990
- children: [
2991
- /* @__PURE__ */ jsxRuntime.jsx(
2992
- ink.Box,
2993
- {
2994
- borderStyle: "round",
2995
- borderColor: "cyan",
2996
- paddingX: 1,
2997
- width: stdoutWidth,
2998
- children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: getBreadcrumbText(
2999
- clampedState,
3000
- selectedDataset?.name,
3001
- selectedRun?.label
3002
- ) })
3003
- }
3004
- ),
3005
- clampedState.startupWarnings.length > 0 && /* @__PURE__ */ jsxRuntime.jsxs(
3006
- ink.Box,
3007
- {
3008
- marginTop: 1,
3009
- borderStyle: "round",
3010
- borderColor: "yellow",
3011
- paddingX: 1,
3012
- flexDirection: "column",
3013
- width: stdoutWidth,
3014
- children: [
3015
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "yellow", children: "Startup warnings:" }),
3016
- clampedState.startupWarnings.map((warning, index) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: warning }, `${warning}-${index}`))
3017
- ]
3018
- }
3019
- ),
3020
- clampedState.searchMode && /* @__PURE__ */ jsxRuntime.jsxs(
3021
- ink.Box,
3022
- {
3023
- marginTop: 1,
3024
- borderStyle: "round",
3025
- borderColor: "magenta",
3026
- paddingX: 1,
3027
- width: stdoutWidth,
3028
- children: [
3029
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", bold: true, children: "Search: " }),
3030
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: clampedState.searchQuery })
3031
- ]
3032
- }
3033
- ),
3034
- runtimeMessage && /* @__PURE__ */ jsxRuntime.jsx(
3035
- ink.Box,
3036
- {
3037
- marginTop: 1,
3038
- borderStyle: "round",
3039
- borderColor: "blue",
3040
- paddingX: 1,
3041
- width: stdoutWidth,
3042
- children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "blue", children: runtimeMessage })
3043
- }
3044
- ),
3045
- /* @__PURE__ */ jsxRuntime.jsx(
3046
- ink.Box,
3047
- {
3048
- marginTop: 1,
3049
- flexGrow: 1,
3050
- width: stdoutWidth,
3051
- flexDirection: "row",
3052
- children: renderContent()
3053
- }
3054
- ),
3055
- /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: getFooterText(clampedState) }) })
3056
- ]
3057
- }
3058
- );
2875
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", flexGrow: 1, width: stdoutWidth, height: stdoutHeight, children: [
2876
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, width: stdoutWidth, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: getBreadcrumbText(clampedState, selectedDataset?.name, selectedRun?.label) }) }),
2877
+ clampedState.startupWarnings.length > 0 && /* @__PURE__ */ jsxRuntime.jsxs(
2878
+ ink.Box,
2879
+ {
2880
+ marginTop: 1,
2881
+ borderStyle: "round",
2882
+ borderColor: "yellow",
2883
+ paddingX: 1,
2884
+ flexDirection: "column",
2885
+ width: stdoutWidth,
2886
+ children: [
2887
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "yellow", children: "Startup warnings:" }),
2888
+ clampedState.startupWarnings.map((warning, index) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: warning }, `${warning}-${index}`))
2889
+ ]
2890
+ }
2891
+ ),
2892
+ clampedState.searchMode && /* @__PURE__ */ jsxRuntime.jsxs(
2893
+ ink.Box,
2894
+ {
2895
+ marginTop: 1,
2896
+ borderStyle: "round",
2897
+ borderColor: "magenta",
2898
+ paddingX: 1,
2899
+ width: stdoutWidth,
2900
+ children: [
2901
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "magenta", bold: true, children: [
2902
+ "Search:",
2903
+ " "
2904
+ ] }),
2905
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: clampedState.searchQuery })
2906
+ ]
2907
+ }
2908
+ ),
2909
+ runtimeMessage && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, borderStyle: "round", borderColor: "blue", paddingX: 1, width: stdoutWidth, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "blue", children: runtimeMessage }) }),
2910
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, flexGrow: 1, width: stdoutWidth, flexDirection: "row", children: renderContent() }),
2911
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: getFooterText(clampedState) }) })
2912
+ ] });
3059
2913
  }
3060
2914
  async function main() {
3061
2915
  const args = parseStartupArgs(process.argv.slice(2));
@@ -3067,9 +2921,7 @@ async function main() {
3067
2921
  process.on("SIGTERM", () => {
3068
2922
  void runner.shutdown().finally(() => process.exit(0));
3069
2923
  });
3070
- fullscreenInk.withFullScreen(
3071
- /* @__PURE__ */ jsxRuntime.jsx(EvalsCliApp, { data, args, runner })
3072
- ).start();
2924
+ fullscreenInk.withFullScreen(/* @__PURE__ */ jsxRuntime.jsx(EvalsCliApp, { data, args, runner })).start();
3073
2925
  }
3074
2926
  void main();
3075
2927
  //# sourceMappingURL=out.js.map