@m4trix/evals 0.25.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -7
- package/dist/cli-simple.cjs +911 -643
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +898 -630
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +688 -575
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +679 -566
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +959 -623
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +201 -7
- package/dist/index.js +947 -625
- package/dist/index.js.map +1 -1
- package/package.json +3 -5
package/dist/cli.js
CHANGED
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { withFullScreen, useScreenSize } from 'fullscreen-ink';
|
|
3
|
-
import
|
|
3
|
+
import React, { useState, useRef, useReducer, useEffect, useMemo } from 'react';
|
|
4
4
|
import { useApp, useInput, Box, Text } from 'ink';
|
|
5
5
|
import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
|
|
6
|
-
import {
|
|
7
|
-
import { LineGraph } from '@pppp606/ink-chart';
|
|
6
|
+
import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
|
|
8
7
|
import { randomUUID } from 'crypto';
|
|
9
|
-
import {
|
|
8
|
+
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
9
|
+
import { resolve, join, relative, dirname } from 'path';
|
|
10
10
|
import { existsSync } from 'fs';
|
|
11
11
|
import * as jitiModule from 'jiti';
|
|
12
|
-
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
13
12
|
import { pathToFileURL } from 'url';
|
|
14
13
|
import { diffLines } from 'diff';
|
|
15
14
|
import stringify from 'fast-json-stable-stringify';
|
|
15
|
+
import { LineGraph } from '@pppp606/ink-chart';
|
|
16
16
|
|
|
17
17
|
var SEP = " ";
|
|
18
18
|
var ARROW = "\u203A";
|
|
@@ -90,11 +90,7 @@ function getFooterText(state) {
|
|
|
90
90
|
}
|
|
91
91
|
return "\u2191\u2193 move Enter add/remove S start run / search Esc cancel q quit";
|
|
92
92
|
}
|
|
93
|
-
function ListItem({
|
|
94
|
-
selected,
|
|
95
|
-
label,
|
|
96
|
-
itemKey
|
|
97
|
-
}) {
|
|
93
|
+
function ListItem({ selected, label, itemKey }) {
|
|
98
94
|
return /* @__PURE__ */ jsxs(Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
|
|
99
95
|
selected ? "\u25B8 " : " ",
|
|
100
96
|
label
|
|
@@ -121,9 +117,7 @@ function Pane({
|
|
|
121
117
|
}
|
|
122
118
|
);
|
|
123
119
|
}
|
|
124
|
-
function SectionHeader({
|
|
125
|
-
children
|
|
126
|
-
}) {
|
|
120
|
+
function SectionHeader({ children }) {
|
|
127
121
|
return /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children });
|
|
128
122
|
}
|
|
129
123
|
function StatusText({ status }) {
|
|
@@ -135,10 +129,7 @@ function StatusText({ status }) {
|
|
|
135
129
|
] });
|
|
136
130
|
}
|
|
137
131
|
var LEFT_PANE_WIDTH = 44;
|
|
138
|
-
function RunsSidebar({
|
|
139
|
-
state,
|
|
140
|
-
runs
|
|
141
|
-
}) {
|
|
132
|
+
function RunsSidebar({ state, runs }) {
|
|
142
133
|
const focused = state.focus === "left";
|
|
143
134
|
return /* @__PURE__ */ jsxs(Pane, { width: LEFT_PANE_WIDTH, focused, children: [
|
|
144
135
|
/* @__PURE__ */ jsx(SectionHeader, { children: "Runs" }),
|
|
@@ -167,11 +158,7 @@ function RunsSidebar({
|
|
|
167
158
|
] });
|
|
168
159
|
}
|
|
169
160
|
var BLOCKS = ["\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"];
|
|
170
|
-
function Sparkline({
|
|
171
|
-
data,
|
|
172
|
-
width,
|
|
173
|
-
label
|
|
174
|
-
}) {
|
|
161
|
+
function Sparkline({ data, width, label }) {
|
|
175
162
|
if (data.length === 0)
|
|
176
163
|
return null;
|
|
177
164
|
const max = Math.max(...data);
|
|
@@ -250,6 +237,50 @@ function isPrintableCharacter(input) {
|
|
|
250
237
|
function isBackKey(key) {
|
|
251
238
|
return key.backspace || key.delete;
|
|
252
239
|
}
|
|
240
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
241
|
+
function makeEntityIdSchema(brand, label) {
|
|
242
|
+
return Schema.String.pipe(
|
|
243
|
+
Schema.trimmed(),
|
|
244
|
+
Schema.minLength(1, {
|
|
245
|
+
message: () => `${label} must be non-empty.`
|
|
246
|
+
}),
|
|
247
|
+
Schema.pattern(ENTITY_ID_PATTERN, {
|
|
248
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
249
|
+
}),
|
|
250
|
+
Schema.brand(brand)
|
|
251
|
+
);
|
|
252
|
+
}
|
|
253
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
254
|
+
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
255
|
+
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
256
|
+
function validateWithSchema(schema, raw, context) {
|
|
257
|
+
const trimmed = raw.trim();
|
|
258
|
+
const decode = Schema.decodeUnknownEither(
|
|
259
|
+
schema
|
|
260
|
+
);
|
|
261
|
+
const result = decode(trimmed);
|
|
262
|
+
if (Either.isLeft(result)) {
|
|
263
|
+
throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
264
|
+
}
|
|
265
|
+
return result.right;
|
|
266
|
+
}
|
|
267
|
+
function validateRunConfigName(raw, context) {
|
|
268
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// src/evals/evaluator.ts
|
|
272
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
273
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
274
|
+
const label = evaluator.getDisplayLabel();
|
|
275
|
+
if (label !== void 0) {
|
|
276
|
+
return label;
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
280
|
+
}
|
|
281
|
+
function getEvaluatorTagList(evaluator) {
|
|
282
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
283
|
+
}
|
|
253
284
|
|
|
254
285
|
// src/cli/data.mock.json
|
|
255
286
|
var data_mock_default = {
|
|
@@ -401,9 +432,7 @@ var data_mock_default = {
|
|
|
401
432
|
{ name: "contract_match", score: 100 },
|
|
402
433
|
{ name: "arg_validity", score: 100 }
|
|
403
434
|
],
|
|
404
|
-
checks: [
|
|
405
|
-
{ name: "tool_calls", passed: true, detail: "0 unexpected" }
|
|
406
|
-
],
|
|
435
|
+
checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
|
|
407
436
|
failures: [],
|
|
408
437
|
meta: {
|
|
409
438
|
model: "gpt-4o-mini",
|
|
@@ -426,9 +455,21 @@ var data_mock_default = {
|
|
|
426
455
|
}
|
|
427
456
|
],
|
|
428
457
|
evaluators: [
|
|
429
|
-
{
|
|
430
|
-
|
|
431
|
-
|
|
458
|
+
{
|
|
459
|
+
id: "json-schema-validator",
|
|
460
|
+
name: "JSON Schema Validator",
|
|
461
|
+
configPreview: "strict=true"
|
|
462
|
+
},
|
|
463
|
+
{
|
|
464
|
+
id: "tool-call-contract-checker",
|
|
465
|
+
name: "Tool-call Contract Checker",
|
|
466
|
+
configPreview: "unexpectedCalls=error"
|
|
467
|
+
},
|
|
468
|
+
{
|
|
469
|
+
id: "rubric-judge",
|
|
470
|
+
name: "Rubric Judge (LLM)",
|
|
471
|
+
configPreview: "model=gpt-4o-mini; scale=0-100"
|
|
472
|
+
},
|
|
432
473
|
{ id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
|
|
433
474
|
]
|
|
434
475
|
};
|
|
@@ -495,7 +536,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
495
536
|
function toEvaluatorOption(item) {
|
|
496
537
|
return {
|
|
497
538
|
id: item.id,
|
|
498
|
-
name: item.evaluator
|
|
539
|
+
name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
|
|
499
540
|
configPreview: `Source: ${item.filePath}`
|
|
500
541
|
};
|
|
501
542
|
}
|
|
@@ -508,9 +549,7 @@ async function loadRunnerData(runner) {
|
|
|
508
549
|
const memSnapshots = runner.getAllRunSnapshots();
|
|
509
550
|
const seen = new Set(memSnapshots.map((s) => s.runId));
|
|
510
551
|
const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
|
|
511
|
-
const snapshots = [...memSnapshots, ...fromDisk].sort(
|
|
512
|
-
(a, b) => b.queuedAt - a.queuedAt
|
|
513
|
-
);
|
|
552
|
+
const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
|
|
514
553
|
if (datasets.length === 0 && evaluators.length === 0) {
|
|
515
554
|
return loadMockData();
|
|
516
555
|
}
|
|
@@ -632,7 +671,11 @@ function reduceCliState(state, action) {
|
|
|
632
671
|
return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
|
|
633
672
|
}
|
|
634
673
|
if (state.level === "datasets") {
|
|
635
|
-
return {
|
|
674
|
+
return {
|
|
675
|
+
...state,
|
|
676
|
+
datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1),
|
|
677
|
+
overviewScrollOffset: 0
|
|
678
|
+
};
|
|
636
679
|
}
|
|
637
680
|
if (state.level === "runs") {
|
|
638
681
|
return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
|
|
@@ -650,10 +693,17 @@ function reduceCliState(state, action) {
|
|
|
650
693
|
return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
|
|
651
694
|
}
|
|
652
695
|
if (state.level === "datasets" && state.focus === "right") {
|
|
653
|
-
return {
|
|
696
|
+
return {
|
|
697
|
+
...state,
|
|
698
|
+
overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1)
|
|
699
|
+
};
|
|
654
700
|
}
|
|
655
701
|
if (state.level === "datasets") {
|
|
656
|
-
return {
|
|
702
|
+
return {
|
|
703
|
+
...state,
|
|
704
|
+
datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1),
|
|
705
|
+
overviewScrollOffset: 0
|
|
706
|
+
};
|
|
657
707
|
}
|
|
658
708
|
if (state.level === "runs") {
|
|
659
709
|
return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
|
|
@@ -729,24 +779,168 @@ function reduceCliState(state, action) {
|
|
|
729
779
|
}
|
|
730
780
|
return state;
|
|
731
781
|
}
|
|
782
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
783
|
+
const baseDir = resolve(config.artifactDirectory);
|
|
784
|
+
let entries;
|
|
785
|
+
try {
|
|
786
|
+
entries = await readdir(baseDir);
|
|
787
|
+
} catch {
|
|
788
|
+
return [];
|
|
789
|
+
}
|
|
790
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
791
|
+
const snapshots = [];
|
|
792
|
+
for (const fileName of jsonlFiles) {
|
|
793
|
+
const filePath = join(baseDir, fileName);
|
|
794
|
+
try {
|
|
795
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
796
|
+
if (snapshot) {
|
|
797
|
+
snapshots.push(snapshot);
|
|
798
|
+
}
|
|
799
|
+
} catch {
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
803
|
+
}
|
|
804
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
805
|
+
const content = await readFile(filePath, "utf8");
|
|
806
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
807
|
+
if (lines.length === 0) {
|
|
808
|
+
return null;
|
|
809
|
+
}
|
|
810
|
+
let runQueued = null;
|
|
811
|
+
let runCompleted = null;
|
|
812
|
+
let runFailed = null;
|
|
813
|
+
let runStarted = null;
|
|
814
|
+
for (const line of lines) {
|
|
815
|
+
try {
|
|
816
|
+
const event = JSON.parse(line);
|
|
817
|
+
const type = event.type;
|
|
818
|
+
if (type === "RunQueued") {
|
|
819
|
+
runQueued = {
|
|
820
|
+
runId: event.runId,
|
|
821
|
+
datasetId: event.datasetId,
|
|
822
|
+
datasetName: event.datasetName,
|
|
823
|
+
evaluatorIds: event.evaluatorIds,
|
|
824
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
825
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
826
|
+
ts: event.ts
|
|
827
|
+
};
|
|
828
|
+
}
|
|
829
|
+
if (type === "RunStarted") {
|
|
830
|
+
runStarted = { startedAt: event.startedAt };
|
|
831
|
+
}
|
|
832
|
+
if (type === "RunCompleted") {
|
|
833
|
+
runCompleted = {
|
|
834
|
+
passedTestCases: event.passedTestCases,
|
|
835
|
+
failedTestCases: event.failedTestCases,
|
|
836
|
+
totalTestCases: event.totalTestCases,
|
|
837
|
+
finishedAt: event.finishedAt
|
|
838
|
+
};
|
|
839
|
+
}
|
|
840
|
+
if (type === "RunFailed") {
|
|
841
|
+
runFailed = {
|
|
842
|
+
finishedAt: event.finishedAt,
|
|
843
|
+
errorMessage: event.errorMessage
|
|
844
|
+
};
|
|
845
|
+
}
|
|
846
|
+
} catch {
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
if (!runQueued) {
|
|
850
|
+
return null;
|
|
851
|
+
}
|
|
852
|
+
const artifactPath = filePath;
|
|
853
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
854
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
855
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
856
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
857
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
858
|
+
return {
|
|
859
|
+
runId: runQueued.runId,
|
|
860
|
+
datasetId: runQueued.datasetId,
|
|
861
|
+
datasetName: runQueued.datasetName,
|
|
862
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
863
|
+
queuedAt: runQueued.ts ?? 0,
|
|
864
|
+
startedAt: runStarted?.startedAt,
|
|
865
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
866
|
+
totalTestCases: runQueued.totalTestCases,
|
|
867
|
+
completedTestCases,
|
|
868
|
+
passedTestCases,
|
|
869
|
+
failedTestCases,
|
|
870
|
+
status,
|
|
871
|
+
artifactPath,
|
|
872
|
+
errorMessage: runFailed?.errorMessage
|
|
873
|
+
};
|
|
874
|
+
}
|
|
875
|
+
function aggregateTestCaseProgress(lines) {
|
|
876
|
+
let completedTestCases = 0;
|
|
877
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
878
|
+
for (const line of lines) {
|
|
879
|
+
try {
|
|
880
|
+
const event = JSON.parse(line);
|
|
881
|
+
if (event.type === "TestCaseProgress") {
|
|
882
|
+
const ev = event;
|
|
883
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
884
|
+
const id = ev.testCaseId;
|
|
885
|
+
const current = testCasePassedBy.get(id);
|
|
886
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
887
|
+
}
|
|
888
|
+
} catch {
|
|
889
|
+
}
|
|
890
|
+
}
|
|
891
|
+
let passedTestCases = 0;
|
|
892
|
+
let failedTestCases = 0;
|
|
893
|
+
for (const passed of testCasePassedBy.values()) {
|
|
894
|
+
if (passed) {
|
|
895
|
+
passedTestCases += 1;
|
|
896
|
+
} else {
|
|
897
|
+
failedTestCases += 1;
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
901
|
+
}
|
|
902
|
+
async function parseArtifactFile(artifactPath) {
|
|
903
|
+
try {
|
|
904
|
+
const content = await readFile(artifactPath, "utf8");
|
|
905
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
906
|
+
const results = [];
|
|
907
|
+
for (const line of lines) {
|
|
908
|
+
try {
|
|
909
|
+
const event = JSON.parse(line);
|
|
910
|
+
if (event.type === "TestCaseProgress") {
|
|
911
|
+
const ev = event;
|
|
912
|
+
const repetitionIndex = ev.repetitionIndex ?? ev.rerunIndex;
|
|
913
|
+
const repetitionCount = ev.repetitionCount ?? ev.rerunTotal;
|
|
914
|
+
results.push({
|
|
915
|
+
testCaseId: ev.testCaseId,
|
|
916
|
+
testCaseName: ev.testCaseName,
|
|
917
|
+
completedTestCases: ev.completedTestCases,
|
|
918
|
+
totalTestCases: ev.totalTestCases,
|
|
919
|
+
repetitionId: ev.repetitionId,
|
|
920
|
+
repetitionIndex,
|
|
921
|
+
repetitionCount,
|
|
922
|
+
passed: ev.passed,
|
|
923
|
+
durationMs: ev.durationMs,
|
|
924
|
+
evaluatorScores: ev.evaluatorScores ?? []
|
|
925
|
+
});
|
|
926
|
+
}
|
|
927
|
+
} catch {
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
return results;
|
|
931
|
+
} catch {
|
|
932
|
+
return [];
|
|
933
|
+
}
|
|
934
|
+
}
|
|
732
935
|
|
|
733
936
|
// src/runner/config.ts
|
|
734
937
|
var defaultRunnerConfig = {
|
|
735
938
|
discovery: {
|
|
736
939
|
rootDir: process.cwd(),
|
|
737
940
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
738
|
-
evaluatorSuffixes: [
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
".evaluator.js",
|
|
742
|
-
".evaluator.mjs"
|
|
743
|
-
],
|
|
744
|
-
testCaseSuffixes: [
|
|
745
|
-
".test-case.ts",
|
|
746
|
-
".test-case.tsx",
|
|
747
|
-
".test-case.js",
|
|
748
|
-
".test-case.mjs"
|
|
749
|
-
],
|
|
941
|
+
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
942
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
943
|
+
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
750
944
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
751
945
|
},
|
|
752
946
|
artifactDirectory: ".eval-results",
|
|
@@ -771,6 +965,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
771
965
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
772
966
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
773
967
|
}
|
|
968
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
969
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
970
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
971
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
972
|
+
}
|
|
774
973
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
775
974
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
776
975
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -813,14 +1012,15 @@ function getJitiLoader() {
|
|
|
813
1012
|
}
|
|
814
1013
|
const createJiti2 = jitiModule.createJiti ?? jitiModule.default;
|
|
815
1014
|
if (typeof createJiti2 !== "function") {
|
|
816
|
-
throw new Error(
|
|
817
|
-
"Failed to initialize jiti for m4trix eval config loading."
|
|
818
|
-
);
|
|
1015
|
+
throw new Error("Failed to initialize jiti for m4trix eval config loading.");
|
|
819
1016
|
}
|
|
820
|
-
cachedLoader = createJiti2(
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
1017
|
+
cachedLoader = createJiti2(
|
|
1018
|
+
import.meta.url,
|
|
1019
|
+
{
|
|
1020
|
+
interopDefault: true,
|
|
1021
|
+
moduleCache: true
|
|
1022
|
+
}
|
|
1023
|
+
);
|
|
824
1024
|
return cachedLoader;
|
|
825
1025
|
}
|
|
826
1026
|
function resolveConfigModuleExport(loadedModule) {
|
|
@@ -868,6 +1068,9 @@ function isDatasetLike(value) {
|
|
|
868
1068
|
function isEvaluatorLike(value) {
|
|
869
1069
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
870
1070
|
}
|
|
1071
|
+
function isRunConfigLike(value) {
|
|
1072
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
1073
|
+
}
|
|
871
1074
|
function isTestCaseLike(value) {
|
|
872
1075
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
873
1076
|
}
|
|
@@ -924,9 +1127,7 @@ async function loadModuleExports(filePath) {
|
|
|
924
1127
|
}
|
|
925
1128
|
async function collectDatasetsFromFiles(config) {
|
|
926
1129
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
927
|
-
const matched = files.filter(
|
|
928
|
-
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
929
|
-
);
|
|
1130
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
|
|
930
1131
|
const found = await Promise.all(
|
|
931
1132
|
matched.map(async (absolutePath) => {
|
|
932
1133
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -943,9 +1144,7 @@ async function collectDatasetsFromFiles(config) {
|
|
|
943
1144
|
}
|
|
944
1145
|
async function collectEvaluatorsFromFiles(config) {
|
|
945
1146
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
946
|
-
const matched = files.filter(
|
|
947
|
-
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
948
|
-
);
|
|
1147
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
|
|
949
1148
|
const found = await Promise.all(
|
|
950
1149
|
matched.map(async (absolutePath) => {
|
|
951
1150
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -960,11 +1159,26 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
960
1159
|
);
|
|
961
1160
|
return found.flat();
|
|
962
1161
|
}
|
|
963
|
-
async function
|
|
1162
|
+
async function collectRunConfigsFromFiles(config) {
|
|
964
1163
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
965
|
-
const matched = files.filter(
|
|
966
|
-
|
|
1164
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
1165
|
+
const found = await Promise.all(
|
|
1166
|
+
matched.map(async (absolutePath) => {
|
|
1167
|
+
const exports = await loadModuleExports(absolutePath);
|
|
1168
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
1169
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
1170
|
+
return runConfigs.map((runConfig) => ({
|
|
1171
|
+
id: runConfig.getName(),
|
|
1172
|
+
filePath: relPath,
|
|
1173
|
+
runConfig
|
|
1174
|
+
}));
|
|
1175
|
+
})
|
|
967
1176
|
);
|
|
1177
|
+
return found.flat();
|
|
1178
|
+
}
|
|
1179
|
+
async function collectTestCasesFromFiles(config) {
|
|
1180
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1181
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
968
1182
|
const found = await Promise.all(
|
|
969
1183
|
matched.map(async (absolutePath) => {
|
|
970
1184
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1036,16 +1250,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
1036
1250
|
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
1037
1251
|
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
1038
1252
|
if (diffOptions?.keysOnly) {
|
|
1039
|
-
const expectedKeys = JSON.stringify(
|
|
1040
|
-
|
|
1041
|
-
null,
|
|
1042
|
-
2
|
|
1043
|
-
);
|
|
1044
|
-
const actualKeys = JSON.stringify(
|
|
1045
|
-
extractKeys(actualProcessed),
|
|
1046
|
-
null,
|
|
1047
|
-
2
|
|
1048
|
-
);
|
|
1253
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
1254
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
1049
1255
|
const parts2 = diffLines(expectedKeys, actualKeys);
|
|
1050
1256
|
return formatDiffParts(parts2);
|
|
1051
1257
|
}
|
|
@@ -1056,9 +1262,7 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
1056
1262
|
}
|
|
1057
1263
|
const parts = diffLines(expectedStr, actualStr);
|
|
1058
1264
|
if (diffOptions?.outputNewOnly) {
|
|
1059
|
-
const filtered = parts.filter(
|
|
1060
|
-
(p) => p.added === true
|
|
1061
|
-
);
|
|
1265
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
1062
1266
|
return formatDiffParts(filtered);
|
|
1063
1267
|
}
|
|
1064
1268
|
return formatDiffParts(parts);
|
|
@@ -1125,6 +1329,17 @@ function getDiffLines(entry) {
|
|
|
1125
1329
|
});
|
|
1126
1330
|
}
|
|
1127
1331
|
|
|
1332
|
+
// src/evals/test-case.ts
|
|
1333
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
1334
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
1335
|
+
return testCase.getDisplayLabel();
|
|
1336
|
+
}
|
|
1337
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
1338
|
+
}
|
|
1339
|
+
function getTestCaseTagList(testCase) {
|
|
1340
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
1341
|
+
}
|
|
1342
|
+
|
|
1128
1343
|
// src/evals/metric.ts
|
|
1129
1344
|
var registry = /* @__PURE__ */ new Map();
|
|
1130
1345
|
var Metric = {
|
|
@@ -1148,29 +1363,74 @@ function getMetricById(id) {
|
|
|
1148
1363
|
return registry.get(id);
|
|
1149
1364
|
}
|
|
1150
1365
|
|
|
1151
|
-
// src/evals/
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1366
|
+
// src/evals/aggregators.ts
|
|
1367
|
+
function aggregateTokenCountSum(values) {
|
|
1368
|
+
const initial = {
|
|
1369
|
+
input: 0,
|
|
1370
|
+
output: 0,
|
|
1371
|
+
inputCached: 0,
|
|
1372
|
+
outputCached: 0
|
|
1373
|
+
};
|
|
1374
|
+
return values.reduce(
|
|
1375
|
+
(acc, v) => ({
|
|
1376
|
+
input: acc.input + (v.input ?? 0),
|
|
1377
|
+
output: acc.output + (v.output ?? 0),
|
|
1378
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1379
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1380
|
+
}),
|
|
1381
|
+
initial
|
|
1382
|
+
);
|
|
1155
1383
|
}
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1384
|
+
function aggregateLatencyAverage(values) {
|
|
1385
|
+
if (values.length === 0) {
|
|
1386
|
+
return { ms: 0 };
|
|
1387
|
+
}
|
|
1388
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1389
|
+
return { ms: sum / values.length };
|
|
1390
|
+
}
|
|
1391
|
+
|
|
1392
|
+
// src/evals/metrics/standard.ts
|
|
1393
|
+
Metric.of({
|
|
1394
|
+
id: "token-count",
|
|
1395
|
+
name: "Tokens",
|
|
1396
|
+
aggregate: aggregateTokenCountSum,
|
|
1397
|
+
format: (data, options) => {
|
|
1398
|
+
const input = data.input ?? 0;
|
|
1399
|
+
const output = data.output ?? 0;
|
|
1400
|
+
const inputCached = data.inputCached ?? 0;
|
|
1401
|
+
const outputCached = data.outputCached ?? 0;
|
|
1402
|
+
const cached = inputCached + outputCached;
|
|
1403
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1404
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1405
|
+
}
|
|
1406
|
+
});
|
|
1407
|
+
Metric.of({
|
|
1408
|
+
id: "latency",
|
|
1409
|
+
name: "Latency",
|
|
1410
|
+
aggregate: aggregateLatencyAverage,
|
|
1411
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1412
|
+
});
|
|
1413
|
+
|
|
1414
|
+
// src/evals/score.ts
|
|
1415
|
+
var registry2 = /* @__PURE__ */ new Map();
|
|
1416
|
+
function formatScoreData(def, data, options) {
|
|
1417
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
1418
|
+
}
|
|
1419
|
+
var ScoreAggregate = {
|
|
1420
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
1421
|
+
averageFields(fields) {
|
|
1422
|
+
return (values) => {
|
|
1423
|
+
const count = values.length || 1;
|
|
1424
|
+
const result = {};
|
|
1425
|
+
for (const field of fields) {
|
|
1426
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
1427
|
+
}
|
|
1428
|
+
return result;
|
|
1429
|
+
};
|
|
1430
|
+
},
|
|
1431
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
1432
|
+
averageWithVariance(fields) {
|
|
1433
|
+
return (values) => {
|
|
1174
1434
|
const count = values.length;
|
|
1175
1435
|
const result = {};
|
|
1176
1436
|
for (const field of fields) {
|
|
@@ -1197,13 +1457,10 @@ var ScoreAggregate = {
|
|
|
1197
1457
|
(s, v) => s + (v[valueField] ?? 0),
|
|
1198
1458
|
0
|
|
1199
1459
|
);
|
|
1200
|
-
const sumSq = values.reduce(
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
},
|
|
1205
|
-
0
|
|
1206
|
-
);
|
|
1460
|
+
const sumSq = values.reduce((s, v) => {
|
|
1461
|
+
const value = v[valueField] ?? 0;
|
|
1462
|
+
return s + value * value;
|
|
1463
|
+
}, 0);
|
|
1207
1464
|
const mean = sum / count;
|
|
1208
1465
|
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
1209
1466
|
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
@@ -1262,54 +1519,6 @@ function getScoreById(id) {
|
|
|
1262
1519
|
return registry2.get(id);
|
|
1263
1520
|
}
|
|
1264
1521
|
|
|
1265
|
-
// src/evals/aggregators.ts
|
|
1266
|
-
function aggregateTokenCountSum(values) {
|
|
1267
|
-
const initial = {
|
|
1268
|
-
input: 0,
|
|
1269
|
-
output: 0,
|
|
1270
|
-
inputCached: 0,
|
|
1271
|
-
outputCached: 0
|
|
1272
|
-
};
|
|
1273
|
-
return values.reduce(
|
|
1274
|
-
(acc, v) => ({
|
|
1275
|
-
input: acc.input + (v.input ?? 0),
|
|
1276
|
-
output: acc.output + (v.output ?? 0),
|
|
1277
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1278
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1279
|
-
}),
|
|
1280
|
-
initial
|
|
1281
|
-
);
|
|
1282
|
-
}
|
|
1283
|
-
function aggregateLatencyAverage(values) {
|
|
1284
|
-
if (values.length === 0) {
|
|
1285
|
-
return { ms: 0 };
|
|
1286
|
-
}
|
|
1287
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1288
|
-
return { ms: sum / values.length };
|
|
1289
|
-
}
|
|
1290
|
-
|
|
1291
|
-
// src/evals/metrics/standard.ts
|
|
1292
|
-
Metric.of({
|
|
1293
|
-
id: "token-count",
|
|
1294
|
-
name: "Tokens",
|
|
1295
|
-
aggregate: aggregateTokenCountSum,
|
|
1296
|
-
format: (data, options) => {
|
|
1297
|
-
const input = data.input ?? 0;
|
|
1298
|
-
const output = data.output ?? 0;
|
|
1299
|
-
const inputCached = data.inputCached ?? 0;
|
|
1300
|
-
const outputCached = data.outputCached ?? 0;
|
|
1301
|
-
const cached = inputCached + outputCached;
|
|
1302
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1303
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1304
|
-
}
|
|
1305
|
-
});
|
|
1306
|
-
Metric.of({
|
|
1307
|
-
id: "latency",
|
|
1308
|
-
name: "Latency",
|
|
1309
|
-
aggregate: aggregateLatencyAverage,
|
|
1310
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1311
|
-
});
|
|
1312
|
-
|
|
1313
1522
|
// src/evals/scores/standard.ts
|
|
1314
1523
|
Score.of({
|
|
1315
1524
|
id: "percent",
|
|
@@ -1416,15 +1625,17 @@ function readOutput(testCase) {
|
|
|
1416
1625
|
}
|
|
1417
1626
|
return candidate.getOutput();
|
|
1418
1627
|
}
|
|
1419
|
-
function buildEvaluationUnits(testCases) {
|
|
1628
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
1629
|
+
const count = Math.max(1, repetitionCount);
|
|
1420
1630
|
const units = [];
|
|
1421
1631
|
for (const testCaseItem of testCases) {
|
|
1422
|
-
const
|
|
1423
|
-
for (let r = 0; r <
|
|
1632
|
+
const repetitionId = `rep-${randomUUID()}`;
|
|
1633
|
+
for (let r = 0; r < count; r++) {
|
|
1424
1634
|
units.push({
|
|
1425
1635
|
testCaseItem,
|
|
1426
|
-
|
|
1427
|
-
|
|
1636
|
+
repetitionId,
|
|
1637
|
+
repetitionIndex: r + 1,
|
|
1638
|
+
repetitionCount: count
|
|
1428
1639
|
});
|
|
1429
1640
|
}
|
|
1430
1641
|
}
|
|
@@ -1434,29 +1645,24 @@ function nowIsoForFile() {
|
|
|
1434
1645
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1435
1646
|
}
|
|
1436
1647
|
function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
1437
|
-
return join(
|
|
1438
|
-
artifactDirectory,
|
|
1439
|
-
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1440
|
-
);
|
|
1648
|
+
return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1441
1649
|
}
|
|
1442
1650
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1443
|
-
const { testCaseItem,
|
|
1651
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
1444
1652
|
return Effect.gen(function* () {
|
|
1445
1653
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1446
1654
|
const started = Date.now();
|
|
1447
|
-
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
1448
|
-
n + 1,
|
|
1449
|
-
n + 1
|
|
1450
|
-
]);
|
|
1655
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [n + 1, n + 1]);
|
|
1451
1656
|
yield* publishEvent({
|
|
1452
1657
|
type: "TestCaseStarted",
|
|
1453
1658
|
runId: task.runId,
|
|
1454
1659
|
testCaseId: testCaseItem.id,
|
|
1455
|
-
testCaseName: testCaseItem.testCase
|
|
1660
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1456
1661
|
startedTestCases: startedEvaluations,
|
|
1457
1662
|
totalTestCases: totalEvaluations,
|
|
1458
|
-
|
|
1459
|
-
|
|
1663
|
+
repetitionId,
|
|
1664
|
+
repetitionIndex,
|
|
1665
|
+
repetitionCount
|
|
1460
1666
|
});
|
|
1461
1667
|
const evaluatorScores = [];
|
|
1462
1668
|
let testCaseError;
|
|
@@ -1480,9 +1686,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1480
1686
|
return error;
|
|
1481
1687
|
};
|
|
1482
1688
|
try {
|
|
1483
|
-
const ctx = yield* Effect.promise(
|
|
1484
|
-
() => Promise.resolve(evaluator.resolveContext())
|
|
1485
|
-
);
|
|
1689
|
+
const ctx = yield* Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
|
|
1486
1690
|
const result = yield* Effect.promise(
|
|
1487
1691
|
() => Promise.resolve().then(
|
|
1488
1692
|
() => evaluateFn({
|
|
@@ -1492,8 +1696,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1492
1696
|
meta: {
|
|
1493
1697
|
triggerId: task.triggerId,
|
|
1494
1698
|
runId: evaluatorRunId,
|
|
1495
|
-
datasetId: task.datasetId
|
|
1699
|
+
datasetId: task.datasetId,
|
|
1700
|
+
repetitionId,
|
|
1701
|
+
repetitionIndex,
|
|
1702
|
+
repetitionCount,
|
|
1703
|
+
runConfigName: task.runConfigName
|
|
1496
1704
|
},
|
|
1705
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1706
|
+
runConfigTags: task.runConfigTags,
|
|
1707
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1497
1708
|
logDiff,
|
|
1498
1709
|
log,
|
|
1499
1710
|
createError
|
|
@@ -1536,21 +1747,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1536
1747
|
});
|
|
1537
1748
|
}
|
|
1538
1749
|
}
|
|
1539
|
-
const
|
|
1540
|
-
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
1541
|
-
n + 1,
|
|
1542
|
-
n + 1
|
|
1543
|
-
]);
|
|
1750
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1751
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1544
1752
|
const progressEvent = {
|
|
1545
1753
|
type: "TestCaseProgress",
|
|
1546
1754
|
runId: task.runId,
|
|
1547
1755
|
testCaseId: testCaseItem.id,
|
|
1548
|
-
testCaseName: testCaseItem.testCase
|
|
1756
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1549
1757
|
completedTestCases: completedEvaluations,
|
|
1550
1758
|
totalTestCases: totalEvaluations,
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1759
|
+
repetitionId,
|
|
1760
|
+
repetitionIndex,
|
|
1761
|
+
repetitionCount,
|
|
1762
|
+
passed: repetitionPassedThis,
|
|
1554
1763
|
durationMs: Date.now() - started,
|
|
1555
1764
|
evaluatorScores,
|
|
1556
1765
|
output,
|
|
@@ -1571,9 +1780,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1571
1780
|
(map) => {
|
|
1572
1781
|
const key = testCaseItem.id;
|
|
1573
1782
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1574
|
-
const newResults = [...existing.results,
|
|
1783
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
1575
1784
|
const newCompletedCount = existing.completedCount + 1;
|
|
1576
|
-
const isLast = newCompletedCount ===
|
|
1785
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
1577
1786
|
const newMap = new Map(map);
|
|
1578
1787
|
newMap.set(key, {
|
|
1579
1788
|
completedCount: newCompletedCount,
|
|
@@ -1589,10 +1798,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1589
1798
|
} else {
|
|
1590
1799
|
yield* Ref.update(failedRef, (n) => n + 1);
|
|
1591
1800
|
}
|
|
1592
|
-
const [passed, failed] = yield* Effect.all([
|
|
1593
|
-
Ref.get(passedRef),
|
|
1594
|
-
Ref.get(failedRef)
|
|
1595
|
-
]);
|
|
1801
|
+
const [passed, failed] = yield* Effect.all([Ref.get(passedRef), Ref.get(failedRef)]);
|
|
1596
1802
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1597
1803
|
...snapshot,
|
|
1598
1804
|
passedTestCases: passed,
|
|
@@ -1613,10 +1819,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1613
1819
|
runId: task.runId,
|
|
1614
1820
|
startedAt
|
|
1615
1821
|
});
|
|
1616
|
-
const totalEvaluations = task.testCases.
|
|
1617
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1618
|
-
0
|
|
1619
|
-
);
|
|
1822
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
1620
1823
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1621
1824
|
const completedRef = yield* Ref.make(0);
|
|
1622
1825
|
const startedRef = yield* Ref.make(0);
|
|
@@ -1625,7 +1828,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1625
1828
|
const testCaseResultsRef = yield* Ref.make(
|
|
1626
1829
|
/* @__PURE__ */ new Map()
|
|
1627
1830
|
);
|
|
1628
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1831
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
1629
1832
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
1630
1833
|
task,
|
|
1631
1834
|
unit,
|
|
@@ -1639,11 +1842,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1639
1842
|
failedRef,
|
|
1640
1843
|
testCaseResultsRef
|
|
1641
1844
|
);
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1845
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1846
|
+
if (globalSem !== void 0) {
|
|
1847
|
+
yield* Effect.forEach(
|
|
1848
|
+
evaluationUnits,
|
|
1849
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1850
|
+
{ concurrency: "unbounded", discard: true }
|
|
1851
|
+
);
|
|
1852
|
+
} else {
|
|
1853
|
+
yield* Effect.forEach(
|
|
1854
|
+
evaluationUnits,
|
|
1855
|
+
processEvaluation,
|
|
1856
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1857
|
+
);
|
|
1858
|
+
}
|
|
1647
1859
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
1648
1860
|
Ref.get(completedRef),
|
|
1649
1861
|
Ref.get(passedRef),
|
|
@@ -1679,155 +1891,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1679
1891
|
artifactPath: task.snapshot.artifactPath
|
|
1680
1892
|
});
|
|
1681
1893
|
});
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
} catch {
|
|
1688
|
-
return [];
|
|
1689
|
-
}
|
|
1690
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1691
|
-
const snapshots = [];
|
|
1692
|
-
for (const fileName of jsonlFiles) {
|
|
1693
|
-
const filePath = join(baseDir, fileName);
|
|
1694
|
-
try {
|
|
1695
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1696
|
-
if (snapshot) {
|
|
1697
|
-
snapshots.push(snapshot);
|
|
1698
|
-
}
|
|
1699
|
-
} catch {
|
|
1700
|
-
}
|
|
1701
|
-
}
|
|
1702
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1703
|
-
}
|
|
1704
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1705
|
-
const content = await readFile(filePath, "utf8");
|
|
1706
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1707
|
-
if (lines.length === 0) {
|
|
1708
|
-
return null;
|
|
1709
|
-
}
|
|
1710
|
-
let runQueued = null;
|
|
1711
|
-
let runCompleted = null;
|
|
1712
|
-
let runFailed = null;
|
|
1713
|
-
let runStarted = null;
|
|
1714
|
-
for (const line of lines) {
|
|
1715
|
-
try {
|
|
1716
|
-
const event = JSON.parse(line);
|
|
1717
|
-
const type = event.type;
|
|
1718
|
-
if (type === "RunQueued") {
|
|
1719
|
-
runQueued = {
|
|
1720
|
-
runId: event.runId,
|
|
1721
|
-
datasetId: event.datasetId,
|
|
1722
|
-
datasetName: event.datasetName,
|
|
1723
|
-
evaluatorIds: event.evaluatorIds,
|
|
1724
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1725
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1726
|
-
ts: event.ts
|
|
1727
|
-
};
|
|
1728
|
-
}
|
|
1729
|
-
if (type === "RunStarted") {
|
|
1730
|
-
runStarted = { startedAt: event.startedAt };
|
|
1731
|
-
}
|
|
1732
|
-
if (type === "RunCompleted") {
|
|
1733
|
-
runCompleted = {
|
|
1734
|
-
passedTestCases: event.passedTestCases,
|
|
1735
|
-
failedTestCases: event.failedTestCases,
|
|
1736
|
-
totalTestCases: event.totalTestCases,
|
|
1737
|
-
finishedAt: event.finishedAt
|
|
1738
|
-
};
|
|
1739
|
-
}
|
|
1740
|
-
if (type === "RunFailed") {
|
|
1741
|
-
runFailed = {
|
|
1742
|
-
finishedAt: event.finishedAt,
|
|
1743
|
-
errorMessage: event.errorMessage
|
|
1744
|
-
};
|
|
1745
|
-
}
|
|
1746
|
-
} catch {
|
|
1747
|
-
}
|
|
1894
|
+
|
|
1895
|
+
// src/runner/name-pattern.ts
|
|
1896
|
+
function parseRegexLiteral(pattern) {
|
|
1897
|
+
if (!pattern.startsWith("/")) {
|
|
1898
|
+
return void 0;
|
|
1748
1899
|
}
|
|
1749
|
-
|
|
1750
|
-
|
|
1900
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1901
|
+
if (lastSlash <= 0) {
|
|
1902
|
+
return void 0;
|
|
1751
1903
|
}
|
|
1752
|
-
const artifactPath = filePath;
|
|
1753
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1754
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1755
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1756
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1757
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1758
1904
|
return {
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
datasetName: runQueued.datasetName,
|
|
1762
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1763
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1764
|
-
startedAt: runStarted?.startedAt,
|
|
1765
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1766
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1767
|
-
completedTestCases,
|
|
1768
|
-
passedTestCases,
|
|
1769
|
-
failedTestCases,
|
|
1770
|
-
status,
|
|
1771
|
-
artifactPath,
|
|
1772
|
-
errorMessage: runFailed?.errorMessage
|
|
1905
|
+
source: pattern.slice(1, lastSlash),
|
|
1906
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1773
1907
|
};
|
|
1774
1908
|
}
|
|
1775
|
-
function
|
|
1776
|
-
|
|
1777
|
-
const
|
|
1778
|
-
|
|
1779
|
-
|
|
1780
|
-
|
|
1781
|
-
if (event.type === "TestCaseProgress") {
|
|
1782
|
-
const ev = event;
|
|
1783
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1784
|
-
const id = ev.testCaseId;
|
|
1785
|
-
const current = testCasePassedBy.get(id);
|
|
1786
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1787
|
-
}
|
|
1788
|
-
} catch {
|
|
1789
|
-
}
|
|
1790
|
-
}
|
|
1791
|
-
let passedTestCases = 0;
|
|
1792
|
-
let failedTestCases = 0;
|
|
1793
|
-
for (const passed of testCasePassedBy.values()) {
|
|
1794
|
-
if (passed) {
|
|
1795
|
-
passedTestCases += 1;
|
|
1796
|
-
} else {
|
|
1797
|
-
failedTestCases += 1;
|
|
1798
|
-
}
|
|
1909
|
+
function createNameMatcher(pattern) {
|
|
1910
|
+
const normalizedPattern = pattern.trim();
|
|
1911
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1912
|
+
if (regexLiteral) {
|
|
1913
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1914
|
+
return (value) => regex.test(value);
|
|
1799
1915
|
}
|
|
1800
|
-
|
|
1801
|
-
}
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
const content = await readFile(artifactPath, "utf8");
|
|
1805
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1806
|
-
const results = [];
|
|
1807
|
-
for (const line of lines) {
|
|
1808
|
-
try {
|
|
1809
|
-
const event = JSON.parse(line);
|
|
1810
|
-
if (event.type === "TestCaseProgress") {
|
|
1811
|
-
const ev = event;
|
|
1812
|
-
results.push({
|
|
1813
|
-
testCaseId: ev.testCaseId,
|
|
1814
|
-
testCaseName: ev.testCaseName,
|
|
1815
|
-
completedTestCases: ev.completedTestCases,
|
|
1816
|
-
totalTestCases: ev.totalTestCases,
|
|
1817
|
-
rerunIndex: ev.rerunIndex,
|
|
1818
|
-
rerunTotal: ev.rerunTotal,
|
|
1819
|
-
passed: ev.passed,
|
|
1820
|
-
durationMs: ev.durationMs,
|
|
1821
|
-
evaluatorScores: ev.evaluatorScores ?? []
|
|
1822
|
-
});
|
|
1823
|
-
}
|
|
1824
|
-
} catch {
|
|
1825
|
-
}
|
|
1826
|
-
}
|
|
1827
|
-
return results;
|
|
1828
|
-
} catch {
|
|
1829
|
-
return [];
|
|
1916
|
+
if (normalizedPattern.includes("*")) {
|
|
1917
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1918
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1919
|
+
return (value) => regex.test(value);
|
|
1830
1920
|
}
|
|
1921
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1831
1922
|
}
|
|
1832
1923
|
async function appendJsonLine(artifactPath, payload) {
|
|
1833
1924
|
await mkdir(dirname(artifactPath), { recursive: true });
|
|
@@ -1886,32 +1977,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1886
1977
|
}
|
|
1887
1978
|
|
|
1888
1979
|
// src/runner/api.ts
|
|
1889
|
-
function
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
|
|
1893
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1894
|
-
if (lastSlash <= 0) {
|
|
1895
|
-
return void 0;
|
|
1896
|
-
}
|
|
1897
|
-
return {
|
|
1898
|
-
source: pattern.slice(1, lastSlash),
|
|
1899
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1900
|
-
};
|
|
1901
|
-
}
|
|
1902
|
-
function createNameMatcher(pattern) {
|
|
1903
|
-
const normalizedPattern = pattern.trim();
|
|
1904
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1905
|
-
if (regexLiteral) {
|
|
1906
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1907
|
-
return (value) => regex.test(value);
|
|
1980
|
+
function normalizeRunRepetitions(value) {
|
|
1981
|
+
const n = value ?? 1;
|
|
1982
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
1983
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1908
1984
|
}
|
|
1909
|
-
|
|
1910
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1911
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1912
|
-
return (value) => regex.test(value);
|
|
1913
|
-
}
|
|
1914
|
-
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1985
|
+
return n;
|
|
1915
1986
|
}
|
|
1916
1987
|
function mergeRunnerOverrides(base, next) {
|
|
1917
1988
|
if (!base) {
|
|
@@ -1942,15 +2013,12 @@ var EffectRunner = class {
|
|
|
1942
2013
|
this.persistenceQueue = Effect.runSync(
|
|
1943
2014
|
Queue.unbounded()
|
|
1944
2015
|
);
|
|
1945
|
-
this.snapshotsRef = Effect.runSync(
|
|
1946
|
-
Ref.make(/* @__PURE__ */ new Map())
|
|
1947
|
-
);
|
|
2016
|
+
this.snapshotsRef = Effect.runSync(Ref.make(/* @__PURE__ */ new Map()));
|
|
1948
2017
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1949
2018
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1950
2019
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1951
|
-
this.
|
|
1952
|
-
|
|
1953
|
-
);
|
|
2020
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
2021
|
+
this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
|
|
1954
2022
|
this.persistenceFiber = Effect.runFork(
|
|
1955
2023
|
createPersistenceWorker(this.persistenceQueue)
|
|
1956
2024
|
);
|
|
@@ -1990,6 +2058,137 @@ var EffectRunner = class {
|
|
|
1990
2058
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1991
2059
|
);
|
|
1992
2060
|
}
|
|
2061
|
+
async collectRunConfigs() {
|
|
2062
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
2063
|
+
this.runConfigsById.clear();
|
|
2064
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
2065
|
+
for (const item of runConfigs) {
|
|
2066
|
+
const id = item.runConfig.getName();
|
|
2067
|
+
const lower = id.toLowerCase();
|
|
2068
|
+
const prev = byNameLower.get(lower);
|
|
2069
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
2070
|
+
throw new Error(
|
|
2071
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
2072
|
+
);
|
|
2073
|
+
}
|
|
2074
|
+
byNameLower.set(lower, item);
|
|
2075
|
+
this.runConfigsById.set(id, item);
|
|
2076
|
+
}
|
|
2077
|
+
return runConfigs;
|
|
2078
|
+
}
|
|
2079
|
+
async resolveRunConfigByName(name) {
|
|
2080
|
+
if (this.runConfigsById.size === 0) {
|
|
2081
|
+
await this.collectRunConfigs();
|
|
2082
|
+
}
|
|
2083
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
2084
|
+
const keyLower = key.toLowerCase();
|
|
2085
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
2086
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
2087
|
+
);
|
|
2088
|
+
if (matches.length === 0) {
|
|
2089
|
+
return void 0;
|
|
2090
|
+
}
|
|
2091
|
+
if (matches.length > 1) {
|
|
2092
|
+
throw new Error(
|
|
2093
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
2094
|
+
);
|
|
2095
|
+
}
|
|
2096
|
+
return matches[0];
|
|
2097
|
+
}
|
|
2098
|
+
async expandRunConfigToJobs(collected) {
|
|
2099
|
+
if (this.datasetsById.size === 0) {
|
|
2100
|
+
await this.collectDatasets();
|
|
2101
|
+
}
|
|
2102
|
+
if (this.evaluatorsById.size === 0) {
|
|
2103
|
+
await this.collectEvaluators();
|
|
2104
|
+
}
|
|
2105
|
+
const rcName = collected.runConfig.getName();
|
|
2106
|
+
const jobs = [];
|
|
2107
|
+
const runs = collected.runConfig.getRuns();
|
|
2108
|
+
for (const [i, row] of runs.entries()) {
|
|
2109
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
2110
|
+
(d) => d.dataset === row.dataset
|
|
2111
|
+
);
|
|
2112
|
+
if (!dsCollected) {
|
|
2113
|
+
throw new Error(
|
|
2114
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
2115
|
+
);
|
|
2116
|
+
}
|
|
2117
|
+
let evaluatorIds;
|
|
2118
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
2119
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
2120
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
2121
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
2122
|
+
);
|
|
2123
|
+
if (matched.length === 0) {
|
|
2124
|
+
throw new Error(
|
|
2125
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
2126
|
+
);
|
|
2127
|
+
}
|
|
2128
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
2129
|
+
} else {
|
|
2130
|
+
const evaluators = row.evaluators;
|
|
2131
|
+
evaluatorIds = [];
|
|
2132
|
+
for (const ev of evaluators) {
|
|
2133
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
2134
|
+
(item) => item.evaluator === ev
|
|
2135
|
+
);
|
|
2136
|
+
if (!found) {
|
|
2137
|
+
throw new Error(
|
|
2138
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
2139
|
+
);
|
|
2140
|
+
}
|
|
2141
|
+
evaluatorIds.push(found.id);
|
|
2142
|
+
}
|
|
2143
|
+
}
|
|
2144
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
2145
|
+
jobs.push({
|
|
2146
|
+
datasetId: dsCollected.id,
|
|
2147
|
+
evaluatorIds,
|
|
2148
|
+
runConfigName: rcName,
|
|
2149
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
2150
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
2151
|
+
repetitions
|
|
2152
|
+
});
|
|
2153
|
+
}
|
|
2154
|
+
return jobs;
|
|
2155
|
+
}
|
|
2156
|
+
async expandRunConfigNamesToJobs(names) {
|
|
2157
|
+
const jobs = [];
|
|
2158
|
+
for (const name of names) {
|
|
2159
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
2160
|
+
if (!collected) {
|
|
2161
|
+
const known = await this.collectRunConfigs();
|
|
2162
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
2163
|
+
throw new Error(
|
|
2164
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
2165
|
+
);
|
|
2166
|
+
}
|
|
2167
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
2168
|
+
}
|
|
2169
|
+
return jobs;
|
|
2170
|
+
}
|
|
2171
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
2172
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
2173
|
+
const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
2174
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
2175
|
+
const snapshots = [];
|
|
2176
|
+
for (const job of request.jobs) {
|
|
2177
|
+
snapshots.push(
|
|
2178
|
+
await this.startDatasetRun({
|
|
2179
|
+
datasetId: job.datasetId,
|
|
2180
|
+
evaluatorIds: job.evaluatorIds,
|
|
2181
|
+
triggerId,
|
|
2182
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
2183
|
+
globalEvaluationSemaphore: sem,
|
|
2184
|
+
runConfigName: job.runConfigName,
|
|
2185
|
+
runConfigTags: job.runConfigTags,
|
|
2186
|
+
repetitions: job.repetitions
|
|
2187
|
+
})
|
|
2188
|
+
);
|
|
2189
|
+
}
|
|
2190
|
+
return snapshots;
|
|
2191
|
+
}
|
|
1993
2192
|
async searchTestCases(query) {
|
|
1994
2193
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1995
2194
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -2008,35 +2207,45 @@ var EffectRunner = class {
|
|
|
2008
2207
|
);
|
|
2009
2208
|
}
|
|
2010
2209
|
async runDatasetWith(request) {
|
|
2210
|
+
const runConfigName = validateRunConfigName(
|
|
2211
|
+
request.runConfigName,
|
|
2212
|
+
"runDatasetWith.runConfigName"
|
|
2213
|
+
);
|
|
2214
|
+
return this.startDatasetRun({
|
|
2215
|
+
datasetId: request.datasetId,
|
|
2216
|
+
evaluatorIds: request.evaluatorIds,
|
|
2217
|
+
triggerId: request.triggerId,
|
|
2218
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2219
|
+
repetitions: request.repetitions,
|
|
2220
|
+
runConfigName,
|
|
2221
|
+
runConfigTags: request.runConfigTags
|
|
2222
|
+
});
|
|
2223
|
+
}
|
|
2224
|
+
async startDatasetRun(params) {
|
|
2011
2225
|
if (this.datasetsById.size === 0) {
|
|
2012
2226
|
await this.collectDatasets();
|
|
2013
2227
|
}
|
|
2014
2228
|
if (this.evaluatorsById.size === 0) {
|
|
2015
2229
|
await this.collectEvaluators();
|
|
2016
2230
|
}
|
|
2017
|
-
const dataset = this.datasetsById.get(
|
|
2231
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
2018
2232
|
if (!dataset) {
|
|
2019
|
-
throw new Error(`Unknown dataset: ${
|
|
2233
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
2020
2234
|
}
|
|
2021
|
-
const selectedEvaluators =
|
|
2235
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
2022
2236
|
if (selectedEvaluators.length === 0) {
|
|
2023
2237
|
throw new Error("No evaluators selected for run");
|
|
2024
2238
|
}
|
|
2025
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
2026
|
-
const
|
|
2027
|
-
|
|
2028
|
-
|
|
2029
|
-
)
|
|
2030
|
-
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
2239
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
2240
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
2241
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
2242
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
2243
|
+
const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
|
|
2031
2244
|
const runId = `run-${randomUUID()}`;
|
|
2032
|
-
const artifactPath = createArtifactPath(
|
|
2033
|
-
this.config.artifactDirectory,
|
|
2034
|
-
request.datasetId,
|
|
2035
|
-
runId
|
|
2036
|
-
);
|
|
2245
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
2037
2246
|
const snapshot = {
|
|
2038
2247
|
runId,
|
|
2039
|
-
datasetId:
|
|
2248
|
+
datasetId: params.datasetId,
|
|
2040
2249
|
datasetName: dataset.dataset.getName(),
|
|
2041
2250
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2042
2251
|
queuedAt: Date.now(),
|
|
@@ -2057,7 +2266,7 @@ var EffectRunner = class {
|
|
|
2057
2266
|
const queuedEvent = {
|
|
2058
2267
|
type: "RunQueued",
|
|
2059
2268
|
runId,
|
|
2060
|
-
datasetId:
|
|
2269
|
+
datasetId: params.datasetId,
|
|
2061
2270
|
datasetName: dataset.dataset.getName(),
|
|
2062
2271
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2063
2272
|
totalTestCases: totalEvaluations,
|
|
@@ -2071,17 +2280,20 @@ var EffectRunner = class {
|
|
|
2071
2280
|
payload: queuedEvent
|
|
2072
2281
|
})
|
|
2073
2282
|
);
|
|
2074
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
2075
2283
|
await Effect.runPromise(
|
|
2076
2284
|
Queue.offer(this.runQueue, {
|
|
2077
2285
|
runId,
|
|
2078
2286
|
triggerId,
|
|
2079
|
-
datasetId:
|
|
2287
|
+
datasetId: params.datasetId,
|
|
2080
2288
|
dataset: dataset.dataset,
|
|
2081
2289
|
evaluators: selectedEvaluators,
|
|
2082
2290
|
testCases: selectedTestCases,
|
|
2083
2291
|
snapshot,
|
|
2084
|
-
maxConcurrency
|
|
2292
|
+
maxConcurrency: params.maxConcurrency,
|
|
2293
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2294
|
+
runConfigName: params.runConfigName,
|
|
2295
|
+
runConfigTags,
|
|
2296
|
+
repetitions
|
|
2085
2297
|
})
|
|
2086
2298
|
);
|
|
2087
2299
|
return snapshot;
|
|
@@ -2097,9 +2309,9 @@ var EffectRunner = class {
|
|
|
2097
2309
|
return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
|
|
2098
2310
|
}
|
|
2099
2311
|
getAllRunSnapshots() {
|
|
2100
|
-
return Array.from(
|
|
2101
|
-
|
|
2102
|
-
)
|
|
2312
|
+
return Array.from(Effect.runSync(Ref.get(this.snapshotsRef)).values()).sort(
|
|
2313
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
2314
|
+
);
|
|
2103
2315
|
}
|
|
2104
2316
|
async loadRunSnapshotsFromArtifacts() {
|
|
2105
2317
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -2152,6 +2364,11 @@ var EffectRunner = class {
|
|
|
2152
2364
|
);
|
|
2153
2365
|
}
|
|
2154
2366
|
};
|
|
2367
|
+
|
|
2368
|
+
// src/runner/events.ts
|
|
2369
|
+
var PROGRAMMATIC_RUN_CONFIG = {
|
|
2370
|
+
runConfigName: "programmatic"
|
|
2371
|
+
};
|
|
2155
2372
|
var LEFT_PANE_WIDTH2 = 44;
|
|
2156
2373
|
var MAX_RUNS_FOR_CHART = 12;
|
|
2157
2374
|
var MAX_RUNS_FOR_TREND = 20;
|
|
@@ -2315,11 +2532,7 @@ function DatasetsView({
|
|
|
2315
2532
|
] })
|
|
2316
2533
|
] });
|
|
2317
2534
|
}
|
|
2318
|
-
function RunsView({
|
|
2319
|
-
state,
|
|
2320
|
-
dataset,
|
|
2321
|
-
selectedRun
|
|
2322
|
-
}) {
|
|
2535
|
+
function RunsView({ state, dataset, selectedRun }) {
|
|
2323
2536
|
const runs = dataset?.runs ?? [];
|
|
2324
2537
|
const rightFocused = state.focus === "right";
|
|
2325
2538
|
return /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
@@ -2335,10 +2548,10 @@ function RunsView({
|
|
|
2335
2548
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2336
2549
|
"Commit: ",
|
|
2337
2550
|
selectedRun.meta.commit,
|
|
2338
|
-
"
|
|
2551
|
+
" Branch: ",
|
|
2339
2552
|
selectedRun.meta.branch,
|
|
2553
|
+
" Seed:",
|
|
2340
2554
|
" ",
|
|
2341
|
-
"Seed: ",
|
|
2342
2555
|
selectedRun.meta.seed
|
|
2343
2556
|
] }),
|
|
2344
2557
|
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
@@ -2351,23 +2564,10 @@ function RunsView({
|
|
|
2351
2564
|
format: (v) => `${v}%`
|
|
2352
2565
|
}
|
|
2353
2566
|
),
|
|
2354
|
-
/* @__PURE__ */ jsx(
|
|
2355
|
-
TextBar,
|
|
2356
|
-
{
|
|
2357
|
-
label: "avg score",
|
|
2358
|
-
value: Math.round(selectedRun.performance.avgScore * 100)
|
|
2359
|
-
}
|
|
2360
|
-
),
|
|
2567
|
+
/* @__PURE__ */ jsx(TextBar, { label: "avg score", value: Math.round(selectedRun.performance.avgScore * 100) }),
|
|
2361
2568
|
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
2362
2569
|
/* @__PURE__ */ jsx(SectionHeader, { children: "Dimensions" }),
|
|
2363
|
-
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsx(
|
|
2364
|
-
TextBar,
|
|
2365
|
-
{
|
|
2366
|
-
label: dimension.name,
|
|
2367
|
-
value: dimension.score
|
|
2368
|
-
},
|
|
2369
|
-
dimension.name
|
|
2370
|
-
)),
|
|
2570
|
+
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsx(TextBar, { label: dimension.name, value: dimension.score }, dimension.name)),
|
|
2371
2571
|
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
2372
2572
|
/* @__PURE__ */ jsx(SectionHeader, { children: "Latency trend" }),
|
|
2373
2573
|
/* @__PURE__ */ jsx(
|
|
@@ -2470,15 +2670,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2470
2670
|
...dimensions.map((d) => /* @__PURE__ */ jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
|
|
2471
2671
|
/* @__PURE__ */ jsx(Text, { children: " " }, "sp2"),
|
|
2472
2672
|
/* @__PURE__ */ jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
|
|
2473
|
-
...checks.map((c) => /* @__PURE__ */ jsx(
|
|
2474
|
-
CheckRow,
|
|
2475
|
-
{
|
|
2476
|
-
name: c.name,
|
|
2477
|
-
passed: c.passed,
|
|
2478
|
-
detail: c.detail
|
|
2479
|
-
},
|
|
2480
|
-
`chk-${c.name}`
|
|
2481
|
-
)),
|
|
2673
|
+
...checks.map((c) => /* @__PURE__ */ jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
|
|
2482
2674
|
/* @__PURE__ */ jsx(Text, { children: " " }, "sp3"),
|
|
2483
2675
|
/* @__PURE__ */ jsx(SectionHeader, { children: "Performance" }, "perf-h"),
|
|
2484
2676
|
/* @__PURE__ */ jsx(
|
|
@@ -2524,7 +2716,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2524
2716
|
rows.push(/* @__PURE__ */ jsx(Text, { children: " " }, "sp6"));
|
|
2525
2717
|
rows.push(/* @__PURE__ */ jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
|
|
2526
2718
|
for (const tc of testCases) {
|
|
2527
|
-
const
|
|
2719
|
+
const repetitionPart = tc.repetitionCount != null && tc.repetitionCount > 1 && tc.repetitionIndex != null ? ` (${tc.repetitionIndex}/${tc.repetitionCount})` : "";
|
|
2528
2720
|
rows.push(
|
|
2529
2721
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
2530
2722
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
@@ -2536,13 +2728,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2536
2728
|
] }),
|
|
2537
2729
|
" ",
|
|
2538
2730
|
tc.testCaseName,
|
|
2539
|
-
|
|
2731
|
+
repetitionPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: repetitionPart }) : null,
|
|
2540
2732
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2541
2733
|
" (",
|
|
2542
2734
|
tc.durationMs,
|
|
2543
2735
|
"ms)"
|
|
2544
2736
|
] })
|
|
2545
|
-
] }, `tc-${tc.testCaseId}-${tc.
|
|
2737
|
+
] }, `tc-${tc.testCaseId}-${tc.repetitionId ?? "x"}-${tc.repetitionIndex ?? 0}`)
|
|
2546
2738
|
);
|
|
2547
2739
|
for (const item of tc.evaluatorScores) {
|
|
2548
2740
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
@@ -2595,17 +2787,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2595
2787
|
}
|
|
2596
2788
|
} else {
|
|
2597
2789
|
rows.push(
|
|
2598
|
-
/* @__PURE__ */ jsxs(
|
|
2599
|
-
|
|
2600
|
-
|
|
2601
|
-
|
|
2602
|
-
children: [
|
|
2603
|
-
" ",
|
|
2604
|
-
"n/a"
|
|
2605
|
-
]
|
|
2606
|
-
},
|
|
2607
|
-
`tc-${tc.testCaseId}-${item.evaluatorId}-n/a`
|
|
2608
|
-
)
|
|
2790
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2791
|
+
" ",
|
|
2792
|
+
"n/a"
|
|
2793
|
+
] }, `tc-${tc.testCaseId}-${item.evaluatorId}-n/a`)
|
|
2609
2794
|
);
|
|
2610
2795
|
}
|
|
2611
2796
|
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
@@ -2663,7 +2848,7 @@ function RunDetailsView({
|
|
|
2663
2848
|
const runs = dataset?.runs ?? [];
|
|
2664
2849
|
const rightFocused = state.focus === "right";
|
|
2665
2850
|
const [testCases, setTestCases] = useState([]);
|
|
2666
|
-
const evaluatorNameById =
|
|
2851
|
+
const evaluatorNameById = React.useMemo(
|
|
2667
2852
|
() => new Map(evaluators.map((e) => [e.id, e.name])),
|
|
2668
2853
|
[evaluators]
|
|
2669
2854
|
);
|
|
@@ -2686,7 +2871,7 @@ function RunDetailsView({
|
|
|
2686
2871
|
const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
|
|
2687
2872
|
return /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
2688
2873
|
/* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
|
|
2689
|
-
/* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsx(
|
|
2874
|
+
/* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsx(React.Fragment, { children: row }, i)) }) })
|
|
2690
2875
|
] });
|
|
2691
2876
|
}
|
|
2692
2877
|
var LEFT_PANE_WIDTH3 = 44;
|
|
@@ -2709,19 +2894,11 @@ function NewEvaluationView({
|
|
|
2709
2894
|
visibleEvaluators.map((evaluator, index) => {
|
|
2710
2895
|
const selected = index === state.evaluatorMenuIndex;
|
|
2711
2896
|
const inSelection = state.selectedEvaluatorIds.includes(evaluator.id);
|
|
2712
|
-
return /* @__PURE__ */ jsxs(
|
|
2713
|
-
|
|
2714
|
-
|
|
2715
|
-
|
|
2716
|
-
|
|
2717
|
-
children: [
|
|
2718
|
-
selected ? "\u25B8 " : " ",
|
|
2719
|
-
inSelection ? "[x] " : "[ ] ",
|
|
2720
|
-
evaluator.name
|
|
2721
|
-
]
|
|
2722
|
-
},
|
|
2723
|
-
evaluator.id
|
|
2724
|
-
);
|
|
2897
|
+
return /* @__PURE__ */ jsxs(Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
|
|
2898
|
+
selected ? "\u25B8 " : " ",
|
|
2899
|
+
inSelection ? "[x] " : "[ ] ",
|
|
2900
|
+
evaluator.name
|
|
2901
|
+
] }, evaluator.id);
|
|
2725
2902
|
})
|
|
2726
2903
|
] }),
|
|
2727
2904
|
/* @__PURE__ */ jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
@@ -2753,26 +2930,16 @@ function clampCursor(state, filteredDatasetsLength, selectedRunCount) {
|
|
|
2753
2930
|
...state,
|
|
2754
2931
|
datasetMenuIndex: Math.max(0, Math.min(state.datasetMenuIndex, datasetMax)),
|
|
2755
2932
|
runMenuIndex: Math.max(0, Math.min(state.runMenuIndex, runMax)),
|
|
2756
|
-
evaluatorMenuIndex: Math.max(
|
|
2757
|
-
0,
|
|
2758
|
-
Math.min(state.evaluatorMenuIndex, evaluatorMax)
|
|
2759
|
-
)
|
|
2933
|
+
evaluatorMenuIndex: Math.max(0, Math.min(state.evaluatorMenuIndex, evaluatorMax))
|
|
2760
2934
|
};
|
|
2761
2935
|
}
|
|
2762
|
-
function EvalsCliApp({
|
|
2763
|
-
data,
|
|
2764
|
-
args,
|
|
2765
|
-
runner
|
|
2766
|
-
}) {
|
|
2936
|
+
function EvalsCliApp({ data, args, runner }) {
|
|
2767
2937
|
const { exit } = useApp();
|
|
2768
2938
|
const { width: stdoutWidth, height: stdoutHeight } = useScreenSize();
|
|
2769
2939
|
const [liveData, setLiveData] = useState(data);
|
|
2770
2940
|
const [runtimeMessage, setRuntimeMessage] = useState();
|
|
2771
2941
|
const overviewRowCountRef = useRef(0);
|
|
2772
|
-
const [state, dispatch] = useReducer(
|
|
2773
|
-
reduceCliState,
|
|
2774
|
-
createInitialState(data, args)
|
|
2775
|
-
);
|
|
2942
|
+
const [state, dispatch] = useReducer(reduceCliState, createInitialState(data, args));
|
|
2776
2943
|
useEffect(() => {
|
|
2777
2944
|
setLiveData(data);
|
|
2778
2945
|
}, [data]);
|
|
@@ -2804,14 +2971,8 @@ function EvalsCliApp({
|
|
|
2804
2971
|
filteredDatasets.length,
|
|
2805
2972
|
getDatasetByMenuIndex(filteredDatasets, state.datasetMenuIndex)?.runs.length ?? 0
|
|
2806
2973
|
);
|
|
2807
|
-
const selectedDataset = getDatasetByMenuIndex(
|
|
2808
|
-
|
|
2809
|
-
clampedState.datasetMenuIndex
|
|
2810
|
-
);
|
|
2811
|
-
const selectedRun = getRunByMenuIndex(
|
|
2812
|
-
selectedDataset,
|
|
2813
|
-
clampedState.runMenuIndex
|
|
2814
|
-
);
|
|
2974
|
+
const selectedDataset = getDatasetByMenuIndex(filteredDatasets, clampedState.datasetMenuIndex);
|
|
2975
|
+
const selectedRun = getRunByMenuIndex(selectedDataset, clampedState.runMenuIndex);
|
|
2815
2976
|
const visibleEvaluators = liveData.evaluators.filter(
|
|
2816
2977
|
(evaluator) => evaluator.name.toLowerCase().includes(clampedState.searchQuery.toLowerCase())
|
|
2817
2978
|
);
|
|
@@ -2899,15 +3060,14 @@ function EvalsCliApp({
|
|
|
2899
3060
|
}
|
|
2900
3061
|
void runner.runDatasetWith({
|
|
2901
3062
|
datasetId: selectedDataset.id,
|
|
2902
|
-
evaluatorIds: clampedState.selectedEvaluatorIds
|
|
3063
|
+
evaluatorIds: clampedState.selectedEvaluatorIds,
|
|
3064
|
+
...PROGRAMMATIC_RUN_CONFIG
|
|
2903
3065
|
}).then((snapshot) => {
|
|
2904
3066
|
setRuntimeMessage(
|
|
2905
3067
|
`Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
|
|
2906
3068
|
);
|
|
2907
3069
|
}).catch((error) => {
|
|
2908
|
-
setRuntimeMessage(
|
|
2909
|
-
error instanceof Error ? error.message : "Failed to start evaluation."
|
|
2910
|
-
);
|
|
3070
|
+
setRuntimeMessage(error instanceof Error ? error.message : "Failed to start evaluation.");
|
|
2911
3071
|
});
|
|
2912
3072
|
}
|
|
2913
3073
|
});
|
|
@@ -2934,14 +3094,7 @@ function EvalsCliApp({
|
|
|
2934
3094
|
);
|
|
2935
3095
|
}
|
|
2936
3096
|
if (clampedState.level === "runs") {
|
|
2937
|
-
return /* @__PURE__ */ jsx(
|
|
2938
|
-
RunsView,
|
|
2939
|
-
{
|
|
2940
|
-
state: clampedState,
|
|
2941
|
-
dataset: selectedDataset,
|
|
2942
|
-
selectedRun
|
|
2943
|
-
}
|
|
2944
|
-
);
|
|
3097
|
+
return /* @__PURE__ */ jsx(RunsView, { state: clampedState, dataset: selectedDataset, selectedRun });
|
|
2945
3098
|
}
|
|
2946
3099
|
return /* @__PURE__ */ jsx(
|
|
2947
3100
|
RunDetailsView,
|
|
@@ -2953,82 +3106,44 @@ function EvalsCliApp({
|
|
|
2953
3106
|
}
|
|
2954
3107
|
);
|
|
2955
3108
|
};
|
|
2956
|
-
return /* @__PURE__ */ jsxs(
|
|
2957
|
-
Box,
|
|
2958
|
-
|
|
2959
|
-
|
|
2960
|
-
|
|
2961
|
-
|
|
2962
|
-
|
|
2963
|
-
|
|
2964
|
-
|
|
2965
|
-
|
|
2966
|
-
|
|
2967
|
-
|
|
2968
|
-
|
|
2969
|
-
|
|
2970
|
-
|
|
2971
|
-
|
|
2972
|
-
|
|
2973
|
-
|
|
2974
|
-
|
|
2975
|
-
|
|
2976
|
-
|
|
2977
|
-
|
|
2978
|
-
|
|
2979
|
-
|
|
2980
|
-
|
|
2981
|
-
|
|
2982
|
-
|
|
2983
|
-
|
|
2984
|
-
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
|
|
2989
|
-
|
|
2990
|
-
|
|
2991
|
-
|
|
2992
|
-
|
|
2993
|
-
|
|
2994
|
-
Box,
|
|
2995
|
-
{
|
|
2996
|
-
marginTop: 1,
|
|
2997
|
-
borderStyle: "round",
|
|
2998
|
-
borderColor: "magenta",
|
|
2999
|
-
paddingX: 1,
|
|
3000
|
-
width: stdoutWidth,
|
|
3001
|
-
children: [
|
|
3002
|
-
/* @__PURE__ */ jsx(Text, { color: "magenta", bold: true, children: "Search: " }),
|
|
3003
|
-
/* @__PURE__ */ jsx(Text, { color: "white", children: clampedState.searchQuery })
|
|
3004
|
-
]
|
|
3005
|
-
}
|
|
3006
|
-
),
|
|
3007
|
-
runtimeMessage && /* @__PURE__ */ jsx(
|
|
3008
|
-
Box,
|
|
3009
|
-
{
|
|
3010
|
-
marginTop: 1,
|
|
3011
|
-
borderStyle: "round",
|
|
3012
|
-
borderColor: "blue",
|
|
3013
|
-
paddingX: 1,
|
|
3014
|
-
width: stdoutWidth,
|
|
3015
|
-
children: /* @__PURE__ */ jsx(Text, { color: "blue", children: runtimeMessage })
|
|
3016
|
-
}
|
|
3017
|
-
),
|
|
3018
|
-
/* @__PURE__ */ jsx(
|
|
3019
|
-
Box,
|
|
3020
|
-
{
|
|
3021
|
-
marginTop: 1,
|
|
3022
|
-
flexGrow: 1,
|
|
3023
|
-
width: stdoutWidth,
|
|
3024
|
-
flexDirection: "row",
|
|
3025
|
-
children: renderContent()
|
|
3026
|
-
}
|
|
3027
|
-
),
|
|
3028
|
-
/* @__PURE__ */ jsx(Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: getFooterText(clampedState) }) })
|
|
3029
|
-
]
|
|
3030
|
-
}
|
|
3031
|
-
);
|
|
3109
|
+
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", flexGrow: 1, width: stdoutWidth, height: stdoutHeight, children: [
|
|
3110
|
+
/* @__PURE__ */ jsx(Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, width: stdoutWidth, children: /* @__PURE__ */ jsx(Text, { children: getBreadcrumbText(clampedState, selectedDataset?.name, selectedRun?.label) }) }),
|
|
3111
|
+
clampedState.startupWarnings.length > 0 && /* @__PURE__ */ jsxs(
|
|
3112
|
+
Box,
|
|
3113
|
+
{
|
|
3114
|
+
marginTop: 1,
|
|
3115
|
+
borderStyle: "round",
|
|
3116
|
+
borderColor: "yellow",
|
|
3117
|
+
paddingX: 1,
|
|
3118
|
+
flexDirection: "column",
|
|
3119
|
+
width: stdoutWidth,
|
|
3120
|
+
children: [
|
|
3121
|
+
/* @__PURE__ */ jsx(Text, { color: "yellow", children: "Startup warnings:" }),
|
|
3122
|
+
clampedState.startupWarnings.map((warning, index) => /* @__PURE__ */ jsx(Text, { children: warning }, `${warning}-${index}`))
|
|
3123
|
+
]
|
|
3124
|
+
}
|
|
3125
|
+
),
|
|
3126
|
+
clampedState.searchMode && /* @__PURE__ */ jsxs(
|
|
3127
|
+
Box,
|
|
3128
|
+
{
|
|
3129
|
+
marginTop: 1,
|
|
3130
|
+
borderStyle: "round",
|
|
3131
|
+
borderColor: "magenta",
|
|
3132
|
+
paddingX: 1,
|
|
3133
|
+
width: stdoutWidth,
|
|
3134
|
+
children: [
|
|
3135
|
+
/* @__PURE__ */ jsxs(Text, { color: "magenta", bold: true, children: [
|
|
3136
|
+
"Search:",
|
|
3137
|
+
" "
|
|
3138
|
+
] }),
|
|
3139
|
+
/* @__PURE__ */ jsx(Text, { color: "white", children: clampedState.searchQuery })
|
|
3140
|
+
]
|
|
3141
|
+
}
|
|
3142
|
+
),
|
|
3143
|
+
runtimeMessage && /* @__PURE__ */ jsx(Box, { marginTop: 1, borderStyle: "round", borderColor: "blue", paddingX: 1, width: stdoutWidth, children: /* @__PURE__ */ jsx(Text, { color: "blue", children: runtimeMessage }) }),
|
|
3144
|
+
/* @__PURE__ */ jsx(Box, { marginTop: 1, flexGrow: 1, width: stdoutWidth, flexDirection: "row", children: renderContent() }),
|
|
3145
|
+
/* @__PURE__ */ jsx(Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: getFooterText(clampedState) }) })
|
|
3146
|
+
] });
|
|
3032
3147
|
}
|
|
3033
3148
|
async function main() {
|
|
3034
3149
|
const args = parseStartupArgs(process.argv.slice(2));
|
|
@@ -3040,9 +3155,7 @@ async function main() {
|
|
|
3040
3155
|
process.on("SIGTERM", () => {
|
|
3041
3156
|
void runner.shutdown().finally(() => process.exit(0));
|
|
3042
3157
|
});
|
|
3043
|
-
withFullScreen(
|
|
3044
|
-
/* @__PURE__ */ jsx(EvalsCliApp, { data, args, runner })
|
|
3045
|
-
).start();
|
|
3158
|
+
withFullScreen(/* @__PURE__ */ jsx(EvalsCliApp, { data, args, runner })).start();
|
|
3046
3159
|
}
|
|
3047
3160
|
void main();
|
|
3048
3161
|
//# sourceMappingURL=out.js.map
|