@m4trix/evals 0.25.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -7
- package/dist/cli-simple.cjs +911 -643
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +898 -630
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +688 -575
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +679 -566
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +959 -623
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +201 -7
- package/dist/index.js +947 -625
- package/dist/index.js.map +1 -1
- package/package.json +3 -5
package/dist/cli.cjs
CHANGED
|
@@ -2,19 +2,19 @@
|
|
|
2
2
|
'use strict';
|
|
3
3
|
|
|
4
4
|
var fullscreenInk = require('fullscreen-ink');
|
|
5
|
-
var
|
|
5
|
+
var React = require('react');
|
|
6
6
|
var ink = require('ink');
|
|
7
7
|
var jsxRuntime = require('react/jsx-runtime');
|
|
8
|
-
var path = require('path');
|
|
9
|
-
var inkChart = require('@pppp606/ink-chart');
|
|
10
|
-
var crypto = require('crypto');
|
|
11
8
|
var effect = require('effect');
|
|
9
|
+
var crypto = require('crypto');
|
|
10
|
+
var promises = require('fs/promises');
|
|
11
|
+
var path = require('path');
|
|
12
12
|
var fs = require('fs');
|
|
13
13
|
var jitiModule = require('jiti');
|
|
14
|
-
var promises = require('fs/promises');
|
|
15
14
|
var url = require('url');
|
|
16
15
|
var diff = require('diff');
|
|
17
16
|
var stringify = require('fast-json-stable-stringify');
|
|
17
|
+
var inkChart = require('@pppp606/ink-chart');
|
|
18
18
|
|
|
19
19
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
20
20
|
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
@@ -37,7 +37,7 @@ function _interopNamespace(e) {
|
|
|
37
37
|
return Object.freeze(n);
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
-
var
|
|
40
|
+
var React__default = /*#__PURE__*/_interopDefault(React);
|
|
41
41
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
42
42
|
var stringify__default = /*#__PURE__*/_interopDefault(stringify);
|
|
43
43
|
|
|
@@ -117,11 +117,7 @@ function getFooterText(state) {
|
|
|
117
117
|
}
|
|
118
118
|
return "\u2191\u2193 move Enter add/remove S start run / search Esc cancel q quit";
|
|
119
119
|
}
|
|
120
|
-
function ListItem({
|
|
121
|
-
selected,
|
|
122
|
-
label,
|
|
123
|
-
itemKey
|
|
124
|
-
}) {
|
|
120
|
+
function ListItem({ selected, label, itemKey }) {
|
|
125
121
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
|
|
126
122
|
selected ? "\u25B8 " : " ",
|
|
127
123
|
label
|
|
@@ -148,9 +144,7 @@ function Pane({
|
|
|
148
144
|
}
|
|
149
145
|
);
|
|
150
146
|
}
|
|
151
|
-
function SectionHeader({
|
|
152
|
-
children
|
|
153
|
-
}) {
|
|
147
|
+
function SectionHeader({ children }) {
|
|
154
148
|
return /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children });
|
|
155
149
|
}
|
|
156
150
|
function StatusText({ status }) {
|
|
@@ -162,10 +156,7 @@ function StatusText({ status }) {
|
|
|
162
156
|
] });
|
|
163
157
|
}
|
|
164
158
|
var LEFT_PANE_WIDTH = 44;
|
|
165
|
-
function RunsSidebar({
|
|
166
|
-
state,
|
|
167
|
-
runs
|
|
168
|
-
}) {
|
|
159
|
+
function RunsSidebar({ state, runs }) {
|
|
169
160
|
const focused = state.focus === "left";
|
|
170
161
|
return /* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH, focused, children: [
|
|
171
162
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Runs" }),
|
|
@@ -194,11 +185,7 @@ function RunsSidebar({
|
|
|
194
185
|
] });
|
|
195
186
|
}
|
|
196
187
|
var BLOCKS = ["\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"];
|
|
197
|
-
function Sparkline({
|
|
198
|
-
data,
|
|
199
|
-
width,
|
|
200
|
-
label
|
|
201
|
-
}) {
|
|
188
|
+
function Sparkline({ data, width, label }) {
|
|
202
189
|
if (data.length === 0)
|
|
203
190
|
return null;
|
|
204
191
|
const max = Math.max(...data);
|
|
@@ -277,6 +264,50 @@ function isPrintableCharacter(input) {
|
|
|
277
264
|
function isBackKey(key) {
|
|
278
265
|
return key.backspace || key.delete;
|
|
279
266
|
}
|
|
267
|
+
var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
|
|
268
|
+
function makeEntityIdSchema(brand, label) {
|
|
269
|
+
return effect.Schema.String.pipe(
|
|
270
|
+
effect.Schema.trimmed(),
|
|
271
|
+
effect.Schema.minLength(1, {
|
|
272
|
+
message: () => `${label} must be non-empty.`
|
|
273
|
+
}),
|
|
274
|
+
effect.Schema.pattern(ENTITY_ID_PATTERN, {
|
|
275
|
+
message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
|
|
276
|
+
}),
|
|
277
|
+
effect.Schema.brand(brand)
|
|
278
|
+
);
|
|
279
|
+
}
|
|
280
|
+
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
281
|
+
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
282
|
+
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
283
|
+
function validateWithSchema(schema, raw, context) {
|
|
284
|
+
const trimmed = raw.trim();
|
|
285
|
+
const decode = effect.Schema.decodeUnknownEither(
|
|
286
|
+
schema
|
|
287
|
+
);
|
|
288
|
+
const result = decode(trimmed);
|
|
289
|
+
if (effect.Either.isLeft(result)) {
|
|
290
|
+
throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
|
|
291
|
+
}
|
|
292
|
+
return result.right;
|
|
293
|
+
}
|
|
294
|
+
function validateRunConfigName(raw, context) {
|
|
295
|
+
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// src/evals/evaluator.ts
|
|
299
|
+
function getEvaluatorDisplayLabel(evaluator) {
|
|
300
|
+
if (typeof evaluator.getDisplayLabel === "function") {
|
|
301
|
+
const label = evaluator.getDisplayLabel();
|
|
302
|
+
if (label !== void 0) {
|
|
303
|
+
return label;
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
|
|
307
|
+
}
|
|
308
|
+
function getEvaluatorTagList(evaluator) {
|
|
309
|
+
return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
|
|
310
|
+
}
|
|
280
311
|
|
|
281
312
|
// src/cli/data.mock.json
|
|
282
313
|
var data_mock_default = {
|
|
@@ -428,9 +459,7 @@ var data_mock_default = {
|
|
|
428
459
|
{ name: "contract_match", score: 100 },
|
|
429
460
|
{ name: "arg_validity", score: 100 }
|
|
430
461
|
],
|
|
431
|
-
checks: [
|
|
432
|
-
{ name: "tool_calls", passed: true, detail: "0 unexpected" }
|
|
433
|
-
],
|
|
462
|
+
checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
|
|
434
463
|
failures: [],
|
|
435
464
|
meta: {
|
|
436
465
|
model: "gpt-4o-mini",
|
|
@@ -453,9 +482,21 @@ var data_mock_default = {
|
|
|
453
482
|
}
|
|
454
483
|
],
|
|
455
484
|
evaluators: [
|
|
456
|
-
{
|
|
457
|
-
|
|
458
|
-
|
|
485
|
+
{
|
|
486
|
+
id: "json-schema-validator",
|
|
487
|
+
name: "JSON Schema Validator",
|
|
488
|
+
configPreview: "strict=true"
|
|
489
|
+
},
|
|
490
|
+
{
|
|
491
|
+
id: "tool-call-contract-checker",
|
|
492
|
+
name: "Tool-call Contract Checker",
|
|
493
|
+
configPreview: "unexpectedCalls=error"
|
|
494
|
+
},
|
|
495
|
+
{
|
|
496
|
+
id: "rubric-judge",
|
|
497
|
+
name: "Rubric Judge (LLM)",
|
|
498
|
+
configPreview: "model=gpt-4o-mini; scale=0-100"
|
|
499
|
+
},
|
|
459
500
|
{ id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
|
|
460
501
|
]
|
|
461
502
|
};
|
|
@@ -522,7 +563,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
522
563
|
function toEvaluatorOption(item) {
|
|
523
564
|
return {
|
|
524
565
|
id: item.id,
|
|
525
|
-
name: item.evaluator
|
|
566
|
+
name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
|
|
526
567
|
configPreview: `Source: ${item.filePath}`
|
|
527
568
|
};
|
|
528
569
|
}
|
|
@@ -535,9 +576,7 @@ async function loadRunnerData(runner) {
|
|
|
535
576
|
const memSnapshots = runner.getAllRunSnapshots();
|
|
536
577
|
const seen = new Set(memSnapshots.map((s) => s.runId));
|
|
537
578
|
const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
|
|
538
|
-
const snapshots = [...memSnapshots, ...fromDisk].sort(
|
|
539
|
-
(a, b) => b.queuedAt - a.queuedAt
|
|
540
|
-
);
|
|
579
|
+
const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
|
|
541
580
|
if (datasets.length === 0 && evaluators.length === 0) {
|
|
542
581
|
return loadMockData();
|
|
543
582
|
}
|
|
@@ -659,7 +698,11 @@ function reduceCliState(state, action) {
|
|
|
659
698
|
return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
|
|
660
699
|
}
|
|
661
700
|
if (state.level === "datasets") {
|
|
662
|
-
return {
|
|
701
|
+
return {
|
|
702
|
+
...state,
|
|
703
|
+
datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1),
|
|
704
|
+
overviewScrollOffset: 0
|
|
705
|
+
};
|
|
663
706
|
}
|
|
664
707
|
if (state.level === "runs") {
|
|
665
708
|
return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
|
|
@@ -677,10 +720,17 @@ function reduceCliState(state, action) {
|
|
|
677
720
|
return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
|
|
678
721
|
}
|
|
679
722
|
if (state.level === "datasets" && state.focus === "right") {
|
|
680
|
-
return {
|
|
723
|
+
return {
|
|
724
|
+
...state,
|
|
725
|
+
overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1)
|
|
726
|
+
};
|
|
681
727
|
}
|
|
682
728
|
if (state.level === "datasets") {
|
|
683
|
-
return {
|
|
729
|
+
return {
|
|
730
|
+
...state,
|
|
731
|
+
datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1),
|
|
732
|
+
overviewScrollOffset: 0
|
|
733
|
+
};
|
|
684
734
|
}
|
|
685
735
|
if (state.level === "runs") {
|
|
686
736
|
return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
|
|
@@ -756,24 +806,168 @@ function reduceCliState(state, action) {
|
|
|
756
806
|
}
|
|
757
807
|
return state;
|
|
758
808
|
}
|
|
809
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
810
|
+
const baseDir = path.resolve(config.artifactDirectory);
|
|
811
|
+
let entries;
|
|
812
|
+
try {
|
|
813
|
+
entries = await promises.readdir(baseDir);
|
|
814
|
+
} catch {
|
|
815
|
+
return [];
|
|
816
|
+
}
|
|
817
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
818
|
+
const snapshots = [];
|
|
819
|
+
for (const fileName of jsonlFiles) {
|
|
820
|
+
const filePath = path.join(baseDir, fileName);
|
|
821
|
+
try {
|
|
822
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
823
|
+
if (snapshot) {
|
|
824
|
+
snapshots.push(snapshot);
|
|
825
|
+
}
|
|
826
|
+
} catch {
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
830
|
+
}
|
|
831
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
832
|
+
const content = await promises.readFile(filePath, "utf8");
|
|
833
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
834
|
+
if (lines.length === 0) {
|
|
835
|
+
return null;
|
|
836
|
+
}
|
|
837
|
+
let runQueued = null;
|
|
838
|
+
let runCompleted = null;
|
|
839
|
+
let runFailed = null;
|
|
840
|
+
let runStarted = null;
|
|
841
|
+
for (const line of lines) {
|
|
842
|
+
try {
|
|
843
|
+
const event = JSON.parse(line);
|
|
844
|
+
const type = event.type;
|
|
845
|
+
if (type === "RunQueued") {
|
|
846
|
+
runQueued = {
|
|
847
|
+
runId: event.runId,
|
|
848
|
+
datasetId: event.datasetId,
|
|
849
|
+
datasetName: event.datasetName,
|
|
850
|
+
evaluatorIds: event.evaluatorIds,
|
|
851
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
852
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
853
|
+
ts: event.ts
|
|
854
|
+
};
|
|
855
|
+
}
|
|
856
|
+
if (type === "RunStarted") {
|
|
857
|
+
runStarted = { startedAt: event.startedAt };
|
|
858
|
+
}
|
|
859
|
+
if (type === "RunCompleted") {
|
|
860
|
+
runCompleted = {
|
|
861
|
+
passedTestCases: event.passedTestCases,
|
|
862
|
+
failedTestCases: event.failedTestCases,
|
|
863
|
+
totalTestCases: event.totalTestCases,
|
|
864
|
+
finishedAt: event.finishedAt
|
|
865
|
+
};
|
|
866
|
+
}
|
|
867
|
+
if (type === "RunFailed") {
|
|
868
|
+
runFailed = {
|
|
869
|
+
finishedAt: event.finishedAt,
|
|
870
|
+
errorMessage: event.errorMessage
|
|
871
|
+
};
|
|
872
|
+
}
|
|
873
|
+
} catch {
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
if (!runQueued) {
|
|
877
|
+
return null;
|
|
878
|
+
}
|
|
879
|
+
const artifactPath = filePath;
|
|
880
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
881
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
882
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
883
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
884
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
885
|
+
return {
|
|
886
|
+
runId: runQueued.runId,
|
|
887
|
+
datasetId: runQueued.datasetId,
|
|
888
|
+
datasetName: runQueued.datasetName,
|
|
889
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
890
|
+
queuedAt: runQueued.ts ?? 0,
|
|
891
|
+
startedAt: runStarted?.startedAt,
|
|
892
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
893
|
+
totalTestCases: runQueued.totalTestCases,
|
|
894
|
+
completedTestCases,
|
|
895
|
+
passedTestCases,
|
|
896
|
+
failedTestCases,
|
|
897
|
+
status,
|
|
898
|
+
artifactPath,
|
|
899
|
+
errorMessage: runFailed?.errorMessage
|
|
900
|
+
};
|
|
901
|
+
}
|
|
902
|
+
function aggregateTestCaseProgress(lines) {
|
|
903
|
+
let completedTestCases = 0;
|
|
904
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
905
|
+
for (const line of lines) {
|
|
906
|
+
try {
|
|
907
|
+
const event = JSON.parse(line);
|
|
908
|
+
if (event.type === "TestCaseProgress") {
|
|
909
|
+
const ev = event;
|
|
910
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
911
|
+
const id = ev.testCaseId;
|
|
912
|
+
const current = testCasePassedBy.get(id);
|
|
913
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
914
|
+
}
|
|
915
|
+
} catch {
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
let passedTestCases = 0;
|
|
919
|
+
let failedTestCases = 0;
|
|
920
|
+
for (const passed of testCasePassedBy.values()) {
|
|
921
|
+
if (passed) {
|
|
922
|
+
passedTestCases += 1;
|
|
923
|
+
} else {
|
|
924
|
+
failedTestCases += 1;
|
|
925
|
+
}
|
|
926
|
+
}
|
|
927
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
928
|
+
}
|
|
929
|
+
async function parseArtifactFile(artifactPath) {
|
|
930
|
+
try {
|
|
931
|
+
const content = await promises.readFile(artifactPath, "utf8");
|
|
932
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
933
|
+
const results = [];
|
|
934
|
+
for (const line of lines) {
|
|
935
|
+
try {
|
|
936
|
+
const event = JSON.parse(line);
|
|
937
|
+
if (event.type === "TestCaseProgress") {
|
|
938
|
+
const ev = event;
|
|
939
|
+
const repetitionIndex = ev.repetitionIndex ?? ev.rerunIndex;
|
|
940
|
+
const repetitionCount = ev.repetitionCount ?? ev.rerunTotal;
|
|
941
|
+
results.push({
|
|
942
|
+
testCaseId: ev.testCaseId,
|
|
943
|
+
testCaseName: ev.testCaseName,
|
|
944
|
+
completedTestCases: ev.completedTestCases,
|
|
945
|
+
totalTestCases: ev.totalTestCases,
|
|
946
|
+
repetitionId: ev.repetitionId,
|
|
947
|
+
repetitionIndex,
|
|
948
|
+
repetitionCount,
|
|
949
|
+
passed: ev.passed,
|
|
950
|
+
durationMs: ev.durationMs,
|
|
951
|
+
evaluatorScores: ev.evaluatorScores ?? []
|
|
952
|
+
});
|
|
953
|
+
}
|
|
954
|
+
} catch {
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
return results;
|
|
958
|
+
} catch {
|
|
959
|
+
return [];
|
|
960
|
+
}
|
|
961
|
+
}
|
|
759
962
|
|
|
760
963
|
// src/runner/config.ts
|
|
761
964
|
var defaultRunnerConfig = {
|
|
762
965
|
discovery: {
|
|
763
966
|
rootDir: process.cwd(),
|
|
764
967
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
765
|
-
evaluatorSuffixes: [
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
".evaluator.js",
|
|
769
|
-
".evaluator.mjs"
|
|
770
|
-
],
|
|
771
|
-
testCaseSuffixes: [
|
|
772
|
-
".test-case.ts",
|
|
773
|
-
".test-case.tsx",
|
|
774
|
-
".test-case.js",
|
|
775
|
-
".test-case.mjs"
|
|
776
|
-
],
|
|
968
|
+
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
969
|
+
runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
|
|
970
|
+
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
777
971
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
778
972
|
},
|
|
779
973
|
artifactDirectory: ".eval-results",
|
|
@@ -798,6 +992,11 @@ function toRunnerConfigOverrides(config) {
|
|
|
798
992
|
} else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
|
|
799
993
|
discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
|
|
800
994
|
}
|
|
995
|
+
if (rawDiscovery?.runConfigFilePatterns !== void 0) {
|
|
996
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
|
|
997
|
+
} else if (rawDiscovery?.runConfigSuffixes !== void 0) {
|
|
998
|
+
discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
|
|
999
|
+
}
|
|
801
1000
|
if (rawDiscovery?.testCaseFilePatterns !== void 0) {
|
|
802
1001
|
discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
|
|
803
1002
|
} else if (rawDiscovery?.testCaseSuffixes !== void 0) {
|
|
@@ -840,14 +1039,15 @@ function getJitiLoader() {
|
|
|
840
1039
|
}
|
|
841
1040
|
const createJiti2 = jitiModule__namespace.createJiti ?? jitiModule__namespace.default;
|
|
842
1041
|
if (typeof createJiti2 !== "function") {
|
|
843
|
-
throw new Error(
|
|
844
|
-
"Failed to initialize jiti for m4trix eval config loading."
|
|
845
|
-
);
|
|
1042
|
+
throw new Error("Failed to initialize jiti for m4trix eval config loading.");
|
|
846
1043
|
}
|
|
847
|
-
cachedLoader = createJiti2(
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
1044
|
+
cachedLoader = createJiti2(
|
|
1045
|
+
(typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)),
|
|
1046
|
+
{
|
|
1047
|
+
interopDefault: true,
|
|
1048
|
+
moduleCache: true
|
|
1049
|
+
}
|
|
1050
|
+
);
|
|
851
1051
|
return cachedLoader;
|
|
852
1052
|
}
|
|
853
1053
|
function resolveConfigModuleExport(loadedModule) {
|
|
@@ -895,6 +1095,9 @@ function isDatasetLike(value) {
|
|
|
895
1095
|
function isEvaluatorLike(value) {
|
|
896
1096
|
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
897
1097
|
}
|
|
1098
|
+
function isRunConfigLike(value) {
|
|
1099
|
+
return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
|
|
1100
|
+
}
|
|
898
1101
|
function isTestCaseLike(value) {
|
|
899
1102
|
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
900
1103
|
}
|
|
@@ -951,9 +1154,7 @@ async function loadModuleExports(filePath) {
|
|
|
951
1154
|
}
|
|
952
1155
|
async function collectDatasetsFromFiles(config) {
|
|
953
1156
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
954
|
-
const matched = files.filter(
|
|
955
|
-
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
956
|
-
);
|
|
1157
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
|
|
957
1158
|
const found = await Promise.all(
|
|
958
1159
|
matched.map(async (absolutePath) => {
|
|
959
1160
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -970,9 +1171,7 @@ async function collectDatasetsFromFiles(config) {
|
|
|
970
1171
|
}
|
|
971
1172
|
async function collectEvaluatorsFromFiles(config) {
|
|
972
1173
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
973
|
-
const matched = files.filter(
|
|
974
|
-
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
975
|
-
);
|
|
1174
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
|
|
976
1175
|
const found = await Promise.all(
|
|
977
1176
|
matched.map(async (absolutePath) => {
|
|
978
1177
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -987,11 +1186,26 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
987
1186
|
);
|
|
988
1187
|
return found.flat();
|
|
989
1188
|
}
|
|
990
|
-
async function
|
|
1189
|
+
async function collectRunConfigsFromFiles(config) {
|
|
991
1190
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
992
|
-
const matched = files.filter(
|
|
993
|
-
|
|
1191
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
|
|
1192
|
+
const found = await Promise.all(
|
|
1193
|
+
matched.map(async (absolutePath) => {
|
|
1194
|
+
const exports = await loadModuleExports(absolutePath);
|
|
1195
|
+
const runConfigs = exports.filter(isRunConfigLike);
|
|
1196
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
1197
|
+
return runConfigs.map((runConfig) => ({
|
|
1198
|
+
id: runConfig.getName(),
|
|
1199
|
+
filePath: relPath,
|
|
1200
|
+
runConfig
|
|
1201
|
+
}));
|
|
1202
|
+
})
|
|
994
1203
|
);
|
|
1204
|
+
return found.flat();
|
|
1205
|
+
}
|
|
1206
|
+
async function collectTestCasesFromFiles(config) {
|
|
1207
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1208
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
995
1209
|
const found = await Promise.all(
|
|
996
1210
|
matched.map(async (absolutePath) => {
|
|
997
1211
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1063,16 +1277,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
1063
1277
|
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
1064
1278
|
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
1065
1279
|
if (diffOptions?.keysOnly) {
|
|
1066
|
-
const expectedKeys = JSON.stringify(
|
|
1067
|
-
|
|
1068
|
-
null,
|
|
1069
|
-
2
|
|
1070
|
-
);
|
|
1071
|
-
const actualKeys = JSON.stringify(
|
|
1072
|
-
extractKeys(actualProcessed),
|
|
1073
|
-
null,
|
|
1074
|
-
2
|
|
1075
|
-
);
|
|
1280
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
1281
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
1076
1282
|
const parts2 = diff.diffLines(expectedKeys, actualKeys);
|
|
1077
1283
|
return formatDiffParts(parts2);
|
|
1078
1284
|
}
|
|
@@ -1083,9 +1289,7 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
1083
1289
|
}
|
|
1084
1290
|
const parts = diff.diffLines(expectedStr, actualStr);
|
|
1085
1291
|
if (diffOptions?.outputNewOnly) {
|
|
1086
|
-
const filtered = parts.filter(
|
|
1087
|
-
(p) => p.added === true
|
|
1088
|
-
);
|
|
1292
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
1089
1293
|
return formatDiffParts(filtered);
|
|
1090
1294
|
}
|
|
1091
1295
|
return formatDiffParts(parts);
|
|
@@ -1152,6 +1356,17 @@ function getDiffLines(entry) {
|
|
|
1152
1356
|
});
|
|
1153
1357
|
}
|
|
1154
1358
|
|
|
1359
|
+
// src/evals/test-case.ts
|
|
1360
|
+
function getTestCaseDisplayLabel(testCase) {
|
|
1361
|
+
if (typeof testCase.getDisplayLabel === "function") {
|
|
1362
|
+
return testCase.getDisplayLabel();
|
|
1363
|
+
}
|
|
1364
|
+
return typeof testCase.getName === "function" ? testCase.getName() : "";
|
|
1365
|
+
}
|
|
1366
|
+
function getTestCaseTagList(testCase) {
|
|
1367
|
+
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1155
1370
|
// src/evals/metric.ts
|
|
1156
1371
|
var registry = /* @__PURE__ */ new Map();
|
|
1157
1372
|
var Metric = {
|
|
@@ -1175,25 +1390,70 @@ function getMetricById(id) {
|
|
|
1175
1390
|
return registry.get(id);
|
|
1176
1391
|
}
|
|
1177
1392
|
|
|
1178
|
-
// src/evals/
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1393
|
+
// src/evals/aggregators.ts
|
|
1394
|
+
function aggregateTokenCountSum(values) {
|
|
1395
|
+
const initial = {
|
|
1396
|
+
input: 0,
|
|
1397
|
+
output: 0,
|
|
1398
|
+
inputCached: 0,
|
|
1399
|
+
outputCached: 0
|
|
1400
|
+
};
|
|
1401
|
+
return values.reduce(
|
|
1402
|
+
(acc, v) => ({
|
|
1403
|
+
input: acc.input + (v.input ?? 0),
|
|
1404
|
+
output: acc.output + (v.output ?? 0),
|
|
1405
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1406
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1407
|
+
}),
|
|
1408
|
+
initial
|
|
1409
|
+
);
|
|
1182
1410
|
}
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1411
|
+
function aggregateLatencyAverage(values) {
|
|
1412
|
+
if (values.length === 0) {
|
|
1413
|
+
return { ms: 0 };
|
|
1414
|
+
}
|
|
1415
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1416
|
+
return { ms: sum / values.length };
|
|
1417
|
+
}
|
|
1418
|
+
|
|
1419
|
+
// src/evals/metrics/standard.ts
|
|
1420
|
+
Metric.of({
|
|
1421
|
+
id: "token-count",
|
|
1422
|
+
name: "Tokens",
|
|
1423
|
+
aggregate: aggregateTokenCountSum,
|
|
1424
|
+
format: (data, options) => {
|
|
1425
|
+
const input = data.input ?? 0;
|
|
1426
|
+
const output = data.output ?? 0;
|
|
1427
|
+
const inputCached = data.inputCached ?? 0;
|
|
1428
|
+
const outputCached = data.outputCached ?? 0;
|
|
1429
|
+
const cached = inputCached + outputCached;
|
|
1430
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1431
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1432
|
+
}
|
|
1433
|
+
});
|
|
1434
|
+
Metric.of({
|
|
1435
|
+
id: "latency",
|
|
1436
|
+
name: "Latency",
|
|
1437
|
+
aggregate: aggregateLatencyAverage,
|
|
1438
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1439
|
+
});
|
|
1440
|
+
|
|
1441
|
+
// src/evals/score.ts
|
|
1442
|
+
var registry2 = /* @__PURE__ */ new Map();
|
|
1443
|
+
function formatScoreData(def, data, options) {
|
|
1444
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
1445
|
+
}
|
|
1446
|
+
var ScoreAggregate = {
|
|
1447
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
1448
|
+
averageFields(fields) {
|
|
1449
|
+
return (values) => {
|
|
1450
|
+
const count = values.length || 1;
|
|
1451
|
+
const result = {};
|
|
1452
|
+
for (const field of fields) {
|
|
1453
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
1454
|
+
}
|
|
1455
|
+
return result;
|
|
1456
|
+
};
|
|
1197
1457
|
},
|
|
1198
1458
|
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
1199
1459
|
averageWithVariance(fields) {
|
|
@@ -1224,13 +1484,10 @@ var ScoreAggregate = {
|
|
|
1224
1484
|
(s, v) => s + (v[valueField] ?? 0),
|
|
1225
1485
|
0
|
|
1226
1486
|
);
|
|
1227
|
-
const sumSq = values.reduce(
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
},
|
|
1232
|
-
0
|
|
1233
|
-
);
|
|
1487
|
+
const sumSq = values.reduce((s, v) => {
|
|
1488
|
+
const value = v[valueField] ?? 0;
|
|
1489
|
+
return s + value * value;
|
|
1490
|
+
}, 0);
|
|
1234
1491
|
const mean = sum / count;
|
|
1235
1492
|
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
1236
1493
|
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
@@ -1289,54 +1546,6 @@ function getScoreById(id) {
|
|
|
1289
1546
|
return registry2.get(id);
|
|
1290
1547
|
}
|
|
1291
1548
|
|
|
1292
|
-
// src/evals/aggregators.ts
|
|
1293
|
-
function aggregateTokenCountSum(values) {
|
|
1294
|
-
const initial = {
|
|
1295
|
-
input: 0,
|
|
1296
|
-
output: 0,
|
|
1297
|
-
inputCached: 0,
|
|
1298
|
-
outputCached: 0
|
|
1299
|
-
};
|
|
1300
|
-
return values.reduce(
|
|
1301
|
-
(acc, v) => ({
|
|
1302
|
-
input: acc.input + (v.input ?? 0),
|
|
1303
|
-
output: acc.output + (v.output ?? 0),
|
|
1304
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1305
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1306
|
-
}),
|
|
1307
|
-
initial
|
|
1308
|
-
);
|
|
1309
|
-
}
|
|
1310
|
-
function aggregateLatencyAverage(values) {
|
|
1311
|
-
if (values.length === 0) {
|
|
1312
|
-
return { ms: 0 };
|
|
1313
|
-
}
|
|
1314
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1315
|
-
return { ms: sum / values.length };
|
|
1316
|
-
}
|
|
1317
|
-
|
|
1318
|
-
// src/evals/metrics/standard.ts
|
|
1319
|
-
Metric.of({
|
|
1320
|
-
id: "token-count",
|
|
1321
|
-
name: "Tokens",
|
|
1322
|
-
aggregate: aggregateTokenCountSum,
|
|
1323
|
-
format: (data, options) => {
|
|
1324
|
-
const input = data.input ?? 0;
|
|
1325
|
-
const output = data.output ?? 0;
|
|
1326
|
-
const inputCached = data.inputCached ?? 0;
|
|
1327
|
-
const outputCached = data.outputCached ?? 0;
|
|
1328
|
-
const cached = inputCached + outputCached;
|
|
1329
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1330
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1331
|
-
}
|
|
1332
|
-
});
|
|
1333
|
-
Metric.of({
|
|
1334
|
-
id: "latency",
|
|
1335
|
-
name: "Latency",
|
|
1336
|
-
aggregate: aggregateLatencyAverage,
|
|
1337
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1338
|
-
});
|
|
1339
|
-
|
|
1340
1549
|
// src/evals/scores/standard.ts
|
|
1341
1550
|
Score.of({
|
|
1342
1551
|
id: "percent",
|
|
@@ -1443,15 +1652,17 @@ function readOutput(testCase) {
|
|
|
1443
1652
|
}
|
|
1444
1653
|
return candidate.getOutput();
|
|
1445
1654
|
}
|
|
1446
|
-
function buildEvaluationUnits(testCases) {
|
|
1655
|
+
function buildEvaluationUnits(testCases, repetitionCount) {
|
|
1656
|
+
const count = Math.max(1, repetitionCount);
|
|
1447
1657
|
const units = [];
|
|
1448
1658
|
for (const testCaseItem of testCases) {
|
|
1449
|
-
const
|
|
1450
|
-
for (let r = 0; r <
|
|
1659
|
+
const repetitionId = `rep-${crypto.randomUUID()}`;
|
|
1660
|
+
for (let r = 0; r < count; r++) {
|
|
1451
1661
|
units.push({
|
|
1452
1662
|
testCaseItem,
|
|
1453
|
-
|
|
1454
|
-
|
|
1663
|
+
repetitionId,
|
|
1664
|
+
repetitionIndex: r + 1,
|
|
1665
|
+
repetitionCount: count
|
|
1455
1666
|
});
|
|
1456
1667
|
}
|
|
1457
1668
|
}
|
|
@@ -1461,29 +1672,24 @@ function nowIsoForFile() {
|
|
|
1461
1672
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1462
1673
|
}
|
|
1463
1674
|
function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
1464
|
-
return path.join(
|
|
1465
|
-
artifactDirectory,
|
|
1466
|
-
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1467
|
-
);
|
|
1675
|
+
return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1468
1676
|
}
|
|
1469
1677
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1470
|
-
const { testCaseItem,
|
|
1678
|
+
const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
|
|
1471
1679
|
return effect.Effect.gen(function* () {
|
|
1472
1680
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1473
1681
|
const started = Date.now();
|
|
1474
|
-
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
1475
|
-
n + 1,
|
|
1476
|
-
n + 1
|
|
1477
|
-
]);
|
|
1682
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [n + 1, n + 1]);
|
|
1478
1683
|
yield* publishEvent({
|
|
1479
1684
|
type: "TestCaseStarted",
|
|
1480
1685
|
runId: task.runId,
|
|
1481
1686
|
testCaseId: testCaseItem.id,
|
|
1482
|
-
testCaseName: testCaseItem.testCase
|
|
1687
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1483
1688
|
startedTestCases: startedEvaluations,
|
|
1484
1689
|
totalTestCases: totalEvaluations,
|
|
1485
|
-
|
|
1486
|
-
|
|
1690
|
+
repetitionId,
|
|
1691
|
+
repetitionIndex,
|
|
1692
|
+
repetitionCount
|
|
1487
1693
|
});
|
|
1488
1694
|
const evaluatorScores = [];
|
|
1489
1695
|
let testCaseError;
|
|
@@ -1507,9 +1713,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1507
1713
|
return error;
|
|
1508
1714
|
};
|
|
1509
1715
|
try {
|
|
1510
|
-
const ctx = yield* effect.Effect.promise(
|
|
1511
|
-
() => Promise.resolve(evaluator.resolveContext())
|
|
1512
|
-
);
|
|
1716
|
+
const ctx = yield* effect.Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
|
|
1513
1717
|
const result = yield* effect.Effect.promise(
|
|
1514
1718
|
() => Promise.resolve().then(
|
|
1515
1719
|
() => evaluateFn({
|
|
@@ -1519,8 +1723,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1519
1723
|
meta: {
|
|
1520
1724
|
triggerId: task.triggerId,
|
|
1521
1725
|
runId: evaluatorRunId,
|
|
1522
|
-
datasetId: task.datasetId
|
|
1726
|
+
datasetId: task.datasetId,
|
|
1727
|
+
repetitionId,
|
|
1728
|
+
repetitionIndex,
|
|
1729
|
+
repetitionCount,
|
|
1730
|
+
runConfigName: task.runConfigName
|
|
1523
1731
|
},
|
|
1732
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1733
|
+
runConfigTags: task.runConfigTags,
|
|
1734
|
+
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1524
1735
|
logDiff,
|
|
1525
1736
|
log,
|
|
1526
1737
|
createError
|
|
@@ -1563,21 +1774,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1563
1774
|
});
|
|
1564
1775
|
}
|
|
1565
1776
|
}
|
|
1566
|
-
const
|
|
1567
|
-
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
1568
|
-
n + 1,
|
|
1569
|
-
n + 1
|
|
1570
|
-
]);
|
|
1777
|
+
const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1778
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1571
1779
|
const progressEvent = {
|
|
1572
1780
|
type: "TestCaseProgress",
|
|
1573
1781
|
runId: task.runId,
|
|
1574
1782
|
testCaseId: testCaseItem.id,
|
|
1575
|
-
testCaseName: testCaseItem.testCase
|
|
1783
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1576
1784
|
completedTestCases: completedEvaluations,
|
|
1577
1785
|
totalTestCases: totalEvaluations,
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1786
|
+
repetitionId,
|
|
1787
|
+
repetitionIndex,
|
|
1788
|
+
repetitionCount,
|
|
1789
|
+
passed: repetitionPassedThis,
|
|
1581
1790
|
durationMs: Date.now() - started,
|
|
1582
1791
|
evaluatorScores,
|
|
1583
1792
|
output,
|
|
@@ -1598,9 +1807,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1598
1807
|
(map) => {
|
|
1599
1808
|
const key = testCaseItem.id;
|
|
1600
1809
|
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1601
|
-
const newResults = [...existing.results,
|
|
1810
|
+
const newResults = [...existing.results, repetitionPassedThis];
|
|
1602
1811
|
const newCompletedCount = existing.completedCount + 1;
|
|
1603
|
-
const isLast = newCompletedCount ===
|
|
1812
|
+
const isLast = newCompletedCount === repetitionCount;
|
|
1604
1813
|
const newMap = new Map(map);
|
|
1605
1814
|
newMap.set(key, {
|
|
1606
1815
|
completedCount: newCompletedCount,
|
|
@@ -1616,10 +1825,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1616
1825
|
} else {
|
|
1617
1826
|
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
1618
1827
|
}
|
|
1619
|
-
const [passed, failed] = yield* effect.Effect.all([
|
|
1620
|
-
effect.Ref.get(passedRef),
|
|
1621
|
-
effect.Ref.get(failedRef)
|
|
1622
|
-
]);
|
|
1828
|
+
const [passed, failed] = yield* effect.Effect.all([effect.Ref.get(passedRef), effect.Ref.get(failedRef)]);
|
|
1623
1829
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1624
1830
|
...snapshot,
|
|
1625
1831
|
passedTestCases: passed,
|
|
@@ -1640,10 +1846,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1640
1846
|
runId: task.runId,
|
|
1641
1847
|
startedAt
|
|
1642
1848
|
});
|
|
1643
|
-
const totalEvaluations = task.testCases.
|
|
1644
|
-
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1645
|
-
0
|
|
1646
|
-
);
|
|
1849
|
+
const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
|
|
1647
1850
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1648
1851
|
const completedRef = yield* effect.Ref.make(0);
|
|
1649
1852
|
const startedRef = yield* effect.Ref.make(0);
|
|
@@ -1652,7 +1855,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1652
1855
|
const testCaseResultsRef = yield* effect.Ref.make(
|
|
1653
1856
|
/* @__PURE__ */ new Map()
|
|
1654
1857
|
);
|
|
1655
|
-
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1858
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
|
|
1656
1859
|
const processEvaluation = (unit) => processOneEvaluation(
|
|
1657
1860
|
task,
|
|
1658
1861
|
unit,
|
|
@@ -1666,11 +1869,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1666
1869
|
failedRef,
|
|
1667
1870
|
testCaseResultsRef
|
|
1668
1871
|
);
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1872
|
+
const globalSem = task.globalEvaluationSemaphore;
|
|
1873
|
+
if (globalSem !== void 0) {
|
|
1874
|
+
yield* effect.Effect.forEach(
|
|
1875
|
+
evaluationUnits,
|
|
1876
|
+
(unit) => globalSem.withPermits(1)(processEvaluation(unit)),
|
|
1877
|
+
{ concurrency: "unbounded", discard: true }
|
|
1878
|
+
);
|
|
1879
|
+
} else {
|
|
1880
|
+
yield* effect.Effect.forEach(
|
|
1881
|
+
evaluationUnits,
|
|
1882
|
+
processEvaluation,
|
|
1883
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1884
|
+
);
|
|
1885
|
+
}
|
|
1674
1886
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
1675
1887
|
effect.Ref.get(completedRef),
|
|
1676
1888
|
effect.Ref.get(passedRef),
|
|
@@ -1706,155 +1918,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1706
1918
|
artifactPath: task.snapshot.artifactPath
|
|
1707
1919
|
});
|
|
1708
1920
|
});
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
} catch {
|
|
1715
|
-
return [];
|
|
1716
|
-
}
|
|
1717
|
-
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1718
|
-
const snapshots = [];
|
|
1719
|
-
for (const fileName of jsonlFiles) {
|
|
1720
|
-
const filePath = path.join(baseDir, fileName);
|
|
1721
|
-
try {
|
|
1722
|
-
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1723
|
-
if (snapshot) {
|
|
1724
|
-
snapshots.push(snapshot);
|
|
1725
|
-
}
|
|
1726
|
-
} catch {
|
|
1727
|
-
}
|
|
1728
|
-
}
|
|
1729
|
-
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1730
|
-
}
|
|
1731
|
-
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1732
|
-
const content = await promises.readFile(filePath, "utf8");
|
|
1733
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1734
|
-
if (lines.length === 0) {
|
|
1735
|
-
return null;
|
|
1736
|
-
}
|
|
1737
|
-
let runQueued = null;
|
|
1738
|
-
let runCompleted = null;
|
|
1739
|
-
let runFailed = null;
|
|
1740
|
-
let runStarted = null;
|
|
1741
|
-
for (const line of lines) {
|
|
1742
|
-
try {
|
|
1743
|
-
const event = JSON.parse(line);
|
|
1744
|
-
const type = event.type;
|
|
1745
|
-
if (type === "RunQueued") {
|
|
1746
|
-
runQueued = {
|
|
1747
|
-
runId: event.runId,
|
|
1748
|
-
datasetId: event.datasetId,
|
|
1749
|
-
datasetName: event.datasetName,
|
|
1750
|
-
evaluatorIds: event.evaluatorIds,
|
|
1751
|
-
totalTestCases: event.totalTestCases ?? 0,
|
|
1752
|
-
artifactPath: event.artifactPath ?? filePath,
|
|
1753
|
-
ts: event.ts
|
|
1754
|
-
};
|
|
1755
|
-
}
|
|
1756
|
-
if (type === "RunStarted") {
|
|
1757
|
-
runStarted = { startedAt: event.startedAt };
|
|
1758
|
-
}
|
|
1759
|
-
if (type === "RunCompleted") {
|
|
1760
|
-
runCompleted = {
|
|
1761
|
-
passedTestCases: event.passedTestCases,
|
|
1762
|
-
failedTestCases: event.failedTestCases,
|
|
1763
|
-
totalTestCases: event.totalTestCases,
|
|
1764
|
-
finishedAt: event.finishedAt
|
|
1765
|
-
};
|
|
1766
|
-
}
|
|
1767
|
-
if (type === "RunFailed") {
|
|
1768
|
-
runFailed = {
|
|
1769
|
-
finishedAt: event.finishedAt,
|
|
1770
|
-
errorMessage: event.errorMessage
|
|
1771
|
-
};
|
|
1772
|
-
}
|
|
1773
|
-
} catch {
|
|
1774
|
-
}
|
|
1921
|
+
|
|
1922
|
+
// src/runner/name-pattern.ts
|
|
1923
|
+
function parseRegexLiteral(pattern) {
|
|
1924
|
+
if (!pattern.startsWith("/")) {
|
|
1925
|
+
return void 0;
|
|
1775
1926
|
}
|
|
1776
|
-
|
|
1777
|
-
|
|
1927
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1928
|
+
if (lastSlash <= 0) {
|
|
1929
|
+
return void 0;
|
|
1778
1930
|
}
|
|
1779
|
-
const artifactPath = filePath;
|
|
1780
|
-
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1781
|
-
const progress = aggregateTestCaseProgress(lines);
|
|
1782
|
-
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1783
|
-
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1784
|
-
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1785
1931
|
return {
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
datasetName: runQueued.datasetName,
|
|
1789
|
-
evaluatorIds: runQueued.evaluatorIds,
|
|
1790
|
-
queuedAt: runQueued.ts ?? 0,
|
|
1791
|
-
startedAt: runStarted?.startedAt,
|
|
1792
|
-
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1793
|
-
totalTestCases: runQueued.totalTestCases,
|
|
1794
|
-
completedTestCases,
|
|
1795
|
-
passedTestCases,
|
|
1796
|
-
failedTestCases,
|
|
1797
|
-
status,
|
|
1798
|
-
artifactPath,
|
|
1799
|
-
errorMessage: runFailed?.errorMessage
|
|
1932
|
+
source: pattern.slice(1, lastSlash),
|
|
1933
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1800
1934
|
};
|
|
1801
1935
|
}
|
|
1802
|
-
function
|
|
1803
|
-
|
|
1804
|
-
const
|
|
1805
|
-
|
|
1806
|
-
|
|
1807
|
-
|
|
1808
|
-
if (event.type === "TestCaseProgress") {
|
|
1809
|
-
const ev = event;
|
|
1810
|
-
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1811
|
-
const id = ev.testCaseId;
|
|
1812
|
-
const current = testCasePassedBy.get(id);
|
|
1813
|
-
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1814
|
-
}
|
|
1815
|
-
} catch {
|
|
1816
|
-
}
|
|
1817
|
-
}
|
|
1818
|
-
let passedTestCases = 0;
|
|
1819
|
-
let failedTestCases = 0;
|
|
1820
|
-
for (const passed of testCasePassedBy.values()) {
|
|
1821
|
-
if (passed) {
|
|
1822
|
-
passedTestCases += 1;
|
|
1823
|
-
} else {
|
|
1824
|
-
failedTestCases += 1;
|
|
1825
|
-
}
|
|
1936
|
+
function createNameMatcher(pattern) {
|
|
1937
|
+
const normalizedPattern = pattern.trim();
|
|
1938
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1939
|
+
if (regexLiteral) {
|
|
1940
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1941
|
+
return (value) => regex.test(value);
|
|
1826
1942
|
}
|
|
1827
|
-
|
|
1828
|
-
}
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
const content = await promises.readFile(artifactPath, "utf8");
|
|
1832
|
-
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1833
|
-
const results = [];
|
|
1834
|
-
for (const line of lines) {
|
|
1835
|
-
try {
|
|
1836
|
-
const event = JSON.parse(line);
|
|
1837
|
-
if (event.type === "TestCaseProgress") {
|
|
1838
|
-
const ev = event;
|
|
1839
|
-
results.push({
|
|
1840
|
-
testCaseId: ev.testCaseId,
|
|
1841
|
-
testCaseName: ev.testCaseName,
|
|
1842
|
-
completedTestCases: ev.completedTestCases,
|
|
1843
|
-
totalTestCases: ev.totalTestCases,
|
|
1844
|
-
rerunIndex: ev.rerunIndex,
|
|
1845
|
-
rerunTotal: ev.rerunTotal,
|
|
1846
|
-
passed: ev.passed,
|
|
1847
|
-
durationMs: ev.durationMs,
|
|
1848
|
-
evaluatorScores: ev.evaluatorScores ?? []
|
|
1849
|
-
});
|
|
1850
|
-
}
|
|
1851
|
-
} catch {
|
|
1852
|
-
}
|
|
1853
|
-
}
|
|
1854
|
-
return results;
|
|
1855
|
-
} catch {
|
|
1856
|
-
return [];
|
|
1943
|
+
if (normalizedPattern.includes("*")) {
|
|
1944
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1945
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1946
|
+
return (value) => regex.test(value);
|
|
1857
1947
|
}
|
|
1948
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1858
1949
|
}
|
|
1859
1950
|
async function appendJsonLine(artifactPath, payload) {
|
|
1860
1951
|
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
@@ -1913,32 +2004,12 @@ function searchCollectedTestCases(all, query) {
|
|
|
1913
2004
|
}
|
|
1914
2005
|
|
|
1915
2006
|
// src/runner/api.ts
|
|
1916
|
-
function
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
|
|
1920
|
-
const lastSlash = pattern.lastIndexOf("/");
|
|
1921
|
-
if (lastSlash <= 0) {
|
|
1922
|
-
return void 0;
|
|
2007
|
+
function normalizeRunRepetitions(value) {
|
|
2008
|
+
const n = value ?? 1;
|
|
2009
|
+
if (!Number.isInteger(n) || n < 1) {
|
|
2010
|
+
throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
|
|
1923
2011
|
}
|
|
1924
|
-
return
|
|
1925
|
-
source: pattern.slice(1, lastSlash),
|
|
1926
|
-
flags: pattern.slice(lastSlash + 1)
|
|
1927
|
-
};
|
|
1928
|
-
}
|
|
1929
|
-
function createNameMatcher(pattern) {
|
|
1930
|
-
const normalizedPattern = pattern.trim();
|
|
1931
|
-
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1932
|
-
if (regexLiteral) {
|
|
1933
|
-
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1934
|
-
return (value) => regex.test(value);
|
|
1935
|
-
}
|
|
1936
|
-
if (normalizedPattern.includes("*")) {
|
|
1937
|
-
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1938
|
-
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1939
|
-
return (value) => regex.test(value);
|
|
1940
|
-
}
|
|
1941
|
-
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
2012
|
+
return n;
|
|
1942
2013
|
}
|
|
1943
2014
|
function mergeRunnerOverrides(base, next) {
|
|
1944
2015
|
if (!base) {
|
|
@@ -1969,15 +2040,12 @@ var EffectRunner = class {
|
|
|
1969
2040
|
this.persistenceQueue = effect.Effect.runSync(
|
|
1970
2041
|
effect.Queue.unbounded()
|
|
1971
2042
|
);
|
|
1972
|
-
this.snapshotsRef = effect.Effect.runSync(
|
|
1973
|
-
effect.Ref.make(/* @__PURE__ */ new Map())
|
|
1974
|
-
);
|
|
2043
|
+
this.snapshotsRef = effect.Effect.runSync(effect.Ref.make(/* @__PURE__ */ new Map()));
|
|
1975
2044
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1976
2045
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1977
2046
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1978
|
-
this.
|
|
1979
|
-
|
|
1980
|
-
);
|
|
2047
|
+
this.runConfigsById = /* @__PURE__ */ new Map();
|
|
2048
|
+
this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
|
|
1981
2049
|
this.persistenceFiber = effect.Effect.runFork(
|
|
1982
2050
|
createPersistenceWorker(this.persistenceQueue)
|
|
1983
2051
|
);
|
|
@@ -2017,6 +2085,137 @@ var EffectRunner = class {
|
|
|
2017
2085
|
(item) => matcher(item.evaluator.getName() ?? "")
|
|
2018
2086
|
);
|
|
2019
2087
|
}
|
|
2088
|
+
async collectRunConfigs() {
|
|
2089
|
+
const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
|
|
2090
|
+
this.runConfigsById.clear();
|
|
2091
|
+
const byNameLower = /* @__PURE__ */ new Map();
|
|
2092
|
+
for (const item of runConfigs) {
|
|
2093
|
+
const id = item.runConfig.getName();
|
|
2094
|
+
const lower = id.toLowerCase();
|
|
2095
|
+
const prev = byNameLower.get(lower);
|
|
2096
|
+
if (prev !== void 0 && prev.filePath !== item.filePath) {
|
|
2097
|
+
throw new Error(
|
|
2098
|
+
`Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
|
|
2099
|
+
);
|
|
2100
|
+
}
|
|
2101
|
+
byNameLower.set(lower, item);
|
|
2102
|
+
this.runConfigsById.set(id, item);
|
|
2103
|
+
}
|
|
2104
|
+
return runConfigs;
|
|
2105
|
+
}
|
|
2106
|
+
async resolveRunConfigByName(name) {
|
|
2107
|
+
if (this.runConfigsById.size === 0) {
|
|
2108
|
+
await this.collectRunConfigs();
|
|
2109
|
+
}
|
|
2110
|
+
const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
|
|
2111
|
+
const keyLower = key.toLowerCase();
|
|
2112
|
+
const matches = Array.from(this.runConfigsById.values()).filter(
|
|
2113
|
+
(item) => item.runConfig.getName().toLowerCase() === keyLower
|
|
2114
|
+
);
|
|
2115
|
+
if (matches.length === 0) {
|
|
2116
|
+
return void 0;
|
|
2117
|
+
}
|
|
2118
|
+
if (matches.length > 1) {
|
|
2119
|
+
throw new Error(
|
|
2120
|
+
`Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
|
|
2121
|
+
);
|
|
2122
|
+
}
|
|
2123
|
+
return matches[0];
|
|
2124
|
+
}
|
|
2125
|
+
async expandRunConfigToJobs(collected) {
|
|
2126
|
+
if (this.datasetsById.size === 0) {
|
|
2127
|
+
await this.collectDatasets();
|
|
2128
|
+
}
|
|
2129
|
+
if (this.evaluatorsById.size === 0) {
|
|
2130
|
+
await this.collectEvaluators();
|
|
2131
|
+
}
|
|
2132
|
+
const rcName = collected.runConfig.getName();
|
|
2133
|
+
const jobs = [];
|
|
2134
|
+
const runs = collected.runConfig.getRuns();
|
|
2135
|
+
for (const [i, row] of runs.entries()) {
|
|
2136
|
+
const dsCollected = Array.from(this.datasetsById.values()).find(
|
|
2137
|
+
(d) => d.dataset === row.dataset
|
|
2138
|
+
);
|
|
2139
|
+
if (!dsCollected) {
|
|
2140
|
+
throw new Error(
|
|
2141
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
2142
|
+
);
|
|
2143
|
+
}
|
|
2144
|
+
let evaluatorIds;
|
|
2145
|
+
if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
|
|
2146
|
+
const matcher = createNameMatcher(row.evaluatorPattern);
|
|
2147
|
+
const matched = Array.from(this.evaluatorsById.values()).filter(
|
|
2148
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
2149
|
+
);
|
|
2150
|
+
if (matched.length === 0) {
|
|
2151
|
+
throw new Error(
|
|
2152
|
+
`RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
|
|
2153
|
+
);
|
|
2154
|
+
}
|
|
2155
|
+
evaluatorIds = matched.map((item) => item.id);
|
|
2156
|
+
} else {
|
|
2157
|
+
const evaluators = row.evaluators;
|
|
2158
|
+
evaluatorIds = [];
|
|
2159
|
+
for (const ev of evaluators) {
|
|
2160
|
+
const found = Array.from(this.evaluatorsById.values()).find(
|
|
2161
|
+
(item) => item.evaluator === ev
|
|
2162
|
+
);
|
|
2163
|
+
if (!found) {
|
|
2164
|
+
throw new Error(
|
|
2165
|
+
`RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
|
|
2166
|
+
);
|
|
2167
|
+
}
|
|
2168
|
+
evaluatorIds.push(found.id);
|
|
2169
|
+
}
|
|
2170
|
+
}
|
|
2171
|
+
const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
|
|
2172
|
+
jobs.push({
|
|
2173
|
+
datasetId: dsCollected.id,
|
|
2174
|
+
evaluatorIds,
|
|
2175
|
+
runConfigName: rcName,
|
|
2176
|
+
runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
|
|
2177
|
+
runConfigTags: collected.runConfig.getTags(),
|
|
2178
|
+
repetitions
|
|
2179
|
+
});
|
|
2180
|
+
}
|
|
2181
|
+
return jobs;
|
|
2182
|
+
}
|
|
2183
|
+
async expandRunConfigNamesToJobs(names) {
|
|
2184
|
+
const jobs = [];
|
|
2185
|
+
for (const name of names) {
|
|
2186
|
+
const collected = await this.resolveRunConfigByName(name);
|
|
2187
|
+
if (!collected) {
|
|
2188
|
+
const known = await this.collectRunConfigs();
|
|
2189
|
+
const available = known.map((r) => r.runConfig.getName()).sort();
|
|
2190
|
+
throw new Error(
|
|
2191
|
+
available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
|
|
2192
|
+
);
|
|
2193
|
+
}
|
|
2194
|
+
jobs.push(...await this.expandRunConfigToJobs(collected));
|
|
2195
|
+
}
|
|
2196
|
+
return jobs;
|
|
2197
|
+
}
|
|
2198
|
+
async runDatasetJobsWithSharedConcurrency(request) {
|
|
2199
|
+
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
2200
|
+
const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
2201
|
+
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
2202
|
+
const snapshots = [];
|
|
2203
|
+
for (const job of request.jobs) {
|
|
2204
|
+
snapshots.push(
|
|
2205
|
+
await this.startDatasetRun({
|
|
2206
|
+
datasetId: job.datasetId,
|
|
2207
|
+
evaluatorIds: job.evaluatorIds,
|
|
2208
|
+
triggerId,
|
|
2209
|
+
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
2210
|
+
globalEvaluationSemaphore: sem,
|
|
2211
|
+
runConfigName: job.runConfigName,
|
|
2212
|
+
runConfigTags: job.runConfigTags,
|
|
2213
|
+
repetitions: job.repetitions
|
|
2214
|
+
})
|
|
2215
|
+
);
|
|
2216
|
+
}
|
|
2217
|
+
return snapshots;
|
|
2218
|
+
}
|
|
2020
2219
|
async searchTestCases(query) {
|
|
2021
2220
|
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
2022
2221
|
return searchCollectedTestCases(testCases, query);
|
|
@@ -2035,35 +2234,45 @@ var EffectRunner = class {
|
|
|
2035
2234
|
);
|
|
2036
2235
|
}
|
|
2037
2236
|
async runDatasetWith(request) {
|
|
2237
|
+
const runConfigName = validateRunConfigName(
|
|
2238
|
+
request.runConfigName,
|
|
2239
|
+
"runDatasetWith.runConfigName"
|
|
2240
|
+
);
|
|
2241
|
+
return this.startDatasetRun({
|
|
2242
|
+
datasetId: request.datasetId,
|
|
2243
|
+
evaluatorIds: request.evaluatorIds,
|
|
2244
|
+
triggerId: request.triggerId,
|
|
2245
|
+
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2246
|
+
repetitions: request.repetitions,
|
|
2247
|
+
runConfigName,
|
|
2248
|
+
runConfigTags: request.runConfigTags
|
|
2249
|
+
});
|
|
2250
|
+
}
|
|
2251
|
+
async startDatasetRun(params) {
|
|
2038
2252
|
if (this.datasetsById.size === 0) {
|
|
2039
2253
|
await this.collectDatasets();
|
|
2040
2254
|
}
|
|
2041
2255
|
if (this.evaluatorsById.size === 0) {
|
|
2042
2256
|
await this.collectEvaluators();
|
|
2043
2257
|
}
|
|
2044
|
-
const dataset = this.datasetsById.get(
|
|
2258
|
+
const dataset = this.datasetsById.get(params.datasetId);
|
|
2045
2259
|
if (!dataset) {
|
|
2046
|
-
throw new Error(`Unknown dataset: ${
|
|
2260
|
+
throw new Error(`Unknown dataset: ${params.datasetId}`);
|
|
2047
2261
|
}
|
|
2048
|
-
const selectedEvaluators =
|
|
2262
|
+
const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
2049
2263
|
if (selectedEvaluators.length === 0) {
|
|
2050
2264
|
throw new Error("No evaluators selected for run");
|
|
2051
2265
|
}
|
|
2052
|
-
const selectedTestCases = await this.collectDatasetTestCases(
|
|
2053
|
-
const
|
|
2054
|
-
|
|
2055
|
-
|
|
2056
|
-
)
|
|
2057
|
-
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
2266
|
+
const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
|
|
2267
|
+
const repetitions = normalizeRunRepetitions(params.repetitions);
|
|
2268
|
+
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
2269
|
+
const runConfigTags = [...params.runConfigTags ?? []];
|
|
2270
|
+
const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
2058
2271
|
const runId = `run-${crypto.randomUUID()}`;
|
|
2059
|
-
const artifactPath = createArtifactPath(
|
|
2060
|
-
this.config.artifactDirectory,
|
|
2061
|
-
request.datasetId,
|
|
2062
|
-
runId
|
|
2063
|
-
);
|
|
2272
|
+
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
2064
2273
|
const snapshot = {
|
|
2065
2274
|
runId,
|
|
2066
|
-
datasetId:
|
|
2275
|
+
datasetId: params.datasetId,
|
|
2067
2276
|
datasetName: dataset.dataset.getName(),
|
|
2068
2277
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2069
2278
|
queuedAt: Date.now(),
|
|
@@ -2084,7 +2293,7 @@ var EffectRunner = class {
|
|
|
2084
2293
|
const queuedEvent = {
|
|
2085
2294
|
type: "RunQueued",
|
|
2086
2295
|
runId,
|
|
2087
|
-
datasetId:
|
|
2296
|
+
datasetId: params.datasetId,
|
|
2088
2297
|
datasetName: dataset.dataset.getName(),
|
|
2089
2298
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2090
2299
|
totalTestCases: totalEvaluations,
|
|
@@ -2098,17 +2307,20 @@ var EffectRunner = class {
|
|
|
2098
2307
|
payload: queuedEvent
|
|
2099
2308
|
})
|
|
2100
2309
|
);
|
|
2101
|
-
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
2102
2310
|
await effect.Effect.runPromise(
|
|
2103
2311
|
effect.Queue.offer(this.runQueue, {
|
|
2104
2312
|
runId,
|
|
2105
2313
|
triggerId,
|
|
2106
|
-
datasetId:
|
|
2314
|
+
datasetId: params.datasetId,
|
|
2107
2315
|
dataset: dataset.dataset,
|
|
2108
2316
|
evaluators: selectedEvaluators,
|
|
2109
2317
|
testCases: selectedTestCases,
|
|
2110
2318
|
snapshot,
|
|
2111
|
-
maxConcurrency
|
|
2319
|
+
maxConcurrency: params.maxConcurrency,
|
|
2320
|
+
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2321
|
+
runConfigName: params.runConfigName,
|
|
2322
|
+
runConfigTags,
|
|
2323
|
+
repetitions
|
|
2112
2324
|
})
|
|
2113
2325
|
);
|
|
2114
2326
|
return snapshot;
|
|
@@ -2124,9 +2336,9 @@ var EffectRunner = class {
|
|
|
2124
2336
|
return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
|
|
2125
2337
|
}
|
|
2126
2338
|
getAllRunSnapshots() {
|
|
2127
|
-
return Array.from(
|
|
2128
|
-
|
|
2129
|
-
)
|
|
2339
|
+
return Array.from(effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()).sort(
|
|
2340
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
2341
|
+
);
|
|
2130
2342
|
}
|
|
2131
2343
|
async loadRunSnapshotsFromArtifacts() {
|
|
2132
2344
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -2179,6 +2391,11 @@ var EffectRunner = class {
|
|
|
2179
2391
|
);
|
|
2180
2392
|
}
|
|
2181
2393
|
};
|
|
2394
|
+
|
|
2395
|
+
// src/runner/events.ts
|
|
2396
|
+
var PROGRAMMATIC_RUN_CONFIG = {
|
|
2397
|
+
runConfigName: "programmatic"
|
|
2398
|
+
};
|
|
2182
2399
|
var LEFT_PANE_WIDTH2 = 44;
|
|
2183
2400
|
var MAX_RUNS_FOR_CHART = 12;
|
|
2184
2401
|
var MAX_RUNS_FOR_TREND = 20;
|
|
@@ -2238,9 +2455,9 @@ function DatasetsView({
|
|
|
2238
2455
|
}) {
|
|
2239
2456
|
const leftFocused = state.focus === "left";
|
|
2240
2457
|
const rightFocused = state.focus === "right";
|
|
2241
|
-
const [runScores, setRunScores] =
|
|
2242
|
-
const [loading, setLoading] =
|
|
2243
|
-
|
|
2458
|
+
const [runScores, setRunScores] = React.useState([]);
|
|
2459
|
+
const [loading, setLoading] = React.useState(false);
|
|
2460
|
+
React.useEffect(() => {
|
|
2244
2461
|
if (!selectedDataset?.runs?.length) {
|
|
2245
2462
|
setRunScores([]);
|
|
2246
2463
|
return;
|
|
@@ -2252,7 +2469,7 @@ function DatasetsView({
|
|
|
2252
2469
|
const barData = runScores.slice(0, MAX_RUNS_FOR_CHART).reverse();
|
|
2253
2470
|
const trendValues = runScores.slice(0, MAX_RUNS_FOR_TREND).map((r) => r.value).reverse();
|
|
2254
2471
|
const trendBatched = batchAverage(trendValues, TREND_BATCH_SIZE);
|
|
2255
|
-
const overviewRows =
|
|
2472
|
+
const overviewRows = React.useMemo(() => {
|
|
2256
2473
|
const rows = [];
|
|
2257
2474
|
rows.push(
|
|
2258
2475
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." }, "overview")
|
|
@@ -2342,11 +2559,7 @@ function DatasetsView({
|
|
|
2342
2559
|
] })
|
|
2343
2560
|
] });
|
|
2344
2561
|
}
|
|
2345
|
-
function RunsView({
|
|
2346
|
-
state,
|
|
2347
|
-
dataset,
|
|
2348
|
-
selectedRun
|
|
2349
|
-
}) {
|
|
2562
|
+
function RunsView({ state, dataset, selectedRun }) {
|
|
2350
2563
|
const runs = dataset?.runs ?? [];
|
|
2351
2564
|
const rightFocused = state.focus === "right";
|
|
2352
2565
|
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
@@ -2362,10 +2575,10 @@ function RunsView({
|
|
|
2362
2575
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2363
2576
|
"Commit: ",
|
|
2364
2577
|
selectedRun.meta.commit,
|
|
2365
|
-
"
|
|
2578
|
+
" Branch: ",
|
|
2366
2579
|
selectedRun.meta.branch,
|
|
2580
|
+
" Seed:",
|
|
2367
2581
|
" ",
|
|
2368
|
-
"Seed: ",
|
|
2369
2582
|
selectedRun.meta.seed
|
|
2370
2583
|
] }),
|
|
2371
2584
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
@@ -2378,23 +2591,10 @@ function RunsView({
|
|
|
2378
2591
|
format: (v) => `${v}%`
|
|
2379
2592
|
}
|
|
2380
2593
|
),
|
|
2381
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2382
|
-
TextBar,
|
|
2383
|
-
{
|
|
2384
|
-
label: "avg score",
|
|
2385
|
-
value: Math.round(selectedRun.performance.avgScore * 100)
|
|
2386
|
-
}
|
|
2387
|
-
),
|
|
2594
|
+
/* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: "avg score", value: Math.round(selectedRun.performance.avgScore * 100) }),
|
|
2388
2595
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
2389
2596
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
|
|
2390
|
-
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2391
|
-
TextBar,
|
|
2392
|
-
{
|
|
2393
|
-
label: dimension.name,
|
|
2394
|
-
value: dimension.score
|
|
2395
|
-
},
|
|
2396
|
-
dimension.name
|
|
2397
|
-
)),
|
|
2597
|
+
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: dimension.name, value: dimension.score }, dimension.name)),
|
|
2398
2598
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
2399
2599
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
|
|
2400
2600
|
/* @__PURE__ */ jsxRuntime.jsx(
|
|
@@ -2497,15 +2697,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2497
2697
|
...dimensions.map((d) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
|
|
2498
2698
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp2"),
|
|
2499
2699
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
|
|
2500
|
-
...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2501
|
-
CheckRow,
|
|
2502
|
-
{
|
|
2503
|
-
name: c.name,
|
|
2504
|
-
passed: c.passed,
|
|
2505
|
-
detail: c.detail
|
|
2506
|
-
},
|
|
2507
|
-
`chk-${c.name}`
|
|
2508
|
-
)),
|
|
2700
|
+
...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
|
|
2509
2701
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp3"),
|
|
2510
2702
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Performance" }, "perf-h"),
|
|
2511
2703
|
/* @__PURE__ */ jsxRuntime.jsx(
|
|
@@ -2551,7 +2743,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2551
2743
|
rows.push(/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp6"));
|
|
2552
2744
|
rows.push(/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
|
|
2553
2745
|
for (const tc of testCases) {
|
|
2554
|
-
const
|
|
2746
|
+
const repetitionPart = tc.repetitionCount != null && tc.repetitionCount > 1 && tc.repetitionIndex != null ? ` (${tc.repetitionIndex}/${tc.repetitionCount})` : "";
|
|
2555
2747
|
rows.push(
|
|
2556
2748
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
2557
2749
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
@@ -2563,13 +2755,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2563
2755
|
] }),
|
|
2564
2756
|
" ",
|
|
2565
2757
|
tc.testCaseName,
|
|
2566
|
-
|
|
2758
|
+
repetitionPart ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: repetitionPart }) : null,
|
|
2567
2759
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2568
2760
|
" (",
|
|
2569
2761
|
tc.durationMs,
|
|
2570
2762
|
"ms)"
|
|
2571
2763
|
] })
|
|
2572
|
-
] }, `tc-${tc.testCaseId}-${tc.
|
|
2764
|
+
] }, `tc-${tc.testCaseId}-${tc.repetitionId ?? "x"}-${tc.repetitionIndex ?? 0}`)
|
|
2573
2765
|
);
|
|
2574
2766
|
for (const item of tc.evaluatorScores) {
|
|
2575
2767
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
@@ -2622,17 +2814,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2622
2814
|
}
|
|
2623
2815
|
} else {
|
|
2624
2816
|
rows.push(
|
|
2625
|
-
/* @__PURE__ */ jsxRuntime.jsxs(
|
|
2626
|
-
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
|
-
children: [
|
|
2630
|
-
" ",
|
|
2631
|
-
"n/a"
|
|
2632
|
-
]
|
|
2633
|
-
},
|
|
2634
|
-
`tc-${tc.testCaseId}-${item.evaluatorId}-n/a`
|
|
2635
|
-
)
|
|
2817
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2818
|
+
" ",
|
|
2819
|
+
"n/a"
|
|
2820
|
+
] }, `tc-${tc.testCaseId}-${item.evaluatorId}-n/a`)
|
|
2636
2821
|
);
|
|
2637
2822
|
}
|
|
2638
2823
|
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
@@ -2689,12 +2874,12 @@ function RunDetailsView({
|
|
|
2689
2874
|
}) {
|
|
2690
2875
|
const runs = dataset?.runs ?? [];
|
|
2691
2876
|
const rightFocused = state.focus === "right";
|
|
2692
|
-
const [testCases, setTestCases] =
|
|
2693
|
-
const evaluatorNameById =
|
|
2877
|
+
const [testCases, setTestCases] = React.useState([]);
|
|
2878
|
+
const evaluatorNameById = React__default.default.useMemo(
|
|
2694
2879
|
() => new Map(evaluators.map((e) => [e.id, e.name])),
|
|
2695
2880
|
[evaluators]
|
|
2696
2881
|
);
|
|
2697
|
-
|
|
2882
|
+
React.useEffect(() => {
|
|
2698
2883
|
if (!selectedRun?.meta?.artifact) {
|
|
2699
2884
|
setTestCases([]);
|
|
2700
2885
|
return;
|
|
@@ -2713,7 +2898,7 @@ function RunDetailsView({
|
|
|
2713
2898
|
const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
|
|
2714
2899
|
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
2715
2900
|
/* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
|
|
2716
|
-
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2901
|
+
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React__default.default.Fragment, { children: row }, i)) }) })
|
|
2717
2902
|
] });
|
|
2718
2903
|
}
|
|
2719
2904
|
var LEFT_PANE_WIDTH3 = 44;
|
|
@@ -2736,19 +2921,11 @@ function NewEvaluationView({
|
|
|
2736
2921
|
visibleEvaluators.map((evaluator, index) => {
|
|
2737
2922
|
const selected = index === state.evaluatorMenuIndex;
|
|
2738
2923
|
const inSelection = state.selectedEvaluatorIds.includes(evaluator.id);
|
|
2739
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2740
|
-
|
|
2741
|
-
|
|
2742
|
-
|
|
2743
|
-
|
|
2744
|
-
children: [
|
|
2745
|
-
selected ? "\u25B8 " : " ",
|
|
2746
|
-
inSelection ? "[x] " : "[ ] ",
|
|
2747
|
-
evaluator.name
|
|
2748
|
-
]
|
|
2749
|
-
},
|
|
2750
|
-
evaluator.id
|
|
2751
|
-
);
|
|
2924
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
|
|
2925
|
+
selected ? "\u25B8 " : " ",
|
|
2926
|
+
inSelection ? "[x] " : "[ ] ",
|
|
2927
|
+
evaluator.name
|
|
2928
|
+
] }, evaluator.id);
|
|
2752
2929
|
})
|
|
2753
2930
|
] }),
|
|
2754
2931
|
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
@@ -2780,30 +2957,20 @@ function clampCursor(state, filteredDatasetsLength, selectedRunCount) {
|
|
|
2780
2957
|
...state,
|
|
2781
2958
|
datasetMenuIndex: Math.max(0, Math.min(state.datasetMenuIndex, datasetMax)),
|
|
2782
2959
|
runMenuIndex: Math.max(0, Math.min(state.runMenuIndex, runMax)),
|
|
2783
|
-
evaluatorMenuIndex: Math.max(
|
|
2784
|
-
0,
|
|
2785
|
-
Math.min(state.evaluatorMenuIndex, evaluatorMax)
|
|
2786
|
-
)
|
|
2960
|
+
evaluatorMenuIndex: Math.max(0, Math.min(state.evaluatorMenuIndex, evaluatorMax))
|
|
2787
2961
|
};
|
|
2788
2962
|
}
|
|
2789
|
-
function EvalsCliApp({
|
|
2790
|
-
data,
|
|
2791
|
-
args,
|
|
2792
|
-
runner
|
|
2793
|
-
}) {
|
|
2963
|
+
function EvalsCliApp({ data, args, runner }) {
|
|
2794
2964
|
const { exit } = ink.useApp();
|
|
2795
2965
|
const { width: stdoutWidth, height: stdoutHeight } = fullscreenInk.useScreenSize();
|
|
2796
|
-
const [liveData, setLiveData] =
|
|
2797
|
-
const [runtimeMessage, setRuntimeMessage] =
|
|
2798
|
-
const overviewRowCountRef =
|
|
2799
|
-
const [state, dispatch] =
|
|
2800
|
-
|
|
2801
|
-
createInitialState(data, args)
|
|
2802
|
-
);
|
|
2803
|
-
React2.useEffect(() => {
|
|
2966
|
+
const [liveData, setLiveData] = React.useState(data);
|
|
2967
|
+
const [runtimeMessage, setRuntimeMessage] = React.useState();
|
|
2968
|
+
const overviewRowCountRef = React.useRef(0);
|
|
2969
|
+
const [state, dispatch] = React.useReducer(reduceCliState, createInitialState(data, args));
|
|
2970
|
+
React.useEffect(() => {
|
|
2804
2971
|
setLiveData(data);
|
|
2805
2972
|
}, [data]);
|
|
2806
|
-
|
|
2973
|
+
React.useEffect(() => {
|
|
2807
2974
|
if (!runner) {
|
|
2808
2975
|
return void 0;
|
|
2809
2976
|
}
|
|
@@ -2822,7 +2989,7 @@ function EvalsCliApp({
|
|
|
2822
2989
|
}
|
|
2823
2990
|
});
|
|
2824
2991
|
}, [runner]);
|
|
2825
|
-
const filteredDatasets =
|
|
2992
|
+
const filteredDatasets = React.useMemo(
|
|
2826
2993
|
() => getFilteredDatasets(liveData, state.searchQuery),
|
|
2827
2994
|
[liveData, state.searchQuery]
|
|
2828
2995
|
);
|
|
@@ -2831,14 +2998,8 @@ function EvalsCliApp({
|
|
|
2831
2998
|
filteredDatasets.length,
|
|
2832
2999
|
getDatasetByMenuIndex(filteredDatasets, state.datasetMenuIndex)?.runs.length ?? 0
|
|
2833
3000
|
);
|
|
2834
|
-
const selectedDataset = getDatasetByMenuIndex(
|
|
2835
|
-
|
|
2836
|
-
clampedState.datasetMenuIndex
|
|
2837
|
-
);
|
|
2838
|
-
const selectedRun = getRunByMenuIndex(
|
|
2839
|
-
selectedDataset,
|
|
2840
|
-
clampedState.runMenuIndex
|
|
2841
|
-
);
|
|
3001
|
+
const selectedDataset = getDatasetByMenuIndex(filteredDatasets, clampedState.datasetMenuIndex);
|
|
3002
|
+
const selectedRun = getRunByMenuIndex(selectedDataset, clampedState.runMenuIndex);
|
|
2842
3003
|
const visibleEvaluators = liveData.evaluators.filter(
|
|
2843
3004
|
(evaluator) => evaluator.name.toLowerCase().includes(clampedState.searchQuery.toLowerCase())
|
|
2844
3005
|
);
|
|
@@ -2926,15 +3087,14 @@ function EvalsCliApp({
|
|
|
2926
3087
|
}
|
|
2927
3088
|
void runner.runDatasetWith({
|
|
2928
3089
|
datasetId: selectedDataset.id,
|
|
2929
|
-
evaluatorIds: clampedState.selectedEvaluatorIds
|
|
3090
|
+
evaluatorIds: clampedState.selectedEvaluatorIds,
|
|
3091
|
+
...PROGRAMMATIC_RUN_CONFIG
|
|
2930
3092
|
}).then((snapshot) => {
|
|
2931
3093
|
setRuntimeMessage(
|
|
2932
3094
|
`Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
|
|
2933
3095
|
);
|
|
2934
3096
|
}).catch((error) => {
|
|
2935
|
-
setRuntimeMessage(
|
|
2936
|
-
error instanceof Error ? error.message : "Failed to start evaluation."
|
|
2937
|
-
);
|
|
3097
|
+
setRuntimeMessage(error instanceof Error ? error.message : "Failed to start evaluation.");
|
|
2938
3098
|
});
|
|
2939
3099
|
}
|
|
2940
3100
|
});
|
|
@@ -2961,14 +3121,7 @@ function EvalsCliApp({
|
|
|
2961
3121
|
);
|
|
2962
3122
|
}
|
|
2963
3123
|
if (clampedState.level === "runs") {
|
|
2964
|
-
return /* @__PURE__ */ jsxRuntime.jsx(
|
|
2965
|
-
RunsView,
|
|
2966
|
-
{
|
|
2967
|
-
state: clampedState,
|
|
2968
|
-
dataset: selectedDataset,
|
|
2969
|
-
selectedRun
|
|
2970
|
-
}
|
|
2971
|
-
);
|
|
3124
|
+
return /* @__PURE__ */ jsxRuntime.jsx(RunsView, { state: clampedState, dataset: selectedDataset, selectedRun });
|
|
2972
3125
|
}
|
|
2973
3126
|
return /* @__PURE__ */ jsxRuntime.jsx(
|
|
2974
3127
|
RunDetailsView,
|
|
@@ -2980,82 +3133,44 @@ function EvalsCliApp({
|
|
|
2980
3133
|
}
|
|
2981
3134
|
);
|
|
2982
3135
|
};
|
|
2983
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2984
|
-
ink.Box,
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
|
|
2989
|
-
|
|
2990
|
-
|
|
2991
|
-
|
|
2992
|
-
|
|
2993
|
-
|
|
2994
|
-
|
|
2995
|
-
|
|
2996
|
-
|
|
2997
|
-
|
|
2998
|
-
|
|
2999
|
-
|
|
3000
|
-
|
|
3001
|
-
|
|
3002
|
-
|
|
3003
|
-
|
|
3004
|
-
|
|
3005
|
-
|
|
3006
|
-
|
|
3007
|
-
|
|
3008
|
-
|
|
3009
|
-
|
|
3010
|
-
|
|
3011
|
-
|
|
3012
|
-
|
|
3013
|
-
|
|
3014
|
-
|
|
3015
|
-
|
|
3016
|
-
|
|
3017
|
-
|
|
3018
|
-
|
|
3019
|
-
|
|
3020
|
-
|
|
3021
|
-
ink.Box,
|
|
3022
|
-
{
|
|
3023
|
-
marginTop: 1,
|
|
3024
|
-
borderStyle: "round",
|
|
3025
|
-
borderColor: "magenta",
|
|
3026
|
-
paddingX: 1,
|
|
3027
|
-
width: stdoutWidth,
|
|
3028
|
-
children: [
|
|
3029
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", bold: true, children: "Search: " }),
|
|
3030
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: clampedState.searchQuery })
|
|
3031
|
-
]
|
|
3032
|
-
}
|
|
3033
|
-
),
|
|
3034
|
-
runtimeMessage && /* @__PURE__ */ jsxRuntime.jsx(
|
|
3035
|
-
ink.Box,
|
|
3036
|
-
{
|
|
3037
|
-
marginTop: 1,
|
|
3038
|
-
borderStyle: "round",
|
|
3039
|
-
borderColor: "blue",
|
|
3040
|
-
paddingX: 1,
|
|
3041
|
-
width: stdoutWidth,
|
|
3042
|
-
children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "blue", children: runtimeMessage })
|
|
3043
|
-
}
|
|
3044
|
-
),
|
|
3045
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
3046
|
-
ink.Box,
|
|
3047
|
-
{
|
|
3048
|
-
marginTop: 1,
|
|
3049
|
-
flexGrow: 1,
|
|
3050
|
-
width: stdoutWidth,
|
|
3051
|
-
flexDirection: "row",
|
|
3052
|
-
children: renderContent()
|
|
3053
|
-
}
|
|
3054
|
-
),
|
|
3055
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: getFooterText(clampedState) }) })
|
|
3056
|
-
]
|
|
3057
|
-
}
|
|
3058
|
-
);
|
|
3136
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", flexGrow: 1, width: stdoutWidth, height: stdoutHeight, children: [
|
|
3137
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, width: stdoutWidth, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: getBreadcrumbText(clampedState, selectedDataset?.name, selectedRun?.label) }) }),
|
|
3138
|
+
clampedState.startupWarnings.length > 0 && /* @__PURE__ */ jsxRuntime.jsxs(
|
|
3139
|
+
ink.Box,
|
|
3140
|
+
{
|
|
3141
|
+
marginTop: 1,
|
|
3142
|
+
borderStyle: "round",
|
|
3143
|
+
borderColor: "yellow",
|
|
3144
|
+
paddingX: 1,
|
|
3145
|
+
flexDirection: "column",
|
|
3146
|
+
width: stdoutWidth,
|
|
3147
|
+
children: [
|
|
3148
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "yellow", children: "Startup warnings:" }),
|
|
3149
|
+
clampedState.startupWarnings.map((warning, index) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: warning }, `${warning}-${index}`))
|
|
3150
|
+
]
|
|
3151
|
+
}
|
|
3152
|
+
),
|
|
3153
|
+
clampedState.searchMode && /* @__PURE__ */ jsxRuntime.jsxs(
|
|
3154
|
+
ink.Box,
|
|
3155
|
+
{
|
|
3156
|
+
marginTop: 1,
|
|
3157
|
+
borderStyle: "round",
|
|
3158
|
+
borderColor: "magenta",
|
|
3159
|
+
paddingX: 1,
|
|
3160
|
+
width: stdoutWidth,
|
|
3161
|
+
children: [
|
|
3162
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "magenta", bold: true, children: [
|
|
3163
|
+
"Search:",
|
|
3164
|
+
" "
|
|
3165
|
+
] }),
|
|
3166
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: clampedState.searchQuery })
|
|
3167
|
+
]
|
|
3168
|
+
}
|
|
3169
|
+
),
|
|
3170
|
+
runtimeMessage && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, borderStyle: "round", borderColor: "blue", paddingX: 1, width: stdoutWidth, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "blue", children: runtimeMessage }) }),
|
|
3171
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, flexGrow: 1, width: stdoutWidth, flexDirection: "row", children: renderContent() }),
|
|
3172
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: getFooterText(clampedState) }) })
|
|
3173
|
+
] });
|
|
3059
3174
|
}
|
|
3060
3175
|
async function main() {
|
|
3061
3176
|
const args = parseStartupArgs(process.argv.slice(2));
|
|
@@ -3067,9 +3182,7 @@ async function main() {
|
|
|
3067
3182
|
process.on("SIGTERM", () => {
|
|
3068
3183
|
void runner.shutdown().finally(() => process.exit(0));
|
|
3069
3184
|
});
|
|
3070
|
-
fullscreenInk.withFullScreen(
|
|
3071
|
-
/* @__PURE__ */ jsxRuntime.jsx(EvalsCliApp, { data, args, runner })
|
|
3072
|
-
).start();
|
|
3185
|
+
fullscreenInk.withFullScreen(/* @__PURE__ */ jsxRuntime.jsx(EvalsCliApp, { data, args, runner })).start();
|
|
3073
3186
|
}
|
|
3074
3187
|
void main();
|
|
3075
3188
|
//# sourceMappingURL=out.js.map
|