@m4trix/evals 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +352 -184
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +350 -185
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +294 -155
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +294 -156
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +296 -155
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +1 -2
- package/dist/index.js +294 -156
- package/dist/index.js.map +1 -1
- package/package.json +3 -3
package/dist/cli-simple.cjs
CHANGED
|
@@ -8,12 +8,16 @@ var path = require('path');
|
|
|
8
8
|
var jitiModule = require('jiti');
|
|
9
9
|
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
|
-
var
|
|
11
|
+
var diff = require('diff');
|
|
12
|
+
var stringify = require('fast-json-stable-stringify');
|
|
13
|
+
var os = require('os');
|
|
12
14
|
var React2 = require('react');
|
|
13
15
|
var ink = require('ink');
|
|
14
16
|
var jsxRuntime = require('react/jsx-runtime');
|
|
15
17
|
|
|
16
18
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
19
|
+
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
20
|
+
|
|
17
21
|
function _interopNamespace(e) {
|
|
18
22
|
if (e && e.__esModule) return e;
|
|
19
23
|
var n = Object.create(null);
|
|
@@ -33,6 +37,7 @@ function _interopNamespace(e) {
|
|
|
33
37
|
}
|
|
34
38
|
|
|
35
39
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
40
|
+
var stringify__default = /*#__PURE__*/_interopDefault(stringify);
|
|
36
41
|
var React2__namespace = /*#__PURE__*/_interopNamespace(React2);
|
|
37
42
|
|
|
38
43
|
// src/runner/config.ts
|
|
@@ -284,10 +289,102 @@ async function collectTestCasesFromFiles(config) {
|
|
|
284
289
|
);
|
|
285
290
|
return found.flat();
|
|
286
291
|
}
|
|
292
|
+
function preprocessForDiff(value, options) {
|
|
293
|
+
if (options?.sort && Array.isArray(value)) {
|
|
294
|
+
return [...value].sort((a, b) => {
|
|
295
|
+
const aStr = stringify__default.default(preprocessForDiff(a, options));
|
|
296
|
+
const bStr = stringify__default.default(preprocessForDiff(b, options));
|
|
297
|
+
return aStr.localeCompare(bStr);
|
|
298
|
+
}).map((item) => preprocessForDiff(item, options));
|
|
299
|
+
}
|
|
300
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
|
|
301
|
+
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
302
|
+
const filtered = {};
|
|
303
|
+
for (const [k, v] of Object.entries(value)) {
|
|
304
|
+
if (!keys.includes(k)) {
|
|
305
|
+
filtered[k] = preprocessForDiff(v, options);
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
return filtered;
|
|
309
|
+
}
|
|
310
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value)) {
|
|
311
|
+
const result = {};
|
|
312
|
+
for (const [k, v] of Object.entries(value)) {
|
|
313
|
+
result[k] = preprocessForDiff(v, options);
|
|
314
|
+
}
|
|
315
|
+
return result;
|
|
316
|
+
}
|
|
317
|
+
if (typeof value === "number" && options?.precision !== void 0) {
|
|
318
|
+
return Number(value.toFixed(options.precision));
|
|
319
|
+
}
|
|
320
|
+
return value;
|
|
321
|
+
}
|
|
322
|
+
function toPrettyJson(value) {
|
|
323
|
+
const str = stringify__default.default(value);
|
|
324
|
+
try {
|
|
325
|
+
const parsed = JSON.parse(str);
|
|
326
|
+
return JSON.stringify(parsed, null, 2);
|
|
327
|
+
} catch {
|
|
328
|
+
return str;
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
function formatDiffParts(parts) {
|
|
332
|
+
const lines = [];
|
|
333
|
+
for (const part of parts) {
|
|
334
|
+
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
335
|
+
const partLines = part.value.split("\n");
|
|
336
|
+
for (let i = 0; i < partLines.length; i++) {
|
|
337
|
+
const line = partLines[i];
|
|
338
|
+
if (i === partLines.length - 1 && line === "")
|
|
339
|
+
continue;
|
|
340
|
+
lines.push(prefix + line);
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
return lines.join("\n");
|
|
344
|
+
}
|
|
287
345
|
function createDiffString(expected, actual, diffOptions) {
|
|
288
|
-
const
|
|
289
|
-
const
|
|
290
|
-
|
|
346
|
+
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
347
|
+
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
348
|
+
if (diffOptions?.keysOnly) {
|
|
349
|
+
const expectedKeys = JSON.stringify(
|
|
350
|
+
extractKeys(expectedProcessed),
|
|
351
|
+
null,
|
|
352
|
+
2
|
|
353
|
+
);
|
|
354
|
+
const actualKeys = JSON.stringify(
|
|
355
|
+
extractKeys(actualProcessed),
|
|
356
|
+
null,
|
|
357
|
+
2
|
|
358
|
+
);
|
|
359
|
+
const parts2 = diff.diffLines(expectedKeys, actualKeys);
|
|
360
|
+
return formatDiffParts(parts2);
|
|
361
|
+
}
|
|
362
|
+
const expectedStr = toPrettyJson(expectedProcessed);
|
|
363
|
+
const actualStr = toPrettyJson(actualProcessed);
|
|
364
|
+
if (expectedStr === actualStr) {
|
|
365
|
+
return "";
|
|
366
|
+
}
|
|
367
|
+
const parts = diff.diffLines(expectedStr, actualStr);
|
|
368
|
+
if (diffOptions?.outputNewOnly) {
|
|
369
|
+
const filtered = parts.filter(
|
|
370
|
+
(p) => p.added === true
|
|
371
|
+
);
|
|
372
|
+
return formatDiffParts(filtered);
|
|
373
|
+
}
|
|
374
|
+
return formatDiffParts(parts);
|
|
375
|
+
}
|
|
376
|
+
function extractKeys(value) {
|
|
377
|
+
if (value === null || typeof value !== "object") {
|
|
378
|
+
return "\xB7";
|
|
379
|
+
}
|
|
380
|
+
if (Array.isArray(value)) {
|
|
381
|
+
return value.map(extractKeys);
|
|
382
|
+
}
|
|
383
|
+
const result = {};
|
|
384
|
+
for (const [k, v] of Object.entries(value)) {
|
|
385
|
+
result[k] = extractKeys(v);
|
|
386
|
+
}
|
|
387
|
+
return result;
|
|
291
388
|
}
|
|
292
389
|
function formatLogMessage(msg) {
|
|
293
390
|
if (typeof msg === "string")
|
|
@@ -666,6 +763,20 @@ function readOutput(testCase) {
|
|
|
666
763
|
}
|
|
667
764
|
return candidate.getOutput();
|
|
668
765
|
}
|
|
766
|
+
function buildEvaluationUnits(testCases) {
|
|
767
|
+
const units = [];
|
|
768
|
+
for (const testCaseItem of testCases) {
|
|
769
|
+
const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
770
|
+
for (let r = 0; r < rerunTotal; r++) {
|
|
771
|
+
units.push({
|
|
772
|
+
testCaseItem,
|
|
773
|
+
rerunIndex: r + 1,
|
|
774
|
+
rerunTotal
|
|
775
|
+
});
|
|
776
|
+
}
|
|
777
|
+
}
|
|
778
|
+
return units;
|
|
779
|
+
}
|
|
669
780
|
function nowIsoForFile() {
|
|
670
781
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
671
782
|
}
|
|
@@ -675,157 +786,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
675
786
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
676
787
|
);
|
|
677
788
|
}
|
|
678
|
-
function
|
|
789
|
+
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
790
|
+
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
679
791
|
return effect.Effect.gen(function* () {
|
|
680
|
-
const
|
|
681
|
-
const
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
744
|
-
logs.push(taggedEntry ?? createLogEntry(result));
|
|
745
|
-
testCaseError = result.message;
|
|
746
|
-
evaluatorScores.push({
|
|
747
|
-
evaluatorId,
|
|
748
|
-
scores: [],
|
|
749
|
-
passed: false,
|
|
750
|
-
logs: logs.length > 0 ? logs : void 0
|
|
751
|
-
});
|
|
752
|
-
continue;
|
|
753
|
-
}
|
|
754
|
-
const { scores, metrics } = normalizeResult(result);
|
|
755
|
-
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
756
|
-
evaluatorScores.push({
|
|
757
|
-
evaluatorId,
|
|
758
|
-
scores,
|
|
759
|
-
passed: passed2,
|
|
760
|
-
metrics,
|
|
761
|
-
logs: logs.length > 0 ? logs : void 0
|
|
762
|
-
});
|
|
763
|
-
} catch (error) {
|
|
764
|
-
if (error instanceof Error) {
|
|
765
|
-
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
766
|
-
logs.push(taggedEntry ?? createLogEntry(error));
|
|
767
|
-
}
|
|
768
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
792
|
+
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
793
|
+
const started = Date.now();
|
|
794
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
795
|
+
n + 1,
|
|
796
|
+
n + 1
|
|
797
|
+
]);
|
|
798
|
+
yield* publishEvent({
|
|
799
|
+
type: "TestCaseStarted",
|
|
800
|
+
runId: task.runId,
|
|
801
|
+
testCaseId: testCaseItem.id,
|
|
802
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
803
|
+
startedTestCases: startedEvaluations,
|
|
804
|
+
totalTestCases: totalEvaluations,
|
|
805
|
+
rerunIndex,
|
|
806
|
+
rerunTotal
|
|
807
|
+
});
|
|
808
|
+
const evaluatorScores = [];
|
|
809
|
+
let testCaseError;
|
|
810
|
+
const output = readOutput(testCaseItem.testCase);
|
|
811
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
812
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
813
|
+
if (!evaluateFn) {
|
|
814
|
+
continue;
|
|
815
|
+
}
|
|
816
|
+
const logs = [];
|
|
817
|
+
const logDiff = (expected, actual, options) => {
|
|
818
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
819
|
+
};
|
|
820
|
+
const log = (message, options) => {
|
|
821
|
+
logs.push(createLogEntry(message, options));
|
|
822
|
+
};
|
|
823
|
+
const createError = (message, options) => {
|
|
824
|
+
const entry = createLogEntry(message, options);
|
|
825
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
826
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
827
|
+
return error;
|
|
828
|
+
};
|
|
829
|
+
try {
|
|
830
|
+
const ctx = yield* effect.Effect.promise(
|
|
831
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
832
|
+
);
|
|
833
|
+
const result = yield* effect.Effect.promise(
|
|
834
|
+
() => Promise.resolve().then(
|
|
835
|
+
() => evaluateFn({
|
|
836
|
+
input: testCaseItem.testCase.getInput(),
|
|
837
|
+
ctx,
|
|
838
|
+
output,
|
|
839
|
+
meta: {
|
|
840
|
+
triggerId: task.triggerId,
|
|
841
|
+
runId: evaluatorRunId,
|
|
842
|
+
datasetId: task.datasetId
|
|
843
|
+
},
|
|
844
|
+
logDiff,
|
|
845
|
+
log,
|
|
846
|
+
createError
|
|
847
|
+
})
|
|
848
|
+
)
|
|
849
|
+
);
|
|
850
|
+
if (result instanceof Error) {
|
|
851
|
+
const evaluatorError = result;
|
|
852
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
853
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
854
|
+
testCaseError = result.message;
|
|
769
855
|
evaluatorScores.push({
|
|
770
856
|
evaluatorId,
|
|
771
857
|
scores: [],
|
|
772
858
|
passed: false,
|
|
773
859
|
logs: logs.length > 0 ? logs : void 0
|
|
774
860
|
});
|
|
861
|
+
continue;
|
|
862
|
+
}
|
|
863
|
+
const { scores, metrics } = normalizeResult(result);
|
|
864
|
+
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
865
|
+
evaluatorScores.push({
|
|
866
|
+
evaluatorId,
|
|
867
|
+
scores,
|
|
868
|
+
passed,
|
|
869
|
+
metrics,
|
|
870
|
+
logs: logs.length > 0 ? logs : void 0
|
|
871
|
+
});
|
|
872
|
+
} catch (error) {
|
|
873
|
+
if (error instanceof Error) {
|
|
874
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
875
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
775
876
|
}
|
|
877
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
878
|
+
evaluatorScores.push({
|
|
879
|
+
evaluatorId,
|
|
880
|
+
scores: [],
|
|
881
|
+
passed: false,
|
|
882
|
+
logs: logs.length > 0 ? logs : void 0
|
|
883
|
+
});
|
|
776
884
|
}
|
|
777
|
-
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
778
|
-
rerunPassed.push(rerunPassedThis);
|
|
779
|
-
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
780
|
-
n + 1,
|
|
781
|
-
n + 1
|
|
782
|
-
]);
|
|
783
|
-
const progressEvent = {
|
|
784
|
-
type: "TestCaseProgress",
|
|
785
|
-
runId: task.runId,
|
|
786
|
-
testCaseId: testCaseItem.id,
|
|
787
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
788
|
-
completedTestCases: completedEvaluations,
|
|
789
|
-
totalTestCases: totalEvaluations,
|
|
790
|
-
rerunIndex: r + 1,
|
|
791
|
-
rerunTotal: reruns,
|
|
792
|
-
passed: rerunPassedThis,
|
|
793
|
-
durationMs: Date.now() - started,
|
|
794
|
-
evaluatorScores,
|
|
795
|
-
output,
|
|
796
|
-
errorMessage: testCaseError
|
|
797
|
-
};
|
|
798
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
799
|
-
...snapshot,
|
|
800
|
-
completedTestCases: completedEvaluations
|
|
801
|
-
}));
|
|
802
|
-
yield* publishEvent(progressEvent);
|
|
803
|
-
yield* effect.Queue.offer(persistenceQueue, {
|
|
804
|
-
runId: task.runId,
|
|
805
|
-
artifactPath: task.snapshot.artifactPath,
|
|
806
|
-
payload: progressEvent
|
|
807
|
-
});
|
|
808
885
|
}
|
|
809
|
-
const
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
814
|
-
}
|
|
815
|
-
const [passed, failed] = yield* effect.Effect.all([
|
|
816
|
-
effect.Ref.get(passedRef),
|
|
817
|
-
effect.Ref.get(failedRef)
|
|
886
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
887
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
888
|
+
n + 1,
|
|
889
|
+
n + 1
|
|
818
890
|
]);
|
|
819
|
-
|
|
891
|
+
const progressEvent = {
|
|
892
|
+
type: "TestCaseProgress",
|
|
893
|
+
runId: task.runId,
|
|
894
|
+
testCaseId: testCaseItem.id,
|
|
895
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
896
|
+
completedTestCases: completedEvaluations,
|
|
897
|
+
totalTestCases: totalEvaluations,
|
|
898
|
+
rerunIndex,
|
|
899
|
+
rerunTotal,
|
|
900
|
+
passed: rerunPassedThis,
|
|
901
|
+
durationMs: Date.now() - started,
|
|
902
|
+
evaluatorScores,
|
|
903
|
+
output,
|
|
904
|
+
errorMessage: testCaseError
|
|
905
|
+
};
|
|
906
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
820
907
|
...snapshot,
|
|
821
|
-
|
|
822
|
-
failedTestCases: failed
|
|
908
|
+
completedTestCases: completedEvaluations
|
|
823
909
|
}));
|
|
910
|
+
yield* publishEvent(progressEvent);
|
|
911
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
912
|
+
runId: task.runId,
|
|
913
|
+
artifactPath: task.snapshot.artifactPath,
|
|
914
|
+
payload: progressEvent
|
|
915
|
+
});
|
|
916
|
+
const testCaseCompleted = yield* effect.Ref.modify(
|
|
917
|
+
testCaseResultsRef,
|
|
918
|
+
(map) => {
|
|
919
|
+
const key = testCaseItem.id;
|
|
920
|
+
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
921
|
+
const newResults = [...existing.results, rerunPassedThis];
|
|
922
|
+
const newCompletedCount = existing.completedCount + 1;
|
|
923
|
+
const isLast = newCompletedCount === rerunTotal;
|
|
924
|
+
const newMap = new Map(map);
|
|
925
|
+
newMap.set(key, {
|
|
926
|
+
completedCount: newCompletedCount,
|
|
927
|
+
results: newResults
|
|
928
|
+
});
|
|
929
|
+
const outcome = isLast ? newResults.every(Boolean) : null;
|
|
930
|
+
return [outcome, newMap];
|
|
931
|
+
}
|
|
932
|
+
);
|
|
933
|
+
if (testCaseCompleted !== null) {
|
|
934
|
+
if (testCaseCompleted) {
|
|
935
|
+
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
936
|
+
} else {
|
|
937
|
+
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
938
|
+
}
|
|
939
|
+
const [passed, failed] = yield* effect.Effect.all([
|
|
940
|
+
effect.Ref.get(passedRef),
|
|
941
|
+
effect.Ref.get(failedRef)
|
|
942
|
+
]);
|
|
943
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
944
|
+
...snapshot,
|
|
945
|
+
passedTestCases: passed,
|
|
946
|
+
failedTestCases: failed
|
|
947
|
+
}));
|
|
948
|
+
}
|
|
824
949
|
});
|
|
825
950
|
}
|
|
826
951
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
|
|
827
952
|
const startedAt = Date.now();
|
|
828
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
953
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
829
954
|
...snapshot,
|
|
830
955
|
status: "running",
|
|
831
956
|
startedAt
|
|
@@ -844,9 +969,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
844
969
|
const startedRef = yield* effect.Ref.make(0);
|
|
845
970
|
const passedRef = yield* effect.Ref.make(0);
|
|
846
971
|
const failedRef = yield* effect.Ref.make(0);
|
|
847
|
-
const
|
|
972
|
+
const testCaseResultsRef = yield* effect.Ref.make(
|
|
973
|
+
/* @__PURE__ */ new Map()
|
|
974
|
+
);
|
|
975
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
976
|
+
const processEvaluation = (unit) => processOneEvaluation(
|
|
848
977
|
task,
|
|
849
|
-
|
|
978
|
+
unit,
|
|
850
979
|
totalEvaluations,
|
|
851
980
|
publishEvent,
|
|
852
981
|
persistenceQueue,
|
|
@@ -854,11 +983,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
854
983
|
startedRef,
|
|
855
984
|
completedRef,
|
|
856
985
|
passedRef,
|
|
857
|
-
failedRef
|
|
986
|
+
failedRef,
|
|
987
|
+
testCaseResultsRef
|
|
858
988
|
);
|
|
859
989
|
yield* effect.Effect.forEach(
|
|
860
|
-
|
|
861
|
-
|
|
990
|
+
evaluationUnits,
|
|
991
|
+
processEvaluation,
|
|
862
992
|
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
863
993
|
);
|
|
864
994
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
@@ -876,7 +1006,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
876
1006
|
totalTestCases: task.testCases.length,
|
|
877
1007
|
artifactPath: task.snapshot.artifactPath
|
|
878
1008
|
};
|
|
879
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1009
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
880
1010
|
...snapshot,
|
|
881
1011
|
status: "completed",
|
|
882
1012
|
completedTestCases: completedEvaluations,
|
|
@@ -1129,7 +1259,9 @@ var EffectRunner = class {
|
|
|
1129
1259
|
this.persistenceQueue = effect.Effect.runSync(
|
|
1130
1260
|
effect.Queue.unbounded()
|
|
1131
1261
|
);
|
|
1132
|
-
this.
|
|
1262
|
+
this.snapshotsRef = effect.Effect.runSync(
|
|
1263
|
+
effect.Ref.make(/* @__PURE__ */ new Map())
|
|
1264
|
+
);
|
|
1133
1265
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1134
1266
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1135
1267
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
@@ -1232,7 +1364,13 @@ var EffectRunner = class {
|
|
|
1232
1364
|
status: "queued",
|
|
1233
1365
|
artifactPath
|
|
1234
1366
|
};
|
|
1235
|
-
|
|
1367
|
+
await effect.Effect.runPromise(
|
|
1368
|
+
effect.Ref.update(this.snapshotsRef, (map) => {
|
|
1369
|
+
const next = new Map(map);
|
|
1370
|
+
next.set(runId, snapshot);
|
|
1371
|
+
return next;
|
|
1372
|
+
})
|
|
1373
|
+
);
|
|
1236
1374
|
const queuedEvent = {
|
|
1237
1375
|
type: "RunQueued",
|
|
1238
1376
|
runId,
|
|
@@ -1273,12 +1411,12 @@ var EffectRunner = class {
|
|
|
1273
1411
|
};
|
|
1274
1412
|
}
|
|
1275
1413
|
getRunSnapshot(runId) {
|
|
1276
|
-
return this.
|
|
1414
|
+
return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
|
|
1277
1415
|
}
|
|
1278
1416
|
getAllRunSnapshots() {
|
|
1279
|
-
return Array.from(
|
|
1280
|
-
(
|
|
1281
|
-
);
|
|
1417
|
+
return Array.from(
|
|
1418
|
+
effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
|
|
1419
|
+
).sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1282
1420
|
}
|
|
1283
1421
|
async loadRunSnapshotsFromArtifacts() {
|
|
1284
1422
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -1307,11 +1445,15 @@ var EffectRunner = class {
|
|
|
1307
1445
|
);
|
|
1308
1446
|
}
|
|
1309
1447
|
updateSnapshot(runId, updater) {
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1448
|
+
return effect.Ref.modify(this.snapshotsRef, (map) => {
|
|
1449
|
+
const existing = map.get(runId);
|
|
1450
|
+
if (!existing) {
|
|
1451
|
+
return [void 0, map];
|
|
1452
|
+
}
|
|
1453
|
+
const next = new Map(map);
|
|
1454
|
+
next.set(runId, updater(existing));
|
|
1455
|
+
return [void 0, next];
|
|
1456
|
+
}).pipe(effect.Effect.asVoid);
|
|
1315
1457
|
}
|
|
1316
1458
|
publishEvent(event) {
|
|
1317
1459
|
return effect.Effect.sync(() => {
|
|
@@ -1327,8 +1469,9 @@ var EffectRunner = class {
|
|
|
1327
1469
|
);
|
|
1328
1470
|
}
|
|
1329
1471
|
};
|
|
1330
|
-
|
|
1331
|
-
|
|
1472
|
+
function getDefaultConcurrency() {
|
|
1473
|
+
return Math.max(1, os.cpus().length);
|
|
1474
|
+
}
|
|
1332
1475
|
function parseSimpleCliArgs(argv) {
|
|
1333
1476
|
const args = {
|
|
1334
1477
|
help: false,
|
|
@@ -1355,6 +1498,14 @@ function parseSimpleCliArgs(argv) {
|
|
|
1355
1498
|
index += 1;
|
|
1356
1499
|
continue;
|
|
1357
1500
|
}
|
|
1501
|
+
if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
|
|
1502
|
+
const n = parseInt(argv[index + 1], 10);
|
|
1503
|
+
if (!Number.isNaN(n) && n >= 1) {
|
|
1504
|
+
args.concurrency = n;
|
|
1505
|
+
}
|
|
1506
|
+
index += 1;
|
|
1507
|
+
continue;
|
|
1508
|
+
}
|
|
1358
1509
|
args.unknownArgs.push(token);
|
|
1359
1510
|
}
|
|
1360
1511
|
return args;
|
|
@@ -1362,9 +1513,12 @@ function parseSimpleCliArgs(argv) {
|
|
|
1362
1513
|
function getSimpleCliUsage() {
|
|
1363
1514
|
return [
|
|
1364
1515
|
"Usage:",
|
|
1365
|
-
" eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern>",
|
|
1516
|
+
" eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern> [--concurrency N]",
|
|
1366
1517
|
" eval-agents-simple generate --dataset <datasetName>",
|
|
1367
1518
|
"",
|
|
1519
|
+
"Options:",
|
|
1520
|
+
" --concurrency, -c N Max concurrent test cases (default: CPU count). Use 1 for sequential.",
|
|
1521
|
+
"",
|
|
1368
1522
|
"Pattern examples for --evaluator:",
|
|
1369
1523
|
" score-evaluator exact name (case-insensitive)",
|
|
1370
1524
|
' "*score*" wildcard pattern',
|
|
@@ -1653,6 +1807,7 @@ function RunView({
|
|
|
1653
1807
|
runner,
|
|
1654
1808
|
datasetName,
|
|
1655
1809
|
evaluatorPattern,
|
|
1810
|
+
concurrency,
|
|
1656
1811
|
onComplete
|
|
1657
1812
|
}) {
|
|
1658
1813
|
const [phase, setPhase] = React2.useState(
|
|
@@ -1800,7 +1955,8 @@ function RunView({
|
|
|
1800
1955
|
});
|
|
1801
1956
|
const snapshot = await runner.runDatasetWith({
|
|
1802
1957
|
datasetId: dataset.id,
|
|
1803
|
-
evaluatorIds: evaluators.map((item) => item.id)
|
|
1958
|
+
evaluatorIds: evaluators.map((item) => item.id),
|
|
1959
|
+
concurrency
|
|
1804
1960
|
});
|
|
1805
1961
|
setRunInfo({
|
|
1806
1962
|
runId: snapshot.runId,
|
|
@@ -1828,7 +1984,7 @@ function RunView({
|
|
|
1828
1984
|
});
|
|
1829
1985
|
setPhase("completed");
|
|
1830
1986
|
setTimeout(() => onComplete(), 200);
|
|
1831
|
-
}, [runner, datasetName, evaluatorPattern, onComplete]);
|
|
1987
|
+
}, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
|
|
1832
1988
|
React2.useEffect(() => {
|
|
1833
1989
|
void runEval();
|
|
1834
1990
|
}, [runEval]);
|
|
@@ -1871,22 +2027,30 @@ function RunView({
|
|
|
1871
2027
|
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
|
|
1872
2028
|
}
|
|
1873
2029
|
),
|
|
1874
|
-
runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
|
|
1880
|
-
|
|
1881
|
-
|
|
1882
|
-
|
|
1883
|
-
|
|
1884
|
-
|
|
1885
|
-
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
2030
|
+
runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2031
|
+
ink.Text,
|
|
2032
|
+
{
|
|
2033
|
+
color: "yellow",
|
|
2034
|
+
children: [
|
|
2035
|
+
"[running ",
|
|
2036
|
+
item.startedTestCases,
|
|
2037
|
+
"/",
|
|
2038
|
+
item.totalTestCases,
|
|
2039
|
+
"]",
|
|
2040
|
+
" ",
|
|
2041
|
+
item.name,
|
|
2042
|
+
" ",
|
|
2043
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2044
|
+
"(",
|
|
2045
|
+
item.rerunIndex,
|
|
2046
|
+
"/",
|
|
2047
|
+
item.rerunTotal,
|
|
2048
|
+
")"
|
|
2049
|
+
] })
|
|
2050
|
+
]
|
|
2051
|
+
},
|
|
2052
|
+
`${item.testCaseId}:${item.rerunIndex}`
|
|
2053
|
+
)) })
|
|
1890
2054
|
] }),
|
|
1891
2055
|
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1892
2056
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
@@ -1968,7 +2132,7 @@ function RunView({
|
|
|
1968
2132
|
},
|
|
1969
2133
|
`${item.evaluatorId}-${s.id}-${idx}`
|
|
1970
2134
|
);
|
|
1971
|
-
}) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "
|
|
2135
|
+
}) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
|
|
1972
2136
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1973
2137
|
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1974
2138
|
({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
@@ -2026,9 +2190,9 @@ function RunView({
|
|
|
2026
2190
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
|
|
2027
2191
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
2028
2192
|
const agg = summary.aggregates.get(id);
|
|
2029
|
-
const scoreKeys = [
|
|
2030
|
-
(
|
|
2031
|
-
);
|
|
2193
|
+
const scoreKeys = [
|
|
2194
|
+
...summary.scoreItemsByEvaluatorScore?.keys() ?? []
|
|
2195
|
+
].filter((k) => k.startsWith(`${id}:`));
|
|
2032
2196
|
if (scoreKeys.length === 0) {
|
|
2033
2197
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2034
2198
|
"- ",
|
|
@@ -2336,7 +2500,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2336
2500
|
}
|
|
2337
2501
|
return lines;
|
|
2338
2502
|
}
|
|
2339
|
-
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
2503
|
+
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern, concurrency) {
|
|
2340
2504
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
2341
2505
|
if (!dataset) {
|
|
2342
2506
|
const known = await runner.collectDatasets();
|
|
@@ -2526,7 +2690,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2526
2690
|
});
|
|
2527
2691
|
const snapshot = await runner.runDatasetWith({
|
|
2528
2692
|
datasetId: dataset.id,
|
|
2529
|
-
evaluatorIds: evaluators.map((item) => item.id)
|
|
2693
|
+
evaluatorIds: evaluators.map((item) => item.id),
|
|
2694
|
+
concurrency
|
|
2530
2695
|
});
|
|
2531
2696
|
totalCount = snapshot.totalTestCases;
|
|
2532
2697
|
console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
@@ -2615,13 +2780,14 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2615
2780
|
}
|
|
2616
2781
|
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2617
2782
|
}
|
|
2618
|
-
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
2783
|
+
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
|
|
2619
2784
|
return new Promise((resolve5, reject) => {
|
|
2620
2785
|
const app = ink.render(
|
|
2621
2786
|
React2__namespace.createElement(RunView, {
|
|
2622
2787
|
runner,
|
|
2623
2788
|
datasetName,
|
|
2624
2789
|
evaluatorPattern,
|
|
2790
|
+
concurrency,
|
|
2625
2791
|
onComplete: (err) => {
|
|
2626
2792
|
app.unmount();
|
|
2627
2793
|
if (err) {
|
|
@@ -2668,10 +2834,12 @@ async function main() {
|
|
|
2668
2834
|
const runner = createRunner();
|
|
2669
2835
|
try {
|
|
2670
2836
|
if (args.command === "run") {
|
|
2837
|
+
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
2671
2838
|
await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
|
|
2672
2839
|
runner,
|
|
2673
2840
|
args.datasetName,
|
|
2674
|
-
args.evaluatorPattern
|
|
2841
|
+
args.evaluatorPattern,
|
|
2842
|
+
concurrency
|
|
2675
2843
|
);
|
|
2676
2844
|
return;
|
|
2677
2845
|
}
|