@m4trix/evals 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +352 -184
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +350 -185
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +294 -155
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +294 -156
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +296 -155
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +1 -2
- package/dist/index.js +294 -156
- package/dist/index.js.map +1 -1
- package/package.json +3 -3
package/dist/cli-simple.js
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { randomUUID } from 'crypto';
|
|
3
|
-
import { Effect, PubSub, Queue,
|
|
3
|
+
import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
|
|
4
4
|
import { existsSync } from 'fs';
|
|
5
5
|
import { resolve, relative, join, parse, dirname } from 'path';
|
|
6
6
|
import * as jitiModule from 'jiti';
|
|
7
7
|
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
|
-
import {
|
|
9
|
+
import { diffLines } from 'diff';
|
|
10
|
+
import stringify from 'fast-json-stable-stringify';
|
|
11
|
+
import { cpus } from 'os';
|
|
10
12
|
import * as React2 from 'react';
|
|
11
13
|
import React2__default, { useState, useEffect, useCallback } from 'react';
|
|
12
14
|
import { render, Box, Text } from 'ink';
|
|
@@ -261,10 +263,102 @@ async function collectTestCasesFromFiles(config) {
|
|
|
261
263
|
);
|
|
262
264
|
return found.flat();
|
|
263
265
|
}
|
|
266
|
+
function preprocessForDiff(value, options) {
|
|
267
|
+
if (options?.sort && Array.isArray(value)) {
|
|
268
|
+
return [...value].sort((a, b) => {
|
|
269
|
+
const aStr = stringify(preprocessForDiff(a, options));
|
|
270
|
+
const bStr = stringify(preprocessForDiff(b, options));
|
|
271
|
+
return aStr.localeCompare(bStr);
|
|
272
|
+
}).map((item) => preprocessForDiff(item, options));
|
|
273
|
+
}
|
|
274
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
|
|
275
|
+
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
276
|
+
const filtered = {};
|
|
277
|
+
for (const [k, v] of Object.entries(value)) {
|
|
278
|
+
if (!keys.includes(k)) {
|
|
279
|
+
filtered[k] = preprocessForDiff(v, options);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
return filtered;
|
|
283
|
+
}
|
|
284
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value)) {
|
|
285
|
+
const result = {};
|
|
286
|
+
for (const [k, v] of Object.entries(value)) {
|
|
287
|
+
result[k] = preprocessForDiff(v, options);
|
|
288
|
+
}
|
|
289
|
+
return result;
|
|
290
|
+
}
|
|
291
|
+
if (typeof value === "number" && options?.precision !== void 0) {
|
|
292
|
+
return Number(value.toFixed(options.precision));
|
|
293
|
+
}
|
|
294
|
+
return value;
|
|
295
|
+
}
|
|
296
|
+
function toPrettyJson(value) {
|
|
297
|
+
const str = stringify(value);
|
|
298
|
+
try {
|
|
299
|
+
const parsed = JSON.parse(str);
|
|
300
|
+
return JSON.stringify(parsed, null, 2);
|
|
301
|
+
} catch {
|
|
302
|
+
return str;
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
function formatDiffParts(parts) {
|
|
306
|
+
const lines = [];
|
|
307
|
+
for (const part of parts) {
|
|
308
|
+
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
309
|
+
const partLines = part.value.split("\n");
|
|
310
|
+
for (let i = 0; i < partLines.length; i++) {
|
|
311
|
+
const line = partLines[i];
|
|
312
|
+
if (i === partLines.length - 1 && line === "")
|
|
313
|
+
continue;
|
|
314
|
+
lines.push(prefix + line);
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
return lines.join("\n");
|
|
318
|
+
}
|
|
264
319
|
function createDiffString(expected, actual, diffOptions) {
|
|
265
|
-
const
|
|
266
|
-
const
|
|
267
|
-
|
|
320
|
+
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
321
|
+
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
322
|
+
if (diffOptions?.keysOnly) {
|
|
323
|
+
const expectedKeys = JSON.stringify(
|
|
324
|
+
extractKeys(expectedProcessed),
|
|
325
|
+
null,
|
|
326
|
+
2
|
|
327
|
+
);
|
|
328
|
+
const actualKeys = JSON.stringify(
|
|
329
|
+
extractKeys(actualProcessed),
|
|
330
|
+
null,
|
|
331
|
+
2
|
|
332
|
+
);
|
|
333
|
+
const parts2 = diffLines(expectedKeys, actualKeys);
|
|
334
|
+
return formatDiffParts(parts2);
|
|
335
|
+
}
|
|
336
|
+
const expectedStr = toPrettyJson(expectedProcessed);
|
|
337
|
+
const actualStr = toPrettyJson(actualProcessed);
|
|
338
|
+
if (expectedStr === actualStr) {
|
|
339
|
+
return "";
|
|
340
|
+
}
|
|
341
|
+
const parts = diffLines(expectedStr, actualStr);
|
|
342
|
+
if (diffOptions?.outputNewOnly) {
|
|
343
|
+
const filtered = parts.filter(
|
|
344
|
+
(p) => p.added === true
|
|
345
|
+
);
|
|
346
|
+
return formatDiffParts(filtered);
|
|
347
|
+
}
|
|
348
|
+
return formatDiffParts(parts);
|
|
349
|
+
}
|
|
350
|
+
function extractKeys(value) {
|
|
351
|
+
if (value === null || typeof value !== "object") {
|
|
352
|
+
return "\xB7";
|
|
353
|
+
}
|
|
354
|
+
if (Array.isArray(value)) {
|
|
355
|
+
return value.map(extractKeys);
|
|
356
|
+
}
|
|
357
|
+
const result = {};
|
|
358
|
+
for (const [k, v] of Object.entries(value)) {
|
|
359
|
+
result[k] = extractKeys(v);
|
|
360
|
+
}
|
|
361
|
+
return result;
|
|
268
362
|
}
|
|
269
363
|
function formatLogMessage(msg) {
|
|
270
364
|
if (typeof msg === "string")
|
|
@@ -643,6 +737,20 @@ function readOutput(testCase) {
|
|
|
643
737
|
}
|
|
644
738
|
return candidate.getOutput();
|
|
645
739
|
}
|
|
740
|
+
function buildEvaluationUnits(testCases) {
|
|
741
|
+
const units = [];
|
|
742
|
+
for (const testCaseItem of testCases) {
|
|
743
|
+
const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
744
|
+
for (let r = 0; r < rerunTotal; r++) {
|
|
745
|
+
units.push({
|
|
746
|
+
testCaseItem,
|
|
747
|
+
rerunIndex: r + 1,
|
|
748
|
+
rerunTotal
|
|
749
|
+
});
|
|
750
|
+
}
|
|
751
|
+
}
|
|
752
|
+
return units;
|
|
753
|
+
}
|
|
646
754
|
function nowIsoForFile() {
|
|
647
755
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
648
756
|
}
|
|
@@ -652,157 +760,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
652
760
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
653
761
|
);
|
|
654
762
|
}
|
|
655
|
-
function
|
|
763
|
+
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
764
|
+
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
656
765
|
return Effect.gen(function* () {
|
|
657
|
-
const
|
|
658
|
-
const
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
721
|
-
logs.push(taggedEntry ?? createLogEntry(result));
|
|
722
|
-
testCaseError = result.message;
|
|
723
|
-
evaluatorScores.push({
|
|
724
|
-
evaluatorId,
|
|
725
|
-
scores: [],
|
|
726
|
-
passed: false,
|
|
727
|
-
logs: logs.length > 0 ? logs : void 0
|
|
728
|
-
});
|
|
729
|
-
continue;
|
|
730
|
-
}
|
|
731
|
-
const { scores, metrics } = normalizeResult(result);
|
|
732
|
-
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
733
|
-
evaluatorScores.push({
|
|
734
|
-
evaluatorId,
|
|
735
|
-
scores,
|
|
736
|
-
passed: passed2,
|
|
737
|
-
metrics,
|
|
738
|
-
logs: logs.length > 0 ? logs : void 0
|
|
739
|
-
});
|
|
740
|
-
} catch (error) {
|
|
741
|
-
if (error instanceof Error) {
|
|
742
|
-
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
743
|
-
logs.push(taggedEntry ?? createLogEntry(error));
|
|
744
|
-
}
|
|
745
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
766
|
+
const evaluatorRunId = `run-${randomUUID()}`;
|
|
767
|
+
const started = Date.now();
|
|
768
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
769
|
+
n + 1,
|
|
770
|
+
n + 1
|
|
771
|
+
]);
|
|
772
|
+
yield* publishEvent({
|
|
773
|
+
type: "TestCaseStarted",
|
|
774
|
+
runId: task.runId,
|
|
775
|
+
testCaseId: testCaseItem.id,
|
|
776
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
777
|
+
startedTestCases: startedEvaluations,
|
|
778
|
+
totalTestCases: totalEvaluations,
|
|
779
|
+
rerunIndex,
|
|
780
|
+
rerunTotal
|
|
781
|
+
});
|
|
782
|
+
const evaluatorScores = [];
|
|
783
|
+
let testCaseError;
|
|
784
|
+
const output = readOutput(testCaseItem.testCase);
|
|
785
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
786
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
787
|
+
if (!evaluateFn) {
|
|
788
|
+
continue;
|
|
789
|
+
}
|
|
790
|
+
const logs = [];
|
|
791
|
+
const logDiff = (expected, actual, options) => {
|
|
792
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
793
|
+
};
|
|
794
|
+
const log = (message, options) => {
|
|
795
|
+
logs.push(createLogEntry(message, options));
|
|
796
|
+
};
|
|
797
|
+
const createError = (message, options) => {
|
|
798
|
+
const entry = createLogEntry(message, options);
|
|
799
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
800
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
801
|
+
return error;
|
|
802
|
+
};
|
|
803
|
+
try {
|
|
804
|
+
const ctx = yield* Effect.promise(
|
|
805
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
806
|
+
);
|
|
807
|
+
const result = yield* Effect.promise(
|
|
808
|
+
() => Promise.resolve().then(
|
|
809
|
+
() => evaluateFn({
|
|
810
|
+
input: testCaseItem.testCase.getInput(),
|
|
811
|
+
ctx,
|
|
812
|
+
output,
|
|
813
|
+
meta: {
|
|
814
|
+
triggerId: task.triggerId,
|
|
815
|
+
runId: evaluatorRunId,
|
|
816
|
+
datasetId: task.datasetId
|
|
817
|
+
},
|
|
818
|
+
logDiff,
|
|
819
|
+
log,
|
|
820
|
+
createError
|
|
821
|
+
})
|
|
822
|
+
)
|
|
823
|
+
);
|
|
824
|
+
if (result instanceof Error) {
|
|
825
|
+
const evaluatorError = result;
|
|
826
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
827
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
828
|
+
testCaseError = result.message;
|
|
746
829
|
evaluatorScores.push({
|
|
747
830
|
evaluatorId,
|
|
748
831
|
scores: [],
|
|
749
832
|
passed: false,
|
|
750
833
|
logs: logs.length > 0 ? logs : void 0
|
|
751
834
|
});
|
|
835
|
+
continue;
|
|
836
|
+
}
|
|
837
|
+
const { scores, metrics } = normalizeResult(result);
|
|
838
|
+
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
839
|
+
evaluatorScores.push({
|
|
840
|
+
evaluatorId,
|
|
841
|
+
scores,
|
|
842
|
+
passed,
|
|
843
|
+
metrics,
|
|
844
|
+
logs: logs.length > 0 ? logs : void 0
|
|
845
|
+
});
|
|
846
|
+
} catch (error) {
|
|
847
|
+
if (error instanceof Error) {
|
|
848
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
849
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
752
850
|
}
|
|
851
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
852
|
+
evaluatorScores.push({
|
|
853
|
+
evaluatorId,
|
|
854
|
+
scores: [],
|
|
855
|
+
passed: false,
|
|
856
|
+
logs: logs.length > 0 ? logs : void 0
|
|
857
|
+
});
|
|
753
858
|
}
|
|
754
|
-
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
755
|
-
rerunPassed.push(rerunPassedThis);
|
|
756
|
-
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
757
|
-
n + 1,
|
|
758
|
-
n + 1
|
|
759
|
-
]);
|
|
760
|
-
const progressEvent = {
|
|
761
|
-
type: "TestCaseProgress",
|
|
762
|
-
runId: task.runId,
|
|
763
|
-
testCaseId: testCaseItem.id,
|
|
764
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
765
|
-
completedTestCases: completedEvaluations,
|
|
766
|
-
totalTestCases: totalEvaluations,
|
|
767
|
-
rerunIndex: r + 1,
|
|
768
|
-
rerunTotal: reruns,
|
|
769
|
-
passed: rerunPassedThis,
|
|
770
|
-
durationMs: Date.now() - started,
|
|
771
|
-
evaluatorScores,
|
|
772
|
-
output,
|
|
773
|
-
errorMessage: testCaseError
|
|
774
|
-
};
|
|
775
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
776
|
-
...snapshot,
|
|
777
|
-
completedTestCases: completedEvaluations
|
|
778
|
-
}));
|
|
779
|
-
yield* publishEvent(progressEvent);
|
|
780
|
-
yield* Queue.offer(persistenceQueue, {
|
|
781
|
-
runId: task.runId,
|
|
782
|
-
artifactPath: task.snapshot.artifactPath,
|
|
783
|
-
payload: progressEvent
|
|
784
|
-
});
|
|
785
859
|
}
|
|
786
|
-
const
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
yield* Ref.update(failedRef, (n) => n + 1);
|
|
791
|
-
}
|
|
792
|
-
const [passed, failed] = yield* Effect.all([
|
|
793
|
-
Ref.get(passedRef),
|
|
794
|
-
Ref.get(failedRef)
|
|
860
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
861
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
862
|
+
n + 1,
|
|
863
|
+
n + 1
|
|
795
864
|
]);
|
|
796
|
-
|
|
865
|
+
const progressEvent = {
|
|
866
|
+
type: "TestCaseProgress",
|
|
867
|
+
runId: task.runId,
|
|
868
|
+
testCaseId: testCaseItem.id,
|
|
869
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
870
|
+
completedTestCases: completedEvaluations,
|
|
871
|
+
totalTestCases: totalEvaluations,
|
|
872
|
+
rerunIndex,
|
|
873
|
+
rerunTotal,
|
|
874
|
+
passed: rerunPassedThis,
|
|
875
|
+
durationMs: Date.now() - started,
|
|
876
|
+
evaluatorScores,
|
|
877
|
+
output,
|
|
878
|
+
errorMessage: testCaseError
|
|
879
|
+
};
|
|
880
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
797
881
|
...snapshot,
|
|
798
|
-
|
|
799
|
-
failedTestCases: failed
|
|
882
|
+
completedTestCases: completedEvaluations
|
|
800
883
|
}));
|
|
884
|
+
yield* publishEvent(progressEvent);
|
|
885
|
+
yield* Queue.offer(persistenceQueue, {
|
|
886
|
+
runId: task.runId,
|
|
887
|
+
artifactPath: task.snapshot.artifactPath,
|
|
888
|
+
payload: progressEvent
|
|
889
|
+
});
|
|
890
|
+
const testCaseCompleted = yield* Ref.modify(
|
|
891
|
+
testCaseResultsRef,
|
|
892
|
+
(map) => {
|
|
893
|
+
const key = testCaseItem.id;
|
|
894
|
+
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
895
|
+
const newResults = [...existing.results, rerunPassedThis];
|
|
896
|
+
const newCompletedCount = existing.completedCount + 1;
|
|
897
|
+
const isLast = newCompletedCount === rerunTotal;
|
|
898
|
+
const newMap = new Map(map);
|
|
899
|
+
newMap.set(key, {
|
|
900
|
+
completedCount: newCompletedCount,
|
|
901
|
+
results: newResults
|
|
902
|
+
});
|
|
903
|
+
const outcome = isLast ? newResults.every(Boolean) : null;
|
|
904
|
+
return [outcome, newMap];
|
|
905
|
+
}
|
|
906
|
+
);
|
|
907
|
+
if (testCaseCompleted !== null) {
|
|
908
|
+
if (testCaseCompleted) {
|
|
909
|
+
yield* Ref.update(passedRef, (n) => n + 1);
|
|
910
|
+
} else {
|
|
911
|
+
yield* Ref.update(failedRef, (n) => n + 1);
|
|
912
|
+
}
|
|
913
|
+
const [passed, failed] = yield* Effect.all([
|
|
914
|
+
Ref.get(passedRef),
|
|
915
|
+
Ref.get(failedRef)
|
|
916
|
+
]);
|
|
917
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
918
|
+
...snapshot,
|
|
919
|
+
passedTestCases: passed,
|
|
920
|
+
failedTestCases: failed
|
|
921
|
+
}));
|
|
922
|
+
}
|
|
801
923
|
});
|
|
802
924
|
}
|
|
803
925
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
|
|
804
926
|
const startedAt = Date.now();
|
|
805
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
927
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
806
928
|
...snapshot,
|
|
807
929
|
status: "running",
|
|
808
930
|
startedAt
|
|
@@ -821,9 +943,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
821
943
|
const startedRef = yield* Ref.make(0);
|
|
822
944
|
const passedRef = yield* Ref.make(0);
|
|
823
945
|
const failedRef = yield* Ref.make(0);
|
|
824
|
-
const
|
|
946
|
+
const testCaseResultsRef = yield* Ref.make(
|
|
947
|
+
/* @__PURE__ */ new Map()
|
|
948
|
+
);
|
|
949
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
950
|
+
const processEvaluation = (unit) => processOneEvaluation(
|
|
825
951
|
task,
|
|
826
|
-
|
|
952
|
+
unit,
|
|
827
953
|
totalEvaluations,
|
|
828
954
|
publishEvent,
|
|
829
955
|
persistenceQueue,
|
|
@@ -831,11 +957,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
831
957
|
startedRef,
|
|
832
958
|
completedRef,
|
|
833
959
|
passedRef,
|
|
834
|
-
failedRef
|
|
960
|
+
failedRef,
|
|
961
|
+
testCaseResultsRef
|
|
835
962
|
);
|
|
836
963
|
yield* Effect.forEach(
|
|
837
|
-
|
|
838
|
-
|
|
964
|
+
evaluationUnits,
|
|
965
|
+
processEvaluation,
|
|
839
966
|
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
840
967
|
);
|
|
841
968
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
@@ -853,7 +980,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
853
980
|
totalTestCases: task.testCases.length,
|
|
854
981
|
artifactPath: task.snapshot.artifactPath
|
|
855
982
|
};
|
|
856
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
983
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
857
984
|
...snapshot,
|
|
858
985
|
status: "completed",
|
|
859
986
|
completedTestCases: completedEvaluations,
|
|
@@ -1106,7 +1233,9 @@ var EffectRunner = class {
|
|
|
1106
1233
|
this.persistenceQueue = Effect.runSync(
|
|
1107
1234
|
Queue.unbounded()
|
|
1108
1235
|
);
|
|
1109
|
-
this.
|
|
1236
|
+
this.snapshotsRef = Effect.runSync(
|
|
1237
|
+
Ref.make(/* @__PURE__ */ new Map())
|
|
1238
|
+
);
|
|
1110
1239
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1111
1240
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1112
1241
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
@@ -1209,7 +1338,13 @@ var EffectRunner = class {
|
|
|
1209
1338
|
status: "queued",
|
|
1210
1339
|
artifactPath
|
|
1211
1340
|
};
|
|
1212
|
-
|
|
1341
|
+
await Effect.runPromise(
|
|
1342
|
+
Ref.update(this.snapshotsRef, (map) => {
|
|
1343
|
+
const next = new Map(map);
|
|
1344
|
+
next.set(runId, snapshot);
|
|
1345
|
+
return next;
|
|
1346
|
+
})
|
|
1347
|
+
);
|
|
1213
1348
|
const queuedEvent = {
|
|
1214
1349
|
type: "RunQueued",
|
|
1215
1350
|
runId,
|
|
@@ -1250,12 +1385,12 @@ var EffectRunner = class {
|
|
|
1250
1385
|
};
|
|
1251
1386
|
}
|
|
1252
1387
|
getRunSnapshot(runId) {
|
|
1253
|
-
return this.
|
|
1388
|
+
return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
|
|
1254
1389
|
}
|
|
1255
1390
|
getAllRunSnapshots() {
|
|
1256
|
-
return Array.from(
|
|
1257
|
-
(
|
|
1258
|
-
);
|
|
1391
|
+
return Array.from(
|
|
1392
|
+
Effect.runSync(Ref.get(this.snapshotsRef)).values()
|
|
1393
|
+
).sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1259
1394
|
}
|
|
1260
1395
|
async loadRunSnapshotsFromArtifacts() {
|
|
1261
1396
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -1284,11 +1419,15 @@ var EffectRunner = class {
|
|
|
1284
1419
|
);
|
|
1285
1420
|
}
|
|
1286
1421
|
updateSnapshot(runId, updater) {
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1422
|
+
return Ref.modify(this.snapshotsRef, (map) => {
|
|
1423
|
+
const existing = map.get(runId);
|
|
1424
|
+
if (!existing) {
|
|
1425
|
+
return [void 0, map];
|
|
1426
|
+
}
|
|
1427
|
+
const next = new Map(map);
|
|
1428
|
+
next.set(runId, updater(existing));
|
|
1429
|
+
return [void 0, next];
|
|
1430
|
+
}).pipe(Effect.asVoid);
|
|
1292
1431
|
}
|
|
1293
1432
|
publishEvent(event) {
|
|
1294
1433
|
return Effect.sync(() => {
|
|
@@ -1304,8 +1443,9 @@ var EffectRunner = class {
|
|
|
1304
1443
|
);
|
|
1305
1444
|
}
|
|
1306
1445
|
};
|
|
1307
|
-
|
|
1308
|
-
|
|
1446
|
+
function getDefaultConcurrency() {
|
|
1447
|
+
return Math.max(1, cpus().length);
|
|
1448
|
+
}
|
|
1309
1449
|
function parseSimpleCliArgs(argv) {
|
|
1310
1450
|
const args = {
|
|
1311
1451
|
help: false,
|
|
@@ -1332,6 +1472,14 @@ function parseSimpleCliArgs(argv) {
|
|
|
1332
1472
|
index += 1;
|
|
1333
1473
|
continue;
|
|
1334
1474
|
}
|
|
1475
|
+
if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
|
|
1476
|
+
const n = parseInt(argv[index + 1], 10);
|
|
1477
|
+
if (!Number.isNaN(n) && n >= 1) {
|
|
1478
|
+
args.concurrency = n;
|
|
1479
|
+
}
|
|
1480
|
+
index += 1;
|
|
1481
|
+
continue;
|
|
1482
|
+
}
|
|
1335
1483
|
args.unknownArgs.push(token);
|
|
1336
1484
|
}
|
|
1337
1485
|
return args;
|
|
@@ -1339,9 +1487,12 @@ function parseSimpleCliArgs(argv) {
|
|
|
1339
1487
|
function getSimpleCliUsage() {
|
|
1340
1488
|
return [
|
|
1341
1489
|
"Usage:",
|
|
1342
|
-
" eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern>",
|
|
1490
|
+
" eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern> [--concurrency N]",
|
|
1343
1491
|
" eval-agents-simple generate --dataset <datasetName>",
|
|
1344
1492
|
"",
|
|
1493
|
+
"Options:",
|
|
1494
|
+
" --concurrency, -c N Max concurrent test cases (default: CPU count). Use 1 for sequential.",
|
|
1495
|
+
"",
|
|
1345
1496
|
"Pattern examples for --evaluator:",
|
|
1346
1497
|
" score-evaluator exact name (case-insensitive)",
|
|
1347
1498
|
' "*score*" wildcard pattern',
|
|
@@ -1630,6 +1781,7 @@ function RunView({
|
|
|
1630
1781
|
runner,
|
|
1631
1782
|
datasetName,
|
|
1632
1783
|
evaluatorPattern,
|
|
1784
|
+
concurrency,
|
|
1633
1785
|
onComplete
|
|
1634
1786
|
}) {
|
|
1635
1787
|
const [phase, setPhase] = useState(
|
|
@@ -1777,7 +1929,8 @@ function RunView({
|
|
|
1777
1929
|
});
|
|
1778
1930
|
const snapshot = await runner.runDatasetWith({
|
|
1779
1931
|
datasetId: dataset.id,
|
|
1780
|
-
evaluatorIds: evaluators.map((item) => item.id)
|
|
1932
|
+
evaluatorIds: evaluators.map((item) => item.id),
|
|
1933
|
+
concurrency
|
|
1781
1934
|
});
|
|
1782
1935
|
setRunInfo({
|
|
1783
1936
|
runId: snapshot.runId,
|
|
@@ -1805,7 +1958,7 @@ function RunView({
|
|
|
1805
1958
|
});
|
|
1806
1959
|
setPhase("completed");
|
|
1807
1960
|
setTimeout(() => onComplete(), 200);
|
|
1808
|
-
}, [runner, datasetName, evaluatorPattern, onComplete]);
|
|
1961
|
+
}, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
|
|
1809
1962
|
useEffect(() => {
|
|
1810
1963
|
void runEval();
|
|
1811
1964
|
}, [runEval]);
|
|
@@ -1848,22 +2001,30 @@ function RunView({
|
|
|
1848
2001
|
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
|
|
1849
2002
|
}
|
|
1850
2003
|
),
|
|
1851
|
-
runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
2004
|
+
runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(
|
|
2005
|
+
Text,
|
|
2006
|
+
{
|
|
2007
|
+
color: "yellow",
|
|
2008
|
+
children: [
|
|
2009
|
+
"[running ",
|
|
2010
|
+
item.startedTestCases,
|
|
2011
|
+
"/",
|
|
2012
|
+
item.totalTestCases,
|
|
2013
|
+
"]",
|
|
2014
|
+
" ",
|
|
2015
|
+
item.name,
|
|
2016
|
+
" ",
|
|
2017
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2018
|
+
"(",
|
|
2019
|
+
item.rerunIndex,
|
|
2020
|
+
"/",
|
|
2021
|
+
item.rerunTotal,
|
|
2022
|
+
")"
|
|
2023
|
+
] })
|
|
2024
|
+
]
|
|
2025
|
+
},
|
|
2026
|
+
`${item.testCaseId}:${item.rerunIndex}`
|
|
2027
|
+
)) })
|
|
1867
2028
|
] }),
|
|
1868
2029
|
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1869
2030
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
@@ -1945,7 +2106,7 @@ function RunView({
|
|
|
1945
2106
|
},
|
|
1946
2107
|
`${item.evaluatorId}-${s.id}-${idx}`
|
|
1947
2108
|
);
|
|
1948
|
-
}) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "
|
|
2109
|
+
}) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
|
|
1949
2110
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1950
2111
|
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1951
2112
|
({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
|
|
@@ -2003,9 +2164,9 @@ function RunView({
|
|
|
2003
2164
|
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
|
|
2004
2165
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
2005
2166
|
const agg = summary.aggregates.get(id);
|
|
2006
|
-
const scoreKeys = [
|
|
2007
|
-
(
|
|
2008
|
-
);
|
|
2167
|
+
const scoreKeys = [
|
|
2168
|
+
...summary.scoreItemsByEvaluatorScore?.keys() ?? []
|
|
2169
|
+
].filter((k) => k.startsWith(`${id}:`));
|
|
2009
2170
|
if (scoreKeys.length === 0) {
|
|
2010
2171
|
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2011
2172
|
"- ",
|
|
@@ -2313,7 +2474,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2313
2474
|
}
|
|
2314
2475
|
return lines;
|
|
2315
2476
|
}
|
|
2316
|
-
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
2477
|
+
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern, concurrency) {
|
|
2317
2478
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
2318
2479
|
if (!dataset) {
|
|
2319
2480
|
const known = await runner.collectDatasets();
|
|
@@ -2503,7 +2664,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2503
2664
|
});
|
|
2504
2665
|
const snapshot = await runner.runDatasetWith({
|
|
2505
2666
|
datasetId: dataset.id,
|
|
2506
|
-
evaluatorIds: evaluators.map((item) => item.id)
|
|
2667
|
+
evaluatorIds: evaluators.map((item) => item.id),
|
|
2668
|
+
concurrency
|
|
2507
2669
|
});
|
|
2508
2670
|
totalCount = snapshot.totalTestCases;
|
|
2509
2671
|
console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
@@ -2592,13 +2754,14 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2592
2754
|
}
|
|
2593
2755
|
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2594
2756
|
}
|
|
2595
|
-
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
2757
|
+
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
|
|
2596
2758
|
return new Promise((resolve5, reject) => {
|
|
2597
2759
|
const app = render(
|
|
2598
2760
|
React2.createElement(RunView, {
|
|
2599
2761
|
runner,
|
|
2600
2762
|
datasetName,
|
|
2601
2763
|
evaluatorPattern,
|
|
2764
|
+
concurrency,
|
|
2602
2765
|
onComplete: (err) => {
|
|
2603
2766
|
app.unmount();
|
|
2604
2767
|
if (err) {
|
|
@@ -2645,10 +2808,12 @@ async function main() {
|
|
|
2645
2808
|
const runner = createRunner();
|
|
2646
2809
|
try {
|
|
2647
2810
|
if (args.command === "run") {
|
|
2811
|
+
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
2648
2812
|
await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
|
|
2649
2813
|
runner,
|
|
2650
2814
|
args.datasetName,
|
|
2651
|
-
args.evaluatorPattern
|
|
2815
|
+
args.evaluatorPattern,
|
|
2816
|
+
concurrency
|
|
2652
2817
|
);
|
|
2653
2818
|
return;
|
|
2654
2819
|
}
|