@m4trix/evals 0.15.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +44 -27
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +44 -27
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +56 -26
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +56 -26
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +43 -27
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +67 -4
- package/dist/index.js +42 -28
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/index.d.ts
CHANGED
|
@@ -167,6 +167,30 @@ declare class Dataset {
|
|
|
167
167
|
matchesTestCase(testCase: TestCase<unknown>, filePath: string): boolean;
|
|
168
168
|
}
|
|
169
169
|
|
|
170
|
+
/**
|
|
171
|
+
* Options for customizing JSON diff output. Passed to logDiff, createDiffLogEntry, and printJsonDiff.
|
|
172
|
+
* @see https://www.npmjs.com/package/json-diff
|
|
173
|
+
*/
|
|
174
|
+
interface JsonDiffOptions {
|
|
175
|
+
/** Include equal sections of the document, not just deltas */
|
|
176
|
+
full?: boolean;
|
|
177
|
+
/** Sort primitive values in arrays before comparing */
|
|
178
|
+
sort?: boolean;
|
|
179
|
+
/** Compare only keys, ignore value differences */
|
|
180
|
+
keysOnly?: boolean;
|
|
181
|
+
/** Always output these keys when their parent object has any diff (comma-separated or array) */
|
|
182
|
+
outputKeys?: string | string[];
|
|
183
|
+
/** Output only new/updated values (no - lines) */
|
|
184
|
+
outputNewOnly?: boolean;
|
|
185
|
+
/** Exclude these keys from comparison (comma-separated or array) */
|
|
186
|
+
excludeKeys?: string | string[];
|
|
187
|
+
/** Include unchanged values in output */
|
|
188
|
+
keepUnchangedValues?: boolean;
|
|
189
|
+
/** Round floats to this many decimals before comparing */
|
|
190
|
+
precision?: number;
|
|
191
|
+
/** Max ... elisions in a row before collapsing */
|
|
192
|
+
maxElisions?: number;
|
|
193
|
+
}
|
|
170
194
|
interface DiffLogEntry {
|
|
171
195
|
type: 'diff';
|
|
172
196
|
label?: string;
|
|
@@ -174,7 +198,26 @@ interface DiffLogEntry {
|
|
|
174
198
|
actual: unknown;
|
|
175
199
|
diff: string;
|
|
176
200
|
}
|
|
177
|
-
interface
|
|
201
|
+
interface LogEntry {
|
|
202
|
+
type: 'log';
|
|
203
|
+
label?: string;
|
|
204
|
+
message: string;
|
|
205
|
+
}
|
|
206
|
+
type EvaluatorLogEntry = DiffLogEntry | LogEntry;
|
|
207
|
+
/**
|
|
208
|
+
* Creates a LogEntry for storage in run artifacts. Use for logging objects or text.
|
|
209
|
+
*/
|
|
210
|
+
declare function createLogEntry(message: unknown, options?: {
|
|
211
|
+
label?: string;
|
|
212
|
+
}): LogEntry;
|
|
213
|
+
/**
|
|
214
|
+
* Returns lines from a log entry for display.
|
|
215
|
+
*/
|
|
216
|
+
declare function getLogLines(entry: LogEntry): string[];
|
|
217
|
+
interface CreateDiffLogEntryOptions extends JsonDiffOptions {
|
|
218
|
+
label?: string;
|
|
219
|
+
}
|
|
220
|
+
interface PrintJsonDiffOptions extends JsonDiffOptions {
|
|
178
221
|
/** Enable ANSI colors (default: true) */
|
|
179
222
|
color?: boolean;
|
|
180
223
|
}
|
|
@@ -191,12 +234,27 @@ interface EvalMiddleware<TCtx> {
|
|
|
191
234
|
name: string;
|
|
192
235
|
resolve: () => TCtx | Promise<TCtx>;
|
|
193
236
|
}
|
|
237
|
+
interface EvaluateMeta {
|
|
238
|
+
/** Identifier of the trigger that started the run (for example, a CLI invocation). */
|
|
239
|
+
triggerId: string;
|
|
240
|
+
/**
|
|
241
|
+
* Identifier of the current test-case execution shared across all evaluators
|
|
242
|
+
* for this specific test-case run.
|
|
243
|
+
*/
|
|
244
|
+
runId: string;
|
|
245
|
+
/** Identifier of the dataset currently being evaluated. */
|
|
246
|
+
datasetId: string;
|
|
247
|
+
}
|
|
194
248
|
interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
|
|
195
249
|
input: TInput;
|
|
196
250
|
ctx: TCtx;
|
|
197
251
|
output?: TOutput;
|
|
252
|
+
/** Metadata about the current evaluator invocation. */
|
|
253
|
+
meta: EvaluateMeta;
|
|
198
254
|
/** Records a diff for this test case; stored in run artifact and shown by CLI */
|
|
199
|
-
logDiff: (expected: unknown, actual: unknown, options?:
|
|
255
|
+
logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
|
|
256
|
+
/** Logs a message or object for this test case; stored in run artifact and shown by CLI */
|
|
257
|
+
log: (message: unknown, options?: {
|
|
200
258
|
label?: string;
|
|
201
259
|
}) => void;
|
|
202
260
|
}
|
|
@@ -304,6 +362,11 @@ interface SearchTestCasesQuery {
|
|
|
304
362
|
excludedPaths?: ReadonlyArray<string | RegExp>;
|
|
305
363
|
}
|
|
306
364
|
interface RunDatasetRequest {
|
|
365
|
+
/**
|
|
366
|
+
* Identifier for what triggered the run request (for example, a CLI command).
|
|
367
|
+
* When omitted, the runner generates one in the format `trg-[uuid]`.
|
|
368
|
+
*/
|
|
369
|
+
triggerId?: string;
|
|
307
370
|
datasetId: string;
|
|
308
371
|
evaluatorIds: ReadonlyArray<string>;
|
|
309
372
|
concurrency?: number;
|
|
@@ -352,7 +415,7 @@ type RunnerEvent = {
|
|
|
352
415
|
scores: ReadonlyArray<ScoreItem>;
|
|
353
416
|
passed: boolean;
|
|
354
417
|
metrics?: ReadonlyArray<MetricItem>;
|
|
355
|
-
logs?: ReadonlyArray<
|
|
418
|
+
logs?: ReadonlyArray<EvaluatorLogEntry>;
|
|
356
419
|
}>;
|
|
357
420
|
output?: unknown;
|
|
358
421
|
errorMessage?: string;
|
|
@@ -424,4 +487,4 @@ interface BinaryScoreData {
|
|
|
424
487
|
}
|
|
425
488
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
426
489
|
|
|
427
|
-
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
490
|
+
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
package/dist/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
|
|
2
2
|
export { Schema as S } from 'effect';
|
|
3
|
-
import {
|
|
3
|
+
import { diffString } from 'json-diff';
|
|
4
4
|
import { randomUUID } from 'crypto';
|
|
5
5
|
import { existsSync } from 'fs';
|
|
6
6
|
import { resolve as resolve$1, relative, join, dirname } from 'path';
|
|
@@ -638,46 +638,48 @@ var binaryScore = Score.of({
|
|
|
638
638
|
},
|
|
639
639
|
aggregate: aggregateAll
|
|
640
640
|
});
|
|
641
|
-
function
|
|
641
|
+
function createDiffString(expected, actual, diffOptions) {
|
|
642
|
+
const opts = { ...diffOptions, color: false };
|
|
643
|
+
const result = diffString(expected, actual, opts);
|
|
644
|
+
return typeof result === "string" ? result : "";
|
|
645
|
+
}
|
|
646
|
+
function formatLogMessage(msg) {
|
|
647
|
+
if (typeof msg === "string")
|
|
648
|
+
return msg;
|
|
642
649
|
try {
|
|
643
|
-
|
|
650
|
+
if (msg !== null && typeof msg === "object") {
|
|
651
|
+
return JSON.stringify(msg, null, 2);
|
|
652
|
+
}
|
|
653
|
+
return String(msg);
|
|
644
654
|
} catch {
|
|
645
|
-
return String(
|
|
655
|
+
return String(msg);
|
|
646
656
|
}
|
|
647
657
|
}
|
|
648
|
-
function
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
partLines.pop();
|
|
655
|
-
}
|
|
656
|
-
for (const line of partLines) {
|
|
657
|
-
lines.push(`${prefix} ${line}`);
|
|
658
|
-
}
|
|
659
|
-
}
|
|
660
|
-
return lines.join("\n");
|
|
658
|
+
function createLogEntry(message, options) {
|
|
659
|
+
return {
|
|
660
|
+
type: "log",
|
|
661
|
+
label: options?.label,
|
|
662
|
+
message: formatLogMessage(message)
|
|
663
|
+
};
|
|
661
664
|
}
|
|
662
|
-
function
|
|
663
|
-
|
|
664
|
-
const actualStr = toJsonLines(actual);
|
|
665
|
-
const changes = diffLines(expectedStr, actualStr);
|
|
666
|
-
return formatDiffString(changes);
|
|
665
|
+
function getLogLines(entry) {
|
|
666
|
+
return entry.message.split("\n");
|
|
667
667
|
}
|
|
668
668
|
function createDiffLogEntry(expected, actual, options) {
|
|
669
|
-
const
|
|
669
|
+
const { label, ...diffOpts } = options ?? {};
|
|
670
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
670
671
|
return {
|
|
671
672
|
type: "diff",
|
|
672
|
-
label
|
|
673
|
+
label,
|
|
673
674
|
expected,
|
|
674
675
|
actual,
|
|
675
676
|
diff: diff || "(no differences)"
|
|
676
677
|
};
|
|
677
678
|
}
|
|
678
679
|
function printJsonDiff(expected, actual, options = {}) {
|
|
679
|
-
const
|
|
680
|
-
|
|
680
|
+
const { color = true, ...diffOpts } = options;
|
|
681
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
682
|
+
if (color) {
|
|
681
683
|
const lines = diff.split("\n").map((line) => {
|
|
682
684
|
const trimmed = line.trimStart();
|
|
683
685
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -1033,6 +1035,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1033
1035
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1034
1036
|
const rerunPassed = [];
|
|
1035
1037
|
for (let r = 0; r < reruns; r++) {
|
|
1038
|
+
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1036
1039
|
const started = Date.now();
|
|
1037
1040
|
const evaluatorScores = [];
|
|
1038
1041
|
let testCaseError;
|
|
@@ -1047,6 +1050,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1047
1050
|
const logDiff = (expected, actual, options) => {
|
|
1048
1051
|
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1049
1052
|
};
|
|
1053
|
+
const log = (message, options) => {
|
|
1054
|
+
logs.push(createLogEntry(message, options));
|
|
1055
|
+
};
|
|
1050
1056
|
const ctx = yield* Effect.promise(
|
|
1051
1057
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1052
1058
|
);
|
|
@@ -1056,7 +1062,13 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1056
1062
|
input: testCaseItem.testCase.getInput(),
|
|
1057
1063
|
ctx,
|
|
1058
1064
|
output,
|
|
1059
|
-
|
|
1065
|
+
meta: {
|
|
1066
|
+
triggerId: task.triggerId,
|
|
1067
|
+
runId: evaluatorRunId,
|
|
1068
|
+
datasetId: task.datasetId
|
|
1069
|
+
},
|
|
1070
|
+
logDiff,
|
|
1071
|
+
log
|
|
1060
1072
|
})
|
|
1061
1073
|
)
|
|
1062
1074
|
);
|
|
@@ -1514,6 +1526,7 @@ var EffectRunner = class {
|
|
|
1514
1526
|
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1515
1527
|
0
|
|
1516
1528
|
);
|
|
1529
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
1517
1530
|
const runId = `run-${randomUUID()}`;
|
|
1518
1531
|
const artifactPath = createArtifactPath(
|
|
1519
1532
|
this.config.artifactDirectory,
|
|
@@ -1555,6 +1568,7 @@ var EffectRunner = class {
|
|
|
1555
1568
|
await Effect.runPromise(
|
|
1556
1569
|
Queue.offer(this.runQueue, {
|
|
1557
1570
|
runId,
|
|
1571
|
+
triggerId,
|
|
1558
1572
|
datasetId: request.datasetId,
|
|
1559
1573
|
dataset: dataset.dataset,
|
|
1560
1574
|
evaluators: selectedEvaluators,
|
|
@@ -1628,6 +1642,6 @@ var EffectRunner = class {
|
|
|
1628
1642
|
}
|
|
1629
1643
|
};
|
|
1630
1644
|
|
|
1631
|
-
export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
1645
|
+
export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
1632
1646
|
//# sourceMappingURL=out.js.map
|
|
1633
1647
|
//# sourceMappingURL=index.js.map
|