@m4trix/evals 0.14.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +200 -111
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +200 -111
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +100 -44
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +100 -44
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +35 -27
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +49 -4
- package/dist/index.js +34 -28
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/index.d.ts
CHANGED
|
@@ -167,6 +167,30 @@ declare class Dataset {
|
|
|
167
167
|
matchesTestCase(testCase: TestCase<unknown>, filePath: string): boolean;
|
|
168
168
|
}
|
|
169
169
|
|
|
170
|
+
/**
|
|
171
|
+
* Options for customizing JSON diff output. Passed to logDiff, createDiffLogEntry, and printJsonDiff.
|
|
172
|
+
* @see https://www.npmjs.com/package/json-diff
|
|
173
|
+
*/
|
|
174
|
+
interface JsonDiffOptions {
|
|
175
|
+
/** Include equal sections of the document, not just deltas */
|
|
176
|
+
full?: boolean;
|
|
177
|
+
/** Sort primitive values in arrays before comparing */
|
|
178
|
+
sort?: boolean;
|
|
179
|
+
/** Compare only keys, ignore value differences */
|
|
180
|
+
keysOnly?: boolean;
|
|
181
|
+
/** Always output these keys when their parent object has any diff (comma-separated or array) */
|
|
182
|
+
outputKeys?: string | string[];
|
|
183
|
+
/** Output only new/updated values (no - lines) */
|
|
184
|
+
outputNewOnly?: boolean;
|
|
185
|
+
/** Exclude these keys from comparison (comma-separated or array) */
|
|
186
|
+
excludeKeys?: string | string[];
|
|
187
|
+
/** Include unchanged values in output */
|
|
188
|
+
keepUnchangedValues?: boolean;
|
|
189
|
+
/** Round floats to this many decimals before comparing */
|
|
190
|
+
precision?: number;
|
|
191
|
+
/** Max ... elisions in a row before collapsing */
|
|
192
|
+
maxElisions?: number;
|
|
193
|
+
}
|
|
170
194
|
interface DiffLogEntry {
|
|
171
195
|
type: 'diff';
|
|
172
196
|
label?: string;
|
|
@@ -174,7 +198,26 @@ interface DiffLogEntry {
|
|
|
174
198
|
actual: unknown;
|
|
175
199
|
diff: string;
|
|
176
200
|
}
|
|
177
|
-
interface
|
|
201
|
+
interface LogEntry {
|
|
202
|
+
type: 'log';
|
|
203
|
+
label?: string;
|
|
204
|
+
message: string;
|
|
205
|
+
}
|
|
206
|
+
type EvaluatorLogEntry = DiffLogEntry | LogEntry;
|
|
207
|
+
/**
|
|
208
|
+
* Creates a LogEntry for storage in run artifacts. Use for logging objects or text.
|
|
209
|
+
*/
|
|
210
|
+
declare function createLogEntry(message: unknown, options?: {
|
|
211
|
+
label?: string;
|
|
212
|
+
}): LogEntry;
|
|
213
|
+
/**
|
|
214
|
+
* Returns lines from a log entry for display.
|
|
215
|
+
*/
|
|
216
|
+
declare function getLogLines(entry: LogEntry): string[];
|
|
217
|
+
interface CreateDiffLogEntryOptions extends JsonDiffOptions {
|
|
218
|
+
label?: string;
|
|
219
|
+
}
|
|
220
|
+
interface PrintJsonDiffOptions extends JsonDiffOptions {
|
|
178
221
|
/** Enable ANSI colors (default: true) */
|
|
179
222
|
color?: boolean;
|
|
180
223
|
}
|
|
@@ -196,7 +239,9 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
|
|
|
196
239
|
ctx: TCtx;
|
|
197
240
|
output?: TOutput;
|
|
198
241
|
/** Records a diff for this test case; stored in run artifact and shown by CLI */
|
|
199
|
-
logDiff: (expected: unknown, actual: unknown, options?:
|
|
242
|
+
logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
|
|
243
|
+
/** Logs a message or object for this test case; stored in run artifact and shown by CLI */
|
|
244
|
+
log: (message: unknown, options?: {
|
|
200
245
|
label?: string;
|
|
201
246
|
}) => void;
|
|
202
247
|
}
|
|
@@ -352,7 +397,7 @@ type RunnerEvent = {
|
|
|
352
397
|
scores: ReadonlyArray<ScoreItem>;
|
|
353
398
|
passed: boolean;
|
|
354
399
|
metrics?: ReadonlyArray<MetricItem>;
|
|
355
|
-
logs?: ReadonlyArray<
|
|
400
|
+
logs?: ReadonlyArray<EvaluatorLogEntry>;
|
|
356
401
|
}>;
|
|
357
402
|
output?: unknown;
|
|
358
403
|
errorMessage?: string;
|
|
@@ -424,4 +469,4 @@ interface BinaryScoreData {
|
|
|
424
469
|
}
|
|
425
470
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
426
471
|
|
|
427
|
-
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
472
|
+
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
package/dist/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
|
|
2
2
|
export { Schema as S } from 'effect';
|
|
3
|
-
import {
|
|
3
|
+
import { diffString } from 'json-diff';
|
|
4
4
|
import { randomUUID } from 'crypto';
|
|
5
5
|
import { existsSync } from 'fs';
|
|
6
6
|
import { resolve as resolve$1, relative, join, dirname } from 'path';
|
|
@@ -638,46 +638,48 @@ var binaryScore = Score.of({
|
|
|
638
638
|
},
|
|
639
639
|
aggregate: aggregateAll
|
|
640
640
|
});
|
|
641
|
-
function
|
|
641
|
+
function createDiffString(expected, actual, diffOptions) {
|
|
642
|
+
const opts = { ...diffOptions, color: false };
|
|
643
|
+
const result = diffString(expected, actual, opts);
|
|
644
|
+
return typeof result === "string" ? result : "";
|
|
645
|
+
}
|
|
646
|
+
function formatLogMessage(msg) {
|
|
647
|
+
if (typeof msg === "string")
|
|
648
|
+
return msg;
|
|
642
649
|
try {
|
|
643
|
-
|
|
650
|
+
if (msg !== null && typeof msg === "object") {
|
|
651
|
+
return JSON.stringify(msg, null, 2);
|
|
652
|
+
}
|
|
653
|
+
return String(msg);
|
|
644
654
|
} catch {
|
|
645
|
-
return String(
|
|
655
|
+
return String(msg);
|
|
646
656
|
}
|
|
647
657
|
}
|
|
648
|
-
function
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
partLines.pop();
|
|
655
|
-
}
|
|
656
|
-
for (const line of partLines) {
|
|
657
|
-
lines.push(`${prefix} ${line}`);
|
|
658
|
-
}
|
|
659
|
-
}
|
|
660
|
-
return lines.join("\n");
|
|
658
|
+
function createLogEntry(message, options) {
|
|
659
|
+
return {
|
|
660
|
+
type: "log",
|
|
661
|
+
label: options?.label,
|
|
662
|
+
message: formatLogMessage(message)
|
|
663
|
+
};
|
|
661
664
|
}
|
|
662
|
-
function
|
|
663
|
-
|
|
664
|
-
const actualStr = toJsonLines(actual);
|
|
665
|
-
const changes = diffLines(expectedStr, actualStr);
|
|
666
|
-
return formatDiffString(changes);
|
|
665
|
+
function getLogLines(entry) {
|
|
666
|
+
return entry.message.split("\n");
|
|
667
667
|
}
|
|
668
668
|
function createDiffLogEntry(expected, actual, options) {
|
|
669
|
-
const
|
|
669
|
+
const { label, ...diffOpts } = options ?? {};
|
|
670
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
670
671
|
return {
|
|
671
672
|
type: "diff",
|
|
672
|
-
label
|
|
673
|
+
label,
|
|
673
674
|
expected,
|
|
674
675
|
actual,
|
|
675
676
|
diff: diff || "(no differences)"
|
|
676
677
|
};
|
|
677
678
|
}
|
|
678
679
|
function printJsonDiff(expected, actual, options = {}) {
|
|
679
|
-
const
|
|
680
|
-
|
|
680
|
+
const { color = true, ...diffOpts } = options;
|
|
681
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
682
|
+
if (color) {
|
|
681
683
|
const lines = diff.split("\n").map((line) => {
|
|
682
684
|
const trimmed = line.trimStart();
|
|
683
685
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -1047,6 +1049,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1047
1049
|
const logDiff = (expected, actual, options) => {
|
|
1048
1050
|
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1049
1051
|
};
|
|
1052
|
+
const log = (message, options) => {
|
|
1053
|
+
logs.push(createLogEntry(message, options));
|
|
1054
|
+
};
|
|
1050
1055
|
const ctx = yield* Effect.promise(
|
|
1051
1056
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1052
1057
|
);
|
|
@@ -1056,7 +1061,8 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1056
1061
|
input: testCaseItem.testCase.getInput(),
|
|
1057
1062
|
ctx,
|
|
1058
1063
|
output,
|
|
1059
|
-
logDiff
|
|
1064
|
+
logDiff,
|
|
1065
|
+
log
|
|
1060
1066
|
})
|
|
1061
1067
|
)
|
|
1062
1068
|
);
|
|
@@ -1628,6 +1634,6 @@ var EffectRunner = class {
|
|
|
1628
1634
|
}
|
|
1629
1635
|
};
|
|
1630
1636
|
|
|
1631
|
-
export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
1637
|
+
export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
1632
1638
|
//# sourceMappingURL=out.js.map
|
|
1633
1639
|
//# sourceMappingURL=index.js.map
|