@m4trix/evals 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +8 -0
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +8 -0
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +8 -0
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +8 -0
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +8 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +19 -1
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -234,10 +234,23 @@ interface EvalMiddleware<TCtx> {
|
|
|
234
234
|
name: string;
|
|
235
235
|
resolve: () => TCtx | Promise<TCtx>;
|
|
236
236
|
}
|
|
237
|
+
interface EvaluateMeta {
|
|
238
|
+
/** Identifier of the trigger that started the run (for example, a CLI invocation). */
|
|
239
|
+
triggerId: string;
|
|
240
|
+
/**
|
|
241
|
+
* Identifier of the current test-case execution shared across all evaluators
|
|
242
|
+
* for this specific test-case run.
|
|
243
|
+
*/
|
|
244
|
+
runId: string;
|
|
245
|
+
/** Identifier of the dataset currently being evaluated. */
|
|
246
|
+
datasetId: string;
|
|
247
|
+
}
|
|
237
248
|
interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
|
|
238
249
|
input: TInput;
|
|
239
250
|
ctx: TCtx;
|
|
240
251
|
output?: TOutput;
|
|
252
|
+
/** Metadata about the current evaluator invocation. */
|
|
253
|
+
meta: EvaluateMeta;
|
|
241
254
|
/** Records a diff for this test case; stored in run artifact and shown by CLI */
|
|
242
255
|
logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
|
|
243
256
|
/** Logs a message or object for this test case; stored in run artifact and shown by CLI */
|
|
@@ -349,6 +362,11 @@ interface SearchTestCasesQuery {
|
|
|
349
362
|
excludedPaths?: ReadonlyArray<string | RegExp>;
|
|
350
363
|
}
|
|
351
364
|
interface RunDatasetRequest {
|
|
365
|
+
/**
|
|
366
|
+
* Identifier for what triggered the run request (for example, a CLI command).
|
|
367
|
+
* When omitted, the runner generates one in the format `trg-[uuid]`.
|
|
368
|
+
*/
|
|
369
|
+
triggerId?: string;
|
|
352
370
|
datasetId: string;
|
|
353
371
|
evaluatorIds: ReadonlyArray<string>;
|
|
354
372
|
concurrency?: number;
|
|
@@ -469,4 +487,4 @@ interface BinaryScoreData {
|
|
|
469
487
|
}
|
|
470
488
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
471
489
|
|
|
472
|
-
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
490
|
+
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
package/dist/index.js
CHANGED
|
@@ -1035,6 +1035,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1035
1035
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1036
1036
|
const rerunPassed = [];
|
|
1037
1037
|
for (let r = 0; r < reruns; r++) {
|
|
1038
|
+
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1038
1039
|
const started = Date.now();
|
|
1039
1040
|
const evaluatorScores = [];
|
|
1040
1041
|
let testCaseError;
|
|
@@ -1061,6 +1062,11 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1061
1062
|
input: testCaseItem.testCase.getInput(),
|
|
1062
1063
|
ctx,
|
|
1063
1064
|
output,
|
|
1065
|
+
meta: {
|
|
1066
|
+
triggerId: task.triggerId,
|
|
1067
|
+
runId: evaluatorRunId,
|
|
1068
|
+
datasetId: task.datasetId
|
|
1069
|
+
},
|
|
1064
1070
|
logDiff,
|
|
1065
1071
|
log
|
|
1066
1072
|
})
|
|
@@ -1520,6 +1526,7 @@ var EffectRunner = class {
|
|
|
1520
1526
|
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1521
1527
|
0
|
|
1522
1528
|
);
|
|
1529
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
1523
1530
|
const runId = `run-${randomUUID()}`;
|
|
1524
1531
|
const artifactPath = createArtifactPath(
|
|
1525
1532
|
this.config.artifactDirectory,
|
|
@@ -1561,6 +1568,7 @@ var EffectRunner = class {
|
|
|
1561
1568
|
await Effect.runPromise(
|
|
1562
1569
|
Queue.offer(this.runQueue, {
|
|
1563
1570
|
runId,
|
|
1571
|
+
triggerId,
|
|
1564
1572
|
datasetId: request.datasetId,
|
|
1565
1573
|
dataset: dataset.dataset,
|
|
1566
1574
|
evaluators: selectedEvaluators,
|