@m4trix/evals 0.4.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -121,23 +121,24 @@ type TagMatcher = string | RegExp;
121
121
  type PathMatcher = string | RegExp;
122
122
 
123
123
  type InputOrBuilder<T> = T | (() => T);
124
- interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TOutputDefinition = unknown> {
124
+ interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
125
125
  name: string;
126
126
  tags: string[];
127
127
  inputSchema: TI;
128
128
  input: InputOrBuilder<Schema.Schema.Type<TI>>;
129
- outputDefinition?: InputOrBuilder<TOutputDefinition>;
130
- outputDefintion?: InputOrBuilder<TOutputDefinition>;
129
+ outputSchema?: TO;
130
+ output?: InputOrBuilder<Schema.Schema.Type<TO>>;
131
131
  }
132
- declare class TestCase<TInput = unknown, TOutputDefinition = unknown> {
132
+ declare class TestCase<TInput = unknown, TOutput = unknown> {
133
133
  private readonly _config;
134
134
  private constructor();
135
- static describe<TI extends Schema.Schema.Any, TOutputDefinition = unknown>(config: TestCaseDescribeConfig<TI, TOutputDefinition>): TestCase<Schema.Schema.Type<TI>, TOutputDefinition>;
135
+ static describe<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>>(config: TestCaseDescribeConfig<TI, TO>): TestCase<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>>;
136
136
  getName(): string;
137
137
  getTags(): string[];
138
138
  getInputSchema(): Schema.Schema.Any;
139
139
  getInput(): TInput;
140
- getOutputDefinition(): TOutputDefinition | undefined;
140
+ getOutputSchema(): Schema.Schema.Any | undefined;
141
+ getOutput(): TOutput | undefined;
141
142
  }
142
143
 
143
144
  interface DatasetDefineConfig {
@@ -209,7 +210,7 @@ interface MetricDef<TData = unknown> {
209
210
  declare const Metric: {
210
211
  of<TData>(config: {
211
212
  id: string;
212
- name?: string;
213
+ name?: string | undefined;
213
214
  format: (data: TData) => string;
214
215
  }): MetricDef<TData>;
215
216
  };
@@ -233,7 +234,7 @@ interface ScoreDef<TData = unknown> {
233
234
  declare const Score: {
234
235
  of<TData>(config: {
235
236
  id: string;
236
- name?: string;
237
+ name?: string | undefined;
237
238
  displayStrategy: ScoreDisplayStrategy;
238
239
  format: (data: TData) => string;
239
240
  }): ScoreDef<TData>;
@@ -309,7 +310,7 @@ type RunnerEvent = {
309
310
  passed: boolean;
310
311
  metrics?: ReadonlyArray<MetricItem>;
311
312
  }>;
312
- outputDefinition?: unknown;
313
+ output?: unknown;
313
314
  errorMessage?: string;
314
315
  } | {
315
316
  type: 'RunCompleted';
@@ -374,4 +375,4 @@ interface BinaryScoreData {
374
375
  }
375
376
  declare const binaryScore: ScoreDef<BinaryScoreData>;
376
377
 
377
- export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, type EvaluateArgs, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
378
+ export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
package/dist/index.js CHANGED
@@ -307,8 +307,8 @@ var TestCase = class _TestCase {
307
307
  tags: config.tags,
308
308
  inputSchema: config.inputSchema,
309
309
  input: config.input,
310
- outputDefinition: config.outputDefinition,
311
- outputDefintion: config.outputDefintion
310
+ outputSchema: config.outputSchema,
311
+ output: config.output
312
312
  });
313
313
  }
314
314
  getName() {
@@ -323,12 +323,14 @@ var TestCase = class _TestCase {
323
323
  getInput() {
324
324
  return resolve(this._config.input);
325
325
  }
326
- getOutputDefinition() {
327
- const value = this._config.outputDefinition ?? this._config.outputDefintion;
328
- if (value === void 0) {
326
+ getOutputSchema() {
327
+ return this._config.outputSchema;
328
+ }
329
+ getOutput() {
330
+ if (this._config.output === void 0) {
329
331
  return void 0;
330
332
  }
331
- return resolve(value);
333
+ return resolve(this._config.output);
332
334
  }
333
335
  };
334
336
 
@@ -864,12 +866,12 @@ function normalizeResult(result) {
864
866
  const metrics = Array.isArray(obj.metrics) ? obj.metrics : void 0;
865
867
  return { scores, metrics };
866
868
  }
867
- function readOutputDefinition(testCase) {
869
+ function readOutput(testCase) {
868
870
  const candidate = testCase;
869
- if (typeof candidate.getOutputDefinition !== "function") {
871
+ if (typeof candidate.getOutput !== "function") {
870
872
  return void 0;
871
873
  }
872
- return candidate.getOutputDefinition();
874
+ return candidate.getOutput();
873
875
  }
874
876
  function nowIsoForFile() {
875
877
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
@@ -899,7 +901,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
899
901
  const started = Date.now();
900
902
  const evaluatorScores = [];
901
903
  let testCaseError;
902
- const outputDefinition = readOutputDefinition(testCaseItem.testCase);
904
+ const output = readOutput(testCaseItem.testCase);
903
905
  for (const { id: evaluatorId, evaluator } of task.evaluators) {
904
906
  const evaluateFn = evaluator.getEvaluateFn();
905
907
  if (!evaluateFn) {
@@ -914,7 +916,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
914
916
  evaluateFn({
915
917
  input: testCaseItem.testCase.getInput(),
916
918
  ctx,
917
- output: outputDefinition
919
+ output
918
920
  })
919
921
  )
920
922
  );
@@ -947,7 +949,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
947
949
  passed: testCasePassed,
948
950
  durationMs: Date.now() - started,
949
951
  evaluatorScores,
950
- outputDefinition,
952
+ output,
951
953
  errorMessage: testCaseError
952
954
  };
953
955
  updateSnapshot(task.runId, (snapshot) => ({