@m4trix/evals 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1,4 +1,5 @@
1
1
  import { Schema } from 'effect';
2
+ export { Schema as S } from 'effect';
2
3
 
3
4
  type EvalStatus = 'PASS' | 'FAILED' | 'RUNNING';
4
5
  interface EvalDimension {
@@ -110,7 +111,7 @@ interface M4trixEvalConfig {
110
111
  }
111
112
  type ConfigType = M4trixEvalConfig;
112
113
  type M4trixEvalConfigFactory<TConfig extends ConfigType = ConfigType> = () => TConfig;
113
- declare function defineConfigFunction<TConfig extends ConfigType>(factory: M4trixEvalConfigFactory<TConfig>): M4trixEvalConfigFactory<TConfig>;
114
+ declare function defineConfig<TConfig extends ConfigType>(factory: M4trixEvalConfigFactory<TConfig>): M4trixEvalConfigFactory<TConfig>;
114
115
  declare const defaultRunnerConfig: RunnerConfig;
115
116
  declare function withRunnerConfig(overrides?: RunnerConfigOverrides): RunnerConfig;
116
117
 
@@ -120,20 +121,23 @@ type TagMatcher = string | RegExp;
120
121
  type PathMatcher = string | RegExp;
121
122
 
122
123
  type InputOrBuilder<T> = T | (() => T);
123
- interface TestCaseDescribeConfig<TI extends Schema.Schema.Any> {
124
+ interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TOutputDefinition = unknown> {
124
125
  name: string;
125
126
  tags: string[];
126
127
  inputSchema: TI;
127
128
  input: InputOrBuilder<Schema.Schema.Type<TI>>;
129
+ outputDefinition?: InputOrBuilder<TOutputDefinition>;
130
+ outputDefintion?: InputOrBuilder<TOutputDefinition>;
128
131
  }
129
- declare class TestCase<TInput = unknown> {
132
+ declare class TestCase<TInput = unknown, TOutputDefinition = unknown> {
130
133
  private readonly _config;
131
134
  private constructor();
132
- static describe<TI extends Schema.Schema.Any>(config: TestCaseDescribeConfig<TI>): TestCase<Schema.Schema.Type<TI>>;
135
+ static describe<TI extends Schema.Schema.Any, TOutputDefinition = unknown>(config: TestCaseDescribeConfig<TI, TOutputDefinition>): TestCase<Schema.Schema.Type<TI>, TOutputDefinition>;
133
136
  getName(): string;
134
137
  getTags(): string[];
135
138
  getInputSchema(): Schema.Schema.Any;
136
139
  getInput(): TInput;
140
+ getOutputDefinition(): TOutputDefinition | undefined;
137
141
  }
138
142
 
139
143
  interface DatasetDefineConfig {
@@ -159,7 +163,12 @@ interface EvalMiddleware<TCtx> {
159
163
  name: string;
160
164
  resolve: () => TCtx | Promise<TCtx>;
161
165
  }
162
- type EvaluateFn<TInput, TScore, TCtx> = (input: TInput, ctx: TCtx) => TScore | Promise<TScore>;
166
+ interface EvaluateArgs<TInput, TCtx> {
167
+ input: TInput;
168
+ ctx: TCtx;
169
+ output?: unknown;
170
+ }
171
+ type EvaluateFn<TInput, TScore, TCtx> = (args: EvaluateArgs<TInput, TCtx>) => TScore | Promise<TScore>;
163
172
  interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
164
173
  name: string;
165
174
  inputSchema: TI;
@@ -244,7 +253,7 @@ interface CollectedEvaluator {
244
253
  interface CollectedTestCase {
245
254
  id: string;
246
255
  filePath: string;
247
- testCase: TestCase<unknown>;
256
+ testCase: TestCase<unknown, unknown>;
248
257
  }
249
258
  interface SearchTestCasesQuery {
250
259
  includedTags?: ReadonlyArray<string | RegExp>;
@@ -300,6 +309,7 @@ type RunnerEvent = {
300
309
  passed: boolean;
301
310
  metrics?: ReadonlyArray<MetricItem>;
302
311
  }>;
312
+ outputDefinition?: unknown;
303
313
  errorMessage?: string;
304
314
  } | {
305
315
  type: 'RunCompleted';
@@ -364,4 +374,4 @@ interface BinaryScoreData {
364
374
  }
365
375
  declare const binaryScore: ScoreDef<BinaryScoreData>;
366
376
 
367
- export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfigFunction, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
377
+ export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, type EvaluateArgs, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
package/dist/index.d.ts CHANGED
@@ -1,4 +1,5 @@
1
1
  import { Schema } from 'effect';
2
+ export { Schema as S } from 'effect';
2
3
 
3
4
  type EvalStatus = 'PASS' | 'FAILED' | 'RUNNING';
4
5
  interface EvalDimension {
@@ -110,7 +111,7 @@ interface M4trixEvalConfig {
110
111
  }
111
112
  type ConfigType = M4trixEvalConfig;
112
113
  type M4trixEvalConfigFactory<TConfig extends ConfigType = ConfigType> = () => TConfig;
113
- declare function defineConfigFunction<TConfig extends ConfigType>(factory: M4trixEvalConfigFactory<TConfig>): M4trixEvalConfigFactory<TConfig>;
114
+ declare function defineConfig<TConfig extends ConfigType>(factory: M4trixEvalConfigFactory<TConfig>): M4trixEvalConfigFactory<TConfig>;
114
115
  declare const defaultRunnerConfig: RunnerConfig;
115
116
  declare function withRunnerConfig(overrides?: RunnerConfigOverrides): RunnerConfig;
116
117
 
@@ -120,20 +121,23 @@ type TagMatcher = string | RegExp;
120
121
  type PathMatcher = string | RegExp;
121
122
 
122
123
  type InputOrBuilder<T> = T | (() => T);
123
- interface TestCaseDescribeConfig<TI extends Schema.Schema.Any> {
124
+ interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TOutputDefinition = unknown> {
124
125
  name: string;
125
126
  tags: string[];
126
127
  inputSchema: TI;
127
128
  input: InputOrBuilder<Schema.Schema.Type<TI>>;
129
+ outputDefinition?: InputOrBuilder<TOutputDefinition>;
130
+ outputDefintion?: InputOrBuilder<TOutputDefinition>;
128
131
  }
129
- declare class TestCase<TInput = unknown> {
132
+ declare class TestCase<TInput = unknown, TOutputDefinition = unknown> {
130
133
  private readonly _config;
131
134
  private constructor();
132
- static describe<TI extends Schema.Schema.Any>(config: TestCaseDescribeConfig<TI>): TestCase<Schema.Schema.Type<TI>>;
135
+ static describe<TI extends Schema.Schema.Any, TOutputDefinition = unknown>(config: TestCaseDescribeConfig<TI, TOutputDefinition>): TestCase<Schema.Schema.Type<TI>, TOutputDefinition>;
133
136
  getName(): string;
134
137
  getTags(): string[];
135
138
  getInputSchema(): Schema.Schema.Any;
136
139
  getInput(): TInput;
140
+ getOutputDefinition(): TOutputDefinition | undefined;
137
141
  }
138
142
 
139
143
  interface DatasetDefineConfig {
@@ -159,7 +163,12 @@ interface EvalMiddleware<TCtx> {
159
163
  name: string;
160
164
  resolve: () => TCtx | Promise<TCtx>;
161
165
  }
162
- type EvaluateFn<TInput, TScore, TCtx> = (input: TInput, ctx: TCtx) => TScore | Promise<TScore>;
166
+ interface EvaluateArgs<TInput, TCtx> {
167
+ input: TInput;
168
+ ctx: TCtx;
169
+ output?: unknown;
170
+ }
171
+ type EvaluateFn<TInput, TScore, TCtx> = (args: EvaluateArgs<TInput, TCtx>) => TScore | Promise<TScore>;
163
172
  interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
164
173
  name: string;
165
174
  inputSchema: TI;
@@ -244,7 +253,7 @@ interface CollectedEvaluator {
244
253
  interface CollectedTestCase {
245
254
  id: string;
246
255
  filePath: string;
247
- testCase: TestCase<unknown>;
256
+ testCase: TestCase<unknown, unknown>;
248
257
  }
249
258
  interface SearchTestCasesQuery {
250
259
  includedTags?: ReadonlyArray<string | RegExp>;
@@ -300,6 +309,7 @@ type RunnerEvent = {
300
309
  passed: boolean;
301
310
  metrics?: ReadonlyArray<MetricItem>;
302
311
  }>;
312
+ outputDefinition?: unknown;
303
313
  errorMessage?: string;
304
314
  } | {
305
315
  type: 'RunCompleted';
@@ -364,4 +374,4 @@ interface BinaryScoreData {
364
374
  }
365
375
  declare const binaryScore: ScoreDef<BinaryScoreData>;
366
376
 
367
- export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfigFunction, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
377
+ export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, type EvaluateArgs, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
package/dist/index.js CHANGED
@@ -1,5 +1,6 @@
1
- import { randomUUID } from 'crypto';
2
1
  import { Effect, PubSub, Queue, Fiber } from 'effect';
2
+ export { Schema as S } from 'effect';
3
+ import { randomUUID } from 'crypto';
3
4
  import { existsSync } from 'fs';
4
5
  import { resolve as resolve$1, relative, join, dirname } from 'path';
5
6
  import * as jitiModule from 'jiti';
@@ -305,7 +306,9 @@ var TestCase = class _TestCase {
305
306
  name: config.name,
306
307
  tags: config.tags,
307
308
  inputSchema: config.inputSchema,
308
- input: config.input
309
+ input: config.input,
310
+ outputDefinition: config.outputDefinition,
311
+ outputDefintion: config.outputDefintion
309
312
  });
310
313
  }
311
314
  getName() {
@@ -320,6 +323,13 @@ var TestCase = class _TestCase {
320
323
  getInput() {
321
324
  return resolve(this._config.input);
322
325
  }
326
+ getOutputDefinition() {
327
+ const value = this._config.outputDefinition ?? this._config.outputDefintion;
328
+ if (value === void 0) {
329
+ return void 0;
330
+ }
331
+ return resolve(value);
332
+ }
323
333
  };
324
334
 
325
335
  // src/evals/evaluator.ts
@@ -543,7 +553,7 @@ var binaryScore = Score.of({
543
553
  });
544
554
 
545
555
  // src/runner/config.ts
546
- function defineConfigFunction(factory) {
556
+ function defineConfig(factory) {
547
557
  return factory;
548
558
  }
549
559
  var defaultRunnerConfig = {
@@ -624,7 +634,9 @@ function getJitiLoader() {
624
634
  }
625
635
  const createJiti2 = jitiModule.createJiti ?? jitiModule.default;
626
636
  if (typeof createJiti2 !== "function") {
627
- throw new Error("Failed to initialize jiti for m4trix eval config loading.");
637
+ throw new Error(
638
+ "Failed to initialize jiti for m4trix eval config loading."
639
+ );
628
640
  }
629
641
  cachedLoader = createJiti2(import.meta.url, {
630
642
  interopDefault: true,
@@ -647,7 +659,7 @@ function resolveConfigValue(value) {
647
659
  }
648
660
  if (typeof value !== "object") {
649
661
  throw new Error(
650
- "Invalid m4trix eval config export. Expected an object or defineConfigFunction(() => config)."
662
+ "Invalid m4trix eval config export. Expected an object or defineConfig(() => config)."
651
663
  );
652
664
  }
653
665
  return value;
@@ -852,6 +864,13 @@ function normalizeResult(result) {
852
864
  const metrics = Array.isArray(obj.metrics) ? obj.metrics : void 0;
853
865
  return { scores, metrics };
854
866
  }
867
+ function readOutputDefinition(testCase) {
868
+ const candidate = testCase;
869
+ if (typeof candidate.getOutputDefinition !== "function") {
870
+ return void 0;
871
+ }
872
+ return candidate.getOutputDefinition();
873
+ }
855
874
  function nowIsoForFile() {
856
875
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
857
876
  }
@@ -880,6 +899,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
880
899
  const started = Date.now();
881
900
  const evaluatorScores = [];
882
901
  let testCaseError;
902
+ const outputDefinition = readOutputDefinition(testCaseItem.testCase);
883
903
  for (const { id: evaluatorId, evaluator } of task.evaluators) {
884
904
  const evaluateFn = evaluator.getEvaluateFn();
885
905
  if (!evaluateFn) {
@@ -890,7 +910,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
890
910
  () => Promise.resolve(evaluator.resolveContext())
891
911
  );
892
912
  const result = yield* Effect.promise(
893
- () => Promise.resolve(evaluateFn(testCaseItem.testCase.getInput(), ctx))
913
+ () => Promise.resolve(
914
+ evaluateFn({
915
+ input: testCaseItem.testCase.getInput(),
916
+ ctx,
917
+ output: outputDefinition
918
+ })
919
+ )
894
920
  );
895
921
  const { scores, metrics } = normalizeResult(result);
896
922
  const passed = computeEvaluatorPassed(evaluator, result, scores);
@@ -921,6 +947,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
921
947
  passed: testCasePassed,
922
948
  durationMs: Date.now() - started,
923
949
  evaluatorScores,
950
+ outputDefinition,
924
951
  errorMessage: testCaseError
925
952
  };
926
953
  updateSnapshot(task.runId, (snapshot) => ({
@@ -1267,6 +1294,6 @@ var EffectRunner = class {
1267
1294
  }
1268
1295
  };
1269
1296
 
1270
- export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createRunner, defaultRunnerConfig, defineConfigFunction, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
1297
+ export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
1271
1298
  //# sourceMappingURL=out.js.map
1272
1299
  //# sourceMappingURL=index.js.map