npm - @m4trix/evals - Versions diffs - 0.2.0 → 0.4.0 - Mend

@m4trix/evals 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/dist/index.d.cts CHANGED Viewed

@@ -1,4 +1,5 @@
 import { Schema } from 'effect';
+export { Schema as S } from 'effect';
 type EvalStatus = 'PASS' | 'FAILED' | 'RUNNING';
 interface EvalDimension {
@@ -110,7 +111,7 @@ interface M4trixEvalConfig {
 }
 type ConfigType = M4trixEvalConfig;
 type M4trixEvalConfigFactory<TConfig extends ConfigType = ConfigType> = () => TConfig;
-declare function defineConfigFunction<TConfig extends ConfigType>(factory: M4trixEvalConfigFactory<TConfig>): M4trixEvalConfigFactory<TConfig>;
+declare function defineConfig<TConfig extends ConfigType>(factory: M4trixEvalConfigFactory<TConfig>): M4trixEvalConfigFactory<TConfig>;
 declare const defaultRunnerConfig: RunnerConfig;
 declare function withRunnerConfig(overrides?: RunnerConfigOverrides): RunnerConfig;
@@ -120,20 +121,23 @@ type TagMatcher = string | RegExp;
 type PathMatcher = string | RegExp;
 type InputOrBuilder<T> = T | (() => T);
-interface TestCaseDescribeConfig<TI extends Schema.Schema.Any> {
+interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TOutputDefinition = unknown> {
     name: string;
     tags: string[];
     inputSchema: TI;
     input: InputOrBuilder<Schema.Schema.Type<TI>>;
+    outputDefinition?: InputOrBuilder<TOutputDefinition>;
+    outputDefintion?: InputOrBuilder<TOutputDefinition>;
 }
-declare class TestCase<TInput = unknown> {
+declare class TestCase<TInput = unknown, TOutputDefinition = unknown> {
     private readonly _config;
     private constructor();
-    static describe<TI extends Schema.Schema.Any>(config: TestCaseDescribeConfig<TI>): TestCase<Schema.Schema.Type<TI>>;
+    static describe<TI extends Schema.Schema.Any, TOutputDefinition = unknown>(config: TestCaseDescribeConfig<TI, TOutputDefinition>): TestCase<Schema.Schema.Type<TI>, TOutputDefinition>;
     getName(): string;
     getTags(): string[];
     getInputSchema(): Schema.Schema.Any;
     getInput(): TInput;
+    getOutputDefinition(): TOutputDefinition | undefined;
 }
 interface DatasetDefineConfig {
@@ -159,7 +163,12 @@ interface EvalMiddleware<TCtx> {
     name: string;
     resolve: () => TCtx | Promise<TCtx>;
 }
-type EvaluateFn<TInput, TScore, TCtx> = (input: TInput, ctx: TCtx) => TScore | Promise<TScore>;
+interface EvaluateArgs<TInput, TCtx> {
+    input: TInput;
+    ctx: TCtx;
+    output?: unknown;
+}
+type EvaluateFn<TInput, TScore, TCtx> = (args: EvaluateArgs<TInput, TCtx>) => TScore | Promise<TScore>;
 interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
     name: string;
     inputSchema: TI;
@@ -244,7 +253,7 @@ interface CollectedEvaluator {
 interface CollectedTestCase {
     id: string;
     filePath: string;
-    testCase: TestCase<unknown>;
+    testCase: TestCase<unknown, unknown>;
 }
 interface SearchTestCasesQuery {
     includedTags?: ReadonlyArray<string | RegExp>;
@@ -300,6 +309,7 @@ type RunnerEvent = {
         passed: boolean;
         metrics?: ReadonlyArray<MetricItem>;
     }>;
+    outputDefinition?: unknown;
     errorMessage?: string;
 } | {
     type: 'RunCompleted';
@@ -364,4 +374,4 @@ interface BinaryScoreData {
 }
 declare const binaryScore: ScoreDef<BinaryScoreData>;
-export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfigFunction, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
+export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, type EvaluateArgs, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };

package/dist/index.d.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import { Schema } from 'effect';
+export { Schema as S } from 'effect';
 type EvalStatus = 'PASS' | 'FAILED' | 'RUNNING';
 interface EvalDimension {
@@ -110,7 +111,7 @@ interface M4trixEvalConfig {
 }
 type ConfigType = M4trixEvalConfig;
 type M4trixEvalConfigFactory<TConfig extends ConfigType = ConfigType> = () => TConfig;
-declare function defineConfigFunction<TConfig extends ConfigType>(factory: M4trixEvalConfigFactory<TConfig>): M4trixEvalConfigFactory<TConfig>;
+declare function defineConfig<TConfig extends ConfigType>(factory: M4trixEvalConfigFactory<TConfig>): M4trixEvalConfigFactory<TConfig>;
 declare const defaultRunnerConfig: RunnerConfig;
 declare function withRunnerConfig(overrides?: RunnerConfigOverrides): RunnerConfig;
@@ -120,20 +121,23 @@ type TagMatcher = string | RegExp;
 type PathMatcher = string | RegExp;
 type InputOrBuilder<T> = T | (() => T);
-interface TestCaseDescribeConfig<TI extends Schema.Schema.Any> {
+interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TOutputDefinition = unknown> {
     name: string;
     tags: string[];
     inputSchema: TI;
     input: InputOrBuilder<Schema.Schema.Type<TI>>;
+    outputDefinition?: InputOrBuilder<TOutputDefinition>;
+    outputDefintion?: InputOrBuilder<TOutputDefinition>;
 }
-declare class TestCase<TInput = unknown> {
+declare class TestCase<TInput = unknown, TOutputDefinition = unknown> {
     private readonly _config;
     private constructor();
-    static describe<TI extends Schema.Schema.Any>(config: TestCaseDescribeConfig<TI>): TestCase<Schema.Schema.Type<TI>>;
+    static describe<TI extends Schema.Schema.Any, TOutputDefinition = unknown>(config: TestCaseDescribeConfig<TI, TOutputDefinition>): TestCase<Schema.Schema.Type<TI>, TOutputDefinition>;
     getName(): string;
     getTags(): string[];
     getInputSchema(): Schema.Schema.Any;
     getInput(): TInput;
+    getOutputDefinition(): TOutputDefinition | undefined;
 }
 interface DatasetDefineConfig {
@@ -159,7 +163,12 @@ interface EvalMiddleware<TCtx> {
     name: string;
     resolve: () => TCtx | Promise<TCtx>;
 }
-type EvaluateFn<TInput, TScore, TCtx> = (input: TInput, ctx: TCtx) => TScore | Promise<TScore>;
+interface EvaluateArgs<TInput, TCtx> {
+    input: TInput;
+    ctx: TCtx;
+    output?: unknown;
+}
+type EvaluateFn<TInput, TScore, TCtx> = (args: EvaluateArgs<TInput, TCtx>) => TScore | Promise<TScore>;
 interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
     name: string;
     inputSchema: TI;
@@ -244,7 +253,7 @@ interface CollectedEvaluator {
 interface CollectedTestCase {
     id: string;
     filePath: string;
-    testCase: TestCase<unknown>;
+    testCase: TestCase<unknown, unknown>;
 }
 interface SearchTestCasesQuery {
     includedTags?: ReadonlyArray<string | RegExp>;
@@ -300,6 +309,7 @@ type RunnerEvent = {
         passed: boolean;
         metrics?: ReadonlyArray<MetricItem>;
     }>;
+    outputDefinition?: unknown;
     errorMessage?: string;
 } | {
     type: 'RunCompleted';
@@ -364,4 +374,4 @@ interface BinaryScoreData {
 }
 declare const binaryScore: ScoreDef<BinaryScoreData>;
-export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfigFunction, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
+export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, type EvaluateArgs, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };

package/dist/index.js CHANGED Viewed

@@ -1,5 +1,6 @@
-import { randomUUID } from 'crypto';
 import { Effect, PubSub, Queue, Fiber } from 'effect';
+export { Schema as S } from 'effect';
+import { randomUUID } from 'crypto';
 import { existsSync } from 'fs';
 import { resolve as resolve$1, relative, join, dirname } from 'path';
 import * as jitiModule from 'jiti';
@@ -305,7 +306,9 @@ var TestCase = class _TestCase {
       name: config.name,
       tags: config.tags,
       inputSchema: config.inputSchema,
-      input: config.input
+      input: config.input,
+      outputDefinition: config.outputDefinition,
+      outputDefintion: config.outputDefintion
     });
   }
   getName() {
@@ -320,6 +323,13 @@ var TestCase = class _TestCase {
   getInput() {
     return resolve(this._config.input);
   }
+  getOutputDefinition() {
+    const value = this._config.outputDefinition ?? this._config.outputDefintion;
+    if (value === void 0) {
+      return void 0;
+    }
+    return resolve(value);
+  }
 };
 // src/evals/evaluator.ts
@@ -543,7 +553,7 @@ var binaryScore = Score.of({
 });
 // src/runner/config.ts
-function defineConfigFunction(factory) {
+function defineConfig(factory) {
   return factory;
 }
 var defaultRunnerConfig = {
@@ -624,7 +634,9 @@ function getJitiLoader() {
   }
   const createJiti2 = jitiModule.createJiti ?? jitiModule.default;
   if (typeof createJiti2 !== "function") {
-    throw new Error("Failed to initialize jiti for m4trix eval config loading.");
+    throw new Error(
+      "Failed to initialize jiti for m4trix eval config loading."
+    );
   }
   cachedLoader = createJiti2(import.meta.url, {
     interopDefault: true,
@@ -647,7 +659,7 @@ function resolveConfigValue(value) {
   }
   if (typeof value !== "object") {
     throw new Error(
-      "Invalid m4trix eval config export. Expected an object or defineConfigFunction(() => config)."
+      "Invalid m4trix eval config export. Expected an object or defineConfig(() => config)."
     );
   }
   return value;
@@ -852,6 +864,13 @@ function normalizeResult(result) {
   const metrics = Array.isArray(obj.metrics) ? obj.metrics : void 0;
   return { scores, metrics };
 }
+function readOutputDefinition(testCase) {
+  const candidate = testCase;
+  if (typeof candidate.getOutputDefinition !== "function") {
+    return void 0;
+  }
+  return candidate.getOutputDefinition();
+}
 function nowIsoForFile() {
   return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
 }
@@ -880,6 +899,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
     const started = Date.now();
     const evaluatorScores = [];
     let testCaseError;
+    const outputDefinition = readOutputDefinition(testCaseItem.testCase);
     for (const { id: evaluatorId, evaluator } of task.evaluators) {
       const evaluateFn = evaluator.getEvaluateFn();
       if (!evaluateFn) {
@@ -890,7 +910,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
           () => Promise.resolve(evaluator.resolveContext())
         );
         const result = yield* Effect.promise(
-          () => Promise.resolve(evaluateFn(testCaseItem.testCase.getInput(), ctx))
+          () => Promise.resolve(
+            evaluateFn({
+              input: testCaseItem.testCase.getInput(),
+              ctx,
+              output: outputDefinition
+            })
+          )
         );
         const { scores, metrics } = normalizeResult(result);
         const passed = computeEvaluatorPassed(evaluator, result, scores);
@@ -921,6 +947,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
       passed: testCasePassed,
       durationMs: Date.now() - started,
       evaluatorScores,
+      outputDefinition,
       errorMessage: testCaseError
     };
     updateSnapshot(task.runId, (snapshot) => ({
@@ -1267,6 +1294,6 @@ var EffectRunner = class {
   }
 };
-export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createRunner, defaultRunnerConfig, defineConfigFunction, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
+export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
 //# sourceMappingURL=out.js.map
 //# sourceMappingURL=index.js.map