npm - evalsense - Versions diffs - 0.2.1 → 0.3.1 - Mend

evalsense 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/README.md +99 -82
package/dist/{chunk-HDJID3GC.cjs → chunk-BE7CB3AM.cjs} +39 -28
package/dist/chunk-BE7CB3AM.cjs.map +1 -0
package/dist/chunk-DGUM43GV.js +10 -0
package/dist/chunk-DGUM43GV.js.map +1 -0
package/dist/chunk-JEQ2X3Z6.cjs +12 -0
package/dist/chunk-JEQ2X3Z6.cjs.map +1 -0
package/dist/{chunk-5P7LNNO6.js → chunk-K6QPJ2NO.js} +39 -28
package/dist/chunk-K6QPJ2NO.js.map +1 -0
package/dist/{chunk-Y23VHTD3.cjs → chunk-RZFLCWTW.cjs} +2 -2
package/dist/chunk-RZFLCWTW.cjs.map +1 -0
package/dist/{chunk-BRPM6AB6.js → chunk-Z3U6AUWX.js} +2 -2
package/dist/chunk-Z3U6AUWX.js.map +1 -0
package/dist/cli.cjs +39 -36
package/dist/cli.cjs.map +1 -1
package/dist/cli.js +37 -34
package/dist/cli.js.map +1 -1
package/dist/index.cjs +320 -104
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +93 -7
package/dist/index.d.ts +93 -7
package/dist/index.js +242 -26
package/dist/index.js.map +1 -1
package/dist/metrics/index.cjs +257 -17
package/dist/metrics/index.cjs.map +1 -1
package/dist/metrics/index.d.cts +252 -1
package/dist/metrics/index.d.ts +252 -1
package/dist/metrics/index.js +240 -2
package/dist/metrics/index.js.map +1 -1
package/dist/metrics/opinionated/index.cjs +6 -5
package/dist/metrics/opinionated/index.js +2 -1
package/package.json +4 -3
package/dist/chunk-5P7LNNO6.js.map +0 -1
package/dist/chunk-BRPM6AB6.js.map +0 -1
package/dist/chunk-HDJID3GC.cjs.map +0 -1
package/dist/chunk-Y23VHTD3.cjs.map +0 -1

package/dist/index.d.ts CHANGED Viewed

@@ -143,8 +143,12 @@ declare function runModelParallel<T extends Record<string, unknown>>(dataset: Da
 interface AlignOptions {
     /** Whether to throw on missing IDs (default: false) */
     strict?: boolean;
-    /** Field to use as ID (default: "id") */
+    /** Field to use as ID in both arrays (default: "id") - legacy option */
     idField?: string;
+    /** Field to use as ID in predictions array (default: "id") */
+    predictionIdField?: string;
+    /** Field to use as ID in expected/ground truth array (default: "id") */
+    expectedIdField?: string;
 }
 /**
  * Aligns predictions with expected values by ID
@@ -321,6 +325,52 @@ declare class FieldSelector {
      *   .toHavePercentageAbove(0.7, 0.8)
      */
     toHavePercentageAbove(valueThreshold: number, percentageThreshold: number): this;
+    /**
+     * Validates that ground truth exists and both arrays contain numeric values.
+     * Returns the filtered numeric arrays for regression metrics.
+     */
+    private validateRegressionInputs;
+    /**
+     * Asserts that Mean Absolute Error is below a threshold.
+     * Requires numeric values in both actual and expected.
+     *
+     * @param threshold - Maximum allowed MAE
+     * @returns this for method chaining
+     *
+     * @example
+     * expectStats(predictions, groundTruth)
+     *   .field("score")
+     *   .toHaveMAEBelow(0.1)
+     */
+    toHaveMAEBelow(threshold: number): this;
+    /**
+     * Asserts that Root Mean Squared Error is below a threshold.
+     * Requires numeric values in both actual and expected.
+     *
+     * @param threshold - Maximum allowed RMSE
+     * @returns this for method chaining
+     *
+     * @example
+     * expectStats(predictions, groundTruth)
+     *   .field("score")
+     *   .toHaveRMSEBelow(0.15)
+     */
+    toHaveRMSEBelow(threshold: number): this;
+    /**
+     * Asserts that R-squared (coefficient of determination) is above a threshold.
+     * R² measures how well the predictions explain the variance in expected values.
+     * R² = 1.0 means perfect prediction, R² = 0 means prediction is no better than mean.
+     * Requires numeric values in both actual and expected.
+     *
+     * @param threshold - Minimum required R² value (0-1)
+     * @returns this for method chaining
+     *
+     * @example
+     * expectStats(predictions, groundTruth)
+     *   .field("score")
+     *   .toHaveR2Above(0.8)
+     */
+    toHaveR2Above(threshold: number): this;
     /**
      * Gets the computed metrics for this field
      */
@@ -339,15 +389,40 @@ declare class FieldSelector {
  * Input types that expectStats() accepts
  */
 type StatsInput = ModelRunResult | Prediction[] | AlignedRecord[];
+/**
+ * Options for expectStats when using two-argument form
+ */
+interface ExpectStatsOptions {
+    /**
+     * Field to use as ID in both arrays (default: "id") - legacy option
+     * Also checks "_id" as fallback for expected records.
+     */
+    idField?: string;
+    /**
+     * Field to use as ID in predictions array (default: "id")
+     */
+    predictionIdField?: string;
+    /**
+     * Field to use as ID in expected/ground truth array (default: "id")
+     */
+    expectedIdField?: string;
+    /**
+     * Whether to throw on missing IDs (default: false)
+     * When true, throws if any prediction has no matching expected record.
+     */
+    strict?: boolean;
+}
 /**
  * Entry point for statistical assertions.
  *
- * Supports two usage patterns:
+ * Supports multiple usage patterns:
  * 1. Single argument: predictions without ground truth (for distribution assertions)
- * 2. Two arguments: predictions with ground truth (for classification metrics)
+ * 2. Two arguments: predictions with ground truth (for classification/regression metrics)
+ * 3. Three arguments: predictions with ground truth and options (for custom ID field)
  *
- * @param inputOrActual - Either StatsInput (one-arg) or Prediction[] (two-arg)
- * @param expected - Ground truth data (optional, only for two-arg usage)
+ * @param inputOrActual - Either StatsInput (one-arg) or Prediction[] (two/three-arg)
+ * @param expected - Ground truth data (optional, only for two/three-arg usage)
+ * @param options - Alignment options (optional, only for three-arg usage)
  * @returns ExpectStats instance for chaining assertions
  *
  * @example
@@ -357,14 +432,21 @@ type StatsInput = ModelRunResult | Prediction[] | AlignedRecord[];
  *   .toHavePercentageBelow(0.5, 0.9);
  *
  * @example
- * // Pattern 1b: Judge validation (with ground truth)
+ * // Pattern 2: Classification with ground truth
  * expectStats(judgeOutputs, humanLabels)
  *   .field("hallucinated")
  *   .toHaveRecallAbove(true, 0.85)
  *   .toHavePrecisionAbove(true, 0.8);
+ *
+ * @example
+ * // Pattern 3: Custom ID field
+ * expectStats(predictions, groundTruth, { idField: 'uuid' })
+ *   .field("score")
+ *   .toHaveAccuracyAbove(0.8);
  */
 declare function expectStats(input: StatsInput): ExpectStats;
 declare function expectStats(actual: Prediction[], expected: Array<Record<string, unknown>>): ExpectStats;
+declare function expectStats(actual: Prediction[], expected: Array<Record<string, unknown>>, options: ExpectStatsOptions): ExpectStats;
 /**
  * Main stats expectation class
  */
@@ -520,6 +602,10 @@ declare class ConsoleReporter {
      * Applies color if enabled
      */
     private color;
+    /**
+     * Formats a value for display
+     */
+    private formatValue;
     /**
      * Logs a line
      */
@@ -601,4 +687,4 @@ declare class TestExecutionError extends EvalSenseError {
     constructor(message: string, testName: string, originalError?: Error);
 }
-export { AlignedRecord, AssertionError, AssertionResult, ClassificationMetrics, ConfigurationError, ConfusionMatrix, ConsoleReporter, Dataset, DatasetError, EvalReport, EvalSenseError, FieldMetricResult, IntegrityError, IntegrityResult, JsonReporter, Prediction, TestExecutionError, TestFn, afterAll, afterEach, alignByKey, beforeAll, beforeEach, buildConfusionMatrix, checkIntegrity, computeAccuracy, computeClassificationMetrics, computeF1, computePrecision, computeRecall, createDataset, describe, discoverEvalFiles, evalTest, executeEvalFiles, expectStats, extractFieldValues, filterComplete, formatConfusionMatrix, getExitCode, it, loadDataset, parseReport, runModel, runModelParallel, test, validatePredictions };
+export { AlignedRecord, AssertionError, AssertionResult, ClassificationMetrics, ConfigurationError, ConfusionMatrix, ConsoleReporter, Dataset, DatasetError, EvalReport, EvalSenseError, type ExpectStatsOptions, FieldMetricResult, IntegrityError, IntegrityResult, JsonReporter, Prediction, TestExecutionError, TestFn, afterAll, afterEach, alignByKey, beforeAll, beforeEach, buildConfusionMatrix, checkIntegrity, computeAccuracy, computeClassificationMetrics, computeF1, computePrecision, computeRecall, createDataset, describe, discoverEvalFiles, evalTest, executeEvalFiles, expectStats, extractFieldValues, filterComplete, formatConfusionMatrix, getExitCode, it, loadDataset, parseReport, runModel, runModelParallel, test, validatePredictions };

package/dist/index.js CHANGED Viewed

@@ -1,5 +1,6 @@
-import { getCurrentSuite, setCurrentSuite, addSuite, addTestToCurrentSuite, DatasetError, IntegrityError, buildConfusionMatrix, getTruePositives, getFalsePositives, getFalseNegatives, getSupport, AssertionError, recordAssertion, recordFieldMetrics } from './chunk-5P7LNNO6.js';
-export { AssertionError, ConfigurationError, ConsoleReporter, DatasetError, EvalSenseError, ExitCodes, IntegrityError, JsonReporter, TestExecutionError, buildConfusionMatrix, discoverEvalFiles, executeEvalFiles, formatConfusionMatrix, getExitCode, parseReport } from './chunk-5P7LNNO6.js';
+import { getCurrentSuite, setCurrentSuite, addSuite, addTestToCurrentSuite, DatasetError, IntegrityError, buildConfusionMatrix, getTruePositives, getFalsePositives, getFalseNegatives, getSupport, AssertionError, recordAssertion, recordFieldMetrics } from './chunk-K6QPJ2NO.js';
+export { AssertionError, ConfigurationError, ConsoleReporter, DatasetError, EvalSenseError, ExitCodes, IntegrityError, JsonReporter, TestExecutionError, buildConfusionMatrix, discoverEvalFiles, executeEvalFiles, formatConfusionMatrix, getExitCode, parseReport } from './chunk-K6QPJ2NO.js';
+import './chunk-DGUM43GV.js';
 import { readFileSync } from 'fs';
 import { resolve, extname } from 'path';
@@ -183,9 +184,7 @@ async function runModel(dataset, modelFn) {
 function getRecordId(record) {
   const id = record.id ?? record._id;
   if (id === void 0 || id === null) {
-    throw new DatasetError(
-      'Dataset records must have an "id" or "_id" field for alignment'
-    );
+    throw new DatasetError('Dataset records must have an "id" or "_id" field for alignment');
   }
   return String(id);
 }
@@ -207,9 +206,7 @@ async function runModelParallel(dataset, modelFn, concurrency = 10) {
   for (const { prediction, record } of results) {
     const id = getRecordId(record);
     if (prediction.id !== id) {
-      throw new DatasetError(
-        `Prediction ID mismatch: expected "${id}", got "${prediction.id}".`
-      );
+      throw new DatasetError(`Prediction ID mismatch: expected "${id}", got "${prediction.id}".`);
     }
     predictions.push(prediction);
     aligned.push({
@@ -227,16 +224,28 @@ async function runModelParallel(dataset, modelFn, concurrency = 10) {
 // src/dataset/alignment.ts
 function alignByKey(predictions, expected, options = {}) {
-  const { strict = false, idField = "id" } = options;
+  const { strict = false, idField, predictionIdField, expectedIdField } = options;
+  const predIdField = predictionIdField ?? idField ?? "id";
+  const expIdField = expectedIdField ?? idField ?? "id";
   const expectedMap = /* @__PURE__ */ new Map();
   for (const record of expected) {
-    const id = String(record[idField] ?? record._id);
+    const id = String(record[expIdField] ?? record._id);
+    if (!id || id === "undefined") {
+      throw new IntegrityError(
+        `Expected record missing ${expIdField} field: ${JSON.stringify(record)}`
+      );
+    }
     expectedMap.set(id, record);
   }
   const aligned = [];
   const missingIds = [];
   for (const prediction of predictions) {
-    const id = prediction.id;
+    const id = String(prediction[predIdField]);
+    if (!id || id === "undefined") {
+      throw new IntegrityError(
+        `Prediction missing ${predIdField} field: ${JSON.stringify(prediction)}`
+      );
+    }
     const expectedRecord = expectedMap.get(id);
     if (!expectedRecord) {
       missingIds.push(id);
@@ -306,9 +315,7 @@ function checkIntegrity(dataset, options = {}) {
       }
     }
     if (requiredFields.length > 0) {
-      const missing = requiredFields.filter(
-        (field) => record[field] === void 0
-      );
+      const missing = requiredFields.filter((field) => record[field] === void 0);
       if (missing.length > 0) {
         missingFields.push({
           id: String(id ?? `record[${i}]`),
@@ -331,7 +338,9 @@ function checkIntegrity(dataset, options = {}) {
       issues.push(`${missingIds.length} record(s) missing ID`);
     }
     if (duplicateIds.length > 0) {
-      issues.push(`${duplicateIds.length} duplicate ID(s): ${duplicateIds.slice(0, 3).join(", ")}${duplicateIds.length > 3 ? "..." : ""}`);
+      issues.push(
+        `${duplicateIds.length} duplicate ID(s): ${duplicateIds.slice(0, 3).join(", ")}${duplicateIds.length > 3 ? "..." : ""}`
+      );
     }
     if (missingFields.length > 0) {
       issues.push(`${missingFields.length} record(s) missing required fields`);
@@ -429,6 +438,67 @@ function computeAccuracy(actual, expected) {
   return total > 0 ? correct / total : 0;
 }
+// src/statistics/regression.ts
+function computeRegressionMetrics(actual, expected) {
+  if (actual.length !== expected.length) {
+    throw new Error(
+      `Array length mismatch: actual has ${actual.length} elements, expected has ${expected.length}`
+    );
+  }
+  const n = actual.length;
+  if (n === 0) {
+    return { mae: 0, mse: 0, rmse: 0, r2: 0 };
+  }
+  const mae = computeMAE(actual, expected);
+  const mse = computeMSE(actual, expected);
+  const rmse = Math.sqrt(mse);
+  const r2 = computeR2(actual, expected);
+  return { mae, mse, rmse, r2 };
+}
+function computeMAE(actual, expected) {
+  if (actual.length !== expected.length || actual.length === 0) {
+    return 0;
+  }
+  let sum = 0;
+  for (let i = 0; i < actual.length; i++) {
+    sum += Math.abs((actual[i] ?? 0) - (expected[i] ?? 0));
+  }
+  return sum / actual.length;
+}
+function computeMSE(actual, expected) {
+  if (actual.length !== expected.length || actual.length === 0) {
+    return 0;
+  }
+  let sum = 0;
+  for (let i = 0; i < actual.length; i++) {
+    const diff = (actual[i] ?? 0) - (expected[i] ?? 0);
+    sum += diff * diff;
+  }
+  return sum / actual.length;
+}
+function computeR2(actual, expected) {
+  if (actual.length !== expected.length || actual.length === 0) {
+    return 0;
+  }
+  let meanExpected = 0;
+  for (const val of expected) {
+    meanExpected += val ?? 0;
+  }
+  meanExpected /= expected.length;
+  let ssTotal = 0;
+  let ssResidual = 0;
+  for (let i = 0; i < actual.length; i++) {
+    const exp = expected[i] ?? 0;
+    const act = actual[i] ?? 0;
+    ssTotal += (exp - meanExpected) ** 2;
+    ssResidual += (exp - act) ** 2;
+  }
+  if (ssTotal === 0) {
+    return ssResidual === 0 ? 1 : 0;
+  }
+  return 1 - ssResidual / ssTotal;
+}
 // src/statistics/distribution.ts
 function filterNumericValues(values) {
   return values.filter(
@@ -693,9 +763,7 @@ var FieldSelector = class {
    * Throws a clear error if expected values are missing.
    */
   validateGroundTruth() {
-    const hasExpected = this.expectedValues.some(
-      (v) => v !== void 0 && v !== null
-    );
+    const hasExpected = this.expectedValues.some((v) => v !== void 0 && v !== null);
     if (!hasExpected) {
       throw new AssertionError(
         `Classification metric requires ground truth, but field "${this.fieldName}" has no expected values. Use expectStats(predictions, groundTruth) to provide expected values.`,
@@ -920,7 +988,12 @@ var FieldSelector = class {
     this.assertions.push(result);
     recordAssertion(result);
     if (!passed) {
-      throw new AssertionError(result.message, percentageThreshold, actualPercentage, this.fieldName);
+      throw new AssertionError(
+        result.message,
+        percentageThreshold,
+        actualPercentage,
+        this.fieldName
+      );
     }
     return this;
   }
@@ -961,7 +1034,144 @@ var FieldSelector = class {
     this.assertions.push(result);
     recordAssertion(result);
     if (!passed) {
-      throw new AssertionError(result.message, percentageThreshold, actualPercentage, this.fieldName);
+      throw new AssertionError(
+        result.message,
+        percentageThreshold,
+        actualPercentage,
+        this.fieldName
+      );
+    }
+    return this;
+  }
+  // ============================================================================
+  // Regression Assertions
+  // ============================================================================
+  /**
+   * Validates that ground truth exists and both arrays contain numeric values.
+   * Returns the filtered numeric arrays for regression metrics.
+   */
+  validateRegressionInputs() {
+    this.validateGroundTruth();
+    const numericActual = filterNumericValues(this.actualValues);
+    const numericExpected = filterNumericValues(this.expectedValues);
+    if (numericActual.length === 0) {
+      throw new AssertionError(
+        `Regression metric requires numeric values, but field "${this.fieldName}" has no numeric actual values.`,
+        void 0,
+        void 0,
+        this.fieldName
+      );
+    }
+    if (numericExpected.length === 0) {
+      throw new AssertionError(
+        `Regression metric requires numeric values, but field "${this.fieldName}" has no numeric expected values.`,
+        void 0,
+        void 0,
+        this.fieldName
+      );
+    }
+    if (numericActual.length !== numericExpected.length) {
+      throw new AssertionError(
+        `Regression metric requires equal-length arrays, but got ${numericActual.length} actual and ${numericExpected.length} expected values.`,
+        numericExpected.length,
+        numericActual.length,
+        this.fieldName
+      );
+    }
+    return { actual: numericActual, expected: numericExpected };
+  }
+  /**
+   * Asserts that Mean Absolute Error is below a threshold.
+   * Requires numeric values in both actual and expected.
+   *
+   * @param threshold - Maximum allowed MAE
+   * @returns this for method chaining
+   *
+   * @example
+   * expectStats(predictions, groundTruth)
+   *   .field("score")
+   *   .toHaveMAEBelow(0.1)
+   */
+  toHaveMAEBelow(threshold) {
+    const { actual, expected } = this.validateRegressionInputs();
+    const metrics = computeRegressionMetrics(actual, expected);
+    const passed = metrics.mae <= threshold;
+    const result = {
+      type: "mae",
+      passed,
+      message: passed ? `MAE ${metrics.mae.toFixed(4)} is below ${threshold}` : `MAE ${metrics.mae.toFixed(4)} exceeds threshold ${threshold}`,
+      expected: threshold,
+      actual: metrics.mae,
+      field: this.fieldName
+    };
+    this.assertions.push(result);
+    recordAssertion(result);
+    if (!passed) {
+      throw new AssertionError(result.message, threshold, metrics.mae, this.fieldName);
+    }
+    return this;
+  }
+  /**
+   * Asserts that Root Mean Squared Error is below a threshold.
+   * Requires numeric values in both actual and expected.
+   *
+   * @param threshold - Maximum allowed RMSE
+   * @returns this for method chaining
+   *
+   * @example
+   * expectStats(predictions, groundTruth)
+   *   .field("score")
+   *   .toHaveRMSEBelow(0.15)
+   */
+  toHaveRMSEBelow(threshold) {
+    const { actual, expected } = this.validateRegressionInputs();
+    const metrics = computeRegressionMetrics(actual, expected);
+    const passed = metrics.rmse <= threshold;
+    const result = {
+      type: "rmse",
+      passed,
+      message: passed ? `RMSE ${metrics.rmse.toFixed(4)} is below ${threshold}` : `RMSE ${metrics.rmse.toFixed(4)} exceeds threshold ${threshold}`,
+      expected: threshold,
+      actual: metrics.rmse,
+      field: this.fieldName
+    };
+    this.assertions.push(result);
+    recordAssertion(result);
+    if (!passed) {
+      throw new AssertionError(result.message, threshold, metrics.rmse, this.fieldName);
+    }
+    return this;
+  }
+  /**
+   * Asserts that R-squared (coefficient of determination) is above a threshold.
+   * R² measures how well the predictions explain the variance in expected values.
+   * R² = 1.0 means perfect prediction, R² = 0 means prediction is no better than mean.
+   * Requires numeric values in both actual and expected.
+   *
+   * @param threshold - Minimum required R² value (0-1)
+   * @returns this for method chaining
+   *
+   * @example
+   * expectStats(predictions, groundTruth)
+   *   .field("score")
+   *   .toHaveR2Above(0.8)
+   */
+  toHaveR2Above(threshold) {
+    const { actual, expected } = this.validateRegressionInputs();
+    const metrics = computeRegressionMetrics(actual, expected);
+    const passed = metrics.r2 >= threshold;
+    const result = {
+      type: "r2",
+      passed,
+      message: passed ? `R\xB2 ${metrics.r2.toFixed(4)} is above ${threshold}` : `R\xB2 ${metrics.r2.toFixed(4)} is below threshold ${threshold}`,
+      expected: threshold,
+      actual: metrics.r2,
+      field: this.fieldName
+    };
+    this.assertions.push(result);
+    recordAssertion(result);
+    if (!passed) {
+      throw new AssertionError(result.message, threshold, metrics.r2, this.fieldName);
     }
     return this;
   }
@@ -998,16 +1208,22 @@ function normalizeInput(input) {
       expected: {}
     }));
   }
-  throw new Error("Invalid input to expectStats(): expected ModelRunResult, Prediction[], or AlignedRecord[]");
+  throw new Error(
+    "Invalid input to expectStats(): expected ModelRunResult, Prediction[], or AlignedRecord[]"
+  );
 }
-function expectStats(inputOrActual, expected) {
+function expectStats(inputOrActual, expected, options) {
   if (expected !== void 0) {
     if (!Array.isArray(inputOrActual)) {
-      throw new Error(
-        "When using two-argument expectStats(), first argument must be Prediction[]"
-      );
+      throw new Error("When using two-argument expectStats(), first argument must be Prediction[]");
     }
-    const aligned2 = alignByKey(inputOrActual, expected);
+    const alignOptions = options ? {
+      idField: options.idField,
+      predictionIdField: options.predictionIdField,
+      expectedIdField: options.expectedIdField,
+      strict: options.strict
+    } : void 0;
+    const aligned2 = alignByKey(inputOrActual, expected, alignOptions);
     return new ExpectStats(aligned2);
   }
   const aligned = normalizeInput(inputOrActual);