npm - evalsense - Versions diffs - 0.3.2 → 0.4.0 - Mend

evalsense 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/README.md +235 -98
package/dist/{chunk-BFGA2NUB.cjs → chunk-4BKZPVY4.cjs} +13 -6
package/dist/chunk-4BKZPVY4.cjs.map +1 -0
package/dist/{chunk-IYLSY7NX.js → chunk-IUVDDMJ3.js} +13 -6
package/dist/chunk-IUVDDMJ3.js.map +1 -0
package/dist/chunk-NCCQRZ2Y.cjs +1141 -0
package/dist/chunk-NCCQRZ2Y.cjs.map +1 -0
package/dist/chunk-TDGWDK2L.js +1108 -0
package/dist/chunk-TDGWDK2L.js.map +1 -0
package/dist/cli.cjs +11 -11
package/dist/cli.js +1 -1
package/dist/index-CATqAHNK.d.cts +416 -0
package/dist/index-CoMpaW-K.d.ts +416 -0
package/dist/index.cjs +507 -580
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +210 -161
package/dist/index.d.ts +210 -161
package/dist/index.js +455 -524
package/dist/index.js.map +1 -1
package/dist/metrics/index.cjs +103 -342
package/dist/metrics/index.cjs.map +1 -1
package/dist/metrics/index.d.cts +260 -31
package/dist/metrics/index.d.ts +260 -31
package/dist/metrics/index.js +24 -312
package/dist/metrics/index.js.map +1 -1
package/dist/metrics/opinionated/index.cjs +5 -5
package/dist/metrics/opinionated/index.d.cts +2 -163
package/dist/metrics/opinionated/index.d.ts +2 -163
package/dist/metrics/opinionated/index.js +1 -1
package/dist/{types-C71p0wzM.d.cts → types-D0hzfyKm.d.cts} +1 -13
package/dist/{types-C71p0wzM.d.ts → types-D0hzfyKm.d.ts} +1 -13
package/package.json +1 -1
package/dist/chunk-BFGA2NUB.cjs.map +0 -1
package/dist/chunk-IYLSY7NX.js.map +0 -1
package/dist/chunk-RZFLCWTW.cjs +0 -942
package/dist/chunk-RZFLCWTW.cjs.map +0 -1
package/dist/chunk-Z3U6AUWX.js +0 -925
package/dist/chunk-Z3U6AUWX.js.map +0 -1

package/dist/index.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { T as TestFn, D as Dataset, P as Prediction, A as AlignedRecord, I as IntegrityResult, C as ClassificationMetrics, c as AssertionResult, d as ConfusionMatrix, E as EvalReport, F as FieldMetricResult } from './types-C71p0wzM.js';
-export { e as CLIOptions, f as ClassMetrics, g as DatasetMetadata, h as EvalTest, i as ExitCode, j as ExitCodes, a as MetricConfig, M as MetricFn, b as MetricOutput, R as RegressionMetrics, S as Suite, k as SuiteResult, l as TestContext, m as TestResult } from './types-C71p0wzM.js';
+import { T as TestFn, P as Prediction, A as AlignedRecord, I as IntegrityResult, b as AssertionResult, C as ClassificationMetrics, c as ConfusionMatrix, E as EvalReport, F as FieldMetricResult } from './types-D0hzfyKm.js';
+export { d as CLIOptions, e as ClassMetrics, f as EvalTest, g as ExitCode, h as ExitCodes, i as MetricConfig, M as MetricFn, a as MetricOutput, R as RegressionMetrics, S as Suite, j as SuiteResult, k as TestContext, l as TestResult } from './types-D0hzfyKm.js';
 /**
  * describe() implementation - Jest-like test suite grouping
@@ -73,65 +73,6 @@ declare function evalTestSkip(name: string, _fn: TestFn): void;
  */
 declare function evalTestOnly(name: string, fn: TestFn): void;
-/**
- * Dataset loading functionality
- */
-/**
- * Loads a dataset from a JSON or NDJSON file
- *
- * @param path - Path to the dataset file (relative to cwd or absolute)
- * @returns Dataset with records and metadata
- *
- * @example
- * ```ts
- * const dataset = loadDataset("./fixtures/sentiment.json");
- * // dataset.records = [{ id: "1", text: "...", sentiment: "positive" }, ...]
- * ```
- */
-declare function loadDataset<T extends Record<string, unknown> = Record<string, unknown>>(path: string): Dataset<T>;
-/**
- * Creates a dataset from an array of records (for testing/programmatic use)
- */
-declare function createDataset<T extends Record<string, unknown>>(records: T[], source?: string): Dataset<T>;
-/**
- * runModel() - executes a model function against a dataset
- */
-/**
- * Model function signature - takes a record and returns a prediction
- */
-type ModelFn<T> = (record: T) => Prediction | Promise<Prediction>;
-/**
- * Result of running a model on a dataset
- */
-interface ModelRunResult {
-    predictions: Prediction[];
-    aligned: AlignedRecord[];
-    duration: number;
-}
-/**
- * Runs a model function against each record in a dataset
- *
- * @param dataset - The dataset to process
- * @param modelFn - Function that processes each record and returns a prediction
- * @returns Aligned predictions with actual vs expected values
- *
- * @example
- * ```ts
- * const result = await runModel(dataset, (record) => ({
- *   id: record.id,
- *   sentiment: classify(record.text)
- * }));
- * ```
- */
-declare function runModel<T extends Record<string, unknown>>(dataset: Dataset<T>, modelFn: ModelFn<T>): Promise<ModelRunResult>;
-/**
- * Runs model in parallel with concurrency limit
- */
-declare function runModelParallel<T extends Record<string, unknown>>(dataset: Dataset<T>, modelFn: ModelFn<T>, concurrency?: number): Promise<ModelRunResult>;
 /**
  * Dataset alignment utilities
  * Aligns predictions with ground truth by ID
@@ -192,11 +133,11 @@ interface IntegrityOptions {
 /**
  * Checks dataset integrity - validates IDs and required fields
  *
- * @param dataset - Dataset to check
+ * @param records - Array of records to check
  * @param options - Integrity check options
  * @returns Integrity result with details
  */
-declare function checkIntegrity<T extends Record<string, unknown>>(dataset: Dataset<T>, options?: IntegrityOptions): IntegrityResult;
+declare function checkIntegrity<T extends Record<string, unknown>>(records: T[], options?: IntegrityOptions): IntegrityResult;
 /**
  * Validates predictions against a dataset
  */
@@ -206,6 +147,51 @@ declare function validatePredictions(predictions: Prediction[], expectedIds: str
     extra: string[];
 };
+/**
+ * MetricMatcher - provides Jest-like assertion methods for metrics
+ */
+interface MetricMatcherContext<TParent> {
+    parent: TParent;
+    metricName: string;
+    metricValue: number;
+    fieldName: string;
+    targetClass?: string;
+    assertions: AssertionResult[];
+    formatValue?: (value: number) => string;
+}
+/**
+ * Matcher class for individual metric assertions
+ * Returns the parent selector to enable fluent chaining
+ */
+declare class MetricMatcher<TParent> {
+    private context;
+    constructor(context: MetricMatcherContext<TParent>);
+    private formatMetricValue;
+    private createAssertion;
+    private recordAndReturn;
+    /**
+     * Assert that the metric is greater than or equal to the threshold (>=)
+     */
+    toBeAtLeast(threshold: number): TParent;
+    /**
+     * Assert that the metric is strictly greater than the threshold (>)
+     */
+    toBeAbove(threshold: number): TParent;
+    /**
+     * Assert that the metric is less than or equal to the threshold (<=)
+     */
+    toBeAtMost(threshold: number): TParent;
+    /**
+     * Assert that the metric is strictly less than the threshold (<)
+     */
+    toBeBelow(threshold: number): TParent;
+    /**
+     * Assert that the metric equals the expected value (with optional tolerance for floats)
+     */
+    toEqual(expected: number, tolerance?: number): TParent;
+}
 /**
  * Selector for binarized fields (continuous → binary threshold)
  */
@@ -217,29 +203,48 @@ declare class BinarizeSelector {
     private assertions;
     constructor(aligned: AlignedRecord[], fieldName: string, threshold: number);
     /**
-     * Asserts that accuracy is above a threshold
+     * Access accuracy metric for assertions
+     * @example
+     * expectStats(predictions, groundTruth)
+     *   .field("score")
+     *   .binarize(0.5)
+     *   .accuracy.toBeAtLeast(0.8)
      */
-    toHaveAccuracyAbove(threshold: number): this;
+    get accuracy(): MetricMatcher<this>;
     /**
-     * Asserts that precision is above a threshold
-     * @param classOrThreshold - Either the class (true/false) or threshold
-     * @param threshold - Threshold when class is specified
+     * Access F1 score metric for assertions (macro average)
+     * @example
+     * expectStats(predictions, groundTruth)
+     *   .field("score")
+     *   .binarize(0.5)
+     *   .f1.toBeAtLeast(0.75)
      */
-    toHavePrecisionAbove(classOrThreshold: boolean | number, threshold?: number): this;
+    get f1(): MetricMatcher<this>;
     /**
-     * Asserts that recall is above a threshold
-     * @param classOrThreshold - Either the class (true/false) or threshold
-     * @param threshold - Threshold when class is specified
+     * Access precision metric for assertions
+     * @param targetClass - Optional boolean class (true/false). If omitted, uses macro average
+     * @example
+     * expectStats(predictions, groundTruth)
+     *   .field("score")
+     *   .binarize(0.5)
+     *   .precision(true).toBeAtLeast(0.7)
      */
-    toHaveRecallAbove(classOrThreshold: boolean | number, threshold?: number): this;
+    precision(targetClass?: boolean): MetricMatcher<this>;
     /**
-     * Asserts that F1 score is above a threshold
+     * Access recall metric for assertions
+     * @param targetClass - Optional boolean class (true/false). If omitted, uses macro average
+     * @example
+     * expectStats(predictions, groundTruth)
+     *   .field("score")
+     *   .binarize(0.5)
+     *   .recall(true).toBeAtLeast(0.7)
      */
-    toHaveF1Above(classOrThreshold: boolean | number, threshold?: number): this;
+    recall(targetClass?: boolean): MetricMatcher<this>;
     /**
-     * Includes the confusion matrix in the report
+     * Displays the confusion matrix in the report
+     * This is not an assertion - it always passes and just records the matrix for display
      */
-    toHaveConfusionMatrix(): this;
+    displayConfusionMatrix(): this;
     /**
      * Gets computed metrics
      */
@@ -250,6 +255,47 @@ declare class BinarizeSelector {
     getAssertions(): AssertionResult[];
 }
+/**
+ * PercentageMatcher - provides assertion methods for percentage-based distribution checks
+ */
+type PercentageDirection = "above" | "below";
+interface PercentageMatcherContext<TParent> {
+    parent: TParent;
+    fieldName: string;
+    valueThreshold: number;
+    direction: PercentageDirection;
+    actualPercentage: number;
+    assertions: AssertionResult[];
+}
+/**
+ * Matcher class for percentage-based distribution assertions
+ * Returns the parent selector to enable fluent chaining
+ */
+declare class PercentageMatcher<TParent> {
+    private context;
+    constructor(context: PercentageMatcherContext<TParent>);
+    private formatPercentage;
+    private createAssertion;
+    private recordAndReturn;
+    /**
+     * Assert that the percentage is greater than or equal to the threshold (>=)
+     */
+    toBeAtLeast(percentageThreshold: number): TParent;
+    /**
+     * Assert that the percentage is strictly greater than the threshold (>)
+     */
+    toBeAbove(percentageThreshold: number): TParent;
+    /**
+     * Assert that the percentage is less than or equal to the threshold (<=)
+     */
+    toBeAtMost(percentageThreshold: number): TParent;
+    /**
+     * Assert that the percentage is strictly less than the threshold (<)
+     */
+    toBeBelow(percentageThreshold: number): TParent;
+}
 /**
  * Field selector for building assertions on a specific field
  */
@@ -270,109 +316,98 @@ declare class FieldSelector {
      */
     private validateGroundTruth;
     /**
-     * Asserts that accuracy is above a threshold
-     */
-    toHaveAccuracyAbove(threshold: number): this;
-    /**
-     * Asserts that precision is above a threshold
-     * @param classOrThreshold - Either the class name or threshold (if class is omitted, uses macro average)
-     * @param threshold - Threshold when class is specified
-     */
-    toHavePrecisionAbove(classOrThreshold: string | number, threshold?: number): this;
-    /**
-     * Asserts that recall is above a threshold
-     * @param classOrThreshold - Either the class name or threshold (if class is omitted, uses macro average)
-     * @param threshold - Threshold when class is specified
+     * Validates that ground truth exists and both arrays contain numeric values.
+     * Returns the filtered numeric arrays for regression metrics.
      */
-    toHaveRecallAbove(classOrThreshold: string | number, threshold?: number): this;
+    private validateRegressionInputs;
     /**
-     * Asserts that F1 score is above a threshold
-     * @param classOrThreshold - Either the class name or threshold (if class is omitted, uses macro average)
-     * @param threshold - Threshold when class is specified
+     * Access accuracy metric for assertions
+     * @example
+     * expectStats(predictions, groundTruth)
+     *   .field("sentiment")
+     *   .accuracy.toBeAtLeast(0.8)
      */
-    toHaveF1Above(classOrThreshold: string | number, threshold?: number): this;
+    get accuracy(): MetricMatcher<this>;
     /**
-     * Includes the confusion matrix in the report
+     * Access F1 score metric for assertions (macro average)
+     * @example
+     * expectStats(predictions, groundTruth)
+     *   .field("sentiment")
+     *   .f1.toBeAtLeast(0.75)
      */
-    toHaveConfusionMatrix(): this;
+    get f1(): MetricMatcher<this>;
     /**
-     * Asserts that a percentage of values are below or equal to a threshold.
-     * This is a distributional assertion that only looks at actual values (no ground truth required).
-     *
-     * @param valueThreshold - The value threshold to compare against
-     * @param percentageThreshold - The minimum percentage (0-1) of values that should be <= valueThreshold
-     * @returns this for method chaining
-     *
+     * Access precision metric for assertions
+     * @param targetClass - Optional class name. If omitted, uses macro average
      * @example
-     * // Assert that 90% of confidence scores are below 0.5
-     * expectStats(predictions)
-     *   .field("confidence")
-     *   .toHavePercentageBelow(0.5, 0.9)
+     * expectStats(predictions, groundTruth)
+     *   .field("sentiment")
+     *   .precision("positive").toBeAtLeast(0.7)
      */
-    toHavePercentageBelow(valueThreshold: number, percentageThreshold: number): this;
+    precision(targetClass?: string): MetricMatcher<this>;
     /**
-     * Asserts that a percentage of values are above a threshold.
-     * This is a distributional assertion that only looks at actual values (no ground truth required).
-     *
-     * @param valueThreshold - The value threshold to compare against
-     * @param percentageThreshold - The minimum percentage (0-1) of values that should be > valueThreshold
-     * @returns this for method chaining
-     *
+     * Access recall metric for assertions
+     * @param targetClass - Optional class name. If omitted, uses macro average
      * @example
-     * // Assert that 80% of quality scores are above 0.7
-     * expectStats(predictions)
-     *   .field("quality")
-     *   .toHavePercentageAbove(0.7, 0.8)
+     * expectStats(predictions, groundTruth)
+     *   .field("sentiment")
+     *   .recall("positive").toBeAtLeast(0.7)
      */
-    toHavePercentageAbove(valueThreshold: number, percentageThreshold: number): this;
+    recall(targetClass?: string): MetricMatcher<this>;
     /**
-     * Validates that ground truth exists and both arrays contain numeric values.
-     * Returns the filtered numeric arrays for regression metrics.
+     * Access Mean Absolute Error metric for assertions
+     * @example
+     * expectStats(predictions, groundTruth)
+     *   .field("score")
+     *   .mae.toBeAtMost(0.1)
      */
-    private validateRegressionInputs;
+    get mae(): MetricMatcher<this>;
     /**
-     * Asserts that Mean Absolute Error is below a threshold.
-     * Requires numeric values in both actual and expected.
-     *
-     * @param threshold - Maximum allowed MAE
-     * @returns this for method chaining
-     *
+     * Access Root Mean Squared Error metric for assertions
      * @example
      * expectStats(predictions, groundTruth)
      *   .field("score")
-     *   .toHaveMAEBelow(0.1)
+     *   .rmse.toBeAtMost(0.15)
      */
-    toHaveMAEBelow(threshold: number): this;
+    get rmse(): MetricMatcher<this>;
     /**
-     * Asserts that Root Mean Squared Error is below a threshold.
-     * Requires numeric values in both actual and expected.
-     *
-     * @param threshold - Maximum allowed RMSE
-     * @returns this for method chaining
-     *
+     * Access R-squared (coefficient of determination) metric for assertions
      * @example
      * expectStats(predictions, groundTruth)
      *   .field("score")
-     *   .toHaveRMSEBelow(0.15)
-     */
-    toHaveRMSEBelow(threshold: number): this;
-    /**
-     * Asserts that R-squared (coefficient of determination) is above a threshold.
-     * R² measures how well the predictions explain the variance in expected values.
-     * R² = 1.0 means perfect prediction, R² = 0 means prediction is no better than mean.
-     * Requires numeric values in both actual and expected.
-     *
-     * @param threshold - Minimum required R² value (0-1)
-     * @returns this for method chaining
-     *
+     *   .r2.toBeAtLeast(0.8)
+     */
+    get r2(): MetricMatcher<this>;
+    /**
+     * Assert on the percentage of values below or equal to a threshold
+     * @param valueThreshold - The value threshold to compare against
+     * @example
+     * expectStats(predictions)
+     *   .field("confidence")
+     *   .percentageBelow(0.5).toBeAtLeast(0.9)
+     */
+    percentageBelow(valueThreshold: number): PercentageMatcher<this>;
+    /**
+     * Assert on the percentage of values above a threshold
+     * @param valueThreshold - The value threshold to compare against
+     * @example
+     * expectStats(predictions)
+     *   .field("quality")
+     *   .percentageAbove(0.7).toBeAtLeast(0.8)
+     */
+    percentageAbove(valueThreshold: number): PercentageMatcher<this>;
+    /**
+     * Displays the confusion matrix in the report
+     * This is not an assertion - it always passes and just records the matrix for display
      * @example
      * expectStats(predictions, groundTruth)
-     *   .field("score")
-     *   .toHaveR2Above(0.8)
+     *   .field("sentiment")
+     *   .accuracy.toBeAtLeast(0.8)
+     *   .displayConfusionMatrix()
      */
-    toHaveR2Above(threshold: number): this;
+    displayConfusionMatrix(): this;
     /**
-     * Gets the computed metrics for this field
+     * Gets the computed classification metrics for this field
      */
     getMetrics(): ClassificationMetrics;
     /**
@@ -385,10 +420,16 @@ declare class FieldSelector {
  * expectStats() - fluent assertion API for statistical evaluation
  */
+/**
+ * Object with aligned records (e.g., from custom model execution)
+ */
+interface AlignedRecordsInput {
+    aligned: AlignedRecord[];
+}
 /**
  * Input types that expectStats() accepts
  */
-type StatsInput = ModelRunResult | Prediction[] | AlignedRecord[];
+type StatsInput = AlignedRecordsInput | Prediction[] | AlignedRecord[];
 /**
  * Options for expectStats when using two-argument form
  */
@@ -429,20 +470,20 @@ interface ExpectStatsOptions {
  * // Pattern 1: Distribution assertions (no ground truth)
  * expectStats(predictions)
  *   .field("confidence")
- *   .toHavePercentageBelow(0.5, 0.9);
+ *   .percentageBelow(0.5).toBeAtLeast(0.9);
  *
  * @example
  * // Pattern 2: Classification with ground truth
  * expectStats(judgeOutputs, humanLabels)
  *   .field("hallucinated")
- *   .toHaveRecallAbove(true, 0.85)
- *   .toHavePrecisionAbove(true, 0.8);
+ *   .recall(true).toBeAtLeast(0.85)
+ *   .precision(true).toBeAtLeast(0.8);
  *
  * @example
  * // Pattern 3: Custom ID field
  * expectStats(predictions, groundTruth, { idField: 'uuid' })
  *   .field("score")
- *   .toHaveAccuracyAbove(0.8);
+ *   .accuracy.toBeAtLeast(0.8);
  */
 declare function expectStats(input: StatsInput): ExpectStats;
 declare function expectStats(actual: Prediction[], expected: Array<Record<string, unknown>>): ExpectStats;
@@ -489,7 +530,15 @@ declare class ExpectStats {
  */
 declare function buildConfusionMatrix(actual: unknown[], expected: unknown[]): ConfusionMatrix;
 /**
- * Formats a confusion matrix as a string table
+ * Formats a confusion matrix as a string table with axis labels
+ *
+ * Output format:
+ * ```
+ * Predicted →   negative positive
+ * Actual ↓
+ *   negative           5        1
+ *   positive           2        7
+ * ```
  */
 declare function formatConfusionMatrix(cm: ConfusionMatrix): string;
@@ -687,4 +736,4 @@ declare class TestExecutionError extends EvalSenseError {
     constructor(message: string, testName: string, originalError?: Error);
 }
-export { AlignedRecord, AssertionError, AssertionResult, ClassificationMetrics, ConfigurationError, ConfusionMatrix, ConsoleReporter, Dataset, DatasetError, EvalReport, EvalSenseError, type ExpectStatsOptions, FieldMetricResult, IntegrityError, IntegrityResult, JsonReporter, Prediction, TestExecutionError, TestFn, afterAll, afterEach, alignByKey, beforeAll, beforeEach, buildConfusionMatrix, checkIntegrity, computeAccuracy, computeClassificationMetrics, computeF1, computePrecision, computeRecall, createDataset, describe, discoverEvalFiles, evalTest, executeEvalFiles, expectStats, extractFieldValues, filterComplete, formatConfusionMatrix, getExitCode, it, loadDataset, parseReport, runModel, runModelParallel, test, validatePredictions };
+export { AlignedRecord, type AlignedRecordsInput, AssertionError, AssertionResult, ClassificationMetrics, ConfigurationError, ConfusionMatrix, ConsoleReporter, DatasetError, EvalReport, EvalSenseError, type ExpectStatsOptions, FieldMetricResult, IntegrityError, IntegrityResult, JsonReporter, Prediction, type StatsInput, TestExecutionError, TestFn, afterAll, afterEach, alignByKey, beforeAll, beforeEach, buildConfusionMatrix, checkIntegrity, computeAccuracy, computeClassificationMetrics, computeF1, computePrecision, computeRecall, describe, discoverEvalFiles, evalTest, executeEvalFiles, expectStats, extractFieldValues, filterComplete, formatConfusionMatrix, getExitCode, it, parseReport, test, validatePredictions };