npm - evalsense - Versions diffs - 0.2.0 - Mend

evalsense 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/README.md +678 -0
package/bin/evalsense.js +3 -0
package/dist/chunk-5P7LNNO6.js +747 -0
package/dist/chunk-5P7LNNO6.js.map +1 -0
package/dist/chunk-BRPM6AB6.js +925 -0
package/dist/chunk-BRPM6AB6.js.map +1 -0
package/dist/chunk-HDJID3GC.cjs +779 -0
package/dist/chunk-HDJID3GC.cjs.map +1 -0
package/dist/chunk-Y23VHTD3.cjs +942 -0
package/dist/chunk-Y23VHTD3.cjs.map +1 -0
package/dist/cli.cjs +65 -0
package/dist/cli.cjs.map +1 -0
package/dist/cli.d.cts +1 -0
package/dist/cli.d.ts +1 -0
package/dist/cli.js +63 -0
package/dist/cli.js.map +1 -0
package/dist/index.cjs +1126 -0
package/dist/index.cjs.map +1 -0
package/dist/index.d.cts +604 -0
package/dist/index.d.ts +604 -0
package/dist/index.js +1043 -0
package/dist/index.js.map +1 -0
package/dist/metrics/index.cjs +275 -0
package/dist/metrics/index.cjs.map +1 -0
package/dist/metrics/index.d.cts +299 -0
package/dist/metrics/index.d.ts +299 -0
package/dist/metrics/index.js +191 -0
package/dist/metrics/index.js.map +1 -0
package/dist/metrics/opinionated/index.cjs +24 -0
package/dist/metrics/opinionated/index.cjs.map +1 -0
package/dist/metrics/opinionated/index.d.cts +163 -0
package/dist/metrics/opinionated/index.d.ts +163 -0
package/dist/metrics/opinionated/index.js +3 -0
package/dist/metrics/opinionated/index.js.map +1 -0
package/dist/types-C71p0wzM.d.cts +265 -0
package/dist/types-C71p0wzM.d.ts +265 -0
package/package.json +91 -0

package/dist/index.d.cts ADDED Viewed

@@ -0,0 +1,604 @@
+import { T as TestFn, D as Dataset, P as Prediction, A as AlignedRecord, I as IntegrityResult, C as ClassificationMetrics, c as AssertionResult, d as ConfusionMatrix, E as EvalReport, F as FieldMetricResult } from './types-C71p0wzM.cjs';
+export { e as CLIOptions, f as ClassMetrics, g as DatasetMetadata, h as EvalTest, i as ExitCode, j as ExitCodes, a as MetricConfig, M as MetricFn, b as MetricOutput, R as RegressionMetrics, S as Suite, k as SuiteResult, l as TestContext, m as TestResult } from './types-C71p0wzM.cjs';
+/**
+ * describe() implementation - Jest-like test suite grouping
+ */
+/**
+ * Creates a test suite that groups related eval tests
+ *
+ * @example
+ * ```ts
+ * describe("Sentiment classifier", () => {
+ *   evalTest("accuracy above 80%", async () => {
+ *     // test implementation
+ *   });
+ * });
+ * ```
+ */
+declare function describe(name: string, fn: () => void): void;
+/**
+ * Lifecycle hook - runs once before all tests in the suite
+ */
+declare function beforeAll(fn: TestFn): void;
+/**
+ * Lifecycle hook - runs once after all tests in the suite
+ */
+declare function afterAll(fn: TestFn): void;
+/**
+ * Lifecycle hook - runs before each test in the suite
+ */
+declare function beforeEach(fn: TestFn): void;
+/**
+ * Lifecycle hook - runs after each test in the suite
+ */
+declare function afterEach(fn: TestFn): void;
+/**
+ * evalTest() implementation - defines an individual evaluation test
+ */
+/**
+ * Defines an individual evaluation test within a describe() block
+ *
+ * @example
+ * ```ts
+ * evalTest("accuracy above 80%", async () => {
+ *   const dataset = loadDataset("./data.json");
+ *   const predictions = await runModel(dataset, classify);
+ *
+ *   expectStats(predictions)
+ *     .field("sentiment")
+ *     .toHaveAccuracyAbove(0.8);
+ * });
+ * ```
+ */
+declare function evalTest(name: string, fn: TestFn): void;
+declare namespace evalTest {
+    var skip: typeof evalTestSkip;
+    var only: typeof evalTestOnly;
+}
+/**
+ * Alias for evalTest - some users may prefer "test" or "it"
+ */
+declare const test: typeof evalTest;
+declare const it: typeof evalTest;
+/**
+ * Skipped test - registers but doesn't run
+ */
+declare function evalTestSkip(name: string, _fn: TestFn): void;
+/**
+ * Focused test - only runs this test (TODO: implement filtering)
+ */
+declare function evalTestOnly(name: string, fn: TestFn): void;
+/**
+ * Dataset loading functionality
+ */
+/**
+ * Loads a dataset from a JSON or NDJSON file
+ *
+ * @param path - Path to the dataset file (relative to cwd or absolute)
+ * @returns Dataset with records and metadata
+ *
+ * @example
+ * ```ts
+ * const dataset = loadDataset("./fixtures/sentiment.json");
+ * // dataset.records = [{ id: "1", text: "...", sentiment: "positive" }, ...]
+ * ```
+ */
+declare function loadDataset<T extends Record<string, unknown> = Record<string, unknown>>(path: string): Dataset<T>;
+/**
+ * Creates a dataset from an array of records (for testing/programmatic use)
+ */
+declare function createDataset<T extends Record<string, unknown>>(records: T[], source?: string): Dataset<T>;
+/**
+ * runModel() - executes a model function against a dataset
+ */
+/**
+ * Model function signature - takes a record and returns a prediction
+ */
+type ModelFn<T> = (record: T) => Prediction | Promise<Prediction>;
+/**
+ * Result of running a model on a dataset
+ */
+interface ModelRunResult {
+    predictions: Prediction[];
+    aligned: AlignedRecord[];
+    duration: number;
+}
+/**
+ * Runs a model function against each record in a dataset
+ *
+ * @param dataset - The dataset to process
+ * @param modelFn - Function that processes each record and returns a prediction
+ * @returns Aligned predictions with actual vs expected values
+ *
+ * @example
+ * ```ts
+ * const result = await runModel(dataset, (record) => ({
+ *   id: record.id,
+ *   sentiment: classify(record.text)
+ * }));
+ * ```
+ */
+declare function runModel<T extends Record<string, unknown>>(dataset: Dataset<T>, modelFn: ModelFn<T>): Promise<ModelRunResult>;
+/**
+ * Runs model in parallel with concurrency limit
+ */
+declare function runModelParallel<T extends Record<string, unknown>>(dataset: Dataset<T>, modelFn: ModelFn<T>, concurrency?: number): Promise<ModelRunResult>;
+/**
+ * Dataset alignment utilities
+ * Aligns predictions with ground truth by ID
+ */
+/**
+ * Options for alignment
+ */
+interface AlignOptions {
+    /** Whether to throw on missing IDs (default: false) */
+    strict?: boolean;
+    /** Field to use as ID (default: "id") */
+    idField?: string;
+}
+/**
+ * Aligns predictions with expected values by ID
+ *
+ * @param predictions - Model predictions with IDs
+ * @param expected - Ground truth records with IDs
+ * @param options - Alignment options
+ * @returns Array of aligned records
+ */
+declare function alignByKey(predictions: Prediction[], expected: Array<Record<string, unknown>>, options?: AlignOptions): AlignedRecord[];
+/**
+ * Extracts field values from aligned records for statistical analysis
+ *
+ * @param aligned - Aligned records
+ * @param field - Field name to extract
+ * @returns Object with actual and expected arrays
+ */
+declare function extractFieldValues(aligned: AlignedRecord[], field: string): {
+    actual: unknown[];
+    expected: unknown[];
+    ids: string[];
+};
+/**
+ * Filters aligned records to only those with values in both actual and expected
+ */
+declare function filterComplete(aligned: AlignedRecord[], field: string): AlignedRecord[];
+/**
+ * Dataset integrity checks
+ */
+/**
+ * Options for integrity checks
+ */
+interface IntegrityOptions {
+    /** Required fields that must be present in each record */
+    requiredFields?: string[];
+    /** Whether to throw on integrity failures (default: false) */
+    throwOnFailure?: boolean;
+}
+/**
+ * Checks dataset integrity - validates IDs and required fields
+ *
+ * @param dataset - Dataset to check
+ * @param options - Integrity check options
+ * @returns Integrity result with details
+ */
+declare function checkIntegrity<T extends Record<string, unknown>>(dataset: Dataset<T>, options?: IntegrityOptions): IntegrityResult;
+/**
+ * Validates predictions against a dataset
+ */
+declare function validatePredictions(predictions: Prediction[], expectedIds: string[]): {
+    valid: boolean;
+    missing: string[];
+    extra: string[];
+};
+/**
+ * Selector for binarized fields (continuous → binary threshold)
+ */
+declare class BinarizeSelector {
+    private fieldName;
+    private threshold;
+    private binaryActual;
+    private binaryExpected;
+    private assertions;
+    constructor(aligned: AlignedRecord[], fieldName: string, threshold: number);
+    /**
+     * Asserts that accuracy is above a threshold
+     */
+    toHaveAccuracyAbove(threshold: number): this;
+    /**
+     * Asserts that precision is above a threshold
+     * @param classOrThreshold - Either the class (true/false) or threshold
+     * @param threshold - Threshold when class is specified
+     */
+    toHavePrecisionAbove(classOrThreshold: boolean | number, threshold?: number): this;
+    /**
+     * Asserts that recall is above a threshold
+     * @param classOrThreshold - Either the class (true/false) or threshold
+     * @param threshold - Threshold when class is specified
+     */
+    toHaveRecallAbove(classOrThreshold: boolean | number, threshold?: number): this;
+    /**
+     * Asserts that F1 score is above a threshold
+     */
+    toHaveF1Above(classOrThreshold: boolean | number, threshold?: number): this;
+    /**
+     * Includes the confusion matrix in the report
+     */
+    toHaveConfusionMatrix(): this;
+    /**
+     * Gets computed metrics
+     */
+    getMetrics(): ClassificationMetrics;
+    /**
+     * Gets all assertions made
+     */
+    getAssertions(): AssertionResult[];
+}
+/**
+ * Field selector for building assertions on a specific field
+ */
+declare class FieldSelector {
+    private aligned;
+    private fieldName;
+    private actualValues;
+    private expectedValues;
+    private assertions;
+    constructor(aligned: AlignedRecord[], fieldName: string);
+    /**
+     * Transforms continuous scores to binary classification using a threshold
+     */
+    binarize(threshold: number): BinarizeSelector;
+    /**
+     * Validates that ground truth exists for classification metrics.
+     * Throws a clear error if expected values are missing.
+     */
+    private validateGroundTruth;
+    /**
+     * Asserts that accuracy is above a threshold
+     */
+    toHaveAccuracyAbove(threshold: number): this;
+    /**
+     * Asserts that precision is above a threshold
+     * @param classOrThreshold - Either the class name or threshold (if class is omitted, uses macro average)
+     * @param threshold - Threshold when class is specified
+     */
+    toHavePrecisionAbove(classOrThreshold: string | number, threshold?: number): this;
+    /**
+     * Asserts that recall is above a threshold
+     * @param classOrThreshold - Either the class name or threshold (if class is omitted, uses macro average)
+     * @param threshold - Threshold when class is specified
+     */
+    toHaveRecallAbove(classOrThreshold: string | number, threshold?: number): this;
+    /**
+     * Asserts that F1 score is above a threshold
+     * @param classOrThreshold - Either the class name or threshold (if class is omitted, uses macro average)
+     * @param threshold - Threshold when class is specified
+     */
+    toHaveF1Above(classOrThreshold: string | number, threshold?: number): this;
+    /**
+     * Includes the confusion matrix in the report
+     */
+    toHaveConfusionMatrix(): this;
+    /**
+     * Asserts that a percentage of values are below or equal to a threshold.
+     * This is a distributional assertion that only looks at actual values (no ground truth required).
+     *
+     * @param valueThreshold - The value threshold to compare against
+     * @param percentageThreshold - The minimum percentage (0-1) of values that should be <= valueThreshold
+     * @returns this for method chaining
+     *
+     * @example
+     * // Assert that 90% of confidence scores are below 0.5
+     * expectStats(predictions)
+     *   .field("confidence")
+     *   .toHavePercentageBelow(0.5, 0.9)
+     */
+    toHavePercentageBelow(valueThreshold: number, percentageThreshold: number): this;
+    /**
+     * Asserts that a percentage of values are above a threshold.
+     * This is a distributional assertion that only looks at actual values (no ground truth required).
+     *
+     * @param valueThreshold - The value threshold to compare against
+     * @param percentageThreshold - The minimum percentage (0-1) of values that should be > valueThreshold
+     * @returns this for method chaining
+     *
+     * @example
+     * // Assert that 80% of quality scores are above 0.7
+     * expectStats(predictions)
+     *   .field("quality")
+     *   .toHavePercentageAbove(0.7, 0.8)
+     */
+    toHavePercentageAbove(valueThreshold: number, percentageThreshold: number): this;
+    /**
+     * Gets the computed metrics for this field
+     */
+    getMetrics(): ClassificationMetrics;
+    /**
+     * Gets all assertions made on this field
+     */
+    getAssertions(): AssertionResult[];
+}
+/**
+ * expectStats() - fluent assertion API for statistical evaluation
+ */
+/**
+ * Input types that expectStats() accepts
+ */
+type StatsInput = ModelRunResult | Prediction[] | AlignedRecord[];
+/**
+ * Entry point for statistical assertions.
+ *
+ * Supports two usage patterns:
+ * 1. Single argument: predictions without ground truth (for distribution assertions)
+ * 2. Two arguments: predictions with ground truth (for classification metrics)
+ *
+ * @param inputOrActual - Either StatsInput (one-arg) or Prediction[] (two-arg)
+ * @param expected - Ground truth data (optional, only for two-arg usage)
+ * @returns ExpectStats instance for chaining assertions
+ *
+ * @example
+ * // Pattern 1: Distribution assertions (no ground truth)
+ * expectStats(predictions)
+ *   .field("confidence")
+ *   .toHavePercentageBelow(0.5, 0.9);
+ *
+ * @example
+ * // Pattern 1b: Judge validation (with ground truth)
+ * expectStats(judgeOutputs, humanLabels)
+ *   .field("hallucinated")
+ *   .toHaveRecallAbove(true, 0.85)
+ *   .toHavePrecisionAbove(true, 0.8);
+ */
+declare function expectStats(input: StatsInput): ExpectStats;
+declare function expectStats(actual: Prediction[], expected: Array<Record<string, unknown>>): ExpectStats;
+/**
+ * Main stats expectation class
+ */
+declare class ExpectStats {
+    private aligned;
+    constructor(aligned: AlignedRecord[]);
+    /**
+     * Selects a field to evaluate
+     */
+    field(fieldName: string): FieldSelector;
+    /**
+     * Gets the raw aligned records (for advanced use)
+     */
+    getAligned(): AlignedRecord[];
+    /**
+     * Gets the count of records
+     */
+    count(): number;
+}
+/**
+ * Confusion matrix computation
+ */
+/**
+ * Builds a confusion matrix from actual and predicted values
+ *
+ * @param actual - Actual/predicted values from the model
+ * @param expected - Expected/ground truth values
+ * @returns ConfusionMatrix with matrix, labels, and total
+ *
+ * @example
+ * ```ts
+ * const matrix = buildConfusionMatrix(
+ *   ["positive", "negative", "positive"],
+ *   ["positive", "positive", "positive"]
+ * );
+ * // matrix.matrix[i][j] = count of expected[i] predicted as actual[j]
+ * ```
+ */
+declare function buildConfusionMatrix(actual: unknown[], expected: unknown[]): ConfusionMatrix;
+/**
+ * Formats a confusion matrix as a string table
+ */
+declare function formatConfusionMatrix(cm: ConfusionMatrix): string;
+/**
+ * Classification metrics computation
+ */
+/**
+ * Computes all classification metrics from actual and expected values
+ */
+declare function computeClassificationMetrics(actual: unknown[], expected: unknown[]): ClassificationMetrics;
+/**
+ * Computes precision for a specific class
+ */
+declare function computePrecision(actual: unknown[], expected: unknown[], targetClass: string): number;
+/**
+ * Computes recall for a specific class
+ */
+declare function computeRecall(actual: unknown[], expected: unknown[], targetClass: string): number;
+/**
+ * Computes F1 score for a specific class
+ */
+declare function computeF1(actual: unknown[], expected: unknown[], targetClass: string): number;
+/**
+ * Computes overall accuracy
+ */
+declare function computeAccuracy(actual: unknown[], expected: unknown[]): number;
+/**
+ * JSON Reporter - deterministic JSON output
+ */
+/**
+ * JSON Reporter for machine-readable output
+ */
+declare class JsonReporter {
+    /**
+     * Formats a report as deterministic JSON
+     */
+    format(report: EvalReport): string;
+    /**
+     * Writes report to a file
+     */
+    writeToFile(report: EvalReport, path: string): void;
+    /**
+     * Converts report to a JSON-serializable format
+     */
+    private toSerializable;
+}
+/**
+ * Parses a JSON report back into an EvalReport
+ */
+declare function parseReport(json: string): EvalReport;
+/**
+ * Console Reporter - human-readable output
+ */
+/**
+ * Console reporter for human-readable output
+ */
+declare class ConsoleReporter {
+    private useColors;
+    constructor(useColors?: boolean);
+    /**
+     * Prints the run header
+     */
+    printHeader(fileCount: number): void;
+    /**
+     * Prints the full report
+     */
+    printReport(report: EvalReport): void;
+    /**
+     * Prints a suite's results
+     */
+    private printSuite;
+    /**
+     * Prints a single test result
+     */
+    private printTest;
+    /**
+     * Prints field metrics summary
+     */
+    private printFieldMetrics;
+    /**
+     * Prints the summary
+     */
+    private printSummary;
+    /**
+     * Prints a confusion matrix
+     */
+    printConfusionMatrix(fm: FieldMetricResult): void;
+    /**
+     * Formats a percentage
+     */
+    private pct;
+    /**
+     * Formats duration
+     */
+    private formatDuration;
+    /**
+     * Gets status symbol
+     */
+    private getStatusSymbol;
+    /**
+     * Gets status color
+     */
+    private getStatusColor;
+    /**
+     * Applies color if enabled
+     */
+    private color;
+    /**
+     * Logs a line
+     */
+    private log;
+}
+/**
+ * Options for file discovery
+ */
+interface DiscoveryOptions {
+    /** Patterns to match (default: *.eval.{js,ts,mjs}) */
+    patterns?: string[];
+    /** Patterns to ignore */
+    ignore?: string[];
+    /** Base directory to search from */
+    cwd?: string;
+    /** Filter pattern for test names */
+    filter?: string;
+}
+/**
+ * Discovers eval files matching the patterns
+ *
+ * @param options - Discovery options
+ * @returns Array of absolute file paths
+ */
+declare function discoverEvalFiles(options?: DiscoveryOptions): Promise<string[]>;
+/**
+ * Test executor - runs discovered eval files
+ */
+/**
+ * Options for test execution
+ */
+interface ExecutorOptions {
+    /** Stop on first failure */
+    bail?: boolean;
+    /** Test timeout in ms */
+    timeout?: number;
+    /** Filter pattern for test names */
+    filter?: string;
+}
+/**
+ * Executes all eval files and returns results
+ */
+declare function executeEvalFiles(files: string[], options?: ExecutorOptions): Promise<EvalReport>;
+/**
+ * Determines exit code from report
+ */
+declare function getExitCode(report: EvalReport): number;
+/**
+ * Custom error classes for EvalSense
+ */
+declare class EvalSenseError extends Error {
+    constructor(message: string);
+}
+declare class AssertionError extends EvalSenseError {
+    readonly expected: unknown;
+    readonly actual: unknown;
+    readonly field?: string;
+    constructor(message: string, expected?: unknown, actual?: unknown, field?: string);
+}
+declare class DatasetError extends EvalSenseError {
+    readonly source?: string;
+    constructor(message: string, source?: string);
+}
+declare class IntegrityError extends EvalSenseError {
+    readonly missingIds?: string[];
+    readonly duplicateIds?: string[];
+    constructor(message: string, missingIds?: string[], duplicateIds?: string[]);
+}
+declare class ConfigurationError extends EvalSenseError {
+    constructor(message: string);
+}
+declare class TestExecutionError extends EvalSenseError {
+    readonly testName: string;
+    readonly originalError?: Error;
+    constructor(message: string, testName: string, originalError?: Error);
+}
+export { AlignedRecord, AssertionError, AssertionResult, ClassificationMetrics, ConfigurationError, ConfusionMatrix, ConsoleReporter, Dataset, DatasetError, EvalReport, EvalSenseError, FieldMetricResult, IntegrityError, IntegrityResult, JsonReporter, Prediction, TestExecutionError, TestFn, afterAll, afterEach, alignByKey, beforeAll, beforeEach, buildConfusionMatrix, checkIntegrity, computeAccuracy, computeClassificationMetrics, computeF1, computePrecision, computeRecall, createDataset, describe, discoverEvalFiles, evalTest, executeEvalFiles, expectStats, extractFieldValues, filterComplete, formatConfusionMatrix, getExitCode, it, loadDataset, parseReport, runModel, runModelParallel, test, validatePredictions };