npm - evalsense - Versions diffs - 0.2.1 → 0.3.1 - Mend

evalsense 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/README.md +99 -82
package/dist/{chunk-HDJID3GC.cjs → chunk-BE7CB3AM.cjs} +39 -28
package/dist/chunk-BE7CB3AM.cjs.map +1 -0
package/dist/chunk-DGUM43GV.js +10 -0
package/dist/chunk-DGUM43GV.js.map +1 -0
package/dist/chunk-JEQ2X3Z6.cjs +12 -0
package/dist/chunk-JEQ2X3Z6.cjs.map +1 -0
package/dist/{chunk-5P7LNNO6.js → chunk-K6QPJ2NO.js} +39 -28
package/dist/chunk-K6QPJ2NO.js.map +1 -0
package/dist/{chunk-Y23VHTD3.cjs → chunk-RZFLCWTW.cjs} +2 -2
package/dist/chunk-RZFLCWTW.cjs.map +1 -0
package/dist/{chunk-BRPM6AB6.js → chunk-Z3U6AUWX.js} +2 -2
package/dist/chunk-Z3U6AUWX.js.map +1 -0
package/dist/cli.cjs +39 -36
package/dist/cli.cjs.map +1 -1
package/dist/cli.js +37 -34
package/dist/cli.js.map +1 -1
package/dist/index.cjs +320 -104
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +93 -7
package/dist/index.d.ts +93 -7
package/dist/index.js +242 -26
package/dist/index.js.map +1 -1
package/dist/metrics/index.cjs +257 -17
package/dist/metrics/index.cjs.map +1 -1
package/dist/metrics/index.d.cts +252 -1
package/dist/metrics/index.d.ts +252 -1
package/dist/metrics/index.js +240 -2
package/dist/metrics/index.js.map +1 -1
package/dist/metrics/opinionated/index.cjs +6 -5
package/dist/metrics/opinionated/index.js +2 -1
package/package.json +4 -3
package/dist/chunk-5P7LNNO6.js.map +0 -1
package/dist/chunk-BRPM6AB6.js.map +0 -1
package/dist/chunk-HDJID3GC.cjs.map +0 -1
package/dist/chunk-Y23VHTD3.cjs.map +0 -1

package/README.md CHANGED Viewed

@@ -3,10 +3,12 @@
 > JS-native LLM evaluation framework with Jest-like API and statistical assertions
 [![npm version](https://img.shields.io/npm/v/evalsense.svg)](https://www.npmjs.com/package/evalsense)
-[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 **evalsense** brings classical ML-style statistical evaluation to LLM systems in JavaScript. Instead of evaluating individual test cases, evalsense evaluates entire datasets and computes confusion matrices, precision/recall, F1 scores, and other statistical metrics.
+> **New in v0.3.0:** Regression assertions (MAE, RMSE, R²) and flexible ID matching for custom identifier fields! [See migration guide](./docs/migration-v0.3.0.md).
+> **New in v0.2.x:** Built-in adapters for OpenAI, Anthropic, and OpenRouter - no boilerplate needed!
 > **New in v0.2.0:** LLM-powered metrics for hallucination, relevance, faithfulness, and toxicity detection. [See migration guide](./docs/migration-v0.2.md).
 ## Why evalsense?
@@ -57,7 +59,7 @@ function classifySentiment(record) {
   return {
     id: record.id,
-    sentiment: hasPositive && !hasNegative ? "positive" : "negative"
+    sentiment: hasPositive && !hasNegative ? "positive" : "negative",
   };
 }
@@ -109,14 +111,14 @@ describe("Spam classifier", () => {
     const result = await runModel(dataset, (record) => ({
       id: record.id,
-      isSpam: classifyEmail(record.text)
+      isSpam: classifyEmail(record.text),
     }));
     expectStats(result)
       .field("isSpam")
       .toHaveAccuracyAbove(0.9)
-      .toHavePrecisionAbove(true, 0.85)  // Precision for spam=true
-      .toHaveRecallAbove(true, 0.85)     // Recall for spam=true
+      .toHavePrecisionAbove(true, 0.85) // Precision for spam=true
+      .toHaveRecallAbove(true, 0.85) // Recall for spam=true
       .toHaveConfusionMatrix();
   });
 });
@@ -134,13 +136,13 @@ describe("Hallucination detector", () => {
     // Your model returns a continuous score
     const result = await runModel(dataset, (record) => ({
       id: record.id,
-      hallucinated: computeHallucinationScore(record.output)  // 0.0 to 1.0
+      hallucinated: computeHallucinationScore(record.output), // 0.0 to 1.0
     }));
     // Binarize the score at threshold 0.3
     expectStats(result)
       .field("hallucinated")
-      .binarize(0.3)  // >= 0.3 means hallucinated
+      .binarize(0.3) // >= 0.3 means hallucinated
       .toHaveRecallAbove(true, 0.7)
       .toHavePrecisionAbove(true, 0.6)
       .toHaveConfusionMatrix();
@@ -159,7 +161,7 @@ describe("Intent classifier", () => {
     const result = await runModel(dataset, (record) => ({
       id: record.id,
-      intent: classifyIntent(record.query)
+      intent: classifyIntent(record.query),
     }));
     expectStats(result)
@@ -191,12 +193,10 @@ describe("LLM classifier", () => {
         const response = await callLLM(record.text);
         return { id: record.id, category: response.category };
       },
-      5  // concurrency limit
+      5 // concurrency limit
     );
-    expectStats(result)
-      .field("category")
-      .toHaveAccuracyAbove(0.9);
+    expectStats(result).field("category").toHaveAccuracyAbove(0.9);
   });
 });
 ```
@@ -283,6 +283,7 @@ npx evalsense list tests/
 ### Core API
 #### `describe(name, fn)`
 Groups related evaluation tests (like Jest's describe).
 ```javascript
@@ -292,6 +293,7 @@ describe("My model", () => {
 ```
 #### `evalTest(name, fn)` / `test(name, fn)` / `it(name, fn)`
 Defines an evaluation test.
 ```javascript
@@ -303,6 +305,7 @@ evalTest("should have 90% accuracy", async () => {
 ### Dataset Functions
 #### `loadDataset(path)`
 Loads a dataset from a JSON file. Records must have an `id` or `_id` field.
 ```javascript
@@ -310,44 +313,45 @@ const dataset = loadDataset("./data.json");
 ```
 #### `runModel(dataset, modelFn)`
 Runs a model function on each record sequentially.
 ```javascript
 const result = await runModel(dataset, (record) => ({
   id: record.id,
-  prediction: classify(record.text)
+  prediction: classify(record.text),
 }));
 ```
 #### `runModelParallel(dataset, modelFn, concurrency)`
 Runs a model function with parallel execution.
 ```javascript
-const result = await runModelParallel(dataset, modelFn, 10);  // concurrency=10
+const result = await runModelParallel(dataset, modelFn, 10); // concurrency=10
 ```
 ### Assertions
 #### `expectStats(result)`
 Creates a statistical assertion chain from model results.
 ```javascript
-expectStats(result)
-  .field("prediction")
-  .toHaveAccuracyAbove(0.8);
+expectStats(result).field("prediction").toHaveAccuracyAbove(0.8);
 ```
 #### `expectStats(predictions, groundTruth)`
 Two-argument form for judge validation. Aligns predictions with ground truth by `id` field.
 ```javascript
 // Validate judge outputs against human labels
-expectStats(judgeOutputs, humanLabels)
-  .field("label")
-  .toHaveAccuracyAbove(0.85);
+expectStats(judgeOutputs, humanLabels).field("label").toHaveAccuracyAbove(0.85);
 ```
 **When to use:**
 - Validating LLM judges against human labels
 - Evaluating metric quality
 - Testing automated detection systems
@@ -355,19 +359,21 @@ expectStats(judgeOutputs, humanLabels)
 ### Field Selection
 #### `.field(fieldName)`
 Selects a field for evaluation.
 ```javascript
-expectStats(result).field("sentiment")
+expectStats(result).field("sentiment");
 ```
 #### `.binarize(threshold)`
 Converts continuous scores to binary (>=threshold is true).
 ```javascript
 expectStats(result)
   .field("score")
-  .binarize(0.5)  // score >= 0.5 is true
+  .binarize(0.5) // score >= 0.5 is true
   .toHaveAccuracyAbove(0.8);
 ```
@@ -403,23 +409,20 @@ Distribution assertions validate output distributions **without requiring ground
 ```javascript
 // Assert that at least 80% of confidence scores are above 0.7
-expectStats(predictions)
-  .field("confidence")
-  .toHavePercentageAbove(0.7, 0.8);
+expectStats(predictions).field("confidence").toHavePercentageAbove(0.7, 0.8);
 // Assert that at least 90% of toxicity scores are below 0.3
-expectStats(predictions)
-  .field("toxicity")
-  .toHavePercentageBelow(0.3, 0.9);
+expectStats(predictions).field("toxicity").toHavePercentageBelow(0.3, 0.9);
 // Chain multiple distribution assertions
 expectStats(predictions)
   .field("score")
-  .toHavePercentageAbove(0.5, 0.6)  // At least 60% above 0.5
+  .toHavePercentageAbove(0.5, 0.6) // At least 60% above 0.5
   .toHavePercentageBelow(0.9, 0.8); // At least 80% below 0.9
 ```
 **Use cases:**
 - Monitor confidence score distributions
 - Validate schema compliance rates
 - Check output range constraints
@@ -436,35 +439,35 @@ Validate judge outputs against human-labeled ground truth using the **two-argume
 const judgeOutputs = [
   { id: "1", hallucinated: true },
   { id: "2", hallucinated: false },
-  { id: "3", hallucinated: true }
+  { id: "3", hallucinated: true },
 ];
 // Human labels (ground truth)
 const humanLabels = [
   { id: "1", hallucinated: true },
   { id: "2", hallucinated: false },
-  { id: "3", hallucinated: false }
+  { id: "3", hallucinated: false },
 ];
 // Validate judge performance
 expectStats(judgeOutputs, humanLabels)
   .field("hallucinated")
-  .toHaveRecallAbove(true, 0.9)       // Don't miss hallucinations
-  .toHavePrecisionAbove(true, 0.7)    // Some false positives OK
+  .toHaveRecallAbove(true, 0.9) // Don't miss hallucinations
+  .toHavePrecisionAbove(true, 0.7) // Some false positives OK
   .toHaveConfusionMatrix();
 ```
 **Use cases:**
 - Evaluate LLM-as-judge accuracy
 - Validate heuristic metrics against human labels
 - Test automated detection systems (refusal, policy compliance)
 - Calibrate metric thresholds
 **Two-argument expectStats:**
 ```javascript
-expectStats(actual, expected)
-  .field("fieldName")
-  .toHaveAccuracyAbove(0.8);
+expectStats(actual, expected).field("fieldName").toHaveAccuracyAbove(0.8);
 ```
 The first argument is your predictions (judge outputs), the second is ground truth (human labels). Both must have matching `id` fields for alignment.
@@ -493,6 +496,7 @@ Datasets must be JSON arrays where each record has an `id` or `_id` field:
 ```
 **Requirements:**
 - Each record MUST have `id` or `_id` for alignment
 - Ground truth fields (e.g., `label`, `sentiment`, `category`) are compared against model outputs
 - Model functions must return predictions with matching `id`
@@ -522,6 +526,7 @@ project/
 ```
 Run with:
 ```bash
 npx evalsense run tests/
 ```
@@ -551,26 +556,25 @@ evalsense includes LLM-powered metrics for hallucination detection, relevance as
 ### Quick Setup
 ```javascript
-import { setLLMClient } from "evalsense/metrics";
+import { setLLMClient, createOpenAIAdapter } from "evalsense/metrics";
 import { hallucination, relevance, faithfulness, toxicity } from "evalsense/metrics/opinionated";
 // 1. Configure your LLM client (one-time setup)
-setLLMClient({
-  async complete(prompt) {
-    // Call your LLM API (OpenAI, Anthropic, local model, etc.)
-    const response = await yourLLM.generate(prompt);
-    return response.text;
-  }
-});
+setLLMClient(
+  createOpenAIAdapter(process.env.OPENAI_API_KEY, {
+    model: "gpt-4-turbo-preview",
+    temperature: 0,
+  })
+);
 // 2. Use metrics in evaluations
 const results = await hallucination({
   outputs: [{ id: "1", output: "Paris has 50 million people." }],
-  context: ["Paris has approximately 2.1 million residents."]
+  context: ["Paris has approximately 2.1 million residents."],
 });
-console.log(results[0].score);      // 0.9 (high hallucination)
-console.log(results[0].reasoning);  // "Output claims 50M, context says 2.1M"
+console.log(results[0].score); // 0.9 (high hallucination)
+console.log(results[0].reasoning); // "Output claims 50M, context says 2.1M"
 ```
 ### Available Metrics
@@ -589,62 +593,74 @@ Choose between accuracy and cost:
 await hallucination({
   outputs,
   context,
-  evaluationMode: "per-row"  // default
+  evaluationMode: "per-row", // default
 });
 // Batch: Lower cost, single API call
 await hallucination({
   outputs,
   context,
-  evaluationMode: "batch"
+  evaluationMode: "batch",
 });
 ```
-### Provider Examples
+### Built-in Provider Adapters
+evalsense includes ready-to-use adapters for popular LLM providers:
+**OpenAI (GPT-4, GPT-3.5)**
-**OpenAI:**
 ```javascript
-import OpenAI from "openai";
+import { createOpenAIAdapter } from "evalsense/metrics";
-const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
-setLLMClient({
-  async complete(prompt) {
-    const response = await openai.chat.completions.create({
-      model: "gpt-4-turbo-preview",
-      messages: [{ role: "user", content: prompt }],
-    });
-    return response.choices[0]?.message?.content ?? "";
-  }
-});
+// npm install openai
+setLLMClient(
+  createOpenAIAdapter(process.env.OPENAI_API_KEY, {
+    model: "gpt-4-turbo-preview", // or "gpt-3.5-turbo" for lower cost
+    temperature: 0,
+    maxTokens: 4096,
+  })
+);
 ```
-**Anthropic:**
+**Anthropic (Claude)**
 ```javascript
-import Anthropic from "@anthropic-ai/sdk";
+import { createAnthropicAdapter } from "evalsense/metrics";
-const anthropic = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY });
-setLLMClient({
-  async complete(prompt) {
-    const message = await anthropic.messages.create({
-      model: "claude-3-5-sonnet-20241022",
-      max_tokens: 4096,
-      messages: [{ role: "user", content: prompt }],
-    });
-    return message.content[0].text;
-  }
-});
+// npm install @anthropic-ai/sdk
+setLLMClient(
+  createAnthropicAdapter(process.env.ANTHROPIC_API_KEY, {
+    model: "claude-3-5-sonnet-20241022", // or "claude-3-haiku-20240307" for speed
+    maxTokens: 4096,
+  })
+);
+```
+**OpenRouter (100+ models from one API)**
+```javascript
+import { createOpenRouterAdapter } from "evalsense/metrics";
+// No SDK needed - uses fetch
+setLLMClient(
+  createOpenRouterAdapter(process.env.OPENROUTER_API_KEY, {
+    model: "anthropic/claude-3.5-sonnet", // or "openai/gpt-3.5-turbo", etc.
+    temperature: 0,
+    appName: "my-eval-system",
+  })
+);
 ```
-**Local (Ollama):**
+**Custom Adapter (for any provider)**
 ```javascript
 setLLMClient({
   async complete(prompt) {
-    const response = await fetch("http://localhost:11434/api/generate", {
-      method: "POST",
-      body: JSON.stringify({ model: "llama2", prompt }),
-    });
-    return (await response.json()).response;
-  }
+    // Implement for your LLM provider
+    const response = await yourLLM.generate(prompt);
+    return response.text;
+  },
 });
 ```
@@ -660,6 +676,7 @@ setLLMClient({
 evalsense is built on the principle that **metrics are predictions, not facts**.
 Instead of treating LLM-as-judge metrics (relevance, hallucination, etc.) as ground truth, evalsense:
 - Treats them as **weak labels** from a model
 - Validates them statistically against human references when available
 - Computes confusion matrices to reveal bias and systematic errors

package/dist/{chunk-HDJID3GC.cjs → chunk-BE7CB3AM.cjs} RENAMED Viewed

@@ -146,10 +146,7 @@ function getSupport(cm, label) {
 }
 function formatConfusionMatrix(cm) {
   const maxLabelLen = Math.max(...cm.labels.map((l) => l.length), 8);
-  const colWidth = Math.max(
-    ...cm.matrix.flat().map((n) => String(n).length),
-    maxLabelLen
-  );
+  const colWidth = Math.max(...cm.matrix.flat().map((n) => String(n).length), maxLabelLen);
   const header = " ".repeat(maxLabelLen + 2) + cm.labels.map((l) => l.padStart(colWidth)).join(" ");
   const rows = cm.labels.map((label, i) => {
     const rowData = cm.matrix[i].map((n) => String(n).padStart(colWidth)).join(" ");
@@ -261,7 +258,7 @@ var ConsoleReporter = class {
    */
   printHeader(fileCount) {
     this.log("");
-    this.log(this.color("bold", `EvalSense v0.1.0`));
+    this.log(this.color("bold", `EvalSense v0.3.1`));
     this.log(this.color("dim", `Running ${fileCount} eval file(s)...`));
     this.log("");
   }
@@ -296,14 +293,23 @@ var ConsoleReporter = class {
     for (const fm of test.fieldMetrics) {
       this.printFieldMetrics(fm);
     }
-    if (test.error && test.status === "failed") {
-      this.log(this.color("red", `      ${test.error.message}`));
-    } else if (test.error && test.status === "error") {
-      this.log(this.color("red", `      Error: ${test.error.message}`));
+    for (const fm of test.fieldMetrics) {
+      if (fm.metrics.confusionMatrix && Object.keys(fm.metrics.confusionMatrix).length > 0) {
+        this.printConfusionMatrix(fm);
+      }
+    }
+    if (test.error) {
+      const prefix = test.status === "error" ? "Error" : "Assertion Failed";
+      this.log(this.color("red", `      ${prefix}: ${test.error.message}`));
+      this.log("");
     }
     for (const assertion of test.assertions) {
       if (!assertion.passed) {
-        this.log(this.color("red", `      ${assertion.message}`));
+        this.log(this.color("red", `      \u2717 ${assertion.message}`));
+        if (assertion.expected !== void 0 && assertion.actual !== void 0) {
+          this.log(this.color("dim", `        Expected: ${this.formatValue(assertion.expected)}`));
+          this.log(this.color("dim", `        Actual:   ${this.formatValue(assertion.actual)}`));
+        }
       }
     }
   }
@@ -419,6 +425,24 @@ var ConsoleReporter = class {
     }
     return `${colors[colorName]}${text}${colors.reset}`;
   }
+  /**
+   * Formats a value for display
+   */
+  formatValue(value) {
+    if (typeof value === "number") {
+      if (value >= 0 && value <= 1) {
+        return `${(value * 100).toFixed(1)}%`;
+      }
+      return value.toFixed(4);
+    }
+    if (typeof value === "string") {
+      return `"${value}"`;
+    }
+    if (Array.isArray(value)) {
+      return `[${value.join(", ")}]`;
+    }
+    return String(value);
+  }
   /**
    * Logs a line
    */
@@ -426,23 +450,10 @@ var ConsoleReporter = class {
     console.log(message);
   }
 };
-var DEFAULT_PATTERNS = [
-  "**/*.eval.js",
-  "**/*.eval.ts",
-  "**/*.eval.mjs"
-];
-var DEFAULT_IGNORE = [
-  "**/node_modules/**",
-  "**/dist/**",
-  "**/build/**",
-  "**/.git/**"
-];
+var DEFAULT_PATTERNS = ["**/*.eval.js", "**/*.eval.ts", "**/*.eval.mjs"];
+var DEFAULT_IGNORE = ["**/node_modules/**", "**/dist/**", "**/build/**", "**/.git/**"];
 async function discoverEvalFiles(options = {}) {
-  const {
-    patterns = DEFAULT_PATTERNS,
-    ignore = DEFAULT_IGNORE,
-    cwd = process.cwd()
-  } = options;
+  const { patterns = DEFAULT_PATTERNS, ignore = DEFAULT_IGNORE, cwd = process.cwd() } = options;
   const files = [];
   for (const pattern of patterns) {
     const matches = await glob.glob(pattern, {
@@ -775,5 +786,5 @@ exports.parseReport = parseReport;
 exports.recordAssertion = recordAssertion;
 exports.recordFieldMetrics = recordFieldMetrics;
 exports.setCurrentSuite = setCurrentSuite;
-//# sourceMappingURL=chunk-HDJID3GC.cjs.map
-//# sourceMappingURL=chunk-HDJID3GC.cjs.map
+//# sourceMappingURL=chunk-BE7CB3AM.cjs.map
+//# sourceMappingURL=chunk-BE7CB3AM.cjs.map