evalsense 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -3,10 +3,12 @@
3
3
  > JS-native LLM evaluation framework with Jest-like API and statistical assertions
4
4
 
5
5
  [![npm version](https://img.shields.io/npm/v/evalsense.svg)](https://www.npmjs.com/package/evalsense)
6
- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
7
7
 
8
8
  **evalsense** brings classical ML-style statistical evaluation to LLM systems in JavaScript. Instead of evaluating individual test cases, evalsense evaluates entire datasets and computes confusion matrices, precision/recall, F1 scores, and other statistical metrics.
9
9
 
10
+ > **New in v0.3.0:** Regression assertions (MAE, RMSE, R²) and flexible ID matching for custom identifier fields! [See migration guide](./docs/migration-v0.3.0.md).
11
+ > **New in v0.2.x:** Built-in adapters for OpenAI, Anthropic, and OpenRouter - no boilerplate needed!
10
12
  > **New in v0.2.0:** LLM-powered metrics for hallucination, relevance, faithfulness, and toxicity detection. [See migration guide](./docs/migration-v0.2.md).
11
13
 
12
14
  ## Why evalsense?
@@ -57,7 +59,7 @@ function classifySentiment(record) {
57
59
 
58
60
  return {
59
61
  id: record.id,
60
- sentiment: hasPositive && !hasNegative ? "positive" : "negative"
62
+ sentiment: hasPositive && !hasNegative ? "positive" : "negative",
61
63
  };
62
64
  }
63
65
 
@@ -109,14 +111,14 @@ describe("Spam classifier", () => {
109
111
 
110
112
  const result = await runModel(dataset, (record) => ({
111
113
  id: record.id,
112
- isSpam: classifyEmail(record.text)
114
+ isSpam: classifyEmail(record.text),
113
115
  }));
114
116
 
115
117
  expectStats(result)
116
118
  .field("isSpam")
117
119
  .toHaveAccuracyAbove(0.9)
118
- .toHavePrecisionAbove(true, 0.85) // Precision for spam=true
119
- .toHaveRecallAbove(true, 0.85) // Recall for spam=true
120
+ .toHavePrecisionAbove(true, 0.85) // Precision for spam=true
121
+ .toHaveRecallAbove(true, 0.85) // Recall for spam=true
120
122
  .toHaveConfusionMatrix();
121
123
  });
122
124
  });
@@ -134,13 +136,13 @@ describe("Hallucination detector", () => {
134
136
  // Your model returns a continuous score
135
137
  const result = await runModel(dataset, (record) => ({
136
138
  id: record.id,
137
- hallucinated: computeHallucinationScore(record.output) // 0.0 to 1.0
139
+ hallucinated: computeHallucinationScore(record.output), // 0.0 to 1.0
138
140
  }));
139
141
 
140
142
  // Binarize the score at threshold 0.3
141
143
  expectStats(result)
142
144
  .field("hallucinated")
143
- .binarize(0.3) // >= 0.3 means hallucinated
145
+ .binarize(0.3) // >= 0.3 means hallucinated
144
146
  .toHaveRecallAbove(true, 0.7)
145
147
  .toHavePrecisionAbove(true, 0.6)
146
148
  .toHaveConfusionMatrix();
@@ -159,7 +161,7 @@ describe("Intent classifier", () => {
159
161
 
160
162
  const result = await runModel(dataset, (record) => ({
161
163
  id: record.id,
162
- intent: classifyIntent(record.query)
164
+ intent: classifyIntent(record.query),
163
165
  }));
164
166
 
165
167
  expectStats(result)
@@ -191,12 +193,10 @@ describe("LLM classifier", () => {
191
193
  const response = await callLLM(record.text);
192
194
  return { id: record.id, category: response.category };
193
195
  },
194
- 5 // concurrency limit
196
+ 5 // concurrency limit
195
197
  );
196
198
 
197
- expectStats(result)
198
- .field("category")
199
- .toHaveAccuracyAbove(0.9);
199
+ expectStats(result).field("category").toHaveAccuracyAbove(0.9);
200
200
  });
201
201
  });
202
202
  ```
@@ -283,6 +283,7 @@ npx evalsense list tests/
283
283
  ### Core API
284
284
 
285
285
  #### `describe(name, fn)`
286
+
286
287
  Groups related evaluation tests (like Jest's describe).
287
288
 
288
289
  ```javascript
@@ -292,6 +293,7 @@ describe("My model", () => {
292
293
  ```
293
294
 
294
295
  #### `evalTest(name, fn)` / `test(name, fn)` / `it(name, fn)`
296
+
295
297
  Defines an evaluation test.
296
298
 
297
299
  ```javascript
@@ -303,6 +305,7 @@ evalTest("should have 90% accuracy", async () => {
303
305
  ### Dataset Functions
304
306
 
305
307
  #### `loadDataset(path)`
308
+
306
309
  Loads a dataset from a JSON file. Records must have an `id` or `_id` field.
307
310
 
308
311
  ```javascript
@@ -310,44 +313,45 @@ const dataset = loadDataset("./data.json");
310
313
  ```
311
314
 
312
315
  #### `runModel(dataset, modelFn)`
316
+
313
317
  Runs a model function on each record sequentially.
314
318
 
315
319
  ```javascript
316
320
  const result = await runModel(dataset, (record) => ({
317
321
  id: record.id,
318
- prediction: classify(record.text)
322
+ prediction: classify(record.text),
319
323
  }));
320
324
  ```
321
325
 
322
326
  #### `runModelParallel(dataset, modelFn, concurrency)`
327
+
323
328
  Runs a model function with parallel execution.
324
329
 
325
330
  ```javascript
326
- const result = await runModelParallel(dataset, modelFn, 10); // concurrency=10
331
+ const result = await runModelParallel(dataset, modelFn, 10); // concurrency=10
327
332
  ```
328
333
 
329
334
  ### Assertions
330
335
 
331
336
  #### `expectStats(result)`
337
+
332
338
  Creates a statistical assertion chain from model results.
333
339
 
334
340
  ```javascript
335
- expectStats(result)
336
- .field("prediction")
337
- .toHaveAccuracyAbove(0.8);
341
+ expectStats(result).field("prediction").toHaveAccuracyAbove(0.8);
338
342
  ```
339
343
 
340
344
  #### `expectStats(predictions, groundTruth)`
345
+
341
346
  Two-argument form for judge validation. Aligns predictions with ground truth by `id` field.
342
347
 
343
348
  ```javascript
344
349
  // Validate judge outputs against human labels
345
- expectStats(judgeOutputs, humanLabels)
346
- .field("label")
347
- .toHaveAccuracyAbove(0.85);
350
+ expectStats(judgeOutputs, humanLabels).field("label").toHaveAccuracyAbove(0.85);
348
351
  ```
349
352
 
350
353
  **When to use:**
354
+
351
355
  - Validating LLM judges against human labels
352
356
  - Evaluating metric quality
353
357
  - Testing automated detection systems
@@ -355,19 +359,21 @@ expectStats(judgeOutputs, humanLabels)
355
359
  ### Field Selection
356
360
 
357
361
  #### `.field(fieldName)`
362
+
358
363
  Selects a field for evaluation.
359
364
 
360
365
  ```javascript
361
- expectStats(result).field("sentiment")
366
+ expectStats(result).field("sentiment");
362
367
  ```
363
368
 
364
369
  #### `.binarize(threshold)`
370
+
365
371
  Converts continuous scores to binary (>=threshold is true).
366
372
 
367
373
  ```javascript
368
374
  expectStats(result)
369
375
  .field("score")
370
- .binarize(0.5) // score >= 0.5 is true
376
+ .binarize(0.5) // score >= 0.5 is true
371
377
  .toHaveAccuracyAbove(0.8);
372
378
  ```
373
379
 
@@ -403,23 +409,20 @@ Distribution assertions validate output distributions **without requiring ground
403
409
 
404
410
  ```javascript
405
411
  // Assert that at least 80% of confidence scores are above 0.7
406
- expectStats(predictions)
407
- .field("confidence")
408
- .toHavePercentageAbove(0.7, 0.8);
412
+ expectStats(predictions).field("confidence").toHavePercentageAbove(0.7, 0.8);
409
413
 
410
414
  // Assert that at least 90% of toxicity scores are below 0.3
411
- expectStats(predictions)
412
- .field("toxicity")
413
- .toHavePercentageBelow(0.3, 0.9);
415
+ expectStats(predictions).field("toxicity").toHavePercentageBelow(0.3, 0.9);
414
416
 
415
417
  // Chain multiple distribution assertions
416
418
  expectStats(predictions)
417
419
  .field("score")
418
- .toHavePercentageAbove(0.5, 0.6) // At least 60% above 0.5
420
+ .toHavePercentageAbove(0.5, 0.6) // At least 60% above 0.5
419
421
  .toHavePercentageBelow(0.9, 0.8); // At least 80% below 0.9
420
422
  ```
421
423
 
422
424
  **Use cases:**
425
+
423
426
  - Monitor confidence score distributions
424
427
  - Validate schema compliance rates
425
428
  - Check output range constraints
@@ -436,35 +439,35 @@ Validate judge outputs against human-labeled ground truth using the **two-argume
436
439
  const judgeOutputs = [
437
440
  { id: "1", hallucinated: true },
438
441
  { id: "2", hallucinated: false },
439
- { id: "3", hallucinated: true }
442
+ { id: "3", hallucinated: true },
440
443
  ];
441
444
 
442
445
  // Human labels (ground truth)
443
446
  const humanLabels = [
444
447
  { id: "1", hallucinated: true },
445
448
  { id: "2", hallucinated: false },
446
- { id: "3", hallucinated: false }
449
+ { id: "3", hallucinated: false },
447
450
  ];
448
451
 
449
452
  // Validate judge performance
450
453
  expectStats(judgeOutputs, humanLabels)
451
454
  .field("hallucinated")
452
- .toHaveRecallAbove(true, 0.9) // Don't miss hallucinations
453
- .toHavePrecisionAbove(true, 0.7) // Some false positives OK
455
+ .toHaveRecallAbove(true, 0.9) // Don't miss hallucinations
456
+ .toHavePrecisionAbove(true, 0.7) // Some false positives OK
454
457
  .toHaveConfusionMatrix();
455
458
  ```
456
459
 
457
460
  **Use cases:**
461
+
458
462
  - Evaluate LLM-as-judge accuracy
459
463
  - Validate heuristic metrics against human labels
460
464
  - Test automated detection systems (refusal, policy compliance)
461
465
  - Calibrate metric thresholds
462
466
 
463
467
  **Two-argument expectStats:**
468
+
464
469
  ```javascript
465
- expectStats(actual, expected)
466
- .field("fieldName")
467
- .toHaveAccuracyAbove(0.8);
470
+ expectStats(actual, expected).field("fieldName").toHaveAccuracyAbove(0.8);
468
471
  ```
469
472
 
470
473
  The first argument is your predictions (judge outputs), the second is ground truth (human labels). Both must have matching `id` fields for alignment.
@@ -493,6 +496,7 @@ Datasets must be JSON arrays where each record has an `id` or `_id` field:
493
496
  ```
494
497
 
495
498
  **Requirements:**
499
+
496
500
  - Each record MUST have `id` or `_id` for alignment
497
501
  - Ground truth fields (e.g., `label`, `sentiment`, `category`) are compared against model outputs
498
502
  - Model functions must return predictions with matching `id`
@@ -522,6 +526,7 @@ project/
522
526
  ```
523
527
 
524
528
  Run with:
529
+
525
530
  ```bash
526
531
  npx evalsense run tests/
527
532
  ```
@@ -551,26 +556,25 @@ evalsense includes LLM-powered metrics for hallucination detection, relevance as
551
556
  ### Quick Setup
552
557
 
553
558
  ```javascript
554
- import { setLLMClient } from "evalsense/metrics";
559
+ import { setLLMClient, createOpenAIAdapter } from "evalsense/metrics";
555
560
  import { hallucination, relevance, faithfulness, toxicity } from "evalsense/metrics/opinionated";
556
561
 
557
562
  // 1. Configure your LLM client (one-time setup)
558
- setLLMClient({
559
- async complete(prompt) {
560
- // Call your LLM API (OpenAI, Anthropic, local model, etc.)
561
- const response = await yourLLM.generate(prompt);
562
- return response.text;
563
- }
564
- });
563
+ setLLMClient(
564
+ createOpenAIAdapter(process.env.OPENAI_API_KEY, {
565
+ model: "gpt-4-turbo-preview",
566
+ temperature: 0,
567
+ })
568
+ );
565
569
 
566
570
  // 2. Use metrics in evaluations
567
571
  const results = await hallucination({
568
572
  outputs: [{ id: "1", output: "Paris has 50 million people." }],
569
- context: ["Paris has approximately 2.1 million residents."]
573
+ context: ["Paris has approximately 2.1 million residents."],
570
574
  });
571
575
 
572
- console.log(results[0].score); // 0.9 (high hallucination)
573
- console.log(results[0].reasoning); // "Output claims 50M, context says 2.1M"
576
+ console.log(results[0].score); // 0.9 (high hallucination)
577
+ console.log(results[0].reasoning); // "Output claims 50M, context says 2.1M"
574
578
  ```
575
579
 
576
580
  ### Available Metrics
@@ -589,62 +593,74 @@ Choose between accuracy and cost:
589
593
  await hallucination({
590
594
  outputs,
591
595
  context,
592
- evaluationMode: "per-row" // default
596
+ evaluationMode: "per-row", // default
593
597
  });
594
598
 
595
599
  // Batch: Lower cost, single API call
596
600
  await hallucination({
597
601
  outputs,
598
602
  context,
599
- evaluationMode: "batch"
603
+ evaluationMode: "batch",
600
604
  });
601
605
  ```
602
606
 
603
- ### Provider Examples
607
+ ### Built-in Provider Adapters
608
+
609
+ evalsense includes ready-to-use adapters for popular LLM providers:
610
+
611
+ **OpenAI (GPT-4, GPT-3.5)**
604
612
 
605
- **OpenAI:**
606
613
  ```javascript
607
- import OpenAI from "openai";
614
+ import { createOpenAIAdapter } from "evalsense/metrics";
608
615
 
609
- const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
610
- setLLMClient({
611
- async complete(prompt) {
612
- const response = await openai.chat.completions.create({
613
- model: "gpt-4-turbo-preview",
614
- messages: [{ role: "user", content: prompt }],
615
- });
616
- return response.choices[0]?.message?.content ?? "";
617
- }
618
- });
616
+ // npm install openai
617
+ setLLMClient(
618
+ createOpenAIAdapter(process.env.OPENAI_API_KEY, {
619
+ model: "gpt-4-turbo-preview", // or "gpt-3.5-turbo" for lower cost
620
+ temperature: 0,
621
+ maxTokens: 4096,
622
+ })
623
+ );
619
624
  ```
620
625
 
621
- **Anthropic:**
626
+ **Anthropic (Claude)**
627
+
622
628
  ```javascript
623
- import Anthropic from "@anthropic-ai/sdk";
629
+ import { createAnthropicAdapter } from "evalsense/metrics";
624
630
 
625
- const anthropic = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY });
626
- setLLMClient({
627
- async complete(prompt) {
628
- const message = await anthropic.messages.create({
629
- model: "claude-3-5-sonnet-20241022",
630
- max_tokens: 4096,
631
- messages: [{ role: "user", content: prompt }],
632
- });
633
- return message.content[0].text;
634
- }
635
- });
631
+ // npm install @anthropic-ai/sdk
632
+ setLLMClient(
633
+ createAnthropicAdapter(process.env.ANTHROPIC_API_KEY, {
634
+ model: "claude-3-5-sonnet-20241022", // or "claude-3-haiku-20240307" for speed
635
+ maxTokens: 4096,
636
+ })
637
+ );
638
+ ```
639
+
640
+ **OpenRouter (100+ models from one API)**
641
+
642
+ ```javascript
643
+ import { createOpenRouterAdapter } from "evalsense/metrics";
644
+
645
+ // No SDK needed - uses fetch
646
+ setLLMClient(
647
+ createOpenRouterAdapter(process.env.OPENROUTER_API_KEY, {
648
+ model: "anthropic/claude-3.5-sonnet", // or "openai/gpt-3.5-turbo", etc.
649
+ temperature: 0,
650
+ appName: "my-eval-system",
651
+ })
652
+ );
636
653
  ```
637
654
 
638
- **Local (Ollama):**
655
+ **Custom Adapter (for any provider)**
656
+
639
657
  ```javascript
640
658
  setLLMClient({
641
659
  async complete(prompt) {
642
- const response = await fetch("http://localhost:11434/api/generate", {
643
- method: "POST",
644
- body: JSON.stringify({ model: "llama2", prompt }),
645
- });
646
- return (await response.json()).response;
647
- }
660
+ // Implement for your LLM provider
661
+ const response = await yourLLM.generate(prompt);
662
+ return response.text;
663
+ },
648
664
  });
649
665
  ```
650
666
 
@@ -660,6 +676,7 @@ setLLMClient({
660
676
  evalsense is built on the principle that **metrics are predictions, not facts**.
661
677
 
662
678
  Instead of treating LLM-as-judge metrics (relevance, hallucination, etc.) as ground truth, evalsense:
679
+
663
680
  - Treats them as **weak labels** from a model
664
681
  - Validates them statistically against human references when available
665
682
  - Computes confusion matrices to reveal bias and systematic errors
@@ -146,10 +146,7 @@ function getSupport(cm, label) {
146
146
  }
147
147
  function formatConfusionMatrix(cm) {
148
148
  const maxLabelLen = Math.max(...cm.labels.map((l) => l.length), 8);
149
- const colWidth = Math.max(
150
- ...cm.matrix.flat().map((n) => String(n).length),
151
- maxLabelLen
152
- );
149
+ const colWidth = Math.max(...cm.matrix.flat().map((n) => String(n).length), maxLabelLen);
153
150
  const header = " ".repeat(maxLabelLen + 2) + cm.labels.map((l) => l.padStart(colWidth)).join(" ");
154
151
  const rows = cm.labels.map((label, i) => {
155
152
  const rowData = cm.matrix[i].map((n) => String(n).padStart(colWidth)).join(" ");
@@ -261,7 +258,7 @@ var ConsoleReporter = class {
261
258
  */
262
259
  printHeader(fileCount) {
263
260
  this.log("");
264
- this.log(this.color("bold", `EvalSense v0.1.0`));
261
+ this.log(this.color("bold", `EvalSense v0.3.1`));
265
262
  this.log(this.color("dim", `Running ${fileCount} eval file(s)...`));
266
263
  this.log("");
267
264
  }
@@ -296,14 +293,23 @@ var ConsoleReporter = class {
296
293
  for (const fm of test.fieldMetrics) {
297
294
  this.printFieldMetrics(fm);
298
295
  }
299
- if (test.error && test.status === "failed") {
300
- this.log(this.color("red", ` ${test.error.message}`));
301
- } else if (test.error && test.status === "error") {
302
- this.log(this.color("red", ` Error: ${test.error.message}`));
296
+ for (const fm of test.fieldMetrics) {
297
+ if (fm.metrics.confusionMatrix && Object.keys(fm.metrics.confusionMatrix).length > 0) {
298
+ this.printConfusionMatrix(fm);
299
+ }
300
+ }
301
+ if (test.error) {
302
+ const prefix = test.status === "error" ? "Error" : "Assertion Failed";
303
+ this.log(this.color("red", ` ${prefix}: ${test.error.message}`));
304
+ this.log("");
303
305
  }
304
306
  for (const assertion of test.assertions) {
305
307
  if (!assertion.passed) {
306
- this.log(this.color("red", ` ${assertion.message}`));
308
+ this.log(this.color("red", ` \u2717 ${assertion.message}`));
309
+ if (assertion.expected !== void 0 && assertion.actual !== void 0) {
310
+ this.log(this.color("dim", ` Expected: ${this.formatValue(assertion.expected)}`));
311
+ this.log(this.color("dim", ` Actual: ${this.formatValue(assertion.actual)}`));
312
+ }
307
313
  }
308
314
  }
309
315
  }
@@ -419,6 +425,24 @@ var ConsoleReporter = class {
419
425
  }
420
426
  return `${colors[colorName]}${text}${colors.reset}`;
421
427
  }
428
+ /**
429
+ * Formats a value for display
430
+ */
431
+ formatValue(value) {
432
+ if (typeof value === "number") {
433
+ if (value >= 0 && value <= 1) {
434
+ return `${(value * 100).toFixed(1)}%`;
435
+ }
436
+ return value.toFixed(4);
437
+ }
438
+ if (typeof value === "string") {
439
+ return `"${value}"`;
440
+ }
441
+ if (Array.isArray(value)) {
442
+ return `[${value.join(", ")}]`;
443
+ }
444
+ return String(value);
445
+ }
422
446
  /**
423
447
  * Logs a line
424
448
  */
@@ -426,23 +450,10 @@ var ConsoleReporter = class {
426
450
  console.log(message);
427
451
  }
428
452
  };
429
- var DEFAULT_PATTERNS = [
430
- "**/*.eval.js",
431
- "**/*.eval.ts",
432
- "**/*.eval.mjs"
433
- ];
434
- var DEFAULT_IGNORE = [
435
- "**/node_modules/**",
436
- "**/dist/**",
437
- "**/build/**",
438
- "**/.git/**"
439
- ];
453
+ var DEFAULT_PATTERNS = ["**/*.eval.js", "**/*.eval.ts", "**/*.eval.mjs"];
454
+ var DEFAULT_IGNORE = ["**/node_modules/**", "**/dist/**", "**/build/**", "**/.git/**"];
440
455
  async function discoverEvalFiles(options = {}) {
441
- const {
442
- patterns = DEFAULT_PATTERNS,
443
- ignore = DEFAULT_IGNORE,
444
- cwd = process.cwd()
445
- } = options;
456
+ const { patterns = DEFAULT_PATTERNS, ignore = DEFAULT_IGNORE, cwd = process.cwd() } = options;
446
457
  const files = [];
447
458
  for (const pattern of patterns) {
448
459
  const matches = await glob.glob(pattern, {
@@ -775,5 +786,5 @@ exports.parseReport = parseReport;
775
786
  exports.recordAssertion = recordAssertion;
776
787
  exports.recordFieldMetrics = recordFieldMetrics;
777
788
  exports.setCurrentSuite = setCurrentSuite;
778
- //# sourceMappingURL=chunk-HDJID3GC.cjs.map
779
- //# sourceMappingURL=chunk-HDJID3GC.cjs.map
789
+ //# sourceMappingURL=chunk-BE7CB3AM.cjs.map
790
+ //# sourceMappingURL=chunk-BE7CB3AM.cjs.map