evalsense 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +678 -0
- package/bin/evalsense.js +3 -0
- package/dist/chunk-5P7LNNO6.js +747 -0
- package/dist/chunk-5P7LNNO6.js.map +1 -0
- package/dist/chunk-BRPM6AB6.js +925 -0
- package/dist/chunk-BRPM6AB6.js.map +1 -0
- package/dist/chunk-HDJID3GC.cjs +779 -0
- package/dist/chunk-HDJID3GC.cjs.map +1 -0
- package/dist/chunk-Y23VHTD3.cjs +942 -0
- package/dist/chunk-Y23VHTD3.cjs.map +1 -0
- package/dist/cli.cjs +65 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +63 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.cjs +1126 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +604 -0
- package/dist/index.d.ts +604 -0
- package/dist/index.js +1043 -0
- package/dist/index.js.map +1 -0
- package/dist/metrics/index.cjs +275 -0
- package/dist/metrics/index.cjs.map +1 -0
- package/dist/metrics/index.d.cts +299 -0
- package/dist/metrics/index.d.ts +299 -0
- package/dist/metrics/index.js +191 -0
- package/dist/metrics/index.js.map +1 -0
- package/dist/metrics/opinionated/index.cjs +24 -0
- package/dist/metrics/opinionated/index.cjs.map +1 -0
- package/dist/metrics/opinionated/index.d.cts +163 -0
- package/dist/metrics/opinionated/index.d.ts +163 -0
- package/dist/metrics/opinionated/index.js +3 -0
- package/dist/metrics/opinionated/index.js.map +1 -0
- package/dist/types-C71p0wzM.d.cts +265 -0
- package/dist/types-C71p0wzM.d.ts +265 -0
- package/package.json +91 -0
package/README.md
ADDED
|
@@ -0,0 +1,678 @@
|
|
|
1
|
+
# evalsense
|
|
2
|
+
|
|
3
|
+
> JS-native LLM evaluation framework with Jest-like API and statistical assertions
|
|
4
|
+
|
|
5
|
+
[](https://www.npmjs.com/package/evalsense)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
|
|
8
|
+
**evalsense** brings classical ML-style statistical evaluation to LLM systems in JavaScript. Instead of evaluating individual test cases, evalsense evaluates entire datasets and computes confusion matrices, precision/recall, F1 scores, and other statistical metrics.
|
|
9
|
+
|
|
10
|
+
> **New in v0.2.0:** LLM-powered metrics for hallucination, relevance, faithfulness, and toxicity detection. [See migration guide](./docs/migration-v0.2.md).
|
|
11
|
+
|
|
12
|
+
## Why evalsense?
|
|
13
|
+
|
|
14
|
+
Most LLM evaluation tools stop at producing scores (accuracy, relevance, hallucination). evalsense goes further by:
|
|
15
|
+
|
|
16
|
+
- ✅ Computing **confusion matrices** to reveal systematic failure patterns
|
|
17
|
+
- ✅ Analyzing **false positives vs false negatives** across datasets
|
|
18
|
+
- ✅ Treating **metrics as predictions, not truth** (and validating them statistically)
|
|
19
|
+
- ✅ Providing a **Jest-like API** that fits naturally into JS/Node workflows
|
|
20
|
+
- ✅ Supporting **deterministic CI/CD** integration with specific exit codes
|
|
21
|
+
|
|
22
|
+
## Features
|
|
23
|
+
|
|
24
|
+
- 📊 **Dataset-level evaluation** - evaluate distributions, not single examples
|
|
25
|
+
- 🎯 **Statistical rigor** - confusion matrices, precision/recall, F1, regression metrics
|
|
26
|
+
- 🧪 **Jest-like API** - familiar `describe()` and test patterns
|
|
27
|
+
- 🤖 **LLM-powered metrics** - hallucination, relevance, faithfulness, toxicity with explainable reasoning
|
|
28
|
+
- ⚡ **Dual evaluation modes** - choose between accuracy (per-row) or cost efficiency (batch)
|
|
29
|
+
- 🔄 **CI-friendly** - deterministic execution, machine-readable reports
|
|
30
|
+
- 🚀 **JS-native** - first-class TypeScript support, works with any Node.js LLM library
|
|
31
|
+
- 🔌 **Composable** - evaluate outputs from your existing LLM code
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
npm install --save-dev evalsense
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Or with yarn:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
yarn add -D evalsense
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Quick Start
|
|
46
|
+
|
|
47
|
+
Create a file named `sentiment.eval.js`:
|
|
48
|
+
|
|
49
|
+
```javascript
|
|
50
|
+
import { describe, evalTest, expectStats, loadDataset, runModel } from "evalsense";
|
|
51
|
+
|
|
52
|
+
// Your model function - can be any JS function
|
|
53
|
+
function classifySentiment(record) {
|
|
54
|
+
const text = record.text.toLowerCase();
|
|
55
|
+
const hasPositive = /love|amazing|great|fantastic|perfect/.test(text);
|
|
56
|
+
const hasNegative = /terrible|worst|disappointed|waste/.test(text);
|
|
57
|
+
|
|
58
|
+
return {
|
|
59
|
+
id: record.id,
|
|
60
|
+
sentiment: hasPositive && !hasNegative ? "positive" : "negative"
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
describe("Sentiment classifier", () => {
|
|
65
|
+
evalTest("accuracy above 80%", async () => {
|
|
66
|
+
// 1. Load dataset with ground truth
|
|
67
|
+
const dataset = loadDataset("./sentiment.json");
|
|
68
|
+
|
|
69
|
+
// 2. Run your model on the dataset
|
|
70
|
+
const result = await runModel(dataset, classifySentiment);
|
|
71
|
+
|
|
72
|
+
// 3. Assert on statistical properties
|
|
73
|
+
expectStats(result)
|
|
74
|
+
.field("sentiment")
|
|
75
|
+
.toHaveAccuracyAbove(0.8)
|
|
76
|
+
.toHaveRecallAbove("positive", 0.7)
|
|
77
|
+
.toHavePrecisionAbove("positive", 0.7)
|
|
78
|
+
.toHaveConfusionMatrix();
|
|
79
|
+
});
|
|
80
|
+
});
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Create `sentiment.json`:
|
|
84
|
+
|
|
85
|
+
```json
|
|
86
|
+
[
|
|
87
|
+
{ "id": "1", "text": "I love this product!", "sentiment": "positive" },
|
|
88
|
+
{ "id": "2", "text": "Terrible experience.", "sentiment": "negative" },
|
|
89
|
+
{ "id": "3", "text": "Great quality!", "sentiment": "positive" }
|
|
90
|
+
]
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Run the evaluation:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
npx evalsense run sentiment.eval.js
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Usage
|
|
100
|
+
|
|
101
|
+
### Basic Classification Example
|
|
102
|
+
|
|
103
|
+
```javascript
|
|
104
|
+
import { describe, evalTest, expectStats, loadDataset, runModel } from "evalsense";
|
|
105
|
+
|
|
106
|
+
describe("Spam classifier", () => {
|
|
107
|
+
evalTest("high precision and recall", async () => {
|
|
108
|
+
const dataset = loadDataset("./emails.json");
|
|
109
|
+
|
|
110
|
+
const result = await runModel(dataset, (record) => ({
|
|
111
|
+
id: record.id,
|
|
112
|
+
isSpam: classifyEmail(record.text)
|
|
113
|
+
}));
|
|
114
|
+
|
|
115
|
+
expectStats(result)
|
|
116
|
+
.field("isSpam")
|
|
117
|
+
.toHaveAccuracyAbove(0.9)
|
|
118
|
+
.toHavePrecisionAbove(true, 0.85) // Precision for spam=true
|
|
119
|
+
.toHaveRecallAbove(true, 0.85) // Recall for spam=true
|
|
120
|
+
.toHaveConfusionMatrix();
|
|
121
|
+
});
|
|
122
|
+
});
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Continuous Scores with Binarization
|
|
126
|
+
|
|
127
|
+
```javascript
|
|
128
|
+
import { describe, evalTest, expectStats, loadDataset, runModel } from "evalsense";
|
|
129
|
+
|
|
130
|
+
describe("Hallucination detector", () => {
|
|
131
|
+
evalTest("detect hallucinations with 70% recall", async () => {
|
|
132
|
+
const dataset = loadDataset("./outputs.json");
|
|
133
|
+
|
|
134
|
+
// Your model returns a continuous score
|
|
135
|
+
const result = await runModel(dataset, (record) => ({
|
|
136
|
+
id: record.id,
|
|
137
|
+
hallucinated: computeHallucinationScore(record.output) // 0.0 to 1.0
|
|
138
|
+
}));
|
|
139
|
+
|
|
140
|
+
// Binarize the score at threshold 0.3
|
|
141
|
+
expectStats(result)
|
|
142
|
+
.field("hallucinated")
|
|
143
|
+
.binarize(0.3) // >= 0.3 means hallucinated
|
|
144
|
+
.toHaveRecallAbove(true, 0.7)
|
|
145
|
+
.toHavePrecisionAbove(true, 0.6)
|
|
146
|
+
.toHaveConfusionMatrix();
|
|
147
|
+
});
|
|
148
|
+
});
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### Multi-class Classification
|
|
152
|
+
|
|
153
|
+
```javascript
|
|
154
|
+
import { describe, evalTest, expectStats, loadDataset, runModel } from "evalsense";
|
|
155
|
+
|
|
156
|
+
describe("Intent classifier", () => {
|
|
157
|
+
evalTest("balanced performance across intents", async () => {
|
|
158
|
+
const dataset = loadDataset("./intents.json");
|
|
159
|
+
|
|
160
|
+
const result = await runModel(dataset, (record) => ({
|
|
161
|
+
id: record.id,
|
|
162
|
+
intent: classifyIntent(record.query)
|
|
163
|
+
}));
|
|
164
|
+
|
|
165
|
+
expectStats(result)
|
|
166
|
+
.field("intent")
|
|
167
|
+
.toHaveAccuracyAbove(0.85)
|
|
168
|
+
.toHaveRecallAbove("purchase", 0.8)
|
|
169
|
+
.toHaveRecallAbove("support", 0.8)
|
|
170
|
+
.toHaveRecallAbove("general", 0.7)
|
|
171
|
+
.toHaveConfusionMatrix();
|
|
172
|
+
});
|
|
173
|
+
});
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### Parallel Model Execution
|
|
177
|
+
|
|
178
|
+
For LLM calls or slow operations, use parallel execution:
|
|
179
|
+
|
|
180
|
+
```javascript
|
|
181
|
+
import { describe, evalTest, expectStats, loadDataset, runModelParallel } from "evalsense";
|
|
182
|
+
|
|
183
|
+
describe("LLM classifier", () => {
|
|
184
|
+
evalTest("classification accuracy", async () => {
|
|
185
|
+
const dataset = loadDataset("./data.json");
|
|
186
|
+
|
|
187
|
+
// Run with concurrency=5
|
|
188
|
+
const result = await runModelParallel(
|
|
189
|
+
dataset,
|
|
190
|
+
async (record) => {
|
|
191
|
+
const response = await callLLM(record.text);
|
|
192
|
+
return { id: record.id, category: response.category };
|
|
193
|
+
},
|
|
194
|
+
5 // concurrency limit
|
|
195
|
+
);
|
|
196
|
+
|
|
197
|
+
expectStats(result)
|
|
198
|
+
.field("category")
|
|
199
|
+
.toHaveAccuracyAbove(0.9);
|
|
200
|
+
});
|
|
201
|
+
});
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### Test Lifecycle Hooks
|
|
205
|
+
|
|
206
|
+
```javascript
|
|
207
|
+
import { describe, evalTest, beforeAll, afterAll, beforeEach, afterEach } from "evalsense";
|
|
208
|
+
|
|
209
|
+
describe("Model evaluation", () => {
|
|
210
|
+
let model;
|
|
211
|
+
|
|
212
|
+
beforeAll(async () => {
|
|
213
|
+
// Load model once before all tests
|
|
214
|
+
model = await loadModel();
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
afterAll(async () => {
|
|
218
|
+
// Cleanup after all tests
|
|
219
|
+
await model.dispose();
|
|
220
|
+
});
|
|
221
|
+
|
|
222
|
+
beforeEach(() => {
|
|
223
|
+
// Reset state before each test
|
|
224
|
+
model.reset();
|
|
225
|
+
});
|
|
226
|
+
|
|
227
|
+
afterEach(() => {
|
|
228
|
+
// Cleanup after each test
|
|
229
|
+
console.log("Test completed");
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
evalTest("test 1", async () => {
|
|
233
|
+
// ...
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
evalTest("test 2", async () => {
|
|
237
|
+
// ...
|
|
238
|
+
});
|
|
239
|
+
});
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
## CLI Usage
|
|
243
|
+
|
|
244
|
+
### Run Evaluations
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
# Run all eval files in current directory
|
|
248
|
+
npx evalsense run
|
|
249
|
+
|
|
250
|
+
# Run specific file or directory
|
|
251
|
+
npx evalsense run tests/eval/
|
|
252
|
+
|
|
253
|
+
# Filter tests by name
|
|
254
|
+
npx evalsense run --filter "accuracy"
|
|
255
|
+
|
|
256
|
+
# Output JSON report
|
|
257
|
+
npx evalsense run --output report.json
|
|
258
|
+
|
|
259
|
+
# Use different reporters
|
|
260
|
+
npx evalsense run --reporter console # default
|
|
261
|
+
npx evalsense run --reporter json
|
|
262
|
+
npx evalsense run --reporter both
|
|
263
|
+
|
|
264
|
+
# Bail on first failure
|
|
265
|
+
npx evalsense run --bail
|
|
266
|
+
|
|
267
|
+
# Set timeout (in milliseconds)
|
|
268
|
+
npx evalsense run --timeout 60000
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
### List Eval Files
|
|
272
|
+
|
|
273
|
+
```bash
|
|
274
|
+
# List all discovered eval files
|
|
275
|
+
npx evalsense list
|
|
276
|
+
|
|
277
|
+
# List files in specific directory
|
|
278
|
+
npx evalsense list tests/
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
## API Reference
|
|
282
|
+
|
|
283
|
+
### Core API
|
|
284
|
+
|
|
285
|
+
#### `describe(name, fn)`
|
|
286
|
+
Groups related evaluation tests (like Jest's describe).
|
|
287
|
+
|
|
288
|
+
```javascript
|
|
289
|
+
describe("My model", () => {
|
|
290
|
+
// eval tests go here
|
|
291
|
+
});
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
#### `evalTest(name, fn)` / `test(name, fn)` / `it(name, fn)`
|
|
295
|
+
Defines an evaluation test.
|
|
296
|
+
|
|
297
|
+
```javascript
|
|
298
|
+
evalTest("should have 90% accuracy", async () => {
|
|
299
|
+
// test implementation
|
|
300
|
+
});
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
### Dataset Functions
|
|
304
|
+
|
|
305
|
+
#### `loadDataset(path)`
|
|
306
|
+
Loads a dataset from a JSON file. Records must have an `id` or `_id` field.
|
|
307
|
+
|
|
308
|
+
```javascript
|
|
309
|
+
const dataset = loadDataset("./data.json");
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
#### `runModel(dataset, modelFn)`
|
|
313
|
+
Runs a model function on each record sequentially.
|
|
314
|
+
|
|
315
|
+
```javascript
|
|
316
|
+
const result = await runModel(dataset, (record) => ({
|
|
317
|
+
id: record.id,
|
|
318
|
+
prediction: classify(record.text)
|
|
319
|
+
}));
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
#### `runModelParallel(dataset, modelFn, concurrency)`
|
|
323
|
+
Runs a model function with parallel execution.
|
|
324
|
+
|
|
325
|
+
```javascript
|
|
326
|
+
const result = await runModelParallel(dataset, modelFn, 10); // concurrency=10
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
### Assertions
|
|
330
|
+
|
|
331
|
+
#### `expectStats(result)`
|
|
332
|
+
Creates a statistical assertion chain from model results.
|
|
333
|
+
|
|
334
|
+
```javascript
|
|
335
|
+
expectStats(result)
|
|
336
|
+
.field("prediction")
|
|
337
|
+
.toHaveAccuracyAbove(0.8);
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
#### `expectStats(predictions, groundTruth)`
|
|
341
|
+
Two-argument form for judge validation. Aligns predictions with ground truth by `id` field.
|
|
342
|
+
|
|
343
|
+
```javascript
|
|
344
|
+
// Validate judge outputs against human labels
|
|
345
|
+
expectStats(judgeOutputs, humanLabels)
|
|
346
|
+
.field("label")
|
|
347
|
+
.toHaveAccuracyAbove(0.85);
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
**When to use:**
|
|
351
|
+
- Validating LLM judges against human labels
|
|
352
|
+
- Evaluating metric quality
|
|
353
|
+
- Testing automated detection systems
|
|
354
|
+
|
|
355
|
+
### Field Selection
|
|
356
|
+
|
|
357
|
+
#### `.field(fieldName)`
|
|
358
|
+
Selects a field for evaluation.
|
|
359
|
+
|
|
360
|
+
```javascript
|
|
361
|
+
expectStats(result).field("sentiment")
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
#### `.binarize(threshold)`
|
|
365
|
+
Converts continuous scores to binary (>=threshold is true).
|
|
366
|
+
|
|
367
|
+
```javascript
|
|
368
|
+
expectStats(result)
|
|
369
|
+
.field("score")
|
|
370
|
+
.binarize(0.5) // score >= 0.5 is true
|
|
371
|
+
.toHaveAccuracyAbove(0.8);
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
### Available Assertions
|
|
375
|
+
|
|
376
|
+
#### Classification Metrics
|
|
377
|
+
|
|
378
|
+
```javascript
|
|
379
|
+
// Accuracy
|
|
380
|
+
.toHaveAccuracyAbove(threshold)
|
|
381
|
+
.toHaveAccuracyBelow(threshold)
|
|
382
|
+
.toHaveAccuracyBetween(min, max)
|
|
383
|
+
|
|
384
|
+
// Precision (per class)
|
|
385
|
+
.toHavePrecisionAbove(className, threshold)
|
|
386
|
+
.toHavePrecisionBelow(className, threshold)
|
|
387
|
+
|
|
388
|
+
// Recall (per class)
|
|
389
|
+
.toHaveRecallAbove(className, threshold)
|
|
390
|
+
.toHaveRecallBelow(className, threshold)
|
|
391
|
+
|
|
392
|
+
// F1 Score
|
|
393
|
+
.toHaveF1Above(threshold) // Overall F1
|
|
394
|
+
.toHaveF1Above(className, threshold) // Per-class F1
|
|
395
|
+
|
|
396
|
+
// Confusion Matrix
|
|
397
|
+
.toHaveConfusionMatrix() // Prints confusion matrix
|
|
398
|
+
```
|
|
399
|
+
|
|
400
|
+
#### Distribution Assertions (Pattern 1)
|
|
401
|
+
|
|
402
|
+
Distribution assertions validate output distributions **without requiring ground truth**. Use these to monitor that model outputs stay within expected ranges.
|
|
403
|
+
|
|
404
|
+
```javascript
|
|
405
|
+
// Assert that at least 80% of confidence scores are above 0.7
|
|
406
|
+
expectStats(predictions)
|
|
407
|
+
.field("confidence")
|
|
408
|
+
.toHavePercentageAbove(0.7, 0.8);
|
|
409
|
+
|
|
410
|
+
// Assert that at least 90% of toxicity scores are below 0.3
|
|
411
|
+
expectStats(predictions)
|
|
412
|
+
.field("toxicity")
|
|
413
|
+
.toHavePercentageBelow(0.3, 0.9);
|
|
414
|
+
|
|
415
|
+
// Chain multiple distribution assertions
|
|
416
|
+
expectStats(predictions)
|
|
417
|
+
.field("score")
|
|
418
|
+
.toHavePercentageAbove(0.5, 0.6) // At least 60% above 0.5
|
|
419
|
+
.toHavePercentageBelow(0.9, 0.8); // At least 80% below 0.9
|
|
420
|
+
```
|
|
421
|
+
|
|
422
|
+
**Use cases:**
|
|
423
|
+
- Monitor confidence score distributions
|
|
424
|
+
- Validate schema compliance rates
|
|
425
|
+
- Check output range constraints
|
|
426
|
+
- Ensure score distributions remain stable over time
|
|
427
|
+
|
|
428
|
+
See [Distribution Assertions Example](./examples/distribution-assertions.eval.js) for complete examples.
|
|
429
|
+
|
|
430
|
+
### Judge Validation (Pattern 1b)
|
|
431
|
+
|
|
432
|
+
Validate judge outputs against human-labeled ground truth using the **two-argument expectStats API**:
|
|
433
|
+
|
|
434
|
+
```javascript
|
|
435
|
+
// Judge outputs (predictions from your judge/metric)
|
|
436
|
+
const judgeOutputs = [
|
|
437
|
+
{ id: "1", hallucinated: true },
|
|
438
|
+
{ id: "2", hallucinated: false },
|
|
439
|
+
{ id: "3", hallucinated: true }
|
|
440
|
+
];
|
|
441
|
+
|
|
442
|
+
// Human labels (ground truth)
|
|
443
|
+
const humanLabels = [
|
|
444
|
+
{ id: "1", hallucinated: true },
|
|
445
|
+
{ id: "2", hallucinated: false },
|
|
446
|
+
{ id: "3", hallucinated: false }
|
|
447
|
+
];
|
|
448
|
+
|
|
449
|
+
// Validate judge performance
|
|
450
|
+
expectStats(judgeOutputs, humanLabels)
|
|
451
|
+
.field("hallucinated")
|
|
452
|
+
.toHaveRecallAbove(true, 0.9) // Don't miss hallucinations
|
|
453
|
+
.toHavePrecisionAbove(true, 0.7) // Some false positives OK
|
|
454
|
+
.toHaveConfusionMatrix();
|
|
455
|
+
```
|
|
456
|
+
|
|
457
|
+
**Use cases:**
|
|
458
|
+
- Evaluate LLM-as-judge accuracy
|
|
459
|
+
- Validate heuristic metrics against human labels
|
|
460
|
+
- Test automated detection systems (refusal, policy compliance)
|
|
461
|
+
- Calibrate metric thresholds
|
|
462
|
+
|
|
463
|
+
**Two-argument expectStats:**
|
|
464
|
+
```javascript
|
|
465
|
+
expectStats(actual, expected)
|
|
466
|
+
.field("fieldName")
|
|
467
|
+
.toHaveAccuracyAbove(0.8);
|
|
468
|
+
```
|
|
469
|
+
|
|
470
|
+
The first argument is your predictions (judge outputs), the second is ground truth (human labels). Both must have matching `id` fields for alignment.
|
|
471
|
+
|
|
472
|
+
See [Judge Validation Example](./examples/judge-validation.eval.js) for complete examples.
|
|
473
|
+
|
|
474
|
+
For comprehensive guidance on evaluating agent systems, see [Agent Judges Design Patterns](./docs/agent-judges.md).
|
|
475
|
+
|
|
476
|
+
## Dataset Format
|
|
477
|
+
|
|
478
|
+
Datasets must be JSON arrays where each record has an `id` or `_id` field:
|
|
479
|
+
|
|
480
|
+
```json
|
|
481
|
+
[
|
|
482
|
+
{
|
|
483
|
+
"id": "1",
|
|
484
|
+
"text": "input text",
|
|
485
|
+
"label": "expected_output"
|
|
486
|
+
},
|
|
487
|
+
{
|
|
488
|
+
"id": "2",
|
|
489
|
+
"text": "another input",
|
|
490
|
+
"label": "another_output"
|
|
491
|
+
}
|
|
492
|
+
]
|
|
493
|
+
```
|
|
494
|
+
|
|
495
|
+
**Requirements:**
|
|
496
|
+
- Each record MUST have `id` or `_id` for alignment
|
|
497
|
+
- Ground truth fields (e.g., `label`, `sentiment`, `category`) are compared against model outputs
|
|
498
|
+
- Model functions must return predictions with matching `id`
|
|
499
|
+
|
|
500
|
+
## Exit Codes
|
|
501
|
+
|
|
502
|
+
evalsense returns specific exit codes for CI integration:
|
|
503
|
+
|
|
504
|
+
- `0` - Success (all tests passed)
|
|
505
|
+
- `1` - Assertion failure (statistical thresholds not met)
|
|
506
|
+
- `2` - Integrity failure (dataset alignment issues)
|
|
507
|
+
- `3` - Execution error (test threw exception)
|
|
508
|
+
- `4` - Configuration error (invalid CLI options)
|
|
509
|
+
|
|
510
|
+
## Writing Eval Files
|
|
511
|
+
|
|
512
|
+
Eval files use the `.eval.js` or `.eval.ts` extension and are discovered automatically:
|
|
513
|
+
|
|
514
|
+
```
|
|
515
|
+
project/
|
|
516
|
+
├── tests/
|
|
517
|
+
│ ├── classifier.eval.js
|
|
518
|
+
│ └── hallucination.eval.js
|
|
519
|
+
├── data/
|
|
520
|
+
│ └── dataset.json
|
|
521
|
+
└── package.json
|
|
522
|
+
```
|
|
523
|
+
|
|
524
|
+
Run with:
|
|
525
|
+
```bash
|
|
526
|
+
npx evalsense run tests/
|
|
527
|
+
```
|
|
528
|
+
|
|
529
|
+
## Examples
|
|
530
|
+
|
|
531
|
+
See the [`examples/`](./examples/) directory for complete examples:
|
|
532
|
+
|
|
533
|
+
- [`classification.eval.js`](./examples/basic/classification.eval.js) - Binary sentiment classification
|
|
534
|
+
- [`hallucination.eval.js`](./examples/basic/hallucination.eval.js) - Continuous score binarization
|
|
535
|
+
- [`distribution-assertions.eval.js`](./examples/distribution-assertions.eval.js) - Distribution monitoring without ground truth
|
|
536
|
+
- [`judge-validation.eval.js`](./examples/judge-validation.eval.js) - Validating judges against human labels
|
|
537
|
+
|
|
538
|
+
## Field Types
|
|
539
|
+
|
|
540
|
+
evalsense automatically determines evaluation metrics based on field values:
|
|
541
|
+
|
|
542
|
+
- **Boolean** (`true`/`false`) → Binary classification metrics
|
|
543
|
+
- **Categorical** (strings) → Multi-class classification metrics
|
|
544
|
+
- **Numeric** (numbers) → Regression metrics (MAE, MSE, RMSE, R²)
|
|
545
|
+
- **Numeric + threshold** → Binarized classification metrics
|
|
546
|
+
|
|
547
|
+
## LLM-Based Metrics (v0.2.0+)
|
|
548
|
+
|
|
549
|
+
evalsense includes LLM-powered metrics for hallucination detection, relevance assessment, faithfulness verification, and toxicity detection.
|
|
550
|
+
|
|
551
|
+
### Quick Setup
|
|
552
|
+
|
|
553
|
+
```javascript
|
|
554
|
+
import { setLLMClient } from "evalsense/metrics";
|
|
555
|
+
import { hallucination, relevance, faithfulness, toxicity } from "evalsense/metrics/opinionated";
|
|
556
|
+
|
|
557
|
+
// 1. Configure your LLM client (one-time setup)
|
|
558
|
+
setLLMClient({
|
|
559
|
+
async complete(prompt) {
|
|
560
|
+
// Call your LLM API (OpenAI, Anthropic, local model, etc.)
|
|
561
|
+
const response = await yourLLM.generate(prompt);
|
|
562
|
+
return response.text;
|
|
563
|
+
}
|
|
564
|
+
});
|
|
565
|
+
|
|
566
|
+
// 2. Use metrics in evaluations
|
|
567
|
+
const results = await hallucination({
|
|
568
|
+
outputs: [{ id: "1", output: "Paris has 50 million people." }],
|
|
569
|
+
context: ["Paris has approximately 2.1 million residents."]
|
|
570
|
+
});
|
|
571
|
+
|
|
572
|
+
console.log(results[0].score); // 0.9 (high hallucination)
|
|
573
|
+
console.log(results[0].reasoning); // "Output claims 50M, context says 2.1M"
|
|
574
|
+
```
|
|
575
|
+
|
|
576
|
+
### Available Metrics
|
|
577
|
+
|
|
578
|
+
- **`hallucination()`** - Detects claims not supported by context
|
|
579
|
+
- **`relevance()`** - Measures query-response alignment
|
|
580
|
+
- **`faithfulness()`** - Verifies outputs don't contradict sources
|
|
581
|
+
- **`toxicity()`** - Identifies harmful or inappropriate content
|
|
582
|
+
|
|
583
|
+
### Evaluation Modes
|
|
584
|
+
|
|
585
|
+
Choose between accuracy and cost:
|
|
586
|
+
|
|
587
|
+
```javascript
|
|
588
|
+
// Per-row: Higher accuracy, higher cost (N API calls)
|
|
589
|
+
await hallucination({
|
|
590
|
+
outputs,
|
|
591
|
+
context,
|
|
592
|
+
evaluationMode: "per-row" // default
|
|
593
|
+
});
|
|
594
|
+
|
|
595
|
+
// Batch: Lower cost, single API call
|
|
596
|
+
await hallucination({
|
|
597
|
+
outputs,
|
|
598
|
+
context,
|
|
599
|
+
evaluationMode: "batch"
|
|
600
|
+
});
|
|
601
|
+
```
|
|
602
|
+
|
|
603
|
+
### Provider Examples
|
|
604
|
+
|
|
605
|
+
**OpenAI:**
|
|
606
|
+
```javascript
|
|
607
|
+
import OpenAI from "openai";
|
|
608
|
+
|
|
609
|
+
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
|
|
610
|
+
setLLMClient({
|
|
611
|
+
async complete(prompt) {
|
|
612
|
+
const response = await openai.chat.completions.create({
|
|
613
|
+
model: "gpt-4-turbo-preview",
|
|
614
|
+
messages: [{ role: "user", content: prompt }],
|
|
615
|
+
});
|
|
616
|
+
return response.choices[0]?.message?.content ?? "";
|
|
617
|
+
}
|
|
618
|
+
});
|
|
619
|
+
```
|
|
620
|
+
|
|
621
|
+
**Anthropic:**
|
|
622
|
+
```javascript
|
|
623
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
624
|
+
|
|
625
|
+
const anthropic = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY });
|
|
626
|
+
setLLMClient({
|
|
627
|
+
async complete(prompt) {
|
|
628
|
+
const message = await anthropic.messages.create({
|
|
629
|
+
model: "claude-3-5-sonnet-20241022",
|
|
630
|
+
max_tokens: 4096,
|
|
631
|
+
messages: [{ role: "user", content: prompt }],
|
|
632
|
+
});
|
|
633
|
+
return message.content[0].text;
|
|
634
|
+
}
|
|
635
|
+
});
|
|
636
|
+
```
|
|
637
|
+
|
|
638
|
+
**Local (Ollama):**
|
|
639
|
+
```javascript
|
|
640
|
+
setLLMClient({
|
|
641
|
+
async complete(prompt) {
|
|
642
|
+
const response = await fetch("http://localhost:11434/api/generate", {
|
|
643
|
+
method: "POST",
|
|
644
|
+
body: JSON.stringify({ model: "llama2", prompt }),
|
|
645
|
+
});
|
|
646
|
+
return (await response.json()).response;
|
|
647
|
+
}
|
|
648
|
+
});
|
|
649
|
+
```
|
|
650
|
+
|
|
651
|
+
### Learn More
|
|
652
|
+
|
|
653
|
+
- [LLM Metrics Guide](./docs/llm-metrics.md) - Complete usage guide
|
|
654
|
+
- [LLM Adapters Guide](./docs/llm-adapters.md) - Implement adapters for different providers
|
|
655
|
+
- [Migration Guide](./docs/migration-v0.2.md) - Upgrade from v0.1.x
|
|
656
|
+
- [Examples](./examples/) - Working code examples
|
|
657
|
+
|
|
658
|
+
## Philosophy
|
|
659
|
+
|
|
660
|
+
evalsense is built on the principle that **metrics are predictions, not facts**.
|
|
661
|
+
|
|
662
|
+
Instead of treating LLM-as-judge metrics (relevance, hallucination, etc.) as ground truth, evalsense:
|
|
663
|
+
- Treats them as **weak labels** from a model
|
|
664
|
+
- Validates them statistically against human references when available
|
|
665
|
+
- Computes confusion matrices to reveal bias and systematic errors
|
|
666
|
+
- Focuses on dataset-level distributions, not individual examples
|
|
667
|
+
|
|
668
|
+
## Contributing
|
|
669
|
+
|
|
670
|
+
Contributions are welcome! Please see [CLAUDE.md](./CLAUDE.md) for development guidelines.
|
|
671
|
+
|
|
672
|
+
## License
|
|
673
|
+
|
|
674
|
+
MIT © Mohit Joshi
|
|
675
|
+
|
|
676
|
+
---
|
|
677
|
+
|
|
678
|
+
**Made with ❤️ for the JS/Node.js AI community**
|
package/bin/evalsense.js
ADDED