evalsense 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +95 -789
- package/dist/{chunk-TDGWDK2L.js → chunk-IZAC4S4T.js} +2 -2
- package/dist/{chunk-TDGWDK2L.js.map → chunk-IZAC4S4T.js.map} +1 -1
- package/dist/{chunk-4BKZPVY4.cjs → chunk-JX7KEFAD.cjs} +3 -3
- package/dist/{chunk-4BKZPVY4.cjs.map → chunk-JX7KEFAD.cjs.map} +1 -1
- package/dist/{chunk-NCCQRZ2Y.cjs → chunk-UH6L7A5Y.cjs} +2 -2
- package/dist/{chunk-NCCQRZ2Y.cjs.map → chunk-UH6L7A5Y.cjs.map} +1 -1
- package/dist/{chunk-IUVDDMJ3.js → chunk-ZPQYICAW.js} +3 -3
- package/dist/{chunk-IUVDDMJ3.js.map → chunk-ZPQYICAW.js.map} +1 -1
- package/dist/cli.cjs +104 -11
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +94 -1
- package/dist/cli.js.map +1 -1
- package/dist/{index-CoMpaW-K.d.ts → index-7Qog3wxS.d.ts} +2 -1
- package/dist/{index-CATqAHNK.d.cts → index-ezghUO7Q.d.cts} +2 -1
- package/dist/index.cjs +61 -61
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +2 -2
- package/dist/index.js.map +1 -1
- package/dist/metrics/index.cjs +36 -36
- package/dist/metrics/index.d.cts +2 -2
- package/dist/metrics/index.d.ts +2 -2
- package/dist/metrics/index.js +2 -2
- package/dist/metrics/opinionated/index.cjs +5 -5
- package/dist/metrics/opinionated/index.d.cts +1 -1
- package/dist/metrics/opinionated/index.d.ts +1 -1
- package/dist/metrics/opinionated/index.js +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,855 +1,161 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
> JS-native LLM evaluation framework with Jest-like API and statistical assertions
|
|
1
|
+
[](https://www.evalsense.com)
|
|
4
2
|
|
|
5
3
|
[](https://www.npmjs.com/package/evalsense)
|
|
4
|
+
[](https://github.com/mohitjoshi14/evalsense/actions/workflows/ci.yml)
|
|
6
5
|
[](https://opensource.org/licenses/Apache-2.0)
|
|
7
6
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
**evalsense is like Jest for testing code that uses LLMs.**
|
|
11
|
-
|
|
12
|
-
It helps engineers answer one simple question:
|
|
13
|
-
|
|
14
|
-
> **“Is my LLM-powered code good enough to ship?”**
|
|
15
|
-
|
|
16
|
-
Instead of checking a few example responses, evalsense runs your code across many inputs, measures overall quality, and gives you a clear **pass / fail** result — locally or in CI.
|
|
17
|
-
|
|
18
|
-
evalsense is built for **engineers deploying LLM-enabled features**, not for training or benchmarking models.
|
|
19
|
-
|
|
20
|
-
## What problem does evalsense solve?
|
|
21
|
-
|
|
22
|
-
Most LLM evaluation tools focus on individual outputs:
|
|
23
|
-
|
|
24
|
-
> _“How good is this one response?”_
|
|
25
|
-
|
|
26
|
-
That’s useful, but it doesn’t tell you whether your system is reliable.
|
|
7
|
+
> **Jest for LLM Evaluation.** Pass/fail quality gates for your LLM-powered code.
|
|
27
8
|
|
|
28
|
-
evalsense
|
|
9
|
+
evalsense runs your code across many inputs, measures quality statistically, and gives you a clear **pass / fail** result — locally or in CI.
|
|
29
10
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
- run your code many times
|
|
35
|
-
- measure results across all runs
|
|
36
|
-
- fail fast if quality drops
|
|
37
|
-
|
|
38
|
-
## How evalsense works (in plain terms)
|
|
39
|
-
|
|
40
|
-
At a high level, evalsense:
|
|
41
|
-
|
|
42
|
-
1. Runs your code
|
|
43
|
-
(this can be a function, module, API call, or a fixed dataset)
|
|
44
|
-
2. Collects the results
|
|
45
|
-
3. Scores them using:
|
|
46
|
-
- standard metrics (accuracy, precision, recall, F1)
|
|
47
|
-
- LLM-as-judge checks (e.g. relevance, hallucination, correctness)
|
|
11
|
+
```bash
|
|
12
|
+
npm install --save-dev evalsense
|
|
13
|
+
```
|
|
48
14
|
|
|
49
|
-
|
|
50
|
-
5. Applies rules you define
|
|
51
|
-
6. Passes or fails the test
|
|
15
|
+
## Quick Start
|
|
52
16
|
|
|
53
|
-
|
|
17
|
+
Create `quality.eval.js`:
|
|
54
18
|
|
|
55
|
-
|
|
19
|
+
```javascript
|
|
20
|
+
import { describe, evalTest, expectStats } from "evalsense";
|
|
21
|
+
import { readFileSync } from "fs";
|
|
56
22
|
|
|
57
|
-
```ts
|
|
58
23
|
describe("test answer quality", async () => {
|
|
59
24
|
evalTest("toxicity detection", async () => {
|
|
60
25
|
const answers = await generateAnswersDataset(testQuestions);
|
|
61
26
|
const toxicityScore = await toxicity(answers);
|
|
62
27
|
|
|
63
|
-
expectStats(toxicityScore)
|
|
64
|
-
|
|
65
|
-
.percentageBelow(0.5).toBeAtLeast(0.5)
|
|
66
|
-
};
|
|
28
|
+
expectStats(toxicityScore).field("score").percentageBelow(0.5).toBeAtLeast(0.5);
|
|
29
|
+
});
|
|
67
30
|
|
|
68
31
|
evalTest("correctness score", async () => {
|
|
69
32
|
const answers = await generateAnswersDataset(testQuestions);
|
|
70
|
-
const groundTruth =
|
|
33
|
+
const groundTruth = JSON.parse(readFileSync("truth-dataset.json", "utf-8"));
|
|
71
34
|
|
|
72
35
|
expectStats(answers, groundTruth)
|
|
73
36
|
.field("label")
|
|
74
37
|
.accuracy.toBeAtLeast(0.9)
|
|
75
|
-
.precision("positive")
|
|
76
|
-
.
|
|
77
|
-
.
|
|
78
|
-
|
|
79
|
-
});
|
|
80
|
-
```
|
|
81
|
-
|
|
82
|
-
Running the test:
|
|
83
|
-
|
|
84
|
-
```markdown
|
|
85
|
-
**test answer quality**
|
|
86
|
-
|
|
87
|
-
✓ toxicity detection (1ms)
|
|
88
|
-
✓ 50.0% of 'score' values are below or equal to 0.5 (expected >= 50.0%)
|
|
89
|
-
Expected: 50.0%
|
|
90
|
-
Actual: 50.0%
|
|
91
|
-
|
|
92
|
-
✓ correctness score (1ms)
|
|
93
|
-
Field: label | Accuracy: 100.0% | F1: 100.0%
|
|
94
|
-
negative: P=100.0% R=100.0% F1=100.0% (n=5)
|
|
95
|
-
positive: P=100.0% R=100.0% F1=100.0% (n=5)
|
|
96
|
-
|
|
97
|
-
Confusion Matrix: label
|
|
98
|
-
|
|
99
|
-
Predicted → correct incorrect
|
|
100
|
-
Actual ↓
|
|
101
|
-
correct 5 0
|
|
102
|
-
incorrect 0 5
|
|
103
|
-
|
|
104
|
-
✓ Accuracy 100.0% >= 90.0%
|
|
105
|
-
Expected: 90.0%
|
|
106
|
-
Actual: 100.0%
|
|
107
|
-
✓ Precision for 'positive' 100.0% >= 70.0%
|
|
108
|
-
Expected: 70.0%
|
|
109
|
-
Actual: 100.0%
|
|
110
|
-
✓ Recall for 'positive' 100.0% >= 70.0%
|
|
111
|
-
Expected: 70.0%
|
|
112
|
-
Actual: 100.0%
|
|
113
|
-
✓ Confusion matrix recorded for field "label"
|
|
114
|
-
```
|
|
115
|
-
|
|
116
|
-
If the quality drops, the test fails — just like a normal test.
|
|
117
|
-
|
|
118
|
-
## Two common ways to use evalsense
|
|
119
|
-
|
|
120
|
-
### 1. When you **don’t have ground truth**
|
|
121
|
-
|
|
122
|
-
Use this when there are no labels.
|
|
123
|
-
|
|
124
|
-
Example:
|
|
125
|
-
|
|
126
|
-
- Run your LLM-powered function
|
|
127
|
-
- Score outputs using an LLM-as-judge (relevance, hallucination, etc.)
|
|
128
|
-
- Define what “acceptable” means
|
|
129
|
-
- Fail if quality degrades
|
|
130
|
-
|
|
131
|
-
**Example rule:**
|
|
132
|
-
|
|
133
|
-
> “Average relevance score must be at least 0.75”
|
|
134
|
-
|
|
135
|
-
### 2. When you **do have ground truth**
|
|
136
|
-
|
|
137
|
-
Use this when correct answers are known.
|
|
138
|
-
|
|
139
|
-
Example:
|
|
140
|
-
|
|
141
|
-
- Run your prediction code
|
|
142
|
-
- Compare outputs with ground truth
|
|
143
|
-
- Compute accuracy, precision, recall, F1
|
|
144
|
-
- Optionally add LLM-as-judge checks
|
|
145
|
-
- Fail if metrics fall below thresholds
|
|
146
|
-
|
|
147
|
-
**Example rule:**
|
|
148
|
-
|
|
149
|
-
> “F1 score must be ≥ 0.85 and false positives ≤ 5%”
|
|
150
|
-
|
|
151
|
-
## What evalsense is _not_
|
|
152
|
-
|
|
153
|
-
evalsense is **not**:
|
|
154
|
-
|
|
155
|
-
- A tool for scoring single responses in isolation
|
|
156
|
-
- A dashboard or experiment-tracking platform
|
|
157
|
-
- A system for analyzing agent step-by-step traces
|
|
158
|
-
- A model benchmarking or training framework
|
|
159
|
-
|
|
160
|
-
If you mainly want scores, charts, or leaderboards, other tools may be a better fit.
|
|
161
|
-
|
|
162
|
-
## Who should use evalsense
|
|
163
|
-
|
|
164
|
-
evalsense is a good fit if you:
|
|
165
|
-
|
|
166
|
-
- are **shipping LLM-powered features**
|
|
167
|
-
- want **clear pass/fail quality gates**
|
|
168
|
-
- run checks in **CI/CD**
|
|
169
|
-
- care about **regressions** (“did this get worse?”)
|
|
170
|
-
- already think in terms of tests
|
|
171
|
-
- work in **JavaScript / TypeScript**
|
|
172
|
-
|
|
173
|
-
## Who should _not_ use evalsense
|
|
174
|
-
|
|
175
|
-
evalsense may not be right for you if you:
|
|
176
|
-
|
|
177
|
-
- only care about individual output scores
|
|
178
|
-
- want visual dashboards or experiment UIs
|
|
179
|
-
- need deep agent trace inspection
|
|
180
|
-
- are training or benchmarking foundation models
|
|
181
|
-
|
|
182
|
-
## In one sentence
|
|
183
|
-
|
|
184
|
-
**evalsense lets you test the quality of LLM-powered code the same way you test everything else — with clear pass/fail results.**
|
|
185
|
-
|
|
186
|
-
## Installation
|
|
187
|
-
|
|
188
|
-
```bash
|
|
189
|
-
npm install --save-dev evalsense
|
|
190
|
-
```
|
|
191
|
-
|
|
192
|
-
Or with yarn:
|
|
193
|
-
|
|
194
|
-
```bash
|
|
195
|
-
yarn add -D evalsense
|
|
196
|
-
```
|
|
197
|
-
|
|
198
|
-
## Quick Start
|
|
199
|
-
|
|
200
|
-
Create a file named `sentiment.eval.js`:
|
|
201
|
-
|
|
202
|
-
```javascript
|
|
203
|
-
import { describe, evalTest, expectStats } from "evalsense";
|
|
204
|
-
import { readFileSync } from "fs";
|
|
205
|
-
|
|
206
|
-
// Your model function - can be any JS function
|
|
207
|
-
function classifySentiment(text) {
|
|
208
|
-
const lower = text.toLowerCase();
|
|
209
|
-
const hasPositive = /love|amazing|great|fantastic|perfect/.test(lower);
|
|
210
|
-
const hasNegative = /terrible|worst|disappointed|waste/.test(lower);
|
|
211
|
-
return hasPositive && !hasNegative ? "positive" : "negative";
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
describe("Sentiment classifier", () => {
|
|
215
|
-
evalTest("accuracy above 80%", async () => {
|
|
216
|
-
// 1. Load ground truth data
|
|
217
|
-
const groundTruth = JSON.parse(readFileSync("./sentiment.json", "utf-8"));
|
|
218
|
-
|
|
219
|
-
// 2. Run your model and collect predictions
|
|
220
|
-
const predictions = groundTruth.map((record) => ({
|
|
221
|
-
id: record.id,
|
|
222
|
-
sentiment: classifySentiment(record.text),
|
|
223
|
-
}));
|
|
224
|
-
|
|
225
|
-
// 3. Assert on statistical properties
|
|
226
|
-
expectStats(predictions, groundTruth)
|
|
227
|
-
.field("sentiment")
|
|
228
|
-
.accuracy.toBeAtLeast(0.8)
|
|
229
|
-
.recall("positive").toBeAtLeast(0.7)
|
|
230
|
-
.precision("positive").toBeAtLeast(0.7)
|
|
38
|
+
.precision("positive")
|
|
39
|
+
.toBeAtLeast(0.7)
|
|
40
|
+
.recall("positive")
|
|
41
|
+
.toBeAtLeast(0.7)
|
|
231
42
|
.displayConfusionMatrix();
|
|
232
43
|
});
|
|
233
44
|
});
|
|
234
45
|
```
|
|
235
46
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
```json
|
|
239
|
-
[
|
|
240
|
-
{ "id": "1", "text": "I love this product!", "sentiment": "positive" },
|
|
241
|
-
{ "id": "2", "text": "Terrible experience.", "sentiment": "negative" },
|
|
242
|
-
{ "id": "3", "text": "Great quality!", "sentiment": "positive" }
|
|
243
|
-
]
|
|
244
|
-
```
|
|
245
|
-
|
|
246
|
-
Run the evaluation:
|
|
247
|
-
|
|
248
|
-
```bash
|
|
249
|
-
npx evalsense run sentiment.eval.js
|
|
250
|
-
```
|
|
251
|
-
|
|
252
|
-
## Usage
|
|
253
|
-
|
|
254
|
-
### Basic Classification Example
|
|
255
|
-
|
|
256
|
-
```javascript
|
|
257
|
-
import { describe, evalTest, expectStats } from "evalsense";
|
|
258
|
-
import { readFileSync } from "fs";
|
|
259
|
-
|
|
260
|
-
describe("Spam classifier", () => {
|
|
261
|
-
evalTest("high precision and recall", async () => {
|
|
262
|
-
const groundTruth = JSON.parse(readFileSync("./emails.json", "utf-8"));
|
|
263
|
-
|
|
264
|
-
const predictions = groundTruth.map((record) => ({
|
|
265
|
-
id: record.id,
|
|
266
|
-
isSpam: classifyEmail(record.text),
|
|
267
|
-
}));
|
|
268
|
-
|
|
269
|
-
expectStats(predictions, groundTruth)
|
|
270
|
-
.field("isSpam")
|
|
271
|
-
.accuracy.toBeAtLeast(0.9)
|
|
272
|
-
.precision(true).toBeAtLeast(0.85) // Precision for spam=true
|
|
273
|
-
.recall(true).toBeAtLeast(0.85) // Recall for spam=true
|
|
274
|
-
.displayConfusionMatrix();
|
|
275
|
-
});
|
|
276
|
-
});
|
|
277
|
-
```
|
|
278
|
-
|
|
279
|
-
### Continuous Scores with Binarization
|
|
280
|
-
|
|
281
|
-
```javascript
|
|
282
|
-
import { describe, evalTest, expectStats } from "evalsense";
|
|
283
|
-
import { readFileSync } from "fs";
|
|
284
|
-
|
|
285
|
-
describe("Hallucination detector", () => {
|
|
286
|
-
evalTest("detect hallucinations with 70% recall", async () => {
|
|
287
|
-
const groundTruth = JSON.parse(readFileSync("./outputs.json", "utf-8"));
|
|
288
|
-
|
|
289
|
-
// Your model returns a continuous score (0.0 to 1.0)
|
|
290
|
-
const predictions = groundTruth.map((record) => ({
|
|
291
|
-
id: record.id,
|
|
292
|
-
hallucinated: computeHallucinationScore(record.output),
|
|
293
|
-
}));
|
|
294
|
-
|
|
295
|
-
// Binarize the score at threshold 0.3
|
|
296
|
-
expectStats(predictions, groundTruth)
|
|
297
|
-
.field("hallucinated")
|
|
298
|
-
.binarize(0.3) // >= 0.3 means hallucinated
|
|
299
|
-
.recall(true).toBeAtLeast(0.7)
|
|
300
|
-
.precision(true).toBeAtLeast(0.6)
|
|
301
|
-
.displayConfusionMatrix();
|
|
302
|
-
});
|
|
303
|
-
});
|
|
304
|
-
```
|
|
305
|
-
|
|
306
|
-
### Multi-class Classification
|
|
307
|
-
|
|
308
|
-
```javascript
|
|
309
|
-
import { describe, evalTest, expectStats } from "evalsense";
|
|
310
|
-
import { readFileSync } from "fs";
|
|
311
|
-
|
|
312
|
-
describe("Intent classifier", () => {
|
|
313
|
-
evalTest("balanced performance across intents", async () => {
|
|
314
|
-
const groundTruth = JSON.parse(readFileSync("./intents.json", "utf-8"));
|
|
315
|
-
|
|
316
|
-
const predictions = groundTruth.map((record) => ({
|
|
317
|
-
id: record.id,
|
|
318
|
-
intent: classifyIntent(record.query),
|
|
319
|
-
}));
|
|
320
|
-
|
|
321
|
-
expectStats(predictions, groundTruth)
|
|
322
|
-
.field("intent")
|
|
323
|
-
.accuracy.toBeAtLeast(0.85)
|
|
324
|
-
.recall("purchase").toBeAtLeast(0.8)
|
|
325
|
-
.recall("support").toBeAtLeast(0.8)
|
|
326
|
-
.recall("general").toBeAtLeast(0.7)
|
|
327
|
-
.displayConfusionMatrix();
|
|
328
|
-
});
|
|
329
|
-
});
|
|
330
|
-
```
|
|
331
|
-
|
|
332
|
-
### Parallel Model Execution with LLMs
|
|
333
|
-
|
|
334
|
-
For LLM calls or slow operations, use `Promise.all` with chunking for concurrency control:
|
|
335
|
-
|
|
336
|
-
```javascript
|
|
337
|
-
import { describe, evalTest, expectStats } from "evalsense";
|
|
338
|
-
import { readFileSync } from "fs";
|
|
339
|
-
|
|
340
|
-
// Helper for parallel execution with concurrency limit
|
|
341
|
-
async function mapConcurrent(items, fn, concurrency = 5) {
|
|
342
|
-
const results = [];
|
|
343
|
-
for (let i = 0; i < items.length; i += concurrency) {
|
|
344
|
-
const chunk = items.slice(i, i + concurrency);
|
|
345
|
-
results.push(...(await Promise.all(chunk.map(fn))));
|
|
346
|
-
}
|
|
347
|
-
return results;
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
describe("LLM classifier", () => {
|
|
351
|
-
evalTest("classification accuracy", async () => {
|
|
352
|
-
const groundTruth = JSON.parse(readFileSync("./data.json", "utf-8"));
|
|
353
|
-
|
|
354
|
-
// Run with concurrency=5
|
|
355
|
-
const predictions = await mapConcurrent(
|
|
356
|
-
groundTruth,
|
|
357
|
-
async (record) => {
|
|
358
|
-
const response = await callLLM(record.text);
|
|
359
|
-
return { id: record.id, category: response.category };
|
|
360
|
-
},
|
|
361
|
-
5
|
|
362
|
-
);
|
|
363
|
-
|
|
364
|
-
expectStats(predictions, groundTruth).field("category").accuracy.toBeAtLeast(0.9);
|
|
365
|
-
});
|
|
366
|
-
});
|
|
367
|
-
```
|
|
368
|
-
|
|
369
|
-
### Test Lifecycle Hooks
|
|
370
|
-
|
|
371
|
-
```javascript
|
|
372
|
-
import { describe, evalTest, beforeAll, afterAll, beforeEach, afterEach } from "evalsense";
|
|
373
|
-
|
|
374
|
-
describe("Model evaluation", () => {
|
|
375
|
-
let model;
|
|
376
|
-
|
|
377
|
-
beforeAll(async () => {
|
|
378
|
-
// Load model once before all tests
|
|
379
|
-
model = await loadModel();
|
|
380
|
-
});
|
|
381
|
-
|
|
382
|
-
afterAll(async () => {
|
|
383
|
-
// Cleanup after all tests
|
|
384
|
-
await model.dispose();
|
|
385
|
-
});
|
|
386
|
-
|
|
387
|
-
beforeEach(() => {
|
|
388
|
-
// Reset state before each test
|
|
389
|
-
model.reset();
|
|
390
|
-
});
|
|
391
|
-
|
|
392
|
-
afterEach(() => {
|
|
393
|
-
// Cleanup after each test
|
|
394
|
-
console.log("Test completed");
|
|
395
|
-
});
|
|
396
|
-
|
|
397
|
-
evalTest("test 1", async () => {
|
|
398
|
-
// ...
|
|
399
|
-
});
|
|
400
|
-
|
|
401
|
-
evalTest("test 2", async () => {
|
|
402
|
-
// ...
|
|
403
|
-
});
|
|
404
|
-
});
|
|
405
|
-
```
|
|
406
|
-
|
|
407
|
-
## CLI Usage
|
|
408
|
-
|
|
409
|
-
### Run Evaluations
|
|
47
|
+
Run it:
|
|
410
48
|
|
|
411
49
|
```bash
|
|
412
|
-
|
|
413
|
-
npx evalsense run
|
|
414
|
-
|
|
415
|
-
# Run specific file or directory
|
|
416
|
-
npx evalsense run tests/eval/
|
|
417
|
-
|
|
418
|
-
# Filter tests by name
|
|
419
|
-
npx evalsense run --filter "accuracy"
|
|
420
|
-
|
|
421
|
-
# Output JSON report
|
|
422
|
-
npx evalsense run --output report.json
|
|
423
|
-
|
|
424
|
-
# Use different reporters
|
|
425
|
-
npx evalsense run --reporter console # default
|
|
426
|
-
npx evalsense run --reporter json
|
|
427
|
-
npx evalsense run --reporter both
|
|
428
|
-
|
|
429
|
-
# Bail on first failure
|
|
430
|
-
npx evalsense run --bail
|
|
431
|
-
|
|
432
|
-
# Set timeout (in milliseconds)
|
|
433
|
-
npx evalsense run --timeout 60000
|
|
50
|
+
npx evalsense run quality.eval.js
|
|
434
51
|
```
|
|
435
52
|
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
```bash
|
|
439
|
-
# List all discovered eval files
|
|
440
|
-
npx evalsense list
|
|
53
|
+
Output:
|
|
441
54
|
|
|
442
|
-
# List files in specific directory
|
|
443
|
-
npx evalsense list tests/
|
|
444
55
|
```
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
56
|
+
test answer quality
|
|
57
|
+
✓ toxicity detection (1ms)
|
|
58
|
+
✓ 50.0% of 'score' values are below
|
|
59
|
+
or equal to 0.5 (expected >= 50.0%)
|
|
60
|
+
Expected: 50.0%
|
|
61
|
+
Actual: 50.0%
|
|
62
|
+
✓ correctness score (1ms)
|
|
63
|
+
Field: label | Accuracy: 100.0% | F1: 100.0%
|
|
64
|
+
negative: P=100.0% R=100.0% F1=100.0% (n=5)
|
|
65
|
+
positive: P=100.0% R=100.0% F1=100.0% (n=5)
|
|
66
|
+
Confusion Matrix: label
|
|
67
|
+
Predicted → correct incorrect
|
|
68
|
+
Actual ↓
|
|
69
|
+
correct 5 0
|
|
70
|
+
incorrect 0 5
|
|
71
|
+
✓ Accuracy 100.0% >= 90.0%
|
|
72
|
+
✓ Precision for 'positive' 100.0% >= 70.0%
|
|
73
|
+
✓ Recall for 'positive' 100.0% >= 70.0%
|
|
74
|
+
✓ Confusion matrix recorded for field "label"
|
|
75
|
+
All tests passed.
|
|
458
76
|
```
|
|
459
77
|
|
|
460
|
-
|
|
78
|
+
## Key Features
|
|
461
79
|
|
|
462
|
-
|
|
80
|
+
- **Jest-like API** — `describe`, `evalTest`, `expectStats` feel familiar
|
|
81
|
+
- **Statistical assertions** — accuracy, precision, recall, F1, MAE, RMSE, R²
|
|
82
|
+
- **Confusion matrices** — built-in display with `.displayConfusionMatrix()`
|
|
83
|
+
- **Distribution monitoring** — `percentageAbove` / `percentageBelow` without ground truth
|
|
84
|
+
- **LLM-as-judge** — built-in hallucination, relevance, faithfulness, toxicity metrics
|
|
85
|
+
- **CI/CD ready** — structured exit codes, JSON reporter, bail mode
|
|
86
|
+
- **Zero config** — works with any JS data loading and model execution
|
|
463
87
|
|
|
464
|
-
|
|
465
|
-
evalTest("should have 90% accuracy", async () => {
|
|
466
|
-
// test implementation
|
|
467
|
-
});
|
|
468
|
-
```
|
|
469
|
-
|
|
470
|
-
### Dataset Loading
|
|
471
|
-
|
|
472
|
-
evalsense doesn't dictate how you load data or run your model. Use standard Node.js tools:
|
|
88
|
+
## LLM-Based Metrics
|
|
473
89
|
|
|
474
90
|
```javascript
|
|
475
|
-
import {
|
|
476
|
-
|
|
477
|
-
// Load ground truth
|
|
478
|
-
const groundTruth = JSON.parse(readFileSync("./data.json", "utf-8"));
|
|
91
|
+
import { setLLMClient, createAnthropicAdapter } from "evalsense/metrics";
|
|
92
|
+
import { hallucination, relevance } from "evalsense/metrics/opinionated";
|
|
479
93
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
// Or use async operations
|
|
484
|
-
const predictions = await Promise.all(
|
|
485
|
-
groundTruth.map(async (item) => {
|
|
486
|
-
const result = await callLLM(item.text);
|
|
487
|
-
return { id: item.id, prediction: result };
|
|
94
|
+
setLLMClient(
|
|
95
|
+
createAnthropicAdapter(process.env.ANTHROPIC_API_KEY, {
|
|
96
|
+
model: "claude-haiku-4-5-20251001",
|
|
488
97
|
})
|
|
489
98
|
);
|
|
490
|
-
```
|
|
491
|
-
|
|
492
|
-
### Assertions
|
|
493
|
-
|
|
494
|
-
#### `expectStats(predictions, groundTruth)`
|
|
495
|
-
|
|
496
|
-
Creates a statistical assertion chain from predictions and ground truth. Aligns by `id` field.
|
|
497
99
|
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
.
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
```
|
|
505
|
-
|
|
506
|
-
**One-argument form (distribution assertions only):**
|
|
507
|
-
|
|
508
|
-
```javascript
|
|
509
|
-
// For distribution monitoring without ground truth
|
|
510
|
-
expectStats(predictions).field("confidence").percentageAbove(0.7).toBeAtLeast(0.8);
|
|
511
|
-
```
|
|
512
|
-
|
|
513
|
-
**Common use cases:**
|
|
514
|
-
|
|
515
|
-
- Classification evaluation with ground truth
|
|
516
|
-
- Regression evaluation (MAE, RMSE, R²)
|
|
517
|
-
- Validating LLM judges against human labels
|
|
518
|
-
- Distribution monitoring without ground truth
|
|
519
|
-
|
|
520
|
-
### Field Selection
|
|
521
|
-
|
|
522
|
-
#### `.field(fieldName)`
|
|
523
|
-
|
|
524
|
-
Selects a field for evaluation.
|
|
525
|
-
|
|
526
|
-
```javascript
|
|
527
|
-
expectStats(result).field("sentiment");
|
|
528
|
-
```
|
|
529
|
-
|
|
530
|
-
#### `.binarize(threshold)`
|
|
531
|
-
|
|
532
|
-
Converts continuous scores to binary (>=threshold is true).
|
|
533
|
-
|
|
534
|
-
```javascript
|
|
535
|
-
expectStats(result)
|
|
536
|
-
.field("score")
|
|
537
|
-
.binarize(0.5) // score >= 0.5 is true
|
|
538
|
-
.accuracy.toBeAtLeast(0.8);
|
|
539
|
-
```
|
|
540
|
-
|
|
541
|
-
### Available Assertions
|
|
542
|
-
|
|
543
|
-
#### Classification Metrics
|
|
544
|
-
|
|
545
|
-
```javascript
|
|
546
|
-
// Accuracy (macro average for multi-class)
|
|
547
|
-
.accuracy.toBeAtLeast(threshold)
|
|
548
|
-
.accuracy.toBeAbove(threshold)
|
|
549
|
-
.accuracy.toBeAtMost(threshold)
|
|
550
|
-
.accuracy.toBeBelow(threshold)
|
|
551
|
-
|
|
552
|
-
// Precision (per class or macro average)
|
|
553
|
-
.precision("className").toBeAtLeast(threshold)
|
|
554
|
-
.precision().toBeAtLeast(threshold) // macro average
|
|
555
|
-
|
|
556
|
-
// Recall (per class or macro average)
|
|
557
|
-
.recall("className").toBeAtLeast(threshold)
|
|
558
|
-
.recall().toBeAtLeast(threshold) // macro average
|
|
559
|
-
|
|
560
|
-
// F1 Score (macro average)
|
|
561
|
-
.f1.toBeAtLeast(threshold)
|
|
562
|
-
.f1.toBeAbove(threshold)
|
|
563
|
-
|
|
564
|
-
// Regression Metrics
|
|
565
|
-
.mae.toBeAtMost(threshold) // Mean Absolute Error
|
|
566
|
-
.rmse.toBeAtMost(threshold) // Root Mean Squared Error
|
|
567
|
-
.r2.toBeAtLeast(threshold) // R² coefficient
|
|
568
|
-
|
|
569
|
-
// Confusion Matrix
|
|
570
|
-
.displayConfusionMatrix() // Displays confusion matrix (not an assertion)
|
|
571
|
-
```
|
|
572
|
-
|
|
573
|
-
#### Available Matchers
|
|
574
|
-
|
|
575
|
-
All metrics return a matcher object with these comparison methods:
|
|
576
|
-
|
|
577
|
-
```javascript
|
|
578
|
-
.toBeAtLeast(x) // >= x
|
|
579
|
-
.toBeAbove(x) // > x
|
|
580
|
-
.toBeAtMost(x) // <= x
|
|
581
|
-
.toBeBelow(x) // < x
|
|
582
|
-
.toEqual(x, tolerance?) // === x (with optional tolerance for floats)
|
|
583
|
-
```
|
|
584
|
-
|
|
585
|
-
#### Distribution Assertions
|
|
586
|
-
|
|
587
|
-
Distribution assertions validate output distributions **without requiring ground truth**. Use these to monitor that model outputs stay within expected ranges.
|
|
588
|
-
|
|
589
|
-
```javascript
|
|
590
|
-
// Assert that at least 80% of confidence scores are above 0.7
|
|
591
|
-
expectStats(predictions).field("confidence").percentageAbove(0.7).toBeAtLeast(0.8);
|
|
592
|
-
|
|
593
|
-
// Assert that at least 90% of toxicity scores are below 0.3
|
|
594
|
-
expectStats(predictions).field("toxicity").percentageBelow(0.3).toBeAtLeast(0.9);
|
|
595
|
-
|
|
596
|
-
// Chain multiple distribution assertions
|
|
597
|
-
expectStats(predictions)
|
|
598
|
-
.field("score")
|
|
599
|
-
.percentageAbove(0.5).toBeAtLeast(0.6) // At least 60% above 0.5
|
|
600
|
-
.percentageBelow(0.9).toBeAtLeast(0.8); // At least 80% below 0.9
|
|
601
|
-
```
|
|
602
|
-
|
|
603
|
-
**Use cases:**
|
|
604
|
-
|
|
605
|
-
- Monitor confidence score distributions
|
|
606
|
-
- Validate schema compliance rates
|
|
607
|
-
- Check output range constraints
|
|
608
|
-
- Ensure score distributions remain stable over time
|
|
609
|
-
|
|
610
|
-
See [Distribution Assertions Example](./examples/distribution-assertions.eval.js) for complete examples.
|
|
611
|
-
|
|
612
|
-
### Judge Validation
|
|
613
|
-
|
|
614
|
-
Validate judge outputs against human-labeled ground truth using the **two-argument expectStats API**:
|
|
615
|
-
|
|
616
|
-
```javascript
|
|
617
|
-
// Judge outputs (predictions from your judge/metric)
|
|
618
|
-
const judgeOutputs = [
|
|
619
|
-
{ id: "1", hallucinated: true },
|
|
620
|
-
{ id: "2", hallucinated: false },
|
|
621
|
-
{ id: "3", hallucinated: true },
|
|
622
|
-
];
|
|
623
|
-
|
|
624
|
-
// Human labels (ground truth)
|
|
625
|
-
const humanLabels = [
|
|
626
|
-
{ id: "1", hallucinated: true },
|
|
627
|
-
{ id: "2", hallucinated: false },
|
|
628
|
-
{ id: "3", hallucinated: false },
|
|
629
|
-
];
|
|
630
|
-
|
|
631
|
-
// Validate judge performance
|
|
632
|
-
expectStats(judgeOutputs, humanLabels)
|
|
633
|
-
.field("hallucinated")
|
|
634
|
-
.recall(true).toBeAtLeast(0.9) // Don't miss hallucinations
|
|
635
|
-
.precision(true).toBeAtLeast(0.7) // Some false positives OK
|
|
636
|
-
.displayConfusionMatrix();
|
|
100
|
+
const scores = await hallucination({
|
|
101
|
+
outputs: [{ id: "1", output: "Paris has 50 million people." }],
|
|
102
|
+
context: ["Paris has approximately 2.1 million residents."],
|
|
103
|
+
});
|
|
104
|
+
// scores[0].score → 0.9 (high hallucination)
|
|
105
|
+
// scores[0].reasoning → "Output claims 50M, context says 2.1M"
|
|
637
106
|
```
|
|
638
107
|
|
|
639
|
-
|
|
108
|
+
Built-in providers: OpenAI, Anthropic, OpenRouter, or bring your own adapter.
|
|
109
|
+
See [LLM Metrics Guide](./docs/llm-metrics.md) and [Adapters Guide](./docs/llm-adapters.md).
|
|
640
110
|
|
|
641
|
-
|
|
642
|
-
- Validate heuristic metrics against human labels
|
|
643
|
-
- Test automated detection systems (refusal, policy compliance)
|
|
644
|
-
- Calibrate metric thresholds
|
|
111
|
+
## Using with Claude Code (Vibe Check)
|
|
645
112
|
|
|
646
|
-
|
|
113
|
+
evalsense includes an example [Claude Code skill](./skill.md) that acts as an automated LLM quality gate. To set it up in your project:
|
|
647
114
|
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
115
|
+
1. Install evalsense as a dev dependency
|
|
116
|
+
2. Copy [`skill.md`](./skill.md) into your project at `.claude/skills/llm-quality-gate/SKILL.md`
|
|
117
|
+
3. After building any LLM feature, run `/llm-quality-gate` in Claude Code
|
|
651
118
|
|
|
652
|
-
|
|
119
|
+
Claude will automatically create a `.eval.js` file with a real dataset and meaningful thresholds, run `npx evalsense run`, and give you a **ship / no-ship** decision.
|
|
653
120
|
|
|
654
|
-
|
|
121
|
+
## Documentation
|
|
655
122
|
|
|
656
|
-
|
|
123
|
+
| Guide | Description |
|
|
124
|
+
| -------------------------------------------------- | ------------------------------------------------ |
|
|
125
|
+
| [API Reference](./docs/api-reference.md) | Full API — all assertions, matchers, metrics |
|
|
126
|
+
| [CLI Reference](./docs/cli.md) | All CLI flags, exit codes, CI integration |
|
|
127
|
+
| [LLM Metrics](./docs/llm-metrics.md) | Hallucination, relevance, faithfulness, toxicity |
|
|
128
|
+
| [LLM Adapters](./docs/llm-adapters.md) | OpenAI, Anthropic, OpenRouter, custom adapters |
|
|
129
|
+
| [Custom Metrics](./docs/custom-metrics-guide.md) | Pattern and keyword metrics |
|
|
130
|
+
| [Agent Judges](./docs/agent-judges.md) | Design patterns for evaluating agent systems |
|
|
131
|
+
| [Regression Metrics](./docs/regression-metrics.md) | MAE, RMSE, R² usage |
|
|
132
|
+
| [Examples](./examples/) | Working code examples |
|
|
657
133
|
|
|
658
134
|
## Dataset Format
|
|
659
135
|
|
|
660
|
-
|
|
136
|
+
Records must have an `id` or `_id` field:
|
|
661
137
|
|
|
662
138
|
```json
|
|
663
139
|
[
|
|
664
|
-
{
|
|
665
|
-
|
|
666
|
-
"text": "input text",
|
|
667
|
-
"label": "expected_output"
|
|
668
|
-
},
|
|
669
|
-
{
|
|
670
|
-
"id": "2",
|
|
671
|
-
"text": "another input",
|
|
672
|
-
"label": "another_output"
|
|
673
|
-
}
|
|
140
|
+
{ "id": "1", "text": "sample input", "label": "positive" },
|
|
141
|
+
{ "id": "2", "text": "another input", "label": "negative" }
|
|
674
142
|
]
|
|
675
143
|
```
|
|
676
144
|
|
|
677
|
-
**Requirements:**
|
|
678
|
-
|
|
679
|
-
- Each record MUST have `id` or `_id` for alignment
|
|
680
|
-
- Ground truth fields (e.g., `label`, `sentiment`, `category`) are compared against model outputs
|
|
681
|
-
- Model functions must return predictions with matching `id`
|
|
682
|
-
|
|
683
145
|
## Exit Codes
|
|
684
146
|
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
147
|
+
| Code | Meaning |
|
|
148
|
+
| ---- | ------------------------- |
|
|
149
|
+
| `0` | All tests passed |
|
|
150
|
+
| `1` | Assertion failure |
|
|
151
|
+
| `2` | Dataset integrity failure |
|
|
152
|
+
| `3` | Execution error |
|
|
153
|
+
| `4` | Configuration error |
|
|
692
154
|
|
|
693
|
-
##
|
|
694
|
-
|
|
695
|
-
Eval files use the `.eval.js` or `.eval.ts` extension and are discovered automatically:
|
|
696
|
-
|
|
697
|
-
```
|
|
698
|
-
project/
|
|
699
|
-
├── tests/
|
|
700
|
-
│ ├── classifier.eval.js
|
|
701
|
-
│ └── hallucination.eval.js
|
|
702
|
-
├── data/
|
|
703
|
-
│ └── dataset.json
|
|
704
|
-
└── package.json
|
|
705
|
-
```
|
|
706
|
-
|
|
707
|
-
Run with:
|
|
708
|
-
|
|
709
|
-
```bash
|
|
710
|
-
npx evalsense run tests/
|
|
711
|
-
```
|
|
712
|
-
|
|
713
|
-
## Examples
|
|
714
|
-
|
|
715
|
-
See the [`examples/`](./examples/) directory for complete examples:
|
|
716
|
-
|
|
717
|
-
- [`classification.eval.js`](./examples/basic/classification.eval.js) - Binary sentiment classification
|
|
718
|
-
- [`hallucination.eval.js`](./examples/basic/hallucination.eval.js) - Continuous score binarization
|
|
719
|
-
- [`distribution-assertions.eval.js`](./examples/distribution-assertions.eval.js) - Distribution monitoring without ground truth
|
|
720
|
-
- [`judge-validation.eval.js`](./examples/judge-validation.eval.js) - Validating judges against human labels
|
|
721
|
-
|
|
722
|
-
## Field Types
|
|
723
|
-
|
|
724
|
-
evalsense automatically determines evaluation metrics based on field values:
|
|
725
|
-
|
|
726
|
-
- **Boolean** (`true`/`false`) → Binary classification metrics
|
|
727
|
-
- **Categorical** (strings) → Multi-class classification metrics
|
|
728
|
-
- **Numeric** (numbers) → Regression metrics (MAE, MSE, RMSE, R²)
|
|
729
|
-
- **Numeric + threshold** → Binarized classification metrics
|
|
730
|
-
|
|
731
|
-
## LLM-Based Metrics (v0.2.0+)
|
|
732
|
-
|
|
733
|
-
evalsense includes LLM-powered metrics for hallucination detection, relevance assessment, faithfulness verification, and toxicity detection.
|
|
734
|
-
|
|
735
|
-
### Quick Setup
|
|
736
|
-
|
|
737
|
-
```javascript
|
|
738
|
-
import { setLLMClient, createOpenAIAdapter } from "evalsense/metrics";
|
|
739
|
-
import { hallucination, relevance, faithfulness, toxicity } from "evalsense/metrics/opinionated";
|
|
740
|
-
|
|
741
|
-
// 1. Configure your LLM client (one-time setup)
|
|
742
|
-
setLLMClient(
|
|
743
|
-
createOpenAIAdapter(process.env.OPENAI_API_KEY, {
|
|
744
|
-
model: "gpt-4-turbo-preview",
|
|
745
|
-
temperature: 0,
|
|
746
|
-
})
|
|
747
|
-
);
|
|
748
|
-
|
|
749
|
-
// 2. Use metrics in evaluations
|
|
750
|
-
const results = await hallucination({
|
|
751
|
-
outputs: [{ id: "1", output: "Paris has 50 million people." }],
|
|
752
|
-
context: ["Paris has approximately 2.1 million residents."],
|
|
753
|
-
});
|
|
754
|
-
|
|
755
|
-
console.log(results[0].score); // 0.9 (high hallucination)
|
|
756
|
-
console.log(results[0].reasoning); // "Output claims 50M, context says 2.1M"
|
|
757
|
-
```
|
|
758
|
-
|
|
759
|
-
### Available Metrics
|
|
760
|
-
|
|
761
|
-
- **`hallucination()`** - Detects claims not supported by context
|
|
762
|
-
- **`relevance()`** - Measures query-response alignment
|
|
763
|
-
- **`faithfulness()`** - Verifies outputs don't contradict sources
|
|
764
|
-
- **`toxicity()`** - Identifies harmful or inappropriate content
|
|
765
|
-
|
|
766
|
-
### Evaluation Modes
|
|
767
|
-
|
|
768
|
-
Choose between accuracy and cost:
|
|
769
|
-
|
|
770
|
-
```javascript
|
|
771
|
-
// Per-row: Higher accuracy, higher cost (N API calls)
|
|
772
|
-
await hallucination({
|
|
773
|
-
outputs,
|
|
774
|
-
context,
|
|
775
|
-
evaluationMode: "per-row", // default
|
|
776
|
-
});
|
|
777
|
-
|
|
778
|
-
// Batch: Lower cost, single API call
|
|
779
|
-
await hallucination({
|
|
780
|
-
outputs,
|
|
781
|
-
context,
|
|
782
|
-
evaluationMode: "batch",
|
|
783
|
-
});
|
|
784
|
-
```
|
|
785
|
-
|
|
786
|
-
### Built-in Provider Adapters
|
|
787
|
-
|
|
788
|
-
evalsense includes ready-to-use adapters for popular LLM providers:
|
|
789
|
-
|
|
790
|
-
**OpenAI (GPT-4, GPT-3.5)**
|
|
791
|
-
|
|
792
|
-
```javascript
|
|
793
|
-
import { createOpenAIAdapter } from "evalsense/metrics";
|
|
794
|
-
|
|
795
|
-
// npm install openai
|
|
796
|
-
setLLMClient(
|
|
797
|
-
createOpenAIAdapter(process.env.OPENAI_API_KEY, {
|
|
798
|
-
model: "gpt-4-turbo-preview", // or "gpt-3.5-turbo" for lower cost
|
|
799
|
-
temperature: 0,
|
|
800
|
-
maxTokens: 4096,
|
|
801
|
-
})
|
|
802
|
-
);
|
|
803
|
-
```
|
|
804
|
-
|
|
805
|
-
**Anthropic (Claude)**
|
|
806
|
-
|
|
807
|
-
```javascript
|
|
808
|
-
import { createAnthropicAdapter } from "evalsense/metrics";
|
|
809
|
-
|
|
810
|
-
// npm install @anthropic-ai/sdk
|
|
811
|
-
setLLMClient(
|
|
812
|
-
createAnthropicAdapter(process.env.ANTHROPIC_API_KEY, {
|
|
813
|
-
model: "claude-3-5-sonnet-20241022", // or "claude-3-haiku-20240307" for speed
|
|
814
|
-
maxTokens: 4096,
|
|
815
|
-
})
|
|
816
|
-
);
|
|
817
|
-
```
|
|
818
|
-
|
|
819
|
-
**OpenRouter (100+ models from one API)**
|
|
820
|
-
|
|
821
|
-
```javascript
|
|
822
|
-
import { createOpenRouterAdapter } from "evalsense/metrics";
|
|
823
|
-
|
|
824
|
-
// No SDK needed - uses fetch
|
|
825
|
-
setLLMClient(
|
|
826
|
-
createOpenRouterAdapter(process.env.OPENROUTER_API_KEY, {
|
|
827
|
-
model: "anthropic/claude-3.5-sonnet", // or "openai/gpt-3.5-turbo", etc.
|
|
828
|
-
temperature: 0,
|
|
829
|
-
appName: "my-eval-system",
|
|
830
|
-
})
|
|
831
|
-
);
|
|
832
|
-
```
|
|
833
|
-
|
|
834
|
-
**Custom Adapter (for any provider)**
|
|
835
|
-
|
|
836
|
-
```javascript
|
|
837
|
-
setLLMClient({
|
|
838
|
-
async complete(prompt) {
|
|
839
|
-
// Implement for your LLM provider
|
|
840
|
-
const response = await yourLLM.generate(prompt);
|
|
841
|
-
return response.text;
|
|
842
|
-
},
|
|
843
|
-
});
|
|
844
|
-
```
|
|
845
|
-
|
|
846
|
-
### Learn More
|
|
155
|
+
## Contributing
|
|
847
156
|
|
|
848
|
-
|
|
849
|
-
- [LLM Adapters Guide](./docs/llm-adapters.md) - Implement adapters for different providers
|
|
850
|
-
- [Migration Guide](./docs/migration-v0.2.md) - Upgrade from v0.1.x
|
|
851
|
-
- [Examples](./examples/) - Working code examples
|
|
157
|
+
Contributions are welcome. See [CONTRIBUTING.md](./CONTRIBUTING.md) for setup, coding standards, and the PR process.
|
|
852
158
|
|
|
853
|
-
##
|
|
159
|
+
## License
|
|
854
160
|
|
|
855
|
-
|
|
161
|
+
[Apache 2.0](./LICENSE)
|