evalsense 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +99 -82
- package/dist/{chunk-HDJID3GC.cjs → chunk-BE7CB3AM.cjs} +39 -28
- package/dist/chunk-BE7CB3AM.cjs.map +1 -0
- package/dist/chunk-DGUM43GV.js +10 -0
- package/dist/chunk-DGUM43GV.js.map +1 -0
- package/dist/chunk-JEQ2X3Z6.cjs +12 -0
- package/dist/chunk-JEQ2X3Z6.cjs.map +1 -0
- package/dist/{chunk-5P7LNNO6.js → chunk-K6QPJ2NO.js} +39 -28
- package/dist/chunk-K6QPJ2NO.js.map +1 -0
- package/dist/{chunk-Y23VHTD3.cjs → chunk-RZFLCWTW.cjs} +2 -2
- package/dist/chunk-RZFLCWTW.cjs.map +1 -0
- package/dist/{chunk-BRPM6AB6.js → chunk-Z3U6AUWX.js} +2 -2
- package/dist/chunk-Z3U6AUWX.js.map +1 -0
- package/dist/cli.cjs +39 -36
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +37 -34
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +320 -104
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +93 -7
- package/dist/index.d.ts +93 -7
- package/dist/index.js +242 -26
- package/dist/index.js.map +1 -1
- package/dist/metrics/index.cjs +257 -17
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +252 -1
- package/dist/metrics/index.d.ts +252 -1
- package/dist/metrics/index.js +240 -2
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/opinionated/index.cjs +6 -5
- package/dist/metrics/opinionated/index.js +2 -1
- package/package.json +4 -3
- package/dist/chunk-5P7LNNO6.js.map +0 -1
- package/dist/chunk-BRPM6AB6.js.map +0 -1
- package/dist/chunk-HDJID3GC.cjs.map +0 -1
- package/dist/chunk-Y23VHTD3.cjs.map +0 -1
package/README.md
CHANGED
|
@@ -3,10 +3,12 @@
|
|
|
3
3
|
> JS-native LLM evaluation framework with Jest-like API and statistical assertions
|
|
4
4
|
|
|
5
5
|
[](https://www.npmjs.com/package/evalsense)
|
|
6
|
-
[](https://opensource.org/licenses/Apache-2.0)
|
|
7
7
|
|
|
8
8
|
**evalsense** brings classical ML-style statistical evaluation to LLM systems in JavaScript. Instead of evaluating individual test cases, evalsense evaluates entire datasets and computes confusion matrices, precision/recall, F1 scores, and other statistical metrics.
|
|
9
9
|
|
|
10
|
+
> **New in v0.3.0:** Regression assertions (MAE, RMSE, R²) and flexible ID matching for custom identifier fields! [See migration guide](./docs/migration-v0.3.0.md).
|
|
11
|
+
> **New in v0.2.x:** Built-in adapters for OpenAI, Anthropic, and OpenRouter - no boilerplate needed!
|
|
10
12
|
> **New in v0.2.0:** LLM-powered metrics for hallucination, relevance, faithfulness, and toxicity detection. [See migration guide](./docs/migration-v0.2.md).
|
|
11
13
|
|
|
12
14
|
## Why evalsense?
|
|
@@ -57,7 +59,7 @@ function classifySentiment(record) {
|
|
|
57
59
|
|
|
58
60
|
return {
|
|
59
61
|
id: record.id,
|
|
60
|
-
sentiment: hasPositive && !hasNegative ? "positive" : "negative"
|
|
62
|
+
sentiment: hasPositive && !hasNegative ? "positive" : "negative",
|
|
61
63
|
};
|
|
62
64
|
}
|
|
63
65
|
|
|
@@ -109,14 +111,14 @@ describe("Spam classifier", () => {
|
|
|
109
111
|
|
|
110
112
|
const result = await runModel(dataset, (record) => ({
|
|
111
113
|
id: record.id,
|
|
112
|
-
isSpam: classifyEmail(record.text)
|
|
114
|
+
isSpam: classifyEmail(record.text),
|
|
113
115
|
}));
|
|
114
116
|
|
|
115
117
|
expectStats(result)
|
|
116
118
|
.field("isSpam")
|
|
117
119
|
.toHaveAccuracyAbove(0.9)
|
|
118
|
-
.toHavePrecisionAbove(true, 0.85)
|
|
119
|
-
.toHaveRecallAbove(true, 0.85)
|
|
120
|
+
.toHavePrecisionAbove(true, 0.85) // Precision for spam=true
|
|
121
|
+
.toHaveRecallAbove(true, 0.85) // Recall for spam=true
|
|
120
122
|
.toHaveConfusionMatrix();
|
|
121
123
|
});
|
|
122
124
|
});
|
|
@@ -134,13 +136,13 @@ describe("Hallucination detector", () => {
|
|
|
134
136
|
// Your model returns a continuous score
|
|
135
137
|
const result = await runModel(dataset, (record) => ({
|
|
136
138
|
id: record.id,
|
|
137
|
-
hallucinated: computeHallucinationScore(record.output)
|
|
139
|
+
hallucinated: computeHallucinationScore(record.output), // 0.0 to 1.0
|
|
138
140
|
}));
|
|
139
141
|
|
|
140
142
|
// Binarize the score at threshold 0.3
|
|
141
143
|
expectStats(result)
|
|
142
144
|
.field("hallucinated")
|
|
143
|
-
.binarize(0.3)
|
|
145
|
+
.binarize(0.3) // >= 0.3 means hallucinated
|
|
144
146
|
.toHaveRecallAbove(true, 0.7)
|
|
145
147
|
.toHavePrecisionAbove(true, 0.6)
|
|
146
148
|
.toHaveConfusionMatrix();
|
|
@@ -159,7 +161,7 @@ describe("Intent classifier", () => {
|
|
|
159
161
|
|
|
160
162
|
const result = await runModel(dataset, (record) => ({
|
|
161
163
|
id: record.id,
|
|
162
|
-
intent: classifyIntent(record.query)
|
|
164
|
+
intent: classifyIntent(record.query),
|
|
163
165
|
}));
|
|
164
166
|
|
|
165
167
|
expectStats(result)
|
|
@@ -191,12 +193,10 @@ describe("LLM classifier", () => {
|
|
|
191
193
|
const response = await callLLM(record.text);
|
|
192
194
|
return { id: record.id, category: response.category };
|
|
193
195
|
},
|
|
194
|
-
5
|
|
196
|
+
5 // concurrency limit
|
|
195
197
|
);
|
|
196
198
|
|
|
197
|
-
expectStats(result)
|
|
198
|
-
.field("category")
|
|
199
|
-
.toHaveAccuracyAbove(0.9);
|
|
199
|
+
expectStats(result).field("category").toHaveAccuracyAbove(0.9);
|
|
200
200
|
});
|
|
201
201
|
});
|
|
202
202
|
```
|
|
@@ -283,6 +283,7 @@ npx evalsense list tests/
|
|
|
283
283
|
### Core API
|
|
284
284
|
|
|
285
285
|
#### `describe(name, fn)`
|
|
286
|
+
|
|
286
287
|
Groups related evaluation tests (like Jest's describe).
|
|
287
288
|
|
|
288
289
|
```javascript
|
|
@@ -292,6 +293,7 @@ describe("My model", () => {
|
|
|
292
293
|
```
|
|
293
294
|
|
|
294
295
|
#### `evalTest(name, fn)` / `test(name, fn)` / `it(name, fn)`
|
|
296
|
+
|
|
295
297
|
Defines an evaluation test.
|
|
296
298
|
|
|
297
299
|
```javascript
|
|
@@ -303,6 +305,7 @@ evalTest("should have 90% accuracy", async () => {
|
|
|
303
305
|
### Dataset Functions
|
|
304
306
|
|
|
305
307
|
#### `loadDataset(path)`
|
|
308
|
+
|
|
306
309
|
Loads a dataset from a JSON file. Records must have an `id` or `_id` field.
|
|
307
310
|
|
|
308
311
|
```javascript
|
|
@@ -310,44 +313,45 @@ const dataset = loadDataset("./data.json");
|
|
|
310
313
|
```
|
|
311
314
|
|
|
312
315
|
#### `runModel(dataset, modelFn)`
|
|
316
|
+
|
|
313
317
|
Runs a model function on each record sequentially.
|
|
314
318
|
|
|
315
319
|
```javascript
|
|
316
320
|
const result = await runModel(dataset, (record) => ({
|
|
317
321
|
id: record.id,
|
|
318
|
-
prediction: classify(record.text)
|
|
322
|
+
prediction: classify(record.text),
|
|
319
323
|
}));
|
|
320
324
|
```
|
|
321
325
|
|
|
322
326
|
#### `runModelParallel(dataset, modelFn, concurrency)`
|
|
327
|
+
|
|
323
328
|
Runs a model function with parallel execution.
|
|
324
329
|
|
|
325
330
|
```javascript
|
|
326
|
-
const result = await runModelParallel(dataset, modelFn, 10);
|
|
331
|
+
const result = await runModelParallel(dataset, modelFn, 10); // concurrency=10
|
|
327
332
|
```
|
|
328
333
|
|
|
329
334
|
### Assertions
|
|
330
335
|
|
|
331
336
|
#### `expectStats(result)`
|
|
337
|
+
|
|
332
338
|
Creates a statistical assertion chain from model results.
|
|
333
339
|
|
|
334
340
|
```javascript
|
|
335
|
-
expectStats(result)
|
|
336
|
-
.field("prediction")
|
|
337
|
-
.toHaveAccuracyAbove(0.8);
|
|
341
|
+
expectStats(result).field("prediction").toHaveAccuracyAbove(0.8);
|
|
338
342
|
```
|
|
339
343
|
|
|
340
344
|
#### `expectStats(predictions, groundTruth)`
|
|
345
|
+
|
|
341
346
|
Two-argument form for judge validation. Aligns predictions with ground truth by `id` field.
|
|
342
347
|
|
|
343
348
|
```javascript
|
|
344
349
|
// Validate judge outputs against human labels
|
|
345
|
-
expectStats(judgeOutputs, humanLabels)
|
|
346
|
-
.field("label")
|
|
347
|
-
.toHaveAccuracyAbove(0.85);
|
|
350
|
+
expectStats(judgeOutputs, humanLabels).field("label").toHaveAccuracyAbove(0.85);
|
|
348
351
|
```
|
|
349
352
|
|
|
350
353
|
**When to use:**
|
|
354
|
+
|
|
351
355
|
- Validating LLM judges against human labels
|
|
352
356
|
- Evaluating metric quality
|
|
353
357
|
- Testing automated detection systems
|
|
@@ -355,19 +359,21 @@ expectStats(judgeOutputs, humanLabels)
|
|
|
355
359
|
### Field Selection
|
|
356
360
|
|
|
357
361
|
#### `.field(fieldName)`
|
|
362
|
+
|
|
358
363
|
Selects a field for evaluation.
|
|
359
364
|
|
|
360
365
|
```javascript
|
|
361
|
-
expectStats(result).field("sentiment")
|
|
366
|
+
expectStats(result).field("sentiment");
|
|
362
367
|
```
|
|
363
368
|
|
|
364
369
|
#### `.binarize(threshold)`
|
|
370
|
+
|
|
365
371
|
Converts continuous scores to binary (>=threshold is true).
|
|
366
372
|
|
|
367
373
|
```javascript
|
|
368
374
|
expectStats(result)
|
|
369
375
|
.field("score")
|
|
370
|
-
.binarize(0.5)
|
|
376
|
+
.binarize(0.5) // score >= 0.5 is true
|
|
371
377
|
.toHaveAccuracyAbove(0.8);
|
|
372
378
|
```
|
|
373
379
|
|
|
@@ -403,23 +409,20 @@ Distribution assertions validate output distributions **without requiring ground
|
|
|
403
409
|
|
|
404
410
|
```javascript
|
|
405
411
|
// Assert that at least 80% of confidence scores are above 0.7
|
|
406
|
-
expectStats(predictions)
|
|
407
|
-
.field("confidence")
|
|
408
|
-
.toHavePercentageAbove(0.7, 0.8);
|
|
412
|
+
expectStats(predictions).field("confidence").toHavePercentageAbove(0.7, 0.8);
|
|
409
413
|
|
|
410
414
|
// Assert that at least 90% of toxicity scores are below 0.3
|
|
411
|
-
expectStats(predictions)
|
|
412
|
-
.field("toxicity")
|
|
413
|
-
.toHavePercentageBelow(0.3, 0.9);
|
|
415
|
+
expectStats(predictions).field("toxicity").toHavePercentageBelow(0.3, 0.9);
|
|
414
416
|
|
|
415
417
|
// Chain multiple distribution assertions
|
|
416
418
|
expectStats(predictions)
|
|
417
419
|
.field("score")
|
|
418
|
-
.toHavePercentageAbove(0.5, 0.6)
|
|
420
|
+
.toHavePercentageAbove(0.5, 0.6) // At least 60% above 0.5
|
|
419
421
|
.toHavePercentageBelow(0.9, 0.8); // At least 80% below 0.9
|
|
420
422
|
```
|
|
421
423
|
|
|
422
424
|
**Use cases:**
|
|
425
|
+
|
|
423
426
|
- Monitor confidence score distributions
|
|
424
427
|
- Validate schema compliance rates
|
|
425
428
|
- Check output range constraints
|
|
@@ -436,35 +439,35 @@ Validate judge outputs against human-labeled ground truth using the **two-argume
|
|
|
436
439
|
const judgeOutputs = [
|
|
437
440
|
{ id: "1", hallucinated: true },
|
|
438
441
|
{ id: "2", hallucinated: false },
|
|
439
|
-
{ id: "3", hallucinated: true }
|
|
442
|
+
{ id: "3", hallucinated: true },
|
|
440
443
|
];
|
|
441
444
|
|
|
442
445
|
// Human labels (ground truth)
|
|
443
446
|
const humanLabels = [
|
|
444
447
|
{ id: "1", hallucinated: true },
|
|
445
448
|
{ id: "2", hallucinated: false },
|
|
446
|
-
{ id: "3", hallucinated: false }
|
|
449
|
+
{ id: "3", hallucinated: false },
|
|
447
450
|
];
|
|
448
451
|
|
|
449
452
|
// Validate judge performance
|
|
450
453
|
expectStats(judgeOutputs, humanLabels)
|
|
451
454
|
.field("hallucinated")
|
|
452
|
-
.toHaveRecallAbove(true, 0.9)
|
|
453
|
-
.toHavePrecisionAbove(true, 0.7)
|
|
455
|
+
.toHaveRecallAbove(true, 0.9) // Don't miss hallucinations
|
|
456
|
+
.toHavePrecisionAbove(true, 0.7) // Some false positives OK
|
|
454
457
|
.toHaveConfusionMatrix();
|
|
455
458
|
```
|
|
456
459
|
|
|
457
460
|
**Use cases:**
|
|
461
|
+
|
|
458
462
|
- Evaluate LLM-as-judge accuracy
|
|
459
463
|
- Validate heuristic metrics against human labels
|
|
460
464
|
- Test automated detection systems (refusal, policy compliance)
|
|
461
465
|
- Calibrate metric thresholds
|
|
462
466
|
|
|
463
467
|
**Two-argument expectStats:**
|
|
468
|
+
|
|
464
469
|
```javascript
|
|
465
|
-
expectStats(actual, expected)
|
|
466
|
-
.field("fieldName")
|
|
467
|
-
.toHaveAccuracyAbove(0.8);
|
|
470
|
+
expectStats(actual, expected).field("fieldName").toHaveAccuracyAbove(0.8);
|
|
468
471
|
```
|
|
469
472
|
|
|
470
473
|
The first argument is your predictions (judge outputs), the second is ground truth (human labels). Both must have matching `id` fields for alignment.
|
|
@@ -493,6 +496,7 @@ Datasets must be JSON arrays where each record has an `id` or `_id` field:
|
|
|
493
496
|
```
|
|
494
497
|
|
|
495
498
|
**Requirements:**
|
|
499
|
+
|
|
496
500
|
- Each record MUST have `id` or `_id` for alignment
|
|
497
501
|
- Ground truth fields (e.g., `label`, `sentiment`, `category`) are compared against model outputs
|
|
498
502
|
- Model functions must return predictions with matching `id`
|
|
@@ -522,6 +526,7 @@ project/
|
|
|
522
526
|
```
|
|
523
527
|
|
|
524
528
|
Run with:
|
|
529
|
+
|
|
525
530
|
```bash
|
|
526
531
|
npx evalsense run tests/
|
|
527
532
|
```
|
|
@@ -551,26 +556,25 @@ evalsense includes LLM-powered metrics for hallucination detection, relevance as
|
|
|
551
556
|
### Quick Setup
|
|
552
557
|
|
|
553
558
|
```javascript
|
|
554
|
-
import { setLLMClient } from "evalsense/metrics";
|
|
559
|
+
import { setLLMClient, createOpenAIAdapter } from "evalsense/metrics";
|
|
555
560
|
import { hallucination, relevance, faithfulness, toxicity } from "evalsense/metrics/opinionated";
|
|
556
561
|
|
|
557
562
|
// 1. Configure your LLM client (one-time setup)
|
|
558
|
-
setLLMClient(
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
});
|
|
563
|
+
setLLMClient(
|
|
564
|
+
createOpenAIAdapter(process.env.OPENAI_API_KEY, {
|
|
565
|
+
model: "gpt-4-turbo-preview",
|
|
566
|
+
temperature: 0,
|
|
567
|
+
})
|
|
568
|
+
);
|
|
565
569
|
|
|
566
570
|
// 2. Use metrics in evaluations
|
|
567
571
|
const results = await hallucination({
|
|
568
572
|
outputs: [{ id: "1", output: "Paris has 50 million people." }],
|
|
569
|
-
context: ["Paris has approximately 2.1 million residents."]
|
|
573
|
+
context: ["Paris has approximately 2.1 million residents."],
|
|
570
574
|
});
|
|
571
575
|
|
|
572
|
-
console.log(results[0].score);
|
|
573
|
-
console.log(results[0].reasoning);
|
|
576
|
+
console.log(results[0].score); // 0.9 (high hallucination)
|
|
577
|
+
console.log(results[0].reasoning); // "Output claims 50M, context says 2.1M"
|
|
574
578
|
```
|
|
575
579
|
|
|
576
580
|
### Available Metrics
|
|
@@ -589,62 +593,74 @@ Choose between accuracy and cost:
|
|
|
589
593
|
await hallucination({
|
|
590
594
|
outputs,
|
|
591
595
|
context,
|
|
592
|
-
evaluationMode: "per-row"
|
|
596
|
+
evaluationMode: "per-row", // default
|
|
593
597
|
});
|
|
594
598
|
|
|
595
599
|
// Batch: Lower cost, single API call
|
|
596
600
|
await hallucination({
|
|
597
601
|
outputs,
|
|
598
602
|
context,
|
|
599
|
-
evaluationMode: "batch"
|
|
603
|
+
evaluationMode: "batch",
|
|
600
604
|
});
|
|
601
605
|
```
|
|
602
606
|
|
|
603
|
-
### Provider
|
|
607
|
+
### Built-in Provider Adapters
|
|
608
|
+
|
|
609
|
+
evalsense includes ready-to-use adapters for popular LLM providers:
|
|
610
|
+
|
|
611
|
+
**OpenAI (GPT-4, GPT-3.5)**
|
|
604
612
|
|
|
605
|
-
**OpenAI:**
|
|
606
613
|
```javascript
|
|
607
|
-
import
|
|
614
|
+
import { createOpenAIAdapter } from "evalsense/metrics";
|
|
608
615
|
|
|
609
|
-
|
|
610
|
-
setLLMClient(
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
}
|
|
618
|
-
});
|
|
616
|
+
// npm install openai
|
|
617
|
+
setLLMClient(
|
|
618
|
+
createOpenAIAdapter(process.env.OPENAI_API_KEY, {
|
|
619
|
+
model: "gpt-4-turbo-preview", // or "gpt-3.5-turbo" for lower cost
|
|
620
|
+
temperature: 0,
|
|
621
|
+
maxTokens: 4096,
|
|
622
|
+
})
|
|
623
|
+
);
|
|
619
624
|
```
|
|
620
625
|
|
|
621
|
-
**Anthropic
|
|
626
|
+
**Anthropic (Claude)**
|
|
627
|
+
|
|
622
628
|
```javascript
|
|
623
|
-
import
|
|
629
|
+
import { createAnthropicAdapter } from "evalsense/metrics";
|
|
624
630
|
|
|
625
|
-
|
|
626
|
-
setLLMClient(
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
631
|
+
// npm install @anthropic-ai/sdk
|
|
632
|
+
setLLMClient(
|
|
633
|
+
createAnthropicAdapter(process.env.ANTHROPIC_API_KEY, {
|
|
634
|
+
model: "claude-3-5-sonnet-20241022", // or "claude-3-haiku-20240307" for speed
|
|
635
|
+
maxTokens: 4096,
|
|
636
|
+
})
|
|
637
|
+
);
|
|
638
|
+
```
|
|
639
|
+
|
|
640
|
+
**OpenRouter (100+ models from one API)**
|
|
641
|
+
|
|
642
|
+
```javascript
|
|
643
|
+
import { createOpenRouterAdapter } from "evalsense/metrics";
|
|
644
|
+
|
|
645
|
+
// No SDK needed - uses fetch
|
|
646
|
+
setLLMClient(
|
|
647
|
+
createOpenRouterAdapter(process.env.OPENROUTER_API_KEY, {
|
|
648
|
+
model: "anthropic/claude-3.5-sonnet", // or "openai/gpt-3.5-turbo", etc.
|
|
649
|
+
temperature: 0,
|
|
650
|
+
appName: "my-eval-system",
|
|
651
|
+
})
|
|
652
|
+
);
|
|
636
653
|
```
|
|
637
654
|
|
|
638
|
-
**
|
|
655
|
+
**Custom Adapter (for any provider)**
|
|
656
|
+
|
|
639
657
|
```javascript
|
|
640
658
|
setLLMClient({
|
|
641
659
|
async complete(prompt) {
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
return (await response.json()).response;
|
|
647
|
-
}
|
|
660
|
+
// Implement for your LLM provider
|
|
661
|
+
const response = await yourLLM.generate(prompt);
|
|
662
|
+
return response.text;
|
|
663
|
+
},
|
|
648
664
|
});
|
|
649
665
|
```
|
|
650
666
|
|
|
@@ -660,6 +676,7 @@ setLLMClient({
|
|
|
660
676
|
evalsense is built on the principle that **metrics are predictions, not facts**.
|
|
661
677
|
|
|
662
678
|
Instead of treating LLM-as-judge metrics (relevance, hallucination, etc.) as ground truth, evalsense:
|
|
679
|
+
|
|
663
680
|
- Treats them as **weak labels** from a model
|
|
664
681
|
- Validates them statistically against human references when available
|
|
665
682
|
- Computes confusion matrices to reveal bias and systematic errors
|
|
@@ -146,10 +146,7 @@ function getSupport(cm, label) {
|
|
|
146
146
|
}
|
|
147
147
|
function formatConfusionMatrix(cm) {
|
|
148
148
|
const maxLabelLen = Math.max(...cm.labels.map((l) => l.length), 8);
|
|
149
|
-
const colWidth = Math.max(
|
|
150
|
-
...cm.matrix.flat().map((n) => String(n).length),
|
|
151
|
-
maxLabelLen
|
|
152
|
-
);
|
|
149
|
+
const colWidth = Math.max(...cm.matrix.flat().map((n) => String(n).length), maxLabelLen);
|
|
153
150
|
const header = " ".repeat(maxLabelLen + 2) + cm.labels.map((l) => l.padStart(colWidth)).join(" ");
|
|
154
151
|
const rows = cm.labels.map((label, i) => {
|
|
155
152
|
const rowData = cm.matrix[i].map((n) => String(n).padStart(colWidth)).join(" ");
|
|
@@ -261,7 +258,7 @@ var ConsoleReporter = class {
|
|
|
261
258
|
*/
|
|
262
259
|
printHeader(fileCount) {
|
|
263
260
|
this.log("");
|
|
264
|
-
this.log(this.color("bold", `EvalSense v0.1
|
|
261
|
+
this.log(this.color("bold", `EvalSense v0.3.1`));
|
|
265
262
|
this.log(this.color("dim", `Running ${fileCount} eval file(s)...`));
|
|
266
263
|
this.log("");
|
|
267
264
|
}
|
|
@@ -296,14 +293,23 @@ var ConsoleReporter = class {
|
|
|
296
293
|
for (const fm of test.fieldMetrics) {
|
|
297
294
|
this.printFieldMetrics(fm);
|
|
298
295
|
}
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
296
|
+
for (const fm of test.fieldMetrics) {
|
|
297
|
+
if (fm.metrics.confusionMatrix && Object.keys(fm.metrics.confusionMatrix).length > 0) {
|
|
298
|
+
this.printConfusionMatrix(fm);
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
if (test.error) {
|
|
302
|
+
const prefix = test.status === "error" ? "Error" : "Assertion Failed";
|
|
303
|
+
this.log(this.color("red", ` ${prefix}: ${test.error.message}`));
|
|
304
|
+
this.log("");
|
|
303
305
|
}
|
|
304
306
|
for (const assertion of test.assertions) {
|
|
305
307
|
if (!assertion.passed) {
|
|
306
|
-
this.log(this.color("red", ` ${assertion.message}`));
|
|
308
|
+
this.log(this.color("red", ` \u2717 ${assertion.message}`));
|
|
309
|
+
if (assertion.expected !== void 0 && assertion.actual !== void 0) {
|
|
310
|
+
this.log(this.color("dim", ` Expected: ${this.formatValue(assertion.expected)}`));
|
|
311
|
+
this.log(this.color("dim", ` Actual: ${this.formatValue(assertion.actual)}`));
|
|
312
|
+
}
|
|
307
313
|
}
|
|
308
314
|
}
|
|
309
315
|
}
|
|
@@ -419,6 +425,24 @@ var ConsoleReporter = class {
|
|
|
419
425
|
}
|
|
420
426
|
return `${colors[colorName]}${text}${colors.reset}`;
|
|
421
427
|
}
|
|
428
|
+
/**
|
|
429
|
+
* Formats a value for display
|
|
430
|
+
*/
|
|
431
|
+
formatValue(value) {
|
|
432
|
+
if (typeof value === "number") {
|
|
433
|
+
if (value >= 0 && value <= 1) {
|
|
434
|
+
return `${(value * 100).toFixed(1)}%`;
|
|
435
|
+
}
|
|
436
|
+
return value.toFixed(4);
|
|
437
|
+
}
|
|
438
|
+
if (typeof value === "string") {
|
|
439
|
+
return `"${value}"`;
|
|
440
|
+
}
|
|
441
|
+
if (Array.isArray(value)) {
|
|
442
|
+
return `[${value.join(", ")}]`;
|
|
443
|
+
}
|
|
444
|
+
return String(value);
|
|
445
|
+
}
|
|
422
446
|
/**
|
|
423
447
|
* Logs a line
|
|
424
448
|
*/
|
|
@@ -426,23 +450,10 @@ var ConsoleReporter = class {
|
|
|
426
450
|
console.log(message);
|
|
427
451
|
}
|
|
428
452
|
};
|
|
429
|
-
var DEFAULT_PATTERNS = [
|
|
430
|
-
|
|
431
|
-
"**/*.eval.ts",
|
|
432
|
-
"**/*.eval.mjs"
|
|
433
|
-
];
|
|
434
|
-
var DEFAULT_IGNORE = [
|
|
435
|
-
"**/node_modules/**",
|
|
436
|
-
"**/dist/**",
|
|
437
|
-
"**/build/**",
|
|
438
|
-
"**/.git/**"
|
|
439
|
-
];
|
|
453
|
+
var DEFAULT_PATTERNS = ["**/*.eval.js", "**/*.eval.ts", "**/*.eval.mjs"];
|
|
454
|
+
var DEFAULT_IGNORE = ["**/node_modules/**", "**/dist/**", "**/build/**", "**/.git/**"];
|
|
440
455
|
async function discoverEvalFiles(options = {}) {
|
|
441
|
-
const {
|
|
442
|
-
patterns = DEFAULT_PATTERNS,
|
|
443
|
-
ignore = DEFAULT_IGNORE,
|
|
444
|
-
cwd = process.cwd()
|
|
445
|
-
} = options;
|
|
456
|
+
const { patterns = DEFAULT_PATTERNS, ignore = DEFAULT_IGNORE, cwd = process.cwd() } = options;
|
|
446
457
|
const files = [];
|
|
447
458
|
for (const pattern of patterns) {
|
|
448
459
|
const matches = await glob.glob(pattern, {
|
|
@@ -775,5 +786,5 @@ exports.parseReport = parseReport;
|
|
|
775
786
|
exports.recordAssertion = recordAssertion;
|
|
776
787
|
exports.recordFieldMetrics = recordFieldMetrics;
|
|
777
788
|
exports.setCurrentSuite = setCurrentSuite;
|
|
778
|
-
//# sourceMappingURL=chunk-
|
|
779
|
-
//# sourceMappingURL=chunk-
|
|
789
|
+
//# sourceMappingURL=chunk-BE7CB3AM.cjs.map
|
|
790
|
+
//# sourceMappingURL=chunk-BE7CB3AM.cjs.map
|