evalsense 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,604 @@
1
+ import { T as TestFn, D as Dataset, P as Prediction, A as AlignedRecord, I as IntegrityResult, C as ClassificationMetrics, c as AssertionResult, d as ConfusionMatrix, E as EvalReport, F as FieldMetricResult } from './types-C71p0wzM.cjs';
2
+ export { e as CLIOptions, f as ClassMetrics, g as DatasetMetadata, h as EvalTest, i as ExitCode, j as ExitCodes, a as MetricConfig, M as MetricFn, b as MetricOutput, R as RegressionMetrics, S as Suite, k as SuiteResult, l as TestContext, m as TestResult } from './types-C71p0wzM.cjs';
3
+
4
+ /**
5
+ * describe() implementation - Jest-like test suite grouping
6
+ */
7
+
8
+ /**
9
+ * Creates a test suite that groups related eval tests
10
+ *
11
+ * @example
12
+ * ```ts
13
+ * describe("Sentiment classifier", () => {
14
+ * evalTest("accuracy above 80%", async () => {
15
+ * // test implementation
16
+ * });
17
+ * });
18
+ * ```
19
+ */
20
+ declare function describe(name: string, fn: () => void): void;
21
+ /**
22
+ * Lifecycle hook - runs once before all tests in the suite
23
+ */
24
+ declare function beforeAll(fn: TestFn): void;
25
+ /**
26
+ * Lifecycle hook - runs once after all tests in the suite
27
+ */
28
+ declare function afterAll(fn: TestFn): void;
29
+ /**
30
+ * Lifecycle hook - runs before each test in the suite
31
+ */
32
+ declare function beforeEach(fn: TestFn): void;
33
+ /**
34
+ * Lifecycle hook - runs after each test in the suite
35
+ */
36
+ declare function afterEach(fn: TestFn): void;
37
+
38
+ /**
39
+ * evalTest() implementation - defines an individual evaluation test
40
+ */
41
+
42
+ /**
43
+ * Defines an individual evaluation test within a describe() block
44
+ *
45
+ * @example
46
+ * ```ts
47
+ * evalTest("accuracy above 80%", async () => {
48
+ * const dataset = loadDataset("./data.json");
49
+ * const predictions = await runModel(dataset, classify);
50
+ *
51
+ * expectStats(predictions)
52
+ * .field("sentiment")
53
+ * .toHaveAccuracyAbove(0.8);
54
+ * });
55
+ * ```
56
+ */
57
+ declare function evalTest(name: string, fn: TestFn): void;
58
+ declare namespace evalTest {
59
+ var skip: typeof evalTestSkip;
60
+ var only: typeof evalTestOnly;
61
+ }
62
+ /**
63
+ * Alias for evalTest - some users may prefer "test" or "it"
64
+ */
65
+ declare const test: typeof evalTest;
66
+ declare const it: typeof evalTest;
67
+ /**
68
+ * Skipped test - registers but doesn't run
69
+ */
70
+ declare function evalTestSkip(name: string, _fn: TestFn): void;
71
+ /**
72
+ * Focused test - only runs this test (TODO: implement filtering)
73
+ */
74
+ declare function evalTestOnly(name: string, fn: TestFn): void;
75
+
76
+ /**
77
+ * Dataset loading functionality
78
+ */
79
+
80
+ /**
81
+ * Loads a dataset from a JSON or NDJSON file
82
+ *
83
+ * @param path - Path to the dataset file (relative to cwd or absolute)
84
+ * @returns Dataset with records and metadata
85
+ *
86
+ * @example
87
+ * ```ts
88
+ * const dataset = loadDataset("./fixtures/sentiment.json");
89
+ * // dataset.records = [{ id: "1", text: "...", sentiment: "positive" }, ...]
90
+ * ```
91
+ */
92
+ declare function loadDataset<T extends Record<string, unknown> = Record<string, unknown>>(path: string): Dataset<T>;
93
+ /**
94
+ * Creates a dataset from an array of records (for testing/programmatic use)
95
+ */
96
+ declare function createDataset<T extends Record<string, unknown>>(records: T[], source?: string): Dataset<T>;
97
+
98
+ /**
99
+ * runModel() - executes a model function against a dataset
100
+ */
101
+
102
+ /**
103
+ * Model function signature - takes a record and returns a prediction
104
+ */
105
+ type ModelFn<T> = (record: T) => Prediction | Promise<Prediction>;
106
+ /**
107
+ * Result of running a model on a dataset
108
+ */
109
+ interface ModelRunResult {
110
+ predictions: Prediction[];
111
+ aligned: AlignedRecord[];
112
+ duration: number;
113
+ }
114
+ /**
115
+ * Runs a model function against each record in a dataset
116
+ *
117
+ * @param dataset - The dataset to process
118
+ * @param modelFn - Function that processes each record and returns a prediction
119
+ * @returns Aligned predictions with actual vs expected values
120
+ *
121
+ * @example
122
+ * ```ts
123
+ * const result = await runModel(dataset, (record) => ({
124
+ * id: record.id,
125
+ * sentiment: classify(record.text)
126
+ * }));
127
+ * ```
128
+ */
129
+ declare function runModel<T extends Record<string, unknown>>(dataset: Dataset<T>, modelFn: ModelFn<T>): Promise<ModelRunResult>;
130
+ /**
131
+ * Runs model in parallel with concurrency limit
132
+ */
133
+ declare function runModelParallel<T extends Record<string, unknown>>(dataset: Dataset<T>, modelFn: ModelFn<T>, concurrency?: number): Promise<ModelRunResult>;
134
+
135
+ /**
136
+ * Dataset alignment utilities
137
+ * Aligns predictions with ground truth by ID
138
+ */
139
+
140
+ /**
141
+ * Options for alignment
142
+ */
143
+ interface AlignOptions {
144
+ /** Whether to throw on missing IDs (default: false) */
145
+ strict?: boolean;
146
+ /** Field to use as ID (default: "id") */
147
+ idField?: string;
148
+ }
149
+ /**
150
+ * Aligns predictions with expected values by ID
151
+ *
152
+ * @param predictions - Model predictions with IDs
153
+ * @param expected - Ground truth records with IDs
154
+ * @param options - Alignment options
155
+ * @returns Array of aligned records
156
+ */
157
+ declare function alignByKey(predictions: Prediction[], expected: Array<Record<string, unknown>>, options?: AlignOptions): AlignedRecord[];
158
+ /**
159
+ * Extracts field values from aligned records for statistical analysis
160
+ *
161
+ * @param aligned - Aligned records
162
+ * @param field - Field name to extract
163
+ * @returns Object with actual and expected arrays
164
+ */
165
+ declare function extractFieldValues(aligned: AlignedRecord[], field: string): {
166
+ actual: unknown[];
167
+ expected: unknown[];
168
+ ids: string[];
169
+ };
170
+ /**
171
+ * Filters aligned records to only those with values in both actual and expected
172
+ */
173
+ declare function filterComplete(aligned: AlignedRecord[], field: string): AlignedRecord[];
174
+
175
+ /**
176
+ * Dataset integrity checks
177
+ */
178
+
179
+ /**
180
+ * Options for integrity checks
181
+ */
182
+ interface IntegrityOptions {
183
+ /** Required fields that must be present in each record */
184
+ requiredFields?: string[];
185
+ /** Whether to throw on integrity failures (default: false) */
186
+ throwOnFailure?: boolean;
187
+ }
188
+ /**
189
+ * Checks dataset integrity - validates IDs and required fields
190
+ *
191
+ * @param dataset - Dataset to check
192
+ * @param options - Integrity check options
193
+ * @returns Integrity result with details
194
+ */
195
+ declare function checkIntegrity<T extends Record<string, unknown>>(dataset: Dataset<T>, options?: IntegrityOptions): IntegrityResult;
196
+ /**
197
+ * Validates predictions against a dataset
198
+ */
199
+ declare function validatePredictions(predictions: Prediction[], expectedIds: string[]): {
200
+ valid: boolean;
201
+ missing: string[];
202
+ extra: string[];
203
+ };
204
+
205
+ /**
206
+ * Selector for binarized fields (continuous → binary threshold)
207
+ */
208
+ declare class BinarizeSelector {
209
+ private fieldName;
210
+ private threshold;
211
+ private binaryActual;
212
+ private binaryExpected;
213
+ private assertions;
214
+ constructor(aligned: AlignedRecord[], fieldName: string, threshold: number);
215
+ /**
216
+ * Asserts that accuracy is above a threshold
217
+ */
218
+ toHaveAccuracyAbove(threshold: number): this;
219
+ /**
220
+ * Asserts that precision is above a threshold
221
+ * @param classOrThreshold - Either the class (true/false) or threshold
222
+ * @param threshold - Threshold when class is specified
223
+ */
224
+ toHavePrecisionAbove(classOrThreshold: boolean | number, threshold?: number): this;
225
+ /**
226
+ * Asserts that recall is above a threshold
227
+ * @param classOrThreshold - Either the class (true/false) or threshold
228
+ * @param threshold - Threshold when class is specified
229
+ */
230
+ toHaveRecallAbove(classOrThreshold: boolean | number, threshold?: number): this;
231
+ /**
232
+ * Asserts that F1 score is above a threshold
233
+ */
234
+ toHaveF1Above(classOrThreshold: boolean | number, threshold?: number): this;
235
+ /**
236
+ * Includes the confusion matrix in the report
237
+ */
238
+ toHaveConfusionMatrix(): this;
239
+ /**
240
+ * Gets computed metrics
241
+ */
242
+ getMetrics(): ClassificationMetrics;
243
+ /**
244
+ * Gets all assertions made
245
+ */
246
+ getAssertions(): AssertionResult[];
247
+ }
248
+
249
+ /**
250
+ * Field selector for building assertions on a specific field
251
+ */
252
+ declare class FieldSelector {
253
+ private aligned;
254
+ private fieldName;
255
+ private actualValues;
256
+ private expectedValues;
257
+ private assertions;
258
+ constructor(aligned: AlignedRecord[], fieldName: string);
259
+ /**
260
+ * Transforms continuous scores to binary classification using a threshold
261
+ */
262
+ binarize(threshold: number): BinarizeSelector;
263
+ /**
264
+ * Validates that ground truth exists for classification metrics.
265
+ * Throws a clear error if expected values are missing.
266
+ */
267
+ private validateGroundTruth;
268
+ /**
269
+ * Asserts that accuracy is above a threshold
270
+ */
271
+ toHaveAccuracyAbove(threshold: number): this;
272
+ /**
273
+ * Asserts that precision is above a threshold
274
+ * @param classOrThreshold - Either the class name or threshold (if class is omitted, uses macro average)
275
+ * @param threshold - Threshold when class is specified
276
+ */
277
+ toHavePrecisionAbove(classOrThreshold: string | number, threshold?: number): this;
278
+ /**
279
+ * Asserts that recall is above a threshold
280
+ * @param classOrThreshold - Either the class name or threshold (if class is omitted, uses macro average)
281
+ * @param threshold - Threshold when class is specified
282
+ */
283
+ toHaveRecallAbove(classOrThreshold: string | number, threshold?: number): this;
284
+ /**
285
+ * Asserts that F1 score is above a threshold
286
+ * @param classOrThreshold - Either the class name or threshold (if class is omitted, uses macro average)
287
+ * @param threshold - Threshold when class is specified
288
+ */
289
+ toHaveF1Above(classOrThreshold: string | number, threshold?: number): this;
290
+ /**
291
+ * Includes the confusion matrix in the report
292
+ */
293
+ toHaveConfusionMatrix(): this;
294
+ /**
295
+ * Asserts that a percentage of values are below or equal to a threshold.
296
+ * This is a distributional assertion that only looks at actual values (no ground truth required).
297
+ *
298
+ * @param valueThreshold - The value threshold to compare against
299
+ * @param percentageThreshold - The minimum percentage (0-1) of values that should be <= valueThreshold
300
+ * @returns this for method chaining
301
+ *
302
+ * @example
303
+ * // Assert that 90% of confidence scores are below 0.5
304
+ * expectStats(predictions)
305
+ * .field("confidence")
306
+ * .toHavePercentageBelow(0.5, 0.9)
307
+ */
308
+ toHavePercentageBelow(valueThreshold: number, percentageThreshold: number): this;
309
+ /**
310
+ * Asserts that a percentage of values are above a threshold.
311
+ * This is a distributional assertion that only looks at actual values (no ground truth required).
312
+ *
313
+ * @param valueThreshold - The value threshold to compare against
314
+ * @param percentageThreshold - The minimum percentage (0-1) of values that should be > valueThreshold
315
+ * @returns this for method chaining
316
+ *
317
+ * @example
318
+ * // Assert that 80% of quality scores are above 0.7
319
+ * expectStats(predictions)
320
+ * .field("quality")
321
+ * .toHavePercentageAbove(0.7, 0.8)
322
+ */
323
+ toHavePercentageAbove(valueThreshold: number, percentageThreshold: number): this;
324
+ /**
325
+ * Gets the computed metrics for this field
326
+ */
327
+ getMetrics(): ClassificationMetrics;
328
+ /**
329
+ * Gets all assertions made on this field
330
+ */
331
+ getAssertions(): AssertionResult[];
332
+ }
333
+
334
+ /**
335
+ * expectStats() - fluent assertion API for statistical evaluation
336
+ */
337
+
338
+ /**
339
+ * Input types that expectStats() accepts
340
+ */
341
+ type StatsInput = ModelRunResult | Prediction[] | AlignedRecord[];
342
+ /**
343
+ * Entry point for statistical assertions.
344
+ *
345
+ * Supports two usage patterns:
346
+ * 1. Single argument: predictions without ground truth (for distribution assertions)
347
+ * 2. Two arguments: predictions with ground truth (for classification metrics)
348
+ *
349
+ * @param inputOrActual - Either StatsInput (one-arg) or Prediction[] (two-arg)
350
+ * @param expected - Ground truth data (optional, only for two-arg usage)
351
+ * @returns ExpectStats instance for chaining assertions
352
+ *
353
+ * @example
354
+ * // Pattern 1: Distribution assertions (no ground truth)
355
+ * expectStats(predictions)
356
+ * .field("confidence")
357
+ * .toHavePercentageBelow(0.5, 0.9);
358
+ *
359
+ * @example
360
+ * // Pattern 1b: Judge validation (with ground truth)
361
+ * expectStats(judgeOutputs, humanLabels)
362
+ * .field("hallucinated")
363
+ * .toHaveRecallAbove(true, 0.85)
364
+ * .toHavePrecisionAbove(true, 0.8);
365
+ */
366
+ declare function expectStats(input: StatsInput): ExpectStats;
367
+ declare function expectStats(actual: Prediction[], expected: Array<Record<string, unknown>>): ExpectStats;
368
+ /**
369
+ * Main stats expectation class
370
+ */
371
+ declare class ExpectStats {
372
+ private aligned;
373
+ constructor(aligned: AlignedRecord[]);
374
+ /**
375
+ * Selects a field to evaluate
376
+ */
377
+ field(fieldName: string): FieldSelector;
378
+ /**
379
+ * Gets the raw aligned records (for advanced use)
380
+ */
381
+ getAligned(): AlignedRecord[];
382
+ /**
383
+ * Gets the count of records
384
+ */
385
+ count(): number;
386
+ }
387
+
388
+ /**
389
+ * Confusion matrix computation
390
+ */
391
+
392
+ /**
393
+ * Builds a confusion matrix from actual and predicted values
394
+ *
395
+ * @param actual - Actual/predicted values from the model
396
+ * @param expected - Expected/ground truth values
397
+ * @returns ConfusionMatrix with matrix, labels, and total
398
+ *
399
+ * @example
400
+ * ```ts
401
+ * const matrix = buildConfusionMatrix(
402
+ * ["positive", "negative", "positive"],
403
+ * ["positive", "positive", "positive"]
404
+ * );
405
+ * // matrix.matrix[i][j] = count of expected[i] predicted as actual[j]
406
+ * ```
407
+ */
408
+ declare function buildConfusionMatrix(actual: unknown[], expected: unknown[]): ConfusionMatrix;
409
+ /**
410
+ * Formats a confusion matrix as a string table
411
+ */
412
+ declare function formatConfusionMatrix(cm: ConfusionMatrix): string;
413
+
414
+ /**
415
+ * Classification metrics computation
416
+ */
417
+
418
+ /**
419
+ * Computes all classification metrics from actual and expected values
420
+ */
421
+ declare function computeClassificationMetrics(actual: unknown[], expected: unknown[]): ClassificationMetrics;
422
+ /**
423
+ * Computes precision for a specific class
424
+ */
425
+ declare function computePrecision(actual: unknown[], expected: unknown[], targetClass: string): number;
426
+ /**
427
+ * Computes recall for a specific class
428
+ */
429
+ declare function computeRecall(actual: unknown[], expected: unknown[], targetClass: string): number;
430
+ /**
431
+ * Computes F1 score for a specific class
432
+ */
433
+ declare function computeF1(actual: unknown[], expected: unknown[], targetClass: string): number;
434
+ /**
435
+ * Computes overall accuracy
436
+ */
437
+ declare function computeAccuracy(actual: unknown[], expected: unknown[]): number;
438
+
439
+ /**
440
+ * JSON Reporter - deterministic JSON output
441
+ */
442
+
443
+ /**
444
+ * JSON Reporter for machine-readable output
445
+ */
446
+ declare class JsonReporter {
447
+ /**
448
+ * Formats a report as deterministic JSON
449
+ */
450
+ format(report: EvalReport): string;
451
+ /**
452
+ * Writes report to a file
453
+ */
454
+ writeToFile(report: EvalReport, path: string): void;
455
+ /**
456
+ * Converts report to a JSON-serializable format
457
+ */
458
+ private toSerializable;
459
+ }
460
+ /**
461
+ * Parses a JSON report back into an EvalReport
462
+ */
463
+ declare function parseReport(json: string): EvalReport;
464
+
465
+ /**
466
+ * Console Reporter - human-readable output
467
+ */
468
+
469
+ /**
470
+ * Console reporter for human-readable output
471
+ */
472
+ declare class ConsoleReporter {
473
+ private useColors;
474
+ constructor(useColors?: boolean);
475
+ /**
476
+ * Prints the run header
477
+ */
478
+ printHeader(fileCount: number): void;
479
+ /**
480
+ * Prints the full report
481
+ */
482
+ printReport(report: EvalReport): void;
483
+ /**
484
+ * Prints a suite's results
485
+ */
486
+ private printSuite;
487
+ /**
488
+ * Prints a single test result
489
+ */
490
+ private printTest;
491
+ /**
492
+ * Prints field metrics summary
493
+ */
494
+ private printFieldMetrics;
495
+ /**
496
+ * Prints the summary
497
+ */
498
+ private printSummary;
499
+ /**
500
+ * Prints a confusion matrix
501
+ */
502
+ printConfusionMatrix(fm: FieldMetricResult): void;
503
+ /**
504
+ * Formats a percentage
505
+ */
506
+ private pct;
507
+ /**
508
+ * Formats duration
509
+ */
510
+ private formatDuration;
511
+ /**
512
+ * Gets status symbol
513
+ */
514
+ private getStatusSymbol;
515
+ /**
516
+ * Gets status color
517
+ */
518
+ private getStatusColor;
519
+ /**
520
+ * Applies color if enabled
521
+ */
522
+ private color;
523
+ /**
524
+ * Logs a line
525
+ */
526
+ private log;
527
+ }
528
+
529
+ /**
530
+ * Options for file discovery
531
+ */
532
+ interface DiscoveryOptions {
533
+ /** Patterns to match (default: *.eval.{js,ts,mjs}) */
534
+ patterns?: string[];
535
+ /** Patterns to ignore */
536
+ ignore?: string[];
537
+ /** Base directory to search from */
538
+ cwd?: string;
539
+ /** Filter pattern for test names */
540
+ filter?: string;
541
+ }
542
+ /**
543
+ * Discovers eval files matching the patterns
544
+ *
545
+ * @param options - Discovery options
546
+ * @returns Array of absolute file paths
547
+ */
548
+ declare function discoverEvalFiles(options?: DiscoveryOptions): Promise<string[]>;
549
+
550
+ /**
551
+ * Test executor - runs discovered eval files
552
+ */
553
+
554
+ /**
555
+ * Options for test execution
556
+ */
557
+ interface ExecutorOptions {
558
+ /** Stop on first failure */
559
+ bail?: boolean;
560
+ /** Test timeout in ms */
561
+ timeout?: number;
562
+ /** Filter pattern for test names */
563
+ filter?: string;
564
+ }
565
+ /**
566
+ * Executes all eval files and returns results
567
+ */
568
+ declare function executeEvalFiles(files: string[], options?: ExecutorOptions): Promise<EvalReport>;
569
+ /**
570
+ * Determines exit code from report
571
+ */
572
+ declare function getExitCode(report: EvalReport): number;
573
+
574
+ /**
575
+ * Custom error classes for EvalSense
576
+ */
577
+ declare class EvalSenseError extends Error {
578
+ constructor(message: string);
579
+ }
580
+ declare class AssertionError extends EvalSenseError {
581
+ readonly expected: unknown;
582
+ readonly actual: unknown;
583
+ readonly field?: string;
584
+ constructor(message: string, expected?: unknown, actual?: unknown, field?: string);
585
+ }
586
+ declare class DatasetError extends EvalSenseError {
587
+ readonly source?: string;
588
+ constructor(message: string, source?: string);
589
+ }
590
+ declare class IntegrityError extends EvalSenseError {
591
+ readonly missingIds?: string[];
592
+ readonly duplicateIds?: string[];
593
+ constructor(message: string, missingIds?: string[], duplicateIds?: string[]);
594
+ }
595
+ declare class ConfigurationError extends EvalSenseError {
596
+ constructor(message: string);
597
+ }
598
+ declare class TestExecutionError extends EvalSenseError {
599
+ readonly testName: string;
600
+ readonly originalError?: Error;
601
+ constructor(message: string, testName: string, originalError?: Error);
602
+ }
603
+
604
+ export { AlignedRecord, AssertionError, AssertionResult, ClassificationMetrics, ConfigurationError, ConfusionMatrix, ConsoleReporter, Dataset, DatasetError, EvalReport, EvalSenseError, FieldMetricResult, IntegrityError, IntegrityResult, JsonReporter, Prediction, TestExecutionError, TestFn, afterAll, afterEach, alignByKey, beforeAll, beforeEach, buildConfusionMatrix, checkIntegrity, computeAccuracy, computeClassificationMetrics, computeF1, computePrecision, computeRecall, createDataset, describe, discoverEvalFiles, evalTest, executeEvalFiles, expectStats, extractFieldValues, filterComplete, formatConfusionMatrix, getExitCode, it, loadDataset, parseReport, runModel, runModelParallel, test, validatePredictions };