evalsense 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +235 -98
- package/dist/{chunk-BFGA2NUB.cjs → chunk-4BKZPVY4.cjs} +13 -6
- package/dist/chunk-4BKZPVY4.cjs.map +1 -0
- package/dist/{chunk-IYLSY7NX.js → chunk-IUVDDMJ3.js} +13 -6
- package/dist/chunk-IUVDDMJ3.js.map +1 -0
- package/dist/chunk-NCCQRZ2Y.cjs +1141 -0
- package/dist/chunk-NCCQRZ2Y.cjs.map +1 -0
- package/dist/chunk-TDGWDK2L.js +1108 -0
- package/dist/chunk-TDGWDK2L.js.map +1 -0
- package/dist/cli.cjs +11 -11
- package/dist/cli.js +1 -1
- package/dist/index-CATqAHNK.d.cts +416 -0
- package/dist/index-CoMpaW-K.d.ts +416 -0
- package/dist/index.cjs +507 -580
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +210 -161
- package/dist/index.d.ts +210 -161
- package/dist/index.js +455 -524
- package/dist/index.js.map +1 -1
- package/dist/metrics/index.cjs +103 -342
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +260 -31
- package/dist/metrics/index.d.ts +260 -31
- package/dist/metrics/index.js +24 -312
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/opinionated/index.cjs +5 -5
- package/dist/metrics/opinionated/index.d.cts +2 -163
- package/dist/metrics/opinionated/index.d.ts +2 -163
- package/dist/metrics/opinionated/index.js +1 -1
- package/dist/{types-C71p0wzM.d.cts → types-D0hzfyKm.d.cts} +1 -13
- package/dist/{types-C71p0wzM.d.ts → types-D0hzfyKm.d.ts} +1 -13
- package/package.json +1 -1
- package/dist/chunk-BFGA2NUB.cjs.map +0 -1
- package/dist/chunk-IYLSY7NX.js.map +0 -1
- package/dist/chunk-RZFLCWTW.cjs +0 -942
- package/dist/chunk-RZFLCWTW.cjs.map +0 -1
- package/dist/chunk-Z3U6AUWX.js +0 -925
- package/dist/chunk-Z3U6AUWX.js.map +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { T as TestFn,
|
|
2
|
-
export {
|
|
1
|
+
import { T as TestFn, P as Prediction, A as AlignedRecord, I as IntegrityResult, b as AssertionResult, C as ClassificationMetrics, c as ConfusionMatrix, E as EvalReport, F as FieldMetricResult } from './types-D0hzfyKm.js';
|
|
2
|
+
export { d as CLIOptions, e as ClassMetrics, f as EvalTest, g as ExitCode, h as ExitCodes, i as MetricConfig, M as MetricFn, a as MetricOutput, R as RegressionMetrics, S as Suite, j as SuiteResult, k as TestContext, l as TestResult } from './types-D0hzfyKm.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* describe() implementation - Jest-like test suite grouping
|
|
@@ -73,65 +73,6 @@ declare function evalTestSkip(name: string, _fn: TestFn): void;
|
|
|
73
73
|
*/
|
|
74
74
|
declare function evalTestOnly(name: string, fn: TestFn): void;
|
|
75
75
|
|
|
76
|
-
/**
|
|
77
|
-
* Dataset loading functionality
|
|
78
|
-
*/
|
|
79
|
-
|
|
80
|
-
/**
|
|
81
|
-
* Loads a dataset from a JSON or NDJSON file
|
|
82
|
-
*
|
|
83
|
-
* @param path - Path to the dataset file (relative to cwd or absolute)
|
|
84
|
-
* @returns Dataset with records and metadata
|
|
85
|
-
*
|
|
86
|
-
* @example
|
|
87
|
-
* ```ts
|
|
88
|
-
* const dataset = loadDataset("./fixtures/sentiment.json");
|
|
89
|
-
* // dataset.records = [{ id: "1", text: "...", sentiment: "positive" }, ...]
|
|
90
|
-
* ```
|
|
91
|
-
*/
|
|
92
|
-
declare function loadDataset<T extends Record<string, unknown> = Record<string, unknown>>(path: string): Dataset<T>;
|
|
93
|
-
/**
|
|
94
|
-
* Creates a dataset from an array of records (for testing/programmatic use)
|
|
95
|
-
*/
|
|
96
|
-
declare function createDataset<T extends Record<string, unknown>>(records: T[], source?: string): Dataset<T>;
|
|
97
|
-
|
|
98
|
-
/**
|
|
99
|
-
* runModel() - executes a model function against a dataset
|
|
100
|
-
*/
|
|
101
|
-
|
|
102
|
-
/**
|
|
103
|
-
* Model function signature - takes a record and returns a prediction
|
|
104
|
-
*/
|
|
105
|
-
type ModelFn<T> = (record: T) => Prediction | Promise<Prediction>;
|
|
106
|
-
/**
|
|
107
|
-
* Result of running a model on a dataset
|
|
108
|
-
*/
|
|
109
|
-
interface ModelRunResult {
|
|
110
|
-
predictions: Prediction[];
|
|
111
|
-
aligned: AlignedRecord[];
|
|
112
|
-
duration: number;
|
|
113
|
-
}
|
|
114
|
-
/**
|
|
115
|
-
* Runs a model function against each record in a dataset
|
|
116
|
-
*
|
|
117
|
-
* @param dataset - The dataset to process
|
|
118
|
-
* @param modelFn - Function that processes each record and returns a prediction
|
|
119
|
-
* @returns Aligned predictions with actual vs expected values
|
|
120
|
-
*
|
|
121
|
-
* @example
|
|
122
|
-
* ```ts
|
|
123
|
-
* const result = await runModel(dataset, (record) => ({
|
|
124
|
-
* id: record.id,
|
|
125
|
-
* sentiment: classify(record.text)
|
|
126
|
-
* }));
|
|
127
|
-
* ```
|
|
128
|
-
*/
|
|
129
|
-
declare function runModel<T extends Record<string, unknown>>(dataset: Dataset<T>, modelFn: ModelFn<T>): Promise<ModelRunResult>;
|
|
130
|
-
/**
|
|
131
|
-
* Runs model in parallel with concurrency limit
|
|
132
|
-
*/
|
|
133
|
-
declare function runModelParallel<T extends Record<string, unknown>>(dataset: Dataset<T>, modelFn: ModelFn<T>, concurrency?: number): Promise<ModelRunResult>;
|
|
134
|
-
|
|
135
76
|
/**
|
|
136
77
|
* Dataset alignment utilities
|
|
137
78
|
* Aligns predictions with ground truth by ID
|
|
@@ -192,11 +133,11 @@ interface IntegrityOptions {
|
|
|
192
133
|
/**
|
|
193
134
|
* Checks dataset integrity - validates IDs and required fields
|
|
194
135
|
*
|
|
195
|
-
* @param
|
|
136
|
+
* @param records - Array of records to check
|
|
196
137
|
* @param options - Integrity check options
|
|
197
138
|
* @returns Integrity result with details
|
|
198
139
|
*/
|
|
199
|
-
declare function checkIntegrity<T extends Record<string, unknown>>(
|
|
140
|
+
declare function checkIntegrity<T extends Record<string, unknown>>(records: T[], options?: IntegrityOptions): IntegrityResult;
|
|
200
141
|
/**
|
|
201
142
|
* Validates predictions against a dataset
|
|
202
143
|
*/
|
|
@@ -206,6 +147,51 @@ declare function validatePredictions(predictions: Prediction[], expectedIds: str
|
|
|
206
147
|
extra: string[];
|
|
207
148
|
};
|
|
208
149
|
|
|
150
|
+
/**
|
|
151
|
+
* MetricMatcher - provides Jest-like assertion methods for metrics
|
|
152
|
+
*/
|
|
153
|
+
|
|
154
|
+
interface MetricMatcherContext<TParent> {
|
|
155
|
+
parent: TParent;
|
|
156
|
+
metricName: string;
|
|
157
|
+
metricValue: number;
|
|
158
|
+
fieldName: string;
|
|
159
|
+
targetClass?: string;
|
|
160
|
+
assertions: AssertionResult[];
|
|
161
|
+
formatValue?: (value: number) => string;
|
|
162
|
+
}
|
|
163
|
+
/**
|
|
164
|
+
* Matcher class for individual metric assertions
|
|
165
|
+
* Returns the parent selector to enable fluent chaining
|
|
166
|
+
*/
|
|
167
|
+
declare class MetricMatcher<TParent> {
|
|
168
|
+
private context;
|
|
169
|
+
constructor(context: MetricMatcherContext<TParent>);
|
|
170
|
+
private formatMetricValue;
|
|
171
|
+
private createAssertion;
|
|
172
|
+
private recordAndReturn;
|
|
173
|
+
/**
|
|
174
|
+
* Assert that the metric is greater than or equal to the threshold (>=)
|
|
175
|
+
*/
|
|
176
|
+
toBeAtLeast(threshold: number): TParent;
|
|
177
|
+
/**
|
|
178
|
+
* Assert that the metric is strictly greater than the threshold (>)
|
|
179
|
+
*/
|
|
180
|
+
toBeAbove(threshold: number): TParent;
|
|
181
|
+
/**
|
|
182
|
+
* Assert that the metric is less than or equal to the threshold (<=)
|
|
183
|
+
*/
|
|
184
|
+
toBeAtMost(threshold: number): TParent;
|
|
185
|
+
/**
|
|
186
|
+
* Assert that the metric is strictly less than the threshold (<)
|
|
187
|
+
*/
|
|
188
|
+
toBeBelow(threshold: number): TParent;
|
|
189
|
+
/**
|
|
190
|
+
* Assert that the metric equals the expected value (with optional tolerance for floats)
|
|
191
|
+
*/
|
|
192
|
+
toEqual(expected: number, tolerance?: number): TParent;
|
|
193
|
+
}
|
|
194
|
+
|
|
209
195
|
/**
|
|
210
196
|
* Selector for binarized fields (continuous → binary threshold)
|
|
211
197
|
*/
|
|
@@ -217,29 +203,48 @@ declare class BinarizeSelector {
|
|
|
217
203
|
private assertions;
|
|
218
204
|
constructor(aligned: AlignedRecord[], fieldName: string, threshold: number);
|
|
219
205
|
/**
|
|
220
|
-
*
|
|
206
|
+
* Access accuracy metric for assertions
|
|
207
|
+
* @example
|
|
208
|
+
* expectStats(predictions, groundTruth)
|
|
209
|
+
* .field("score")
|
|
210
|
+
* .binarize(0.5)
|
|
211
|
+
* .accuracy.toBeAtLeast(0.8)
|
|
221
212
|
*/
|
|
222
|
-
|
|
213
|
+
get accuracy(): MetricMatcher<this>;
|
|
223
214
|
/**
|
|
224
|
-
*
|
|
225
|
-
* @
|
|
226
|
-
*
|
|
215
|
+
* Access F1 score metric for assertions (macro average)
|
|
216
|
+
* @example
|
|
217
|
+
* expectStats(predictions, groundTruth)
|
|
218
|
+
* .field("score")
|
|
219
|
+
* .binarize(0.5)
|
|
220
|
+
* .f1.toBeAtLeast(0.75)
|
|
227
221
|
*/
|
|
228
|
-
|
|
222
|
+
get f1(): MetricMatcher<this>;
|
|
229
223
|
/**
|
|
230
|
-
*
|
|
231
|
-
* @param
|
|
232
|
-
* @
|
|
224
|
+
* Access precision metric for assertions
|
|
225
|
+
* @param targetClass - Optional boolean class (true/false). If omitted, uses macro average
|
|
226
|
+
* @example
|
|
227
|
+
* expectStats(predictions, groundTruth)
|
|
228
|
+
* .field("score")
|
|
229
|
+
* .binarize(0.5)
|
|
230
|
+
* .precision(true).toBeAtLeast(0.7)
|
|
233
231
|
*/
|
|
234
|
-
|
|
232
|
+
precision(targetClass?: boolean): MetricMatcher<this>;
|
|
235
233
|
/**
|
|
236
|
-
*
|
|
234
|
+
* Access recall metric for assertions
|
|
235
|
+
* @param targetClass - Optional boolean class (true/false). If omitted, uses macro average
|
|
236
|
+
* @example
|
|
237
|
+
* expectStats(predictions, groundTruth)
|
|
238
|
+
* .field("score")
|
|
239
|
+
* .binarize(0.5)
|
|
240
|
+
* .recall(true).toBeAtLeast(0.7)
|
|
237
241
|
*/
|
|
238
|
-
|
|
242
|
+
recall(targetClass?: boolean): MetricMatcher<this>;
|
|
239
243
|
/**
|
|
240
|
-
*
|
|
244
|
+
* Displays the confusion matrix in the report
|
|
245
|
+
* This is not an assertion - it always passes and just records the matrix for display
|
|
241
246
|
*/
|
|
242
|
-
|
|
247
|
+
displayConfusionMatrix(): this;
|
|
243
248
|
/**
|
|
244
249
|
* Gets computed metrics
|
|
245
250
|
*/
|
|
@@ -250,6 +255,47 @@ declare class BinarizeSelector {
|
|
|
250
255
|
getAssertions(): AssertionResult[];
|
|
251
256
|
}
|
|
252
257
|
|
|
258
|
+
/**
|
|
259
|
+
* PercentageMatcher - provides assertion methods for percentage-based distribution checks
|
|
260
|
+
*/
|
|
261
|
+
|
|
262
|
+
type PercentageDirection = "above" | "below";
|
|
263
|
+
interface PercentageMatcherContext<TParent> {
|
|
264
|
+
parent: TParent;
|
|
265
|
+
fieldName: string;
|
|
266
|
+
valueThreshold: number;
|
|
267
|
+
direction: PercentageDirection;
|
|
268
|
+
actualPercentage: number;
|
|
269
|
+
assertions: AssertionResult[];
|
|
270
|
+
}
|
|
271
|
+
/**
|
|
272
|
+
* Matcher class for percentage-based distribution assertions
|
|
273
|
+
* Returns the parent selector to enable fluent chaining
|
|
274
|
+
*/
|
|
275
|
+
declare class PercentageMatcher<TParent> {
|
|
276
|
+
private context;
|
|
277
|
+
constructor(context: PercentageMatcherContext<TParent>);
|
|
278
|
+
private formatPercentage;
|
|
279
|
+
private createAssertion;
|
|
280
|
+
private recordAndReturn;
|
|
281
|
+
/**
|
|
282
|
+
* Assert that the percentage is greater than or equal to the threshold (>=)
|
|
283
|
+
*/
|
|
284
|
+
toBeAtLeast(percentageThreshold: number): TParent;
|
|
285
|
+
/**
|
|
286
|
+
* Assert that the percentage is strictly greater than the threshold (>)
|
|
287
|
+
*/
|
|
288
|
+
toBeAbove(percentageThreshold: number): TParent;
|
|
289
|
+
/**
|
|
290
|
+
* Assert that the percentage is less than or equal to the threshold (<=)
|
|
291
|
+
*/
|
|
292
|
+
toBeAtMost(percentageThreshold: number): TParent;
|
|
293
|
+
/**
|
|
294
|
+
* Assert that the percentage is strictly less than the threshold (<)
|
|
295
|
+
*/
|
|
296
|
+
toBeBelow(percentageThreshold: number): TParent;
|
|
297
|
+
}
|
|
298
|
+
|
|
253
299
|
/**
|
|
254
300
|
* Field selector for building assertions on a specific field
|
|
255
301
|
*/
|
|
@@ -270,109 +316,98 @@ declare class FieldSelector {
|
|
|
270
316
|
*/
|
|
271
317
|
private validateGroundTruth;
|
|
272
318
|
/**
|
|
273
|
-
*
|
|
274
|
-
|
|
275
|
-
toHaveAccuracyAbove(threshold: number): this;
|
|
276
|
-
/**
|
|
277
|
-
* Asserts that precision is above a threshold
|
|
278
|
-
* @param classOrThreshold - Either the class name or threshold (if class is omitted, uses macro average)
|
|
279
|
-
* @param threshold - Threshold when class is specified
|
|
280
|
-
*/
|
|
281
|
-
toHavePrecisionAbove(classOrThreshold: string | number, threshold?: number): this;
|
|
282
|
-
/**
|
|
283
|
-
* Asserts that recall is above a threshold
|
|
284
|
-
* @param classOrThreshold - Either the class name or threshold (if class is omitted, uses macro average)
|
|
285
|
-
* @param threshold - Threshold when class is specified
|
|
319
|
+
* Validates that ground truth exists and both arrays contain numeric values.
|
|
320
|
+
* Returns the filtered numeric arrays for regression metrics.
|
|
286
321
|
*/
|
|
287
|
-
|
|
322
|
+
private validateRegressionInputs;
|
|
288
323
|
/**
|
|
289
|
-
*
|
|
290
|
-
* @
|
|
291
|
-
*
|
|
324
|
+
* Access accuracy metric for assertions
|
|
325
|
+
* @example
|
|
326
|
+
* expectStats(predictions, groundTruth)
|
|
327
|
+
* .field("sentiment")
|
|
328
|
+
* .accuracy.toBeAtLeast(0.8)
|
|
292
329
|
*/
|
|
293
|
-
|
|
330
|
+
get accuracy(): MetricMatcher<this>;
|
|
294
331
|
/**
|
|
295
|
-
*
|
|
332
|
+
* Access F1 score metric for assertions (macro average)
|
|
333
|
+
* @example
|
|
334
|
+
* expectStats(predictions, groundTruth)
|
|
335
|
+
* .field("sentiment")
|
|
336
|
+
* .f1.toBeAtLeast(0.75)
|
|
296
337
|
*/
|
|
297
|
-
|
|
338
|
+
get f1(): MetricMatcher<this>;
|
|
298
339
|
/**
|
|
299
|
-
*
|
|
300
|
-
*
|
|
301
|
-
*
|
|
302
|
-
* @param valueThreshold - The value threshold to compare against
|
|
303
|
-
* @param percentageThreshold - The minimum percentage (0-1) of values that should be <= valueThreshold
|
|
304
|
-
* @returns this for method chaining
|
|
305
|
-
*
|
|
340
|
+
* Access precision metric for assertions
|
|
341
|
+
* @param targetClass - Optional class name. If omitted, uses macro average
|
|
306
342
|
* @example
|
|
307
|
-
*
|
|
308
|
-
*
|
|
309
|
-
* .
|
|
310
|
-
* .toHavePercentageBelow(0.5, 0.9)
|
|
343
|
+
* expectStats(predictions, groundTruth)
|
|
344
|
+
* .field("sentiment")
|
|
345
|
+
* .precision("positive").toBeAtLeast(0.7)
|
|
311
346
|
*/
|
|
312
|
-
|
|
347
|
+
precision(targetClass?: string): MetricMatcher<this>;
|
|
313
348
|
/**
|
|
314
|
-
*
|
|
315
|
-
*
|
|
316
|
-
*
|
|
317
|
-
* @param valueThreshold - The value threshold to compare against
|
|
318
|
-
* @param percentageThreshold - The minimum percentage (0-1) of values that should be > valueThreshold
|
|
319
|
-
* @returns this for method chaining
|
|
320
|
-
*
|
|
349
|
+
* Access recall metric for assertions
|
|
350
|
+
* @param targetClass - Optional class name. If omitted, uses macro average
|
|
321
351
|
* @example
|
|
322
|
-
*
|
|
323
|
-
*
|
|
324
|
-
* .
|
|
325
|
-
* .toHavePercentageAbove(0.7, 0.8)
|
|
352
|
+
* expectStats(predictions, groundTruth)
|
|
353
|
+
* .field("sentiment")
|
|
354
|
+
* .recall("positive").toBeAtLeast(0.7)
|
|
326
355
|
*/
|
|
327
|
-
|
|
356
|
+
recall(targetClass?: string): MetricMatcher<this>;
|
|
328
357
|
/**
|
|
329
|
-
*
|
|
330
|
-
*
|
|
358
|
+
* Access Mean Absolute Error metric for assertions
|
|
359
|
+
* @example
|
|
360
|
+
* expectStats(predictions, groundTruth)
|
|
361
|
+
* .field("score")
|
|
362
|
+
* .mae.toBeAtMost(0.1)
|
|
331
363
|
*/
|
|
332
|
-
|
|
364
|
+
get mae(): MetricMatcher<this>;
|
|
333
365
|
/**
|
|
334
|
-
*
|
|
335
|
-
* Requires numeric values in both actual and expected.
|
|
336
|
-
*
|
|
337
|
-
* @param threshold - Maximum allowed MAE
|
|
338
|
-
* @returns this for method chaining
|
|
339
|
-
*
|
|
366
|
+
* Access Root Mean Squared Error metric for assertions
|
|
340
367
|
* @example
|
|
341
368
|
* expectStats(predictions, groundTruth)
|
|
342
369
|
* .field("score")
|
|
343
|
-
* .
|
|
370
|
+
* .rmse.toBeAtMost(0.15)
|
|
344
371
|
*/
|
|
345
|
-
|
|
372
|
+
get rmse(): MetricMatcher<this>;
|
|
346
373
|
/**
|
|
347
|
-
*
|
|
348
|
-
* Requires numeric values in both actual and expected.
|
|
349
|
-
*
|
|
350
|
-
* @param threshold - Maximum allowed RMSE
|
|
351
|
-
* @returns this for method chaining
|
|
352
|
-
*
|
|
374
|
+
* Access R-squared (coefficient of determination) metric for assertions
|
|
353
375
|
* @example
|
|
354
376
|
* expectStats(predictions, groundTruth)
|
|
355
377
|
* .field("score")
|
|
356
|
-
* .
|
|
357
|
-
*/
|
|
358
|
-
|
|
359
|
-
/**
|
|
360
|
-
*
|
|
361
|
-
*
|
|
362
|
-
*
|
|
363
|
-
*
|
|
364
|
-
*
|
|
365
|
-
*
|
|
366
|
-
|
|
367
|
-
|
|
378
|
+
* .r2.toBeAtLeast(0.8)
|
|
379
|
+
*/
|
|
380
|
+
get r2(): MetricMatcher<this>;
|
|
381
|
+
/**
|
|
382
|
+
* Assert on the percentage of values below or equal to a threshold
|
|
383
|
+
* @param valueThreshold - The value threshold to compare against
|
|
384
|
+
* @example
|
|
385
|
+
* expectStats(predictions)
|
|
386
|
+
* .field("confidence")
|
|
387
|
+
* .percentageBelow(0.5).toBeAtLeast(0.9)
|
|
388
|
+
*/
|
|
389
|
+
percentageBelow(valueThreshold: number): PercentageMatcher<this>;
|
|
390
|
+
/**
|
|
391
|
+
* Assert on the percentage of values above a threshold
|
|
392
|
+
* @param valueThreshold - The value threshold to compare against
|
|
393
|
+
* @example
|
|
394
|
+
* expectStats(predictions)
|
|
395
|
+
* .field("quality")
|
|
396
|
+
* .percentageAbove(0.7).toBeAtLeast(0.8)
|
|
397
|
+
*/
|
|
398
|
+
percentageAbove(valueThreshold: number): PercentageMatcher<this>;
|
|
399
|
+
/**
|
|
400
|
+
* Displays the confusion matrix in the report
|
|
401
|
+
* This is not an assertion - it always passes and just records the matrix for display
|
|
368
402
|
* @example
|
|
369
403
|
* expectStats(predictions, groundTruth)
|
|
370
|
-
* .field("
|
|
371
|
-
* .
|
|
404
|
+
* .field("sentiment")
|
|
405
|
+
* .accuracy.toBeAtLeast(0.8)
|
|
406
|
+
* .displayConfusionMatrix()
|
|
372
407
|
*/
|
|
373
|
-
|
|
408
|
+
displayConfusionMatrix(): this;
|
|
374
409
|
/**
|
|
375
|
-
* Gets the computed metrics for this field
|
|
410
|
+
* Gets the computed classification metrics for this field
|
|
376
411
|
*/
|
|
377
412
|
getMetrics(): ClassificationMetrics;
|
|
378
413
|
/**
|
|
@@ -385,10 +420,16 @@ declare class FieldSelector {
|
|
|
385
420
|
* expectStats() - fluent assertion API for statistical evaluation
|
|
386
421
|
*/
|
|
387
422
|
|
|
423
|
+
/**
|
|
424
|
+
* Object with aligned records (e.g., from custom model execution)
|
|
425
|
+
*/
|
|
426
|
+
interface AlignedRecordsInput {
|
|
427
|
+
aligned: AlignedRecord[];
|
|
428
|
+
}
|
|
388
429
|
/**
|
|
389
430
|
* Input types that expectStats() accepts
|
|
390
431
|
*/
|
|
391
|
-
type StatsInput =
|
|
432
|
+
type StatsInput = AlignedRecordsInput | Prediction[] | AlignedRecord[];
|
|
392
433
|
/**
|
|
393
434
|
* Options for expectStats when using two-argument form
|
|
394
435
|
*/
|
|
@@ -429,20 +470,20 @@ interface ExpectStatsOptions {
|
|
|
429
470
|
* // Pattern 1: Distribution assertions (no ground truth)
|
|
430
471
|
* expectStats(predictions)
|
|
431
472
|
* .field("confidence")
|
|
432
|
-
* .
|
|
473
|
+
* .percentageBelow(0.5).toBeAtLeast(0.9);
|
|
433
474
|
*
|
|
434
475
|
* @example
|
|
435
476
|
* // Pattern 2: Classification with ground truth
|
|
436
477
|
* expectStats(judgeOutputs, humanLabels)
|
|
437
478
|
* .field("hallucinated")
|
|
438
|
-
* .
|
|
439
|
-
* .
|
|
479
|
+
* .recall(true).toBeAtLeast(0.85)
|
|
480
|
+
* .precision(true).toBeAtLeast(0.8);
|
|
440
481
|
*
|
|
441
482
|
* @example
|
|
442
483
|
* // Pattern 3: Custom ID field
|
|
443
484
|
* expectStats(predictions, groundTruth, { idField: 'uuid' })
|
|
444
485
|
* .field("score")
|
|
445
|
-
* .
|
|
486
|
+
* .accuracy.toBeAtLeast(0.8);
|
|
446
487
|
*/
|
|
447
488
|
declare function expectStats(input: StatsInput): ExpectStats;
|
|
448
489
|
declare function expectStats(actual: Prediction[], expected: Array<Record<string, unknown>>): ExpectStats;
|
|
@@ -489,7 +530,15 @@ declare class ExpectStats {
|
|
|
489
530
|
*/
|
|
490
531
|
declare function buildConfusionMatrix(actual: unknown[], expected: unknown[]): ConfusionMatrix;
|
|
491
532
|
/**
|
|
492
|
-
* Formats a confusion matrix as a string table
|
|
533
|
+
* Formats a confusion matrix as a string table with axis labels
|
|
534
|
+
*
|
|
535
|
+
* Output format:
|
|
536
|
+
* ```
|
|
537
|
+
* Predicted → negative positive
|
|
538
|
+
* Actual ↓
|
|
539
|
+
* negative 5 1
|
|
540
|
+
* positive 2 7
|
|
541
|
+
* ```
|
|
493
542
|
*/
|
|
494
543
|
declare function formatConfusionMatrix(cm: ConfusionMatrix): string;
|
|
495
544
|
|
|
@@ -687,4 +736,4 @@ declare class TestExecutionError extends EvalSenseError {
|
|
|
687
736
|
constructor(message: string, testName: string, originalError?: Error);
|
|
688
737
|
}
|
|
689
738
|
|
|
690
|
-
export { AlignedRecord, AssertionError, AssertionResult, ClassificationMetrics, ConfigurationError, ConfusionMatrix, ConsoleReporter,
|
|
739
|
+
export { AlignedRecord, type AlignedRecordsInput, AssertionError, AssertionResult, ClassificationMetrics, ConfigurationError, ConfusionMatrix, ConsoleReporter, DatasetError, EvalReport, EvalSenseError, type ExpectStatsOptions, FieldMetricResult, IntegrityError, IntegrityResult, JsonReporter, Prediction, type StatsInput, TestExecutionError, TestFn, afterAll, afterEach, alignByKey, beforeAll, beforeEach, buildConfusionMatrix, checkIntegrity, computeAccuracy, computeClassificationMetrics, computeF1, computePrecision, computeRecall, describe, discoverEvalFiles, evalTest, executeEvalFiles, expectStats, extractFieldValues, filterComplete, formatConfusionMatrix, getExitCode, it, parseReport, test, validatePredictions };
|