@cogitator-ai/evals 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +456 -0
  2. package/dist/assertions/custom.d.ts +11 -0
  3. package/dist/assertions/custom.d.ts.map +1 -0
  4. package/dist/assertions/custom.js +13 -0
  5. package/dist/assertions/custom.js.map +1 -0
  6. package/dist/assertions/index.d.ts +27 -0
  7. package/dist/assertions/index.d.ts.map +1 -0
  8. package/dist/assertions/index.js +4 -0
  9. package/dist/assertions/index.js.map +1 -0
  10. package/dist/assertions/regression.d.ts +5 -0
  11. package/dist/assertions/regression.d.ts.map +1 -0
  12. package/dist/assertions/regression.js +58 -0
  13. package/dist/assertions/regression.js.map +1 -0
  14. package/dist/assertions/threshold.d.ts +3 -0
  15. package/dist/assertions/threshold.d.ts.map +1 -0
  16. package/dist/assertions/threshold.js +45 -0
  17. package/dist/assertions/threshold.js.map +1 -0
  18. package/dist/datasets/csv-loader.d.ts +3 -0
  19. package/dist/datasets/csv-loader.d.ts.map +1 -0
  20. package/dist/datasets/csv-loader.js +43 -0
  21. package/dist/datasets/csv-loader.js.map +1 -0
  22. package/dist/datasets/dataset.d.ts +15 -0
  23. package/dist/datasets/dataset.d.ts.map +1 -0
  24. package/dist/datasets/dataset.js +62 -0
  25. package/dist/datasets/dataset.js.map +1 -0
  26. package/dist/datasets/index.d.ts +4 -0
  27. package/dist/datasets/index.d.ts.map +1 -0
  28. package/dist/datasets/index.js +4 -0
  29. package/dist/datasets/index.js.map +1 -0
  30. package/dist/datasets/jsonl-loader.d.ts +3 -0
  31. package/dist/datasets/jsonl-loader.d.ts.map +1 -0
  32. package/dist/datasets/jsonl-loader.js +27 -0
  33. package/dist/datasets/jsonl-loader.js.map +1 -0
  34. package/dist/eval-builder.d.ts +30 -0
  35. package/dist/eval-builder.d.ts.map +1 -0
  36. package/dist/eval-builder.js +82 -0
  37. package/dist/eval-builder.js.map +1 -0
  38. package/dist/eval-comparison.d.ts +43 -0
  39. package/dist/eval-comparison.d.ts.map +1 -0
  40. package/dist/eval-comparison.js +125 -0
  41. package/dist/eval-comparison.js.map +1 -0
  42. package/dist/eval-suite.d.ts +63 -0
  43. package/dist/eval-suite.d.ts.map +1 -0
  44. package/dist/eval-suite.js +230 -0
  45. package/dist/eval-suite.js.map +1 -0
  46. package/dist/index.d.ts +31 -0
  47. package/dist/index.d.ts.map +1 -0
  48. package/dist/index.js +20 -0
  49. package/dist/index.js.map +1 -0
  50. package/dist/metrics/custom.d.ts +18 -0
  51. package/dist/metrics/custom.d.ts.map +1 -0
  52. package/dist/metrics/custom.js +28 -0
  53. package/dist/metrics/custom.js.map +1 -0
  54. package/dist/metrics/deterministic.d.ts +11 -0
  55. package/dist/metrics/deterministic.d.ts.map +1 -0
  56. package/dist/metrics/deterministic.js +74 -0
  57. package/dist/metrics/deterministic.js.map +1 -0
  58. package/dist/metrics/index.d.ts +8 -0
  59. package/dist/metrics/index.d.ts.map +1 -0
  60. package/dist/metrics/index.js +5 -0
  61. package/dist/metrics/index.js.map +1 -0
  62. package/dist/metrics/llm-judge.d.ts +27 -0
  63. package/dist/metrics/llm-judge.d.ts.map +1 -0
  64. package/dist/metrics/llm-judge.js +77 -0
  65. package/dist/metrics/llm-judge.js.map +1 -0
  66. package/dist/metrics/statistical.d.ts +5 -0
  67. package/dist/metrics/statistical.d.ts.map +1 -0
  68. package/dist/metrics/statistical.js +85 -0
  69. package/dist/metrics/statistical.js.map +1 -0
  70. package/dist/metrics/types.d.ts +31 -0
  71. package/dist/metrics/types.d.ts.map +1 -0
  72. package/dist/metrics/types.js +2 -0
  73. package/dist/metrics/types.js.map +1 -0
  74. package/dist/reporters/ci.d.ts +3 -0
  75. package/dist/reporters/ci.d.ts.map +1 -0
  76. package/dist/reporters/ci.js +21 -0
  77. package/dist/reporters/ci.js.map +1 -0
  78. package/dist/reporters/console.d.ts +3 -0
  79. package/dist/reporters/console.d.ts.map +1 -0
  80. package/dist/reporters/console.js +46 -0
  81. package/dist/reporters/console.js.map +1 -0
  82. package/dist/reporters/csv.d.ts +5 -0
  83. package/dist/reporters/csv.d.ts.map +1 -0
  84. package/dist/reporters/csv.js +31 -0
  85. package/dist/reporters/csv.js.map +1 -0
  86. package/dist/reporters/index.d.ts +50 -0
  87. package/dist/reporters/index.d.ts.map +1 -0
  88. package/dist/reporters/index.js +28 -0
  89. package/dist/reporters/index.js.map +1 -0
  90. package/dist/reporters/json.d.ts +5 -0
  91. package/dist/reporters/json.d.ts.map +1 -0
  92. package/dist/reporters/json.js +5 -0
  93. package/dist/reporters/json.js.map +1 -0
  94. package/dist/schema.d.ts +29 -0
  95. package/dist/schema.d.ts.map +1 -0
  96. package/dist/schema.js +23 -0
  97. package/dist/schema.js.map +1 -0
  98. package/dist/stats/index.d.ts +6 -0
  99. package/dist/stats/index.d.ts.map +1 -0
  100. package/dist/stats/index.js +4 -0
  101. package/dist/stats/index.js.map +1 -0
  102. package/dist/stats/mcnemar.d.ts +7 -0
  103. package/dist/stats/mcnemar.d.ts.map +1 -0
  104. package/dist/stats/mcnemar.js +34 -0
  105. package/dist/stats/mcnemar.js.map +1 -0
  106. package/dist/stats/percentiles.d.ts +15 -0
  107. package/dist/stats/percentiles.d.ts.map +1 -0
  108. package/dist/stats/percentiles.js +54 -0
  109. package/dist/stats/percentiles.js.map +1 -0
  110. package/dist/stats/t-test.d.ts +9 -0
  111. package/dist/stats/t-test.d.ts.map +1 -0
  112. package/dist/stats/t-test.js +129 -0
  113. package/dist/stats/t-test.js.map +1 -0
  114. package/dist/tools.d.ts +16 -0
  115. package/dist/tools.d.ts.map +1 -0
  116. package/dist/tools.js +58 -0
  117. package/dist/tools.js.map +1 -0
  118. package/package.json +57 -0
package/README.md ADDED
@@ -0,0 +1,456 @@
1
+ # @cogitator-ai/evals
2
+
3
+ Evaluation framework for Cogitator AI agents. Run eval suites, compare models with A/B tests, enforce quality thresholds, and track regressions — all with built-in statistical significance testing.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pnpm add @cogitator-ai/evals
9
+
10
+ # Optional dependencies
11
+ pnpm add papaparse # CSV dataset loading
12
+ ```
13
+
14
+ ## Features
15
+
16
+ - **EvalSuite** — Run datasets against agents or plain functions with configurable concurrency, timeouts, and retries
17
+ - **4 Deterministic Metrics** — exactMatch, contains, regex, jsonSchema (Zod)
18
+ - **5 LLM-as-Judge Metrics** — faithfulness, relevance, coherence, helpfulness, custom llmMetric
19
+ - **3 Statistical Metrics** — latency, cost, tokenUsage with full percentile breakdowns
20
+ - **Custom Metrics** — `metric()` factory for anything domain-specific
21
+ - **Assertions** — threshold, noRegression, custom assertion with auto-detection of lower-is-better metrics
22
+ - **A/B Testing** — EvalComparison with paired t-test and McNemar's test for statistical significance
23
+ - **4 Reporters** — console (colored table), JSON, CSV, CI (exit code on failure)
24
+ - **Builder API** — Fluent `EvalBuilder` for composable eval pipelines
25
+ - **Baseline Workflow** — Save baselines, compare against them, catch regressions in CI
26
+ - **Zod Validation** — Type-safe configuration with runtime checks
27
+
28
+ ---
29
+
30
+ ## Quick Start
31
+
32
+ ```typescript
33
+ import { EvalSuite, Dataset, exactMatch, contains, threshold, latency } from '@cogitator-ai/evals';
34
+
35
+ const dataset = Dataset.from([
36
+ { input: 'What is 2+2?', expected: '4' },
37
+ { input: 'Capital of France?', expected: 'Paris' },
38
+ { input: 'Largest planet?', expected: 'Jupiter' },
39
+ ]);
40
+
41
+ const suite = new EvalSuite({
42
+ dataset,
43
+ target: {
44
+ fn: async (input) => {
45
+ // replace with your agent or LLM call
46
+ return `The answer is ${input}`;
47
+ },
48
+ },
49
+ metrics: [exactMatch(), contains()],
50
+ statisticalMetrics: [latency()],
51
+ assertions: [threshold('exactMatch', 0.8)],
52
+ concurrency: 5,
53
+ timeout: 30_000,
54
+ });
55
+
56
+ const result = await suite.run();
57
+
58
+ result.report('console');
59
+ result.saveBaseline('./baseline.json');
60
+ ```
61
+
62
+ ---
63
+
64
+ ## Datasets
65
+
66
+ Datasets are immutable collections of eval cases. Each case has an `input`, optional `expected`, optional `context`, and optional `metadata`.
67
+
68
+ ### From inline data
69
+
70
+ ```typescript
71
+ import { Dataset } from '@cogitator-ai/evals';
72
+
73
+ const dataset = Dataset.from([
74
+ { input: 'Translate hello to French', expected: 'Bonjour' },
75
+ { input: 'Summarize this article', context: { article: '...' } },
76
+ ]);
77
+ ```
78
+
79
+ ### From JSONL
80
+
81
+ ```typescript
82
+ const dataset = await Dataset.fromJsonl('./evals/qa.jsonl');
83
+ ```
84
+
85
+ Each line must be a JSON object with at least an `input` field:
86
+
87
+ ```jsonl
88
+ {"input": "What is TypeScript?", "expected": "A typed superset of JavaScript"}
89
+ {"input": "What is Zod?", "expected": "A TypeScript-first schema validation library"}
90
+ ```
91
+
92
+ ### From CSV
93
+
94
+ Requires `papaparse` as an optional dependency.
95
+
96
+ ```typescript
97
+ const dataset = await Dataset.fromCsv('./evals/qa.csv');
98
+ ```
99
+
100
+ CSV must have an `input` column. Optional columns: `expected`, `metadata.*`, `context.*`.
101
+
102
+ ### Transformations
103
+
104
+ ```typescript
105
+ const filtered = dataset.filter((c) => c.expected !== undefined);
106
+ const sampled = dataset.sample(50);
107
+ const shuffled = dataset.shuffle();
108
+ ```
109
+
110
+ All transformations return new `Dataset` instances — the original is never mutated.
111
+
112
+ ---
113
+
114
+ ## Metrics
115
+
116
+ ### Deterministic
117
+
118
+ Binary (0 or 1) metrics that compare output against expected values.
119
+
120
+ | Metric | Description | Requires `expected` |
121
+ | ------------ | ------------------------------------------ | ------------------- |
122
+ | `exactMatch` | Exact string match (case optional) | Yes |
123
+ | `contains` | Output contains expected substring | Yes |
124
+ | `regex` | Output matches a regex pattern | No |
125
+ | `jsonSchema` | Output is valid JSON matching a Zod schema | No |
126
+
127
+ ```typescript
128
+ import { exactMatch, contains, regex, jsonSchema } from '@cogitator-ai/evals';
129
+ import { z } from 'zod';
130
+
131
+ const metrics = [
132
+ exactMatch({ caseSensitive: true }),
133
+ contains(),
134
+ regex(/\d{4}-\d{2}-\d{2}/),
135
+ jsonSchema(z.object({ answer: z.string(), confidence: z.number() })),
136
+ ];
137
+ ```
138
+
139
+ ### LLM-as-Judge
140
+
141
+ Metrics scored by an LLM judge (0.0 to 1.0). Require a `judge` config on the suite.
142
+
143
+ | Metric | Evaluates |
144
+ | -------------- | --------------------------------------- |
145
+ | `faithfulness` | Factual accuracy relative to input |
146
+ | `relevance` | How on-topic the response is |
147
+ | `coherence` | Logical structure and readability |
148
+ | `helpfulness` | Practical usefulness to the user |
149
+ | `llmMetric` | Custom prompt — you define the criteria |
150
+
151
+ ```typescript
152
+ import { faithfulness, relevance, llmMetric } from '@cogitator-ai/evals';
153
+
154
+ const suite = new EvalSuite({
155
+ dataset,
156
+ target: { fn: myFunction },
157
+ metrics: [
158
+ faithfulness(),
159
+ relevance(),
160
+ llmMetric({
161
+ name: 'technicalAccuracy',
162
+ prompt: 'Rate how technically accurate the response is for a software engineering audience.',
163
+ }),
164
+ ],
165
+ judge: { model: 'gpt-4o', temperature: 0 },
166
+ });
167
+ ```
168
+
169
+ ### Statistical
170
+
171
+ Aggregate metrics computed across all results. These report percentile breakdowns (p50, p95, p99) rather than per-case scores.
172
+
173
+ ```typescript
174
+ import { latency, cost, tokenUsage } from '@cogitator-ai/evals';
175
+
176
+ const suite = new EvalSuite({
177
+ dataset,
178
+ target: { agent, cogitator },
179
+ metrics: [exactMatch()],
180
+ statisticalMetrics: [latency(), cost(), tokenUsage()],
181
+ });
182
+ ```
183
+
184
+ ### Custom
185
+
186
+ Build domain-specific metrics with the `metric()` factory.
187
+
188
+ ```typescript
189
+ import { metric } from '@cogitator-ai/evals';
190
+
191
+ const wordCount = metric({
192
+ name: 'wordCount',
193
+ evaluate: ({ output }) => {
194
+ const count = output.split(/\s+/).length;
195
+ return { score: Math.min(count / 100, 1), details: `${count} words` };
196
+ },
197
+ });
198
+
199
+ const suite = new EvalSuite({
200
+ dataset,
201
+ target: { fn: myFunction },
202
+ metrics: [wordCount],
203
+ });
204
+ ```
205
+
206
+ Scores are automatically clamped to [0, 1].
207
+
208
+ ---
209
+
210
+ ## Assertions
211
+
212
+ Assertions check aggregated metrics after a suite run and produce pass/fail results.
213
+
214
+ ### threshold
215
+
216
+ Enforces a minimum (or maximum for latency/cost) value on a metric's mean.
217
+
218
+ ```typescript
219
+ import { threshold } from '@cogitator-ai/evals';
220
+
221
+ const assertions = [
222
+ threshold('exactMatch', 0.9),
223
+ threshold('latency', 5000),
224
+ threshold('relevance', 0.7),
225
+ ];
226
+ ```
227
+
228
+ Latency and cost metrics are automatically detected as lower-is-better.
229
+
230
+ ### noRegression
231
+
232
+ Compares current results against a saved baseline file.
233
+
234
+ ```typescript
235
+ import { noRegression } from '@cogitator-ai/evals';
236
+
237
+ const assertions = [noRegression('./baseline.json', { tolerance: 0.05 })];
238
+ ```
239
+
240
+ ### Custom assertion
241
+
242
+ ```typescript
243
+ import { assertion } from '@cogitator-ai/evals';
244
+
245
+ const assertions = [
246
+ assertion({
247
+ name: 'totalCostBudget',
248
+ check: (_aggregated, stats) => stats.cost < 1.0,
249
+ message: 'Total eval cost exceeded $1.00 budget',
250
+ }),
251
+ ];
252
+ ```
253
+
254
+ ---
255
+
256
+ ## A/B Testing
257
+
258
+ `EvalComparison` runs two targets on the same dataset and determines a winner using statistical significance tests (paired t-test for continuous metrics, McNemar's test for binary metrics).
259
+
260
+ ```typescript
261
+ import { EvalComparison, Dataset, exactMatch, contains } from '@cogitator-ai/evals';
262
+
263
+ const dataset = Dataset.from([
264
+ { input: 'What is 2+2?', expected: '4' },
265
+ { input: 'Capital of Japan?', expected: 'Tokyo' },
266
+ { input: 'Boiling point of water?', expected: '100°C' },
267
+ ]);
268
+
269
+ const comparison = new EvalComparison({
270
+ dataset,
271
+ targets: {
272
+ baseline: { fn: async (input) => baselineModel(input) },
273
+ challenger: { fn: async (input) => challengerModel(input) },
274
+ },
275
+ metrics: [exactMatch(), contains()],
276
+ concurrency: 5,
277
+ });
278
+
279
+ const result = await comparison.run();
280
+
281
+ console.log(`Winner: ${result.summary.winner}`);
282
+ for (const [name, mc] of Object.entries(result.summary.metrics)) {
283
+ console.log(
284
+ ` ${name}: baseline=${mc.baseline.toFixed(3)} challenger=${mc.challenger.toFixed(3)} p=${mc.pValue.toFixed(4)} ${mc.significant ? '*' : ''}`
285
+ );
286
+ }
287
+ ```
288
+
289
+ Access full suite results via `result.baseline` and `result.challenger`.
290
+
291
+ ---
292
+
293
+ ## Reporters
294
+
295
+ Call `result.report()` after a suite run to output results.
296
+
297
+ | Reporter | Output |
298
+ | --------- | ----------------------------------------------- |
299
+ | `console` | Colored table with metrics, assertions, summary |
300
+ | `json` | Writes `eval-report.json` (configurable path) |
301
+ | `csv` | Writes `eval-report.csv` (configurable path) |
302
+ | `ci` | Compact output, `process.exit(1)` on failure |
303
+
304
+ ```typescript
305
+ const result = await suite.run();
306
+
307
+ result.report('console');
308
+ result.report('json', { path: './reports/eval.json' });
309
+ result.report(['console', 'json', 'csv']);
310
+ result.report('ci');
311
+ ```
312
+
313
+ ---
314
+
315
+ ## Builder API
316
+
317
+ `EvalBuilder` provides a fluent interface for constructing eval suites.
318
+
319
+ ```typescript
320
+ import {
321
+ EvalBuilder,
322
+ Dataset,
323
+ exactMatch,
324
+ contains,
325
+ faithfulness,
326
+ latency,
327
+ threshold,
328
+ noRegression,
329
+ } from '@cogitator-ai/evals';
330
+
331
+ const suite = new EvalBuilder()
332
+ .withDataset(await Dataset.fromJsonl('./evals/qa.jsonl'))
333
+ .withTarget({ fn: async (input) => myModel(input) })
334
+ .withMetrics([exactMatch(), contains(), faithfulness()])
335
+ .withStatisticalMetrics([latency()])
336
+ .withJudge({ model: 'gpt-4o', temperature: 0 })
337
+ .withAssertions([threshold('exactMatch', 0.85), noRegression('./baseline.json')])
338
+ .withConcurrency(10)
339
+ .withTimeout(60_000)
340
+ .withRetries(2)
341
+ .onProgress(({ completed, total }) => {
342
+ console.log(`${completed}/${total}`);
343
+ })
344
+ .build();
345
+
346
+ const result = await suite.run();
347
+ result.report('console');
348
+ ```
349
+
350
+ ---
351
+
352
+ ## Baseline Workflow
353
+
354
+ Save a baseline after a successful run, then use `noRegression` to guard against regressions in CI.
355
+
356
+ ```typescript
357
+ const result = await suite.run();
358
+
359
+ result.saveBaseline('./baseline.json');
360
+ ```
361
+
362
+ The baseline file is a simple JSON map of metric names to mean scores:
363
+
364
+ ```json
365
+ {
366
+ "exactMatch": 0.92,
367
+ "contains": 0.97,
368
+ "latency": 1234
369
+ }
370
+ ```
371
+
372
+ In subsequent runs, use `noRegression` to compare:
373
+
374
+ ```typescript
375
+ const suite = new EvalSuite({
376
+ dataset,
377
+ target: { fn: myFunction },
378
+ metrics: [exactMatch(), contains()],
379
+ assertions: [noRegression('./baseline.json', { tolerance: 0.05 })],
380
+ });
381
+
382
+ const result = await suite.run();
383
+ result.report('ci');
384
+ ```
385
+
386
+ ---
387
+
388
+ ## API Reference
389
+
390
+ ### Core
391
+
392
+ | Export | Description |
393
+ | ---------------- | ------------------------------------------------------------------- |
394
+ | `EvalSuite` | Main evaluation runner |
395
+ | `EvalComparison` | A/B testing runner with statistical significance |
396
+ | `EvalBuilder` | Fluent builder for EvalSuite |
397
+ | `Dataset` | Immutable dataset with from/fromJsonl/fromCsv/filter/sample/shuffle |
398
+ | `loadJsonl` | Low-level JSONL file loader |
399
+ | `loadCsv` | Low-level CSV file loader |
400
+
401
+ ### Metrics
402
+
403
+ | Export | Type | Description |
404
+ | -------------- | ------------- | ----------------------------------- |
405
+ | `exactMatch` | Deterministic | Exact string match |
406
+ | `contains` | Deterministic | Substring match |
407
+ | `regex` | Deterministic | Regex pattern match |
408
+ | `jsonSchema` | Deterministic | Zod schema validation |
409
+ | `faithfulness` | LLM Judge | Factual accuracy |
410
+ | `relevance` | LLM Judge | Topical relevance |
411
+ | `coherence` | LLM Judge | Logical structure |
412
+ | `helpfulness` | LLM Judge | Practical usefulness |
413
+ | `llmMetric` | LLM Judge | Custom judge prompt |
414
+ | `latency` | Statistical | Response time percentiles |
415
+ | `cost` | Statistical | Token cost aggregation |
416
+ | `tokenUsage` | Statistical | Input/output token counts |
417
+ | `metric` | Custom | Factory for domain-specific metrics |
418
+
419
+ ### Assertions
420
+
421
+ | Export | Description |
422
+ | -------------- | ---------------------------------------------- |
423
+ | `threshold` | Enforce min/max on metric mean |
424
+ | `noRegression` | Compare against saved baseline |
425
+ | `assertion` | Custom assertion with arbitrary check function |
426
+
427
+ ### Reporters
428
+
429
+ | Export | Description |
430
+ | -------- | -------------------------------------- |
431
+ | `report` | Dispatch to one or more reporter types |
432
+
433
+ ### Statistics
434
+
435
+ | Export | Description |
436
+ | -------------- | --------------------------------------------------------- |
437
+ | `pairedTTest` | Paired t-test for continuous metric comparison |
438
+ | `mcnemarsTest` | McNemar's test for binary metric comparison |
439
+ | `mean` | Arithmetic mean |
440
+ | `median` | Median value |
441
+ | `stdDev` | Sample standard deviation |
442
+ | `percentile` | Arbitrary percentile |
443
+ | `aggregate` | Full stats: mean, median, min, max, stdDev, p50, p95, p99 |
444
+
445
+ ### Agent Tools
446
+
447
+ | Export | Description |
448
+ | ------------------- | ------------------------------------ |
449
+ | `createRunEvalTool` | Creates a `run_eval` tool for agents |
450
+ | `evalTools` | Returns all eval tools as an array |
451
+
452
+ ---
453
+
454
+ ## License
455
+
456
+ MIT
@@ -0,0 +1,11 @@
1
+ import type { AssertionFn, AggregatedMetric } from './index';
2
+ export declare function assertion(opts: {
3
+ name: string;
4
+ check: (aggregated: Record<string, AggregatedMetric>, stats: {
5
+ total: number;
6
+ duration: number;
7
+ cost: number;
8
+ }) => boolean;
9
+ message?: string;
10
+ }): AssertionFn;
11
+ //# sourceMappingURL=custom.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"custom.d.ts","sourceRoot":"","sources":["../../src/assertions/custom.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAE7D,wBAAgB,SAAS,CAAC,IAAI,EAAE;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,CACL,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,gBAAgB,CAAC,EAC5C,KAAK,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE,KACrD,OAAO,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB,GAAG,WAAW,CAYd"}
@@ -0,0 +1,13 @@
1
+ export function assertion(opts) {
2
+ return (aggregated, stats) => {
3
+ const passed = opts.check(aggregated, stats);
4
+ return {
5
+ name: opts.name,
6
+ passed,
7
+ message: passed
8
+ ? `Custom assertion '${opts.name}' passed`
9
+ : (opts.message ?? `Custom assertion '${opts.name}' failed`),
10
+ };
11
+ };
12
+ }
13
+ //# sourceMappingURL=custom.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"custom.js","sourceRoot":"","sources":["../../src/assertions/custom.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,SAAS,CAAC,IAOzB;IACC,OAAO,CAAC,UAAU,EAAE,KAAK,EAAE,EAAE;QAC3B,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,EAAE,KAAK,CAAC,CAAC;QAE7C,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,MAAM;YACN,OAAO,EAAE,MAAM;gBACb,CAAC,CAAC,qBAAqB,IAAI,CAAC,IAAI,UAAU;gBAC1C,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,IAAI,qBAAqB,IAAI,CAAC,IAAI,UAAU,CAAC;SAC/D,CAAC;IACJ,CAAC,CAAC;AACJ,CAAC"}
@@ -0,0 +1,27 @@
1
+ export interface AggregatedMetric {
2
+ name: string;
3
+ mean: number;
4
+ median: number;
5
+ min: number;
6
+ max: number;
7
+ stdDev: number;
8
+ p50: number;
9
+ p95: number;
10
+ p99: number;
11
+ }
12
+ export interface AssertionResult {
13
+ name: string;
14
+ passed: boolean;
15
+ message: string;
16
+ actual?: number;
17
+ expected?: number;
18
+ }
19
+ export type AssertionFn = (aggregated: Record<string, AggregatedMetric>, stats: {
20
+ total: number;
21
+ duration: number;
22
+ cost: number;
23
+ }) => AssertionResult;
24
+ export { threshold } from './threshold';
25
+ export { noRegression } from './regression';
26
+ export { assertion } from './custom';
27
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/assertions/index.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,gBAAgB;IAC/B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE,MAAM,CAAC;IACZ,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE,MAAM,CAAC;IACZ,GAAG,EAAE,MAAM,CAAC;IACZ,GAAG,EAAE,MAAM,CAAC;CACb;AAED,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,OAAO,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,MAAM,WAAW,GAAG,CACxB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,gBAAgB,CAAC,EAC5C,KAAK,EAAE;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,KACrD,eAAe,CAAC;AAErB,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC"}
@@ -0,0 +1,4 @@
1
+ export { threshold } from './threshold';
2
+ export { noRegression } from './regression';
3
+ export { assertion } from './custom';
4
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/assertions/index.ts"],"names":[],"mappings":"AAyBA,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC"}
@@ -0,0 +1,5 @@
1
+ import type { AssertionFn } from './index';
2
+ export declare function noRegression(baselinePath: string, opts?: {
3
+ tolerance?: number;
4
+ }): AssertionFn;
5
+ //# sourceMappingURL=regression.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"regression.d.ts","sourceRoot":"","sources":["../../src/assertions/regression.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AAM3C,wBAAgB,YAAY,CAAC,YAAY,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,MAAM,CAAA;CAAE,GAAG,WAAW,CAsD7F"}
@@ -0,0 +1,58 @@
1
+ import * as fs from 'node:fs';
2
+ function isLowerBetter(name) {
3
+ return name.startsWith('latency') || name.startsWith('cost');
4
+ }
5
+ export function noRegression(baselinePath, opts) {
6
+ return (aggregated, _stats) => {
7
+ let baseline;
8
+ try {
9
+ const raw = fs.readFileSync(baselinePath, 'utf-8');
10
+ baseline = JSON.parse(raw);
11
+ }
12
+ catch {
13
+ return {
14
+ name: 'noRegression',
15
+ passed: false,
16
+ message: `Failed to read baseline file: ${baselinePath}`,
17
+ };
18
+ }
19
+ const tolerance = opts?.tolerance ?? 0.05;
20
+ for (const [metric, baselineValue] of Object.entries(baseline)) {
21
+ const agg = aggregated[metric];
22
+ if (!agg)
23
+ continue;
24
+ const actual = agg.mean;
25
+ const lowerBetter = isLowerBetter(metric);
26
+ if (lowerBetter) {
27
+ const limit = baselineValue * (1 + tolerance);
28
+ if (actual > limit) {
29
+ return {
30
+ name: 'noRegression',
31
+ passed: false,
32
+ message: `Regression in '${metric}': ${actual} > ${limit} (baseline ${baselineValue}, tolerance ${tolerance * 100}%)`,
33
+ actual,
34
+ expected: limit,
35
+ };
36
+ }
37
+ }
38
+ else {
39
+ const limit = baselineValue * (1 - tolerance);
40
+ if (actual < limit) {
41
+ return {
42
+ name: 'noRegression',
43
+ passed: false,
44
+ message: `Regression in '${metric}': ${actual} < ${limit} (baseline ${baselineValue}, tolerance ${tolerance * 100}%)`,
45
+ actual,
46
+ expected: limit,
47
+ };
48
+ }
49
+ }
50
+ }
51
+ return {
52
+ name: 'noRegression',
53
+ passed: true,
54
+ message: 'All metrics within tolerance of baseline',
55
+ };
56
+ };
57
+ }
58
+ //# sourceMappingURL=regression.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"regression.js","sourceRoot":"","sources":["../../src/assertions/regression.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAG9B,SAAS,aAAa,CAAC,IAAY;IACjC,OAAO,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;AAC/D,CAAC;AAED,MAAM,UAAU,YAAY,CAAC,YAAoB,EAAE,IAA6B;IAC9E,OAAO,CAAC,UAAU,EAAE,MAAM,EAAE,EAAE;QAC5B,IAAI,QAAgC,CAAC;QACrC,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,EAAE,CAAC,YAAY,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;YACnD,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC7B,CAAC;QAAC,MAAM,CAAC;YACP,OAAO;gBACL,IAAI,EAAE,cAAc;gBACpB,MAAM,EAAE,KAAK;gBACb,OAAO,EAAE,iCAAiC,YAAY,EAAE;aACzD,CAAC;QACJ,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,EAAE,SAAS,IAAI,IAAI,CAAC;QAE1C,KAAK,MAAM,CAAC,MAAM,EAAE,aAAa,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC/D,MAAM,GAAG,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC;YAC/B,IAAI,CAAC,GAAG;gBAAE,SAAS;YAEnB,MAAM,MAAM,GAAG,GAAG,CAAC,IAAI,CAAC;YACxB,MAAM,WAAW,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC;YAE1C,IAAI,WAAW,EAAE,CAAC;gBAChB,MAAM,KAAK,GAAG,aAAa,GAAG,CAAC,CAAC,GAAG,SAAS,CAAC,CAAC;gBAC9C,IAAI,MAAM,GAAG,KAAK,EAAE,CAAC;oBACnB,OAAO;wBACL,IAAI,EAAE,cAAc;wBACpB,MAAM,EAAE,KAAK;wBACb,OAAO,EAAE,kBAAkB,MAAM,MAAM,MAAM,MAAM,KAAK,cAAc,aAAa,eAAe,SAAS,GAAG,GAAG,IAAI;wBACrH,MAAM;wBACN,QAAQ,EAAE,KAAK;qBAChB,CAAC;gBACJ,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,MAAM,KAAK,GAAG,aAAa,GAAG,CAAC,CAAC,GAAG,SAAS,CAAC,CAAC;gBAC9C,IAAI,MAAM,GAAG,KAAK,EAAE,CAAC;oBACnB,OAAO;wBACL,IAAI,EAAE,cAAc;wBACpB,MAAM,EAAE,KAAK;wBACb,OAAO,EAAE,kBAAkB,MAAM,MAAM,MAAM,MAAM,KAAK,cAAc,aAAa,eAAe,SAAS,GAAG,GAAG,IAAI;wBACrH,MAAM;wBACN,QAAQ,EAAE,KAAK;qBAChB,CAAC;gBACJ,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO;YACL,IAAI,EAAE,cAAc;YACpB,MAAM,EAAE,IAAI;YACZ,OAAO,EAAE,0CAA0C;SACpD,CAAC;IACJ,CAAC,CAAC;AACJ,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { AssertionFn } from './index';
2
+ export declare function threshold(metricName: string, value: number): AssertionFn;
3
+ //# sourceMappingURL=threshold.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"threshold.d.ts","sourceRoot":"","sources":["../../src/assertions/threshold.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAoB,MAAM,SAAS,CAAC;AA6B7D,wBAAgB,SAAS,CAAC,UAAU,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,WAAW,CA0BxE"}
@@ -0,0 +1,45 @@
1
+ function isLowerBetter(name) {
2
+ const base = name.split('.')[0];
3
+ return (base.startsWith('latency') ||
4
+ base.startsWith('cost') ||
5
+ base.endsWith('Duration') ||
6
+ base.endsWith('Latency'));
7
+ }
8
+ function resolve(aggregated, path) {
9
+ const parts = path.split('.');
10
+ const metric = aggregated[parts[0]];
11
+ if (!metric)
12
+ return { value: 0, found: false };
13
+ if (parts.length === 1)
14
+ return { value: metric.mean, found: true };
15
+ const field = parts[1];
16
+ const val = metric[field];
17
+ if (typeof val !== 'number')
18
+ return { value: 0, found: false };
19
+ return { value: val, found: true };
20
+ }
21
+ export function threshold(metricName, value) {
22
+ return (aggregated, _stats) => {
23
+ const { value: actual, found } = resolve(aggregated, metricName);
24
+ if (!found) {
25
+ return {
26
+ name: `threshold(${metricName})`,
27
+ passed: false,
28
+ message: `Metric '${metricName}' not found in aggregated results`,
29
+ };
30
+ }
31
+ const lowerBetter = isLowerBetter(metricName);
32
+ const passed = lowerBetter ? actual <= value : actual >= value;
33
+ const direction = lowerBetter ? '<=' : '>=';
34
+ return {
35
+ name: `threshold(${metricName})`,
36
+ passed,
37
+ message: passed
38
+ ? `${metricName} = ${actual} ${direction} ${value}`
39
+ : `${metricName} = ${actual}, expected ${direction} ${value}`,
40
+ actual,
41
+ expected: value,
42
+ };
43
+ };
44
+ }
45
+ //# sourceMappingURL=threshold.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"threshold.js","sourceRoot":"","sources":["../../src/assertions/threshold.ts"],"names":[],"mappings":"AAEA,SAAS,aAAa,CAAC,IAAY;IACjC,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IAChC,OAAO,CACL,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;QAC1B,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;QACvB,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC;QACzB,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,CACzB,CAAC;AACJ,CAAC;AAED,SAAS,OAAO,CACd,UAA4C,EAC5C,IAAY;IAEZ,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAC9B,MAAM,MAAM,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACpC,IAAI,CAAC,MAAM;QAAE,OAAO,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;IAE/C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,KAAK,EAAE,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC;IAEnE,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAA2B,CAAC;IACjD,MAAM,GAAG,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;IAC1B,IAAI,OAAO,GAAG,KAAK,QAAQ;QAAE,OAAO,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;IAE/D,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC;AACrC,CAAC;AAED,MAAM,UAAU,SAAS,CAAC,UAAkB,EAAE,KAAa;IACzD,OAAO,CAAC,UAAU,EAAE,MAAM,EAAE,EAAE;QAC5B,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,GAAG,OAAO,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;QAEjE,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,OAAO;gBACL,IAAI,EAAE,aAAa,UAAU,GAAG;gBAChC,MAAM,EAAE,KAAK;gBACb,OAAO,EAAE,WAAW,UAAU,mCAAmC;aAClE,CAAC;QACJ,CAAC;QAED,MAAM,WAAW,GAAG,aAAa,CAAC,UAAU,CAAC,CAAC;QAC9C,MAAM,MAAM,GAAG,WAAW,CAAC,CAAC,CAAC,MAAM,IAAI,KAAK,CAAC,CAAC,CAAC,MAAM,IAAI,KAAK,CAAC;QAC/D,MAAM,SAAS,GAAG,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC;QAE5C,OAAO;YACL,IAAI,EAAE,aAAa,UAAU,GAAG;YAChC,MAAM;YACN,OAAO,EAAE,MAAM;gBACb,CAAC,CAAC,GAAG,UAAU,MAAM,MAAM,IAAI,SAAS,IAAI,KAAK,EAAE;gBACnD,CAAC,CAAC,GAAG,UAAU,MAAM,MAAM,cAAc,SAAS,IAAI,KAAK,EAAE;YAC/D,MAAM;YACN,QAAQ,EAAE,KAAK;SAChB,CAAC;IACJ,CAAC,CAAC;AACJ,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { EvalCase } from '../schema';
2
+ export declare function loadCsv(path: string): Promise<EvalCase[]>;
3
+ //# sourceMappingURL=csv-loader.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"csv-loader.d.ts","sourceRoot":"","sources":["../../src/datasets/csv-loader.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAE1C,wBAAsB,OAAO,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC,CA6C/D"}