@cogitator-ai/evals 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +456 -0
  2. package/dist/assertions/custom.d.ts +11 -0
  3. package/dist/assertions/custom.d.ts.map +1 -0
  4. package/dist/assertions/custom.js +13 -0
  5. package/dist/assertions/custom.js.map +1 -0
  6. package/dist/assertions/index.d.ts +27 -0
  7. package/dist/assertions/index.d.ts.map +1 -0
  8. package/dist/assertions/index.js +4 -0
  9. package/dist/assertions/index.js.map +1 -0
  10. package/dist/assertions/regression.d.ts +5 -0
  11. package/dist/assertions/regression.d.ts.map +1 -0
  12. package/dist/assertions/regression.js +58 -0
  13. package/dist/assertions/regression.js.map +1 -0
  14. package/dist/assertions/threshold.d.ts +3 -0
  15. package/dist/assertions/threshold.d.ts.map +1 -0
  16. package/dist/assertions/threshold.js +45 -0
  17. package/dist/assertions/threshold.js.map +1 -0
  18. package/dist/datasets/csv-loader.d.ts +3 -0
  19. package/dist/datasets/csv-loader.d.ts.map +1 -0
  20. package/dist/datasets/csv-loader.js +43 -0
  21. package/dist/datasets/csv-loader.js.map +1 -0
  22. package/dist/datasets/dataset.d.ts +15 -0
  23. package/dist/datasets/dataset.d.ts.map +1 -0
  24. package/dist/datasets/dataset.js +62 -0
  25. package/dist/datasets/dataset.js.map +1 -0
  26. package/dist/datasets/index.d.ts +4 -0
  27. package/dist/datasets/index.d.ts.map +1 -0
  28. package/dist/datasets/index.js +4 -0
  29. package/dist/datasets/index.js.map +1 -0
  30. package/dist/datasets/jsonl-loader.d.ts +3 -0
  31. package/dist/datasets/jsonl-loader.d.ts.map +1 -0
  32. package/dist/datasets/jsonl-loader.js +27 -0
  33. package/dist/datasets/jsonl-loader.js.map +1 -0
  34. package/dist/eval-builder.d.ts +30 -0
  35. package/dist/eval-builder.d.ts.map +1 -0
  36. package/dist/eval-builder.js +82 -0
  37. package/dist/eval-builder.js.map +1 -0
  38. package/dist/eval-comparison.d.ts +43 -0
  39. package/dist/eval-comparison.d.ts.map +1 -0
  40. package/dist/eval-comparison.js +125 -0
  41. package/dist/eval-comparison.js.map +1 -0
  42. package/dist/eval-suite.d.ts +63 -0
  43. package/dist/eval-suite.d.ts.map +1 -0
  44. package/dist/eval-suite.js +230 -0
  45. package/dist/eval-suite.js.map +1 -0
  46. package/dist/index.d.ts +31 -0
  47. package/dist/index.d.ts.map +1 -0
  48. package/dist/index.js +20 -0
  49. package/dist/index.js.map +1 -0
  50. package/dist/metrics/custom.d.ts +18 -0
  51. package/dist/metrics/custom.d.ts.map +1 -0
  52. package/dist/metrics/custom.js +28 -0
  53. package/dist/metrics/custom.js.map +1 -0
  54. package/dist/metrics/deterministic.d.ts +11 -0
  55. package/dist/metrics/deterministic.d.ts.map +1 -0
  56. package/dist/metrics/deterministic.js +74 -0
  57. package/dist/metrics/deterministic.js.map +1 -0
  58. package/dist/metrics/index.d.ts +8 -0
  59. package/dist/metrics/index.d.ts.map +1 -0
  60. package/dist/metrics/index.js +5 -0
  61. package/dist/metrics/index.js.map +1 -0
  62. package/dist/metrics/llm-judge.d.ts +27 -0
  63. package/dist/metrics/llm-judge.d.ts.map +1 -0
  64. package/dist/metrics/llm-judge.js +77 -0
  65. package/dist/metrics/llm-judge.js.map +1 -0
  66. package/dist/metrics/statistical.d.ts +5 -0
  67. package/dist/metrics/statistical.d.ts.map +1 -0
  68. package/dist/metrics/statistical.js +85 -0
  69. package/dist/metrics/statistical.js.map +1 -0
  70. package/dist/metrics/types.d.ts +31 -0
  71. package/dist/metrics/types.d.ts.map +1 -0
  72. package/dist/metrics/types.js +2 -0
  73. package/dist/metrics/types.js.map +1 -0
  74. package/dist/reporters/ci.d.ts +3 -0
  75. package/dist/reporters/ci.d.ts.map +1 -0
  76. package/dist/reporters/ci.js +21 -0
  77. package/dist/reporters/ci.js.map +1 -0
  78. package/dist/reporters/console.d.ts +3 -0
  79. package/dist/reporters/console.d.ts.map +1 -0
  80. package/dist/reporters/console.js +46 -0
  81. package/dist/reporters/console.js.map +1 -0
  82. package/dist/reporters/csv.d.ts +5 -0
  83. package/dist/reporters/csv.d.ts.map +1 -0
  84. package/dist/reporters/csv.js +31 -0
  85. package/dist/reporters/csv.js.map +1 -0
  86. package/dist/reporters/index.d.ts +50 -0
  87. package/dist/reporters/index.d.ts.map +1 -0
  88. package/dist/reporters/index.js +28 -0
  89. package/dist/reporters/index.js.map +1 -0
  90. package/dist/reporters/json.d.ts +5 -0
  91. package/dist/reporters/json.d.ts.map +1 -0
  92. package/dist/reporters/json.js +5 -0
  93. package/dist/reporters/json.js.map +1 -0
  94. package/dist/schema.d.ts +29 -0
  95. package/dist/schema.d.ts.map +1 -0
  96. package/dist/schema.js +23 -0
  97. package/dist/schema.js.map +1 -0
  98. package/dist/stats/index.d.ts +6 -0
  99. package/dist/stats/index.d.ts.map +1 -0
  100. package/dist/stats/index.js +4 -0
  101. package/dist/stats/index.js.map +1 -0
  102. package/dist/stats/mcnemar.d.ts +7 -0
  103. package/dist/stats/mcnemar.d.ts.map +1 -0
  104. package/dist/stats/mcnemar.js +34 -0
  105. package/dist/stats/mcnemar.js.map +1 -0
  106. package/dist/stats/percentiles.d.ts +15 -0
  107. package/dist/stats/percentiles.d.ts.map +1 -0
  108. package/dist/stats/percentiles.js +54 -0
  109. package/dist/stats/percentiles.js.map +1 -0
  110. package/dist/stats/t-test.d.ts +9 -0
  111. package/dist/stats/t-test.d.ts.map +1 -0
  112. package/dist/stats/t-test.js +129 -0
  113. package/dist/stats/t-test.js.map +1 -0
  114. package/dist/tools.d.ts +16 -0
  115. package/dist/tools.d.ts.map +1 -0
  116. package/dist/tools.js +58 -0
  117. package/dist/tools.js.map +1 -0
  118. package/package.json +57 -0
@@ -0,0 +1,43 @@
1
+ import { readFileSync } from 'node:fs';
2
+ import { EvalCaseSchema } from '../schema';
3
+ export async function loadCsv(path) {
4
+ let Papa;
5
+ try {
6
+ Papa = await import('papaparse');
7
+ }
8
+ catch {
9
+ throw new Error('papaparse is required for CSV loading. Install it with: npm install papaparse');
10
+ }
11
+ const content = readFileSync(path, 'utf-8');
12
+ const { data, meta } = Papa.parse(content, {
13
+ header: true,
14
+ skipEmptyLines: true,
15
+ });
16
+ if (!meta.fields?.includes('input')) {
17
+ throw new Error('CSV must have an "input" column');
18
+ }
19
+ return data.map((row) => {
20
+ const evalCase = { input: row.input };
21
+ if (row.expected !== undefined && row.expected !== '') {
22
+ evalCase.expected = row.expected;
23
+ }
24
+ const metadata = {};
25
+ const context = {};
26
+ for (const [key, value] of Object.entries(row)) {
27
+ if (key === 'input' || key === 'expected')
28
+ continue;
29
+ if (key.startsWith('metadata.')) {
30
+ metadata[key.slice('metadata.'.length)] = value;
31
+ }
32
+ else if (key.startsWith('context.')) {
33
+ context[key.slice('context.'.length)] = value;
34
+ }
35
+ }
36
+ if (Object.keys(metadata).length > 0)
37
+ evalCase.metadata = metadata;
38
+ if (Object.keys(context).length > 0)
39
+ evalCase.context = context;
40
+ return EvalCaseSchema.parse(evalCase);
41
+ });
42
+ }
43
+ //# sourceMappingURL=csv-loader.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"csv-loader.js","sourceRoot":"","sources":["../../src/datasets/csv-loader.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACvC,OAAO,EAAE,cAAc,EAAE,MAAM,WAAW,CAAC;AAG3C,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,IAAY;IACxC,IAAI,IAAgC,CAAC;IACrC,IAAI,CAAC;QACH,IAAI,GAAG,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC;IACnC,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,+EAA+E,CAChF,CAAC;IACJ,CAAC;IAED,MAAM,OAAO,GAAG,YAAY,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IAC5C,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,IAAI,CAAC,KAAK,CAAyB,OAAO,EAAE;QACjE,MAAM,EAAE,IAAI;QACZ,cAAc,EAAE,IAAI;KACrB,CAAC,CAAC;IAEH,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;QACpC,MAAM,IAAI,KAAK,CAAC,iCAAiC,CAAC,CAAC;IACrD,CAAC;IAED,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;QACtB,MAAM,QAAQ,GAA4B,EAAE,KAAK,EAAE,GAAG,CAAC,KAAK,EAAE,CAAC;QAE/D,IAAI,GAAG,CAAC,QAAQ,KAAK,SAAS,IAAI,GAAG,CAAC,QAAQ,KAAK,EAAE,EAAE,CAAC;YACtD,QAAQ,CAAC,QAAQ,GAAG,GAAG,CAAC,QAAQ,CAAC;QACnC,CAAC;QAED,MAAM,QAAQ,GAA4B,EAAE,CAAC;QAC7C,MAAM,OAAO,GAA4B,EAAE,CAAC;QAE5C,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;YAC/C,IAAI,GAAG,KAAK,OAAO,IAAI,GAAG,KAAK,UAAU;gBAAE,SAAS;YAEpD,IAAI,GAAG,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;gBAChC,QAAQ,CAAC,GAAG,CAAC,KAAK,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,GAAG,KAAK,CAAC;YAClD,CAAC;iBAAM,IAAI,GAAG,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;gBACtC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,KAAK,CAAC;YAChD,CAAC;QACH,CAAC;QAED,IAAI,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,MAAM,GAAG,CAAC;YAAE,QAAQ,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACnE,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC;YAAE,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;QAEhE,OAAO,cAAc,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IACxC,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,15 @@
1
+ import type { EvalCase } from '../schema';
2
+ export declare class Dataset {
3
+ private readonly _cases;
4
+ private constructor();
5
+ static from(cases: EvalCase[]): Dataset;
6
+ static fromJsonl(path: string): Promise<Dataset>;
7
+ static fromCsv(path: string): Promise<Dataset>;
8
+ get length(): number;
9
+ get cases(): readonly EvalCase[];
10
+ [Symbol.iterator](): Iterator<EvalCase>;
11
+ filter(fn: (c: EvalCase) => boolean): Dataset;
12
+ sample(n: number): Dataset;
13
+ shuffle(): Dataset;
14
+ }
15
+ //# sourceMappingURL=dataset.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dataset.d.ts","sourceRoot":"","sources":["../../src/datasets/dataset.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAI1C,qBAAa,OAAO;IAClB,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAsB;IAE7C,OAAO;IAIP,MAAM,CAAC,IAAI,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,OAAO;WAI1B,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;WAKzC,OAAO,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAKpD,IAAI,MAAM,IAAI,MAAM,CAEnB;IAED,IAAI,KAAK,IAAI,SAAS,QAAQ,EAAE,CAE/B;IAED,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC;IAavC,MAAM,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,QAAQ,KAAK,OAAO,GAAG,OAAO;IAI7C,MAAM,CAAC,CAAC,EAAE,MAAM,GAAG,OAAO;IAa1B,OAAO,IAAI,OAAO;CAQnB"}
@@ -0,0 +1,62 @@
1
+ import { EvalCaseSchema } from '../schema';
2
+ import { loadJsonl } from './jsonl-loader';
3
+ import { loadCsv } from './csv-loader';
4
+ export class Dataset {
5
+ _cases;
6
+ constructor(cases) {
7
+ this._cases = Object.freeze(cases.map((c) => EvalCaseSchema.parse(c)));
8
+ }
9
+ static from(cases) {
10
+ return new Dataset(cases);
11
+ }
12
+ static async fromJsonl(path) {
13
+ const cases = await loadJsonl(path);
14
+ return Dataset.from(cases);
15
+ }
16
+ static async fromCsv(path) {
17
+ const cases = await loadCsv(path);
18
+ return Dataset.from(cases);
19
+ }
20
+ get length() {
21
+ return this._cases.length;
22
+ }
23
+ get cases() {
24
+ return this._cases;
25
+ }
26
+ [Symbol.iterator]() {
27
+ let index = 0;
28
+ const cases = this._cases;
29
+ return {
30
+ next() {
31
+ if (index < cases.length) {
32
+ return { value: cases[index++], done: false };
33
+ }
34
+ return { value: undefined, done: true };
35
+ },
36
+ };
37
+ }
38
+ filter(fn) {
39
+ return new Dataset(this._cases.filter(fn));
40
+ }
41
+ sample(n) {
42
+ if (n <= 0 || this._cases.length === 0) {
43
+ return new Dataset([]);
44
+ }
45
+ const count = Math.min(n, this._cases.length);
46
+ const indices = Array.from({ length: this._cases.length }, (_, i) => i);
47
+ for (let i = indices.length - 1; i > 0; i--) {
48
+ const j = Math.floor(Math.random() * (i + 1));
49
+ [indices[i], indices[j]] = [indices[j], indices[i]];
50
+ }
51
+ return new Dataset(indices.slice(0, count).map((i) => this._cases[i]));
52
+ }
53
+ shuffle() {
54
+ const arr = [...this._cases];
55
+ for (let i = arr.length - 1; i > 0; i--) {
56
+ const j = Math.floor(Math.random() * (i + 1));
57
+ [arr[i], arr[j]] = [arr[j], arr[i]];
58
+ }
59
+ return new Dataset(arr);
60
+ }
61
+ }
62
+ //# sourceMappingURL=dataset.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dataset.js","sourceRoot":"","sources":["../../src/datasets/dataset.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,MAAM,WAAW,CAAC;AAE3C,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAEvC,MAAM,OAAO,OAAO;IACD,MAAM,CAAsB;IAE7C,YAAoB,KAAiB;QACnC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzE,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,KAAiB;QAC3B,OAAO,IAAI,OAAO,CAAC,KAAK,CAAC,CAAC;IAC5B,CAAC;IAED,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,IAAY;QACjC,MAAM,KAAK,GAAG,MAAM,SAAS,CAAC,IAAI,CAAC,CAAC;QACpC,OAAO,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAC7B,CAAC;IAED,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,IAAY;QAC/B,MAAM,KAAK,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;QAClC,OAAO,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAC7B,CAAC;IAED,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC;IAC5B,CAAC;IAED,IAAI,KAAK;QACP,OAAO,IAAI,CAAC,MAAM,CAAC;IACrB,CAAC;IAED,CAAC,MAAM,CAAC,QAAQ,CAAC;QACf,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC;QAC1B,OAAO;YACL,IAAI;gBACF,IAAI,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;oBACzB,OAAO,EAAE,KAAK,EAAE,KAAK,CAAC,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC;gBAChD,CAAC;gBACD,OAAO,EAAE,KAAK,EAAE,SAAkB,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YACnD,CAAC;SACF,CAAC;IACJ,CAAC;IAED,MAAM,CAAC,EAA4B;QACjC,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAe,CAAC,CAAC;IAC3D,CAAC;IAED,MAAM,CAAC,CAAS;QACd,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvC,OAAO,IAAI,OAAO,CAAC,EAAE,CAAC,CAAC;QACzB,CAAC;QACD,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;QAC9C,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC;QACxE,KAAK,IAAI,CAAC,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5C,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YAC9C,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;QACtD,CAAC;QACD,OAAO,IAAI,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAe,CAAC,CAAC;IACvF,CAAC;IAED,OAAO;QACL,MAAM,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,MAAM,CAAe,CAAC;QAC3C,KAAK,IAAI,CAAC,GAAG,GAAG,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YAC9C,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACtC,CAAC;QACD,OAAO,IAAI,OAAO,CAAC,GAAG,CAAC,CAAC;IAC1B,CAAC;CACF"}
@@ -0,0 +1,4 @@
1
+ export { Dataset } from './dataset';
2
+ export { loadJsonl } from './jsonl-loader';
3
+ export { loadCsv } from './csv-loader';
4
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/datasets/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC"}
@@ -0,0 +1,4 @@
1
+ export { Dataset } from './dataset';
2
+ export { loadJsonl } from './jsonl-loader';
3
+ export { loadCsv } from './csv-loader';
4
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/datasets/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { EvalCase } from '../schema';
2
+ export declare function loadJsonl(path: string): Promise<EvalCase[]>;
3
+ //# sourceMappingURL=jsonl-loader.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"jsonl-loader.d.ts","sourceRoot":"","sources":["../../src/datasets/jsonl-loader.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAE1C,wBAAsB,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC,CAyBjE"}
@@ -0,0 +1,27 @@
1
+ import { createReadStream } from 'node:fs';
2
+ import { createInterface } from 'node:readline';
3
+ import { EvalCaseSchema } from '../schema';
4
+ export async function loadJsonl(path) {
5
+ const cases = [];
6
+ let lineNumber = 0;
7
+ const rl = createInterface({
8
+ input: createReadStream(path, { encoding: 'utf-8' }),
9
+ crlfDelay: Infinity,
10
+ });
11
+ for await (const line of rl) {
12
+ lineNumber++;
13
+ const trimmed = line.trim();
14
+ if (!trimmed)
15
+ continue;
16
+ let parsed;
17
+ try {
18
+ parsed = JSON.parse(trimmed);
19
+ }
20
+ catch {
21
+ throw new Error(`Invalid JSON at line ${lineNumber}: ${trimmed}`);
22
+ }
23
+ cases.push(EvalCaseSchema.parse(parsed));
24
+ }
25
+ return cases;
26
+ }
27
+ //# sourceMappingURL=jsonl-loader.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"jsonl-loader.js","sourceRoot":"","sources":["../../src/datasets/jsonl-loader.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAC3C,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,WAAW,CAAC;AAG3C,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,IAAY;IAC1C,MAAM,KAAK,GAAe,EAAE,CAAC;IAC7B,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,MAAM,EAAE,GAAG,eAAe,CAAC;QACzB,KAAK,EAAE,gBAAgB,CAAC,IAAI,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC;QACpD,SAAS,EAAE,QAAQ;KACpB,CAAC,CAAC;IAEH,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,EAAE,EAAE,CAAC;QAC5B,UAAU,EAAE,CAAC;QACb,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,CAAC,OAAO;YAAE,SAAS;QAEvB,IAAI,MAAe,CAAC;QACpB,IAAI,CAAC;YACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAC/B,CAAC;QAAC,MAAM,CAAC;YACP,MAAM,IAAI,KAAK,CAAC,wBAAwB,UAAU,KAAK,OAAO,EAAE,CAAC,CAAC;QACpE,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;IAC3C,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC"}
@@ -0,0 +1,30 @@
1
+ import { EvalSuite } from './eval-suite';
2
+ import type { EvalTarget, EvalSuiteOptions } from './eval-suite';
3
+ import { Dataset } from './datasets';
4
+ import type { MetricFn, StatisticalMetricFn } from './metrics/types';
5
+ import type { JudgeConfig } from './schema';
6
+ import type { AssertionFn } from './assertions';
7
+ export declare class EvalBuilder {
8
+ private _dataset?;
9
+ private _target?;
10
+ private _metrics;
11
+ private _statisticalMetrics;
12
+ private _judge?;
13
+ private _assertions;
14
+ private _concurrency?;
15
+ private _timeout?;
16
+ private _retries?;
17
+ private _onProgress?;
18
+ withDataset(dataset: Dataset): this;
19
+ withTarget(target: EvalTarget): this;
20
+ withMetrics(metrics: MetricFn[]): this;
21
+ withStatisticalMetrics(metrics: StatisticalMetricFn[]): this;
22
+ withJudge(config: JudgeConfig): this;
23
+ withAssertions(assertions: AssertionFn[]): this;
24
+ withConcurrency(n: number): this;
25
+ withTimeout(ms: number): this;
26
+ withRetries(n: number): this;
27
+ onProgress(fn: EvalSuiteOptions['onProgress']): this;
28
+ build(): EvalSuite;
29
+ }
30
+ //# sourceMappingURL=eval-builder.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-builder.d.ts","sourceRoot":"","sources":["../src/eval-builder.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,KAAK,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AACjE,OAAO,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AACrC,OAAO,KAAK,EAAE,QAAQ,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAC;AACrE,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,UAAU,CAAC;AAC5C,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAOhD,qBAAa,WAAW;IACtB,OAAO,CAAC,QAAQ,CAAC,CAAU;IAC3B,OAAO,CAAC,OAAO,CAAC,CAAa;IAC7B,OAAO,CAAC,QAAQ,CAAkB;IAClC,OAAO,CAAC,mBAAmB,CAA6B;IACxD,OAAO,CAAC,MAAM,CAAC,CAAc;IAC7B,OAAO,CAAC,WAAW,CAAqB;IACxC,OAAO,CAAC,YAAY,CAAC,CAAS;IAC9B,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,WAAW,CAAC,CAAiC;IAErD,WAAW,CAAC,OAAO,EAAE,OAAO,GAAG,IAAI;IAKnC,UAAU,CAAC,MAAM,EAAE,UAAU,GAAG,IAAI;IAKpC,WAAW,CAAC,OAAO,EAAE,QAAQ,EAAE,GAAG,IAAI;IAKtC,sBAAsB,CAAC,OAAO,EAAE,mBAAmB,EAAE,GAAG,IAAI;IAK5D,SAAS,CAAC,MAAM,EAAE,WAAW,GAAG,IAAI;IAKpC,cAAc,CAAC,UAAU,EAAE,WAAW,EAAE,GAAG,IAAI;IAK/C,eAAe,CAAC,CAAC,EAAE,MAAM,GAAG,IAAI;IAKhC,WAAW,CAAC,EAAE,EAAE,MAAM,GAAG,IAAI;IAK7B,WAAW,CAAC,CAAC,EAAE,MAAM,GAAG,IAAI;IAK5B,UAAU,CAAC,EAAE,EAAE,gBAAgB,CAAC,YAAY,CAAC,GAAG,IAAI;IAKpD,KAAK,IAAI,SAAS;CA8BnB"}
@@ -0,0 +1,82 @@
1
+ import { EvalSuite } from './eval-suite';
2
+ function isLLMMetric(m) {
3
+ return 'requiresJudge' in m && m.requiresJudge === true;
4
+ }
5
+ export class EvalBuilder {
6
+ _dataset;
7
+ _target;
8
+ _metrics = [];
9
+ _statisticalMetrics = [];
10
+ _judge;
11
+ _assertions = [];
12
+ _concurrency;
13
+ _timeout;
14
+ _retries;
15
+ _onProgress;
16
+ withDataset(dataset) {
17
+ this._dataset = dataset;
18
+ return this;
19
+ }
20
+ withTarget(target) {
21
+ this._target = target;
22
+ return this;
23
+ }
24
+ withMetrics(metrics) {
25
+ this._metrics = metrics;
26
+ return this;
27
+ }
28
+ withStatisticalMetrics(metrics) {
29
+ this._statisticalMetrics = metrics;
30
+ return this;
31
+ }
32
+ withJudge(config) {
33
+ this._judge = config;
34
+ return this;
35
+ }
36
+ withAssertions(assertions) {
37
+ this._assertions = assertions;
38
+ return this;
39
+ }
40
+ withConcurrency(n) {
41
+ this._concurrency = n;
42
+ return this;
43
+ }
44
+ withTimeout(ms) {
45
+ this._timeout = ms;
46
+ return this;
47
+ }
48
+ withRetries(n) {
49
+ this._retries = n;
50
+ return this;
51
+ }
52
+ onProgress(fn) {
53
+ this._onProgress = fn;
54
+ return this;
55
+ }
56
+ build() {
57
+ if (!this._dataset) {
58
+ throw new Error('Dataset is required. Use .withDataset()');
59
+ }
60
+ if (!this._target) {
61
+ throw new Error('Target is required. Use .withTarget()');
62
+ }
63
+ const hasLLMMetrics = this._metrics.some(isLLMMetric);
64
+ if (hasLLMMetrics && !this._judge) {
65
+ throw new Error('Judge config required for LLM metrics. Use .withJudge()');
66
+ }
67
+ const opts = {
68
+ dataset: this._dataset,
69
+ target: this._target,
70
+ metrics: this._metrics.length > 0 ? this._metrics : undefined,
71
+ statisticalMetrics: this._statisticalMetrics.length > 0 ? this._statisticalMetrics : undefined,
72
+ judge: this._judge,
73
+ assertions: this._assertions.length > 0 ? this._assertions : undefined,
74
+ concurrency: this._concurrency,
75
+ timeout: this._timeout,
76
+ retries: this._retries,
77
+ onProgress: this._onProgress,
78
+ };
79
+ return new EvalSuite(opts);
80
+ }
81
+ }
82
+ //# sourceMappingURL=eval-builder.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-builder.js","sourceRoot":"","sources":["../src/eval-builder.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAQzC,SAAS,WAAW,CAAC,CAAW;IAC9B,OAAO,eAAe,IAAI,CAAC,IAAK,CAAiB,CAAC,aAAa,KAAK,IAAI,CAAC;AAC3E,CAAC;AAED,MAAM,OAAO,WAAW;IACd,QAAQ,CAAW;IACnB,OAAO,CAAc;IACrB,QAAQ,GAAe,EAAE,CAAC;IAC1B,mBAAmB,GAA0B,EAAE,CAAC;IAChD,MAAM,CAAe;IACrB,WAAW,GAAkB,EAAE,CAAC;IAChC,YAAY,CAAU;IACtB,QAAQ,CAAU;IAClB,QAAQ,CAAU;IAClB,WAAW,CAAkC;IAErD,WAAW,CAAC,OAAgB;QAC1B,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;QACxB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,UAAU,CAAC,MAAkB;QAC3B,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC;QACtB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,WAAW,CAAC,OAAmB;QAC7B,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;QACxB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,sBAAsB,CAAC,OAA8B;QACnD,IAAI,CAAC,mBAAmB,GAAG,OAAO,CAAC;QACnC,OAAO,IAAI,CAAC;IACd,CAAC;IAED,SAAS,CAAC,MAAmB;QAC3B,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,cAAc,CAAC,UAAyB;QACtC,IAAI,CAAC,WAAW,GAAG,UAAU,CAAC;QAC9B,OAAO,IAAI,CAAC;IACd,CAAC;IAED,eAAe,CAAC,CAAS;QACvB,IAAI,CAAC,YAAY,GAAG,CAAC,CAAC;QACtB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,WAAW,CAAC,EAAU;QACpB,IAAI,CAAC,QAAQ,GAAG,EAAE,CAAC;QACnB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,WAAW,CAAC,CAAS;QACnB,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;QAClB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,UAAU,CAAC,EAAkC;QAC3C,IAAI,CAAC,WAAW,GAAG,EAAE,CAAC;QACtB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,KAAK;QACH,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC;YACnB,MAAM,IAAI,KAAK,CAAC,yCAAyC,CAAC,CAAC;QAC7D,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CAAC,uCAAuC,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,aAAa,GAAG,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACtD,IAAI,aAAa,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;YAClC,MAAM,IAAI,KAAK,CAAC,yDAAyD,CAAC,CAAC;QAC7E,CAAC;QAED,MAAM,IAAI,GAAqB;YAC7B,OAAO,EAAE,IAAI,CAAC,QAAQ;YACtB,MAAM,EAAE,IAAI,CAAC,OAAO;YACpB,OAAO,EAAE,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS;YAC7D,kBAAkB,EAChB,IAAI,CAAC,mBAAmB,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC,CAAC,SAAS;YAC5E,KAAK,EAAE,IAAI,CAAC,MAAM;YAClB,UAAU,EAAE,IAAI,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,SAAS;YACtE,WAAW,EAAE,IAAI,CAAC,YAAY;YAC9B,OAAO,EAAE,IAAI,CAAC,QAAQ;YACtB,OAAO,EAAE,IAAI,CAAC,QAAQ;YACtB,UAAU,EAAE,IAAI,CAAC,WAAW;SAC7B,CAAC;QAEF,OAAO,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC;IAC7B,CAAC;CACF"}
@@ -0,0 +1,43 @@
1
+ import type { EvalTarget, EvalSuiteResult } from './eval-suite';
2
+ import type { MetricFn, StatisticalMetricFn } from './metrics/types';
3
+ import type { JudgeConfig } from './schema';
4
+ import { Dataset } from './datasets';
5
+ export interface EvalComparisonOptions {
6
+ dataset: Dataset;
7
+ targets: {
8
+ baseline: EvalTarget;
9
+ challenger: EvalTarget;
10
+ };
11
+ metrics?: MetricFn[];
12
+ statisticalMetrics?: StatisticalMetricFn[];
13
+ judge?: JudgeConfig;
14
+ concurrency?: number;
15
+ timeout?: number;
16
+ retries?: number;
17
+ onProgress?: (progress: {
18
+ target: string;
19
+ completed: number;
20
+ total: number;
21
+ }) => void;
22
+ }
23
+ export interface MetricComparison {
24
+ baseline: number;
25
+ challenger: number;
26
+ pValue: number;
27
+ significant: boolean;
28
+ winner: 'baseline' | 'challenger' | 'tie';
29
+ }
30
+ export interface ComparisonResult {
31
+ summary: {
32
+ winner: 'baseline' | 'challenger' | 'tie';
33
+ metrics: Record<string, MetricComparison>;
34
+ };
35
+ baseline: EvalSuiteResult;
36
+ challenger: EvalSuiteResult;
37
+ }
38
+ export declare class EvalComparison {
39
+ private readonly opts;
40
+ constructor(opts: EvalComparisonOptions);
41
+ run(): Promise<ComparisonResult>;
42
+ }
43
+ //# sourceMappingURL=eval-comparison.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-comparison.d.ts","sourceRoot":"","sources":["../src/eval-comparison.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,UAAU,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAChE,OAAO,KAAK,EAAE,QAAQ,EAAe,mBAAmB,EAAE,MAAM,iBAAiB,CAAC;AAClF,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,UAAU,CAAC;AAC5C,OAAO,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAKrC,MAAM,WAAW,qBAAqB;IACpC,OAAO,EAAE,OAAO,CAAC;IACjB,OAAO,EAAE;QACP,QAAQ,EAAE,UAAU,CAAC;QACrB,UAAU,EAAE,UAAU,CAAC;KACxB,CAAC;IACF,OAAO,CAAC,EAAE,QAAQ,EAAE,CAAC;IACrB,kBAAkB,CAAC,EAAE,mBAAmB,EAAE,CAAC;IAC3C,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,CAAC,QAAQ,EAAE;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,KAAK,IAAI,CAAC;CACvF;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,OAAO,CAAC;IACrB,MAAM,EAAE,UAAU,GAAG,YAAY,GAAG,KAAK,CAAC;CAC3C;AAED,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE;QACP,MAAM,EAAE,UAAU,GAAG,YAAY,GAAG,KAAK,CAAC;QAC1C,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,gBAAgB,CAAC,CAAC;KAC3C,CAAC;IACF,QAAQ,EAAE,eAAe,CAAC;IAC1B,UAAU,EAAE,eAAe,CAAC;CAC7B;AAkED,qBAAa,cAAc;IACzB,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAwB;gBAEjC,IAAI,EAAE,qBAAqB;IAIjC,GAAG,IAAI,OAAO,CAAC,gBAAgB,CAAC;CAoEvC"}
@@ -0,0 +1,125 @@
1
+ import { EvalSuite } from './eval-suite';
2
+ import { pairedTTest } from './stats/t-test';
3
+ import { mcnemarsTest } from './stats/mcnemar';
4
+ import { mean } from './stats/percentiles';
5
+ function isBinary(scores) {
6
+ return scores.every((s) => s === 0 || s === 1);
7
+ }
8
+ function compareMetric(baselineScores, challengerScores) {
9
+ const baselineMean = mean(baselineScores);
10
+ const challengerMean = mean(challengerScores);
11
+ const n = baselineScores.length;
12
+ if (n < 2) {
13
+ return {
14
+ baseline: baselineMean,
15
+ challenger: challengerMean,
16
+ pValue: 1,
17
+ significant: false,
18
+ winner: 'tie',
19
+ };
20
+ }
21
+ const binary = isBinary(baselineScores) && isBinary(challengerScores);
22
+ let pValue;
23
+ let significant;
24
+ if (binary) {
25
+ let bCorrect_cIncorrect = 0;
26
+ let bIncorrect_cCorrect = 0;
27
+ for (let i = 0; i < n; i++) {
28
+ if (baselineScores[i] === 1 && challengerScores[i] === 0)
29
+ bCorrect_cIncorrect++;
30
+ if (baselineScores[i] === 0 && challengerScores[i] === 1)
31
+ bIncorrect_cCorrect++;
32
+ }
33
+ const result = mcnemarsTest(bCorrect_cIncorrect, bIncorrect_cCorrect);
34
+ pValue = result.pValue;
35
+ significant = result.significant;
36
+ }
37
+ else {
38
+ const result = pairedTTest(baselineScores, challengerScores);
39
+ pValue = result.pValue;
40
+ significant = result.significant;
41
+ }
42
+ let winner = 'tie';
43
+ if (significant) {
44
+ winner = challengerMean > baselineMean ? 'challenger' : 'baseline';
45
+ }
46
+ return { baseline: baselineMean, challenger: challengerMean, pValue, significant, winner };
47
+ }
48
+ function determineOverallWinner(metrics) {
49
+ let baselineWins = 0;
50
+ let challengerWins = 0;
51
+ for (const mc of Object.values(metrics)) {
52
+ if (mc.winner === 'baseline')
53
+ baselineWins++;
54
+ if (mc.winner === 'challenger')
55
+ challengerWins++;
56
+ }
57
+ if (challengerWins > baselineWins)
58
+ return 'challenger';
59
+ if (baselineWins > challengerWins)
60
+ return 'baseline';
61
+ return 'tie';
62
+ }
63
+ export class EvalComparison {
64
+ opts;
65
+ constructor(opts) {
66
+ this.opts = opts;
67
+ }
68
+ async run() {
69
+ const { dataset, targets, metrics, statisticalMetrics, judge, concurrency, timeout, retries, onProgress, } = this.opts;
70
+ const sharedConfig = {
71
+ dataset,
72
+ metrics,
73
+ statisticalMetrics,
74
+ judge,
75
+ concurrency,
76
+ timeout,
77
+ retries,
78
+ };
79
+ const baselineSuite = new EvalSuite({
80
+ ...sharedConfig,
81
+ target: targets.baseline,
82
+ onProgress: onProgress
83
+ ? (p) => onProgress({ target: 'baseline', completed: p.completed, total: p.total })
84
+ : undefined,
85
+ });
86
+ const challengerSuite = new EvalSuite({
87
+ ...sharedConfig,
88
+ target: targets.challenger,
89
+ onProgress: onProgress
90
+ ? (p) => onProgress({ target: 'challenger', completed: p.completed, total: p.total })
91
+ : undefined,
92
+ });
93
+ const [baselineResult, challengerResult] = await Promise.all([
94
+ baselineSuite.run(),
95
+ challengerSuite.run(),
96
+ ]);
97
+ const metricNames = new Set();
98
+ for (const r of baselineResult.results) {
99
+ for (const s of r.scores) {
100
+ metricNames.add(s.name);
101
+ }
102
+ }
103
+ const metricComparisons = {};
104
+ for (const name of metricNames) {
105
+ const baselineScores = extractScores(baselineResult, name);
106
+ const challengerScores = extractScores(challengerResult, name);
107
+ metricComparisons[name] = compareMetric(baselineScores, challengerScores);
108
+ }
109
+ return {
110
+ summary: {
111
+ winner: determineOverallWinner(metricComparisons),
112
+ metrics: metricComparisons,
113
+ },
114
+ baseline: baselineResult,
115
+ challenger: challengerResult,
116
+ };
117
+ }
118
+ }
119
+ function extractScores(result, metricName) {
120
+ return result.results.map((r) => {
121
+ const score = r.scores.find((s) => s.name === metricName);
122
+ return score?.score ?? 0;
123
+ });
124
+ }
125
+ //# sourceMappingURL=eval-comparison.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-comparison.js","sourceRoot":"","sources":["../src/eval-comparison.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAKzC,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAC/C,OAAO,EAAE,IAAI,EAAE,MAAM,qBAAqB,CAAC;AAkC3C,SAAS,QAAQ,CAAC,MAAgB;IAChC,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;AACjD,CAAC;AAED,SAAS,aAAa,CAAC,cAAwB,EAAE,gBAA0B;IACzE,MAAM,YAAY,GAAG,IAAI,CAAC,cAAc,CAAC,CAAC;IAC1C,MAAM,cAAc,GAAG,IAAI,CAAC,gBAAgB,CAAC,CAAC;IAC9C,MAAM,CAAC,GAAG,cAAc,CAAC,MAAM,CAAC;IAEhC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;QACV,OAAO;YACL,QAAQ,EAAE,YAAY;YACtB,UAAU,EAAE,cAAc;YAC1B,MAAM,EAAE,CAAC;YACT,WAAW,EAAE,KAAK;YAClB,MAAM,EAAE,KAAK;SACd,CAAC;IACJ,CAAC;IAED,MAAM,MAAM,GAAG,QAAQ,CAAC,cAAc,CAAC,IAAI,QAAQ,CAAC,gBAAgB,CAAC,CAAC;IAEtE,IAAI,MAAc,CAAC;IACnB,IAAI,WAAoB,CAAC;IAEzB,IAAI,MAAM,EAAE,CAAC;QACX,IAAI,mBAAmB,GAAG,CAAC,CAAC;QAC5B,IAAI,mBAAmB,GAAG,CAAC,CAAC;QAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3B,IAAI,cAAc,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,gBAAgB,CAAC,CAAC,CAAC,KAAK,CAAC;gBAAE,mBAAmB,EAAE,CAAC;YAChF,IAAI,cAAc,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,gBAAgB,CAAC,CAAC,CAAC,KAAK,CAAC;gBAAE,mBAAmB,EAAE,CAAC;QAClF,CAAC;QACD,MAAM,MAAM,GAAG,YAAY,CAAC,mBAAmB,EAAE,mBAAmB,CAAC,CAAC;QACtE,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC;QACvB,WAAW,GAAG,MAAM,CAAC,WAAW,CAAC;IACnC,CAAC;SAAM,CAAC;QACN,MAAM,MAAM,GAAG,WAAW,CAAC,cAAc,EAAE,gBAAgB,CAAC,CAAC;QAC7D,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC;QACvB,WAAW,GAAG,MAAM,CAAC,WAAW,CAAC;IACnC,CAAC;IAED,IAAI,MAAM,GAAsC,KAAK,CAAC;IACtD,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,GAAG,cAAc,GAAG,YAAY,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,UAAU,CAAC;IACrE,CAAC;IAED,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,UAAU,EAAE,cAAc,EAAE,MAAM,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC;AAC7F,CAAC;AAED,SAAS,sBAAsB,CAC7B,OAAyC;IAEzC,IAAI,YAAY,GAAG,CAAC,CAAC;IACrB,IAAI,cAAc,GAAG,CAAC,CAAC;IAEvB,KAAK,MAAM,EAAE,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC;QACxC,IAAI,EAAE,CAAC,MAAM,KAAK,UAAU;YAAE,YAAY,EAAE,CAAC;QAC7C,IAAI,EAAE,CAAC,MAAM,KAAK,YAAY;YAAE,cAAc,EAAE,CAAC;IACnD,CAAC;IAED,IAAI,cAAc,GAAG,YAAY;QAAE,OAAO,YAAY,CAAC;IACvD,IAAI,YAAY,GAAG,cAAc;QAAE,OAAO,UAAU,CAAC;IACrD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,MAAM,OAAO,cAAc;IACR,IAAI,CAAwB;IAE7C,YAAY,IAA2B;QACrC,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC;IACnB,CAAC;IAED,KAAK,CAAC,GAAG;QACP,MAAM,EACJ,OAAO,EACP,OAAO,EACP,OAAO,EACP,kBAAkB,EAClB,KAAK,EACL,WAAW,EACX,OAAO,EACP,OAAO,EACP,UAAU,GACX,GAAG,IAAI,CAAC,IAAI,CAAC;QAEd,MAAM,YAAY,GAAG;YACnB,OAAO;YACP,OAAO;YACP,kBAAkB;YAClB,KAAK;YACL,WAAW;YACX,OAAO;YACP,OAAO;SACR,CAAC;QAEF,MAAM,aAAa,GAAG,IAAI,SAAS,CAAC;YAClC,GAAG,YAAY;YACf,MAAM,EAAE,OAAO,CAAC,QAAQ;YACxB,UAAU,EAAE,UAAU;gBACpB,CAAC,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,EAAE,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,CAAC,CAAC,SAAS,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC;gBACnF,CAAC,CAAC,SAAS;SACd,CAAC,CAAC;QAEH,MAAM,eAAe,GAAG,IAAI,SAAS,CAAC;YACpC,GAAG,YAAY;YACf,MAAM,EAAE,OAAO,CAAC,UAAU;YAC1B,UAAU,EAAE,UAAU;gBACpB,CAAC,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,EAAE,MAAM,EAAE,YAAY,EAAE,SAAS,EAAE,CAAC,CAAC,SAAS,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC;gBACrF,CAAC,CAAC,SAAS;SACd,CAAC,CAAC;QAEH,MAAM,CAAC,cAAc,EAAE,gBAAgB,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC3D,aAAa,CAAC,GAAG,EAAE;YACnB,eAAe,CAAC,GAAG,EAAE;SACtB,CAAC,CAAC;QAEH,MAAM,WAAW,GAAG,IAAI,GAAG,EAAU,CAAC;QACtC,KAAK,MAAM,CAAC,IAAI,cAAc,CAAC,OAAO,EAAE,CAAC;YACvC,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,CAAC;gBACzB,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAC1B,CAAC;QACH,CAAC;QAED,MAAM,iBAAiB,GAAqC,EAAE,CAAC;QAE/D,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;YAC/B,MAAM,cAAc,GAAG,aAAa,CAAC,cAAc,EAAE,IAAI,CAAC,CAAC;YAC3D,MAAM,gBAAgB,GAAG,aAAa,CAAC,gBAAgB,EAAE,IAAI,CAAC,CAAC;YAC/D,iBAAiB,CAAC,IAAI,CAAC,GAAG,aAAa,CAAC,cAAc,EAAE,gBAAgB,CAAC,CAAC;QAC5E,CAAC;QAED,OAAO;YACL,OAAO,EAAE;gBACP,MAAM,EAAE,sBAAsB,CAAC,iBAAiB,CAAC;gBACjD,OAAO,EAAE,iBAAiB;aAC3B;YACD,QAAQ,EAAE,cAAc;YACxB,UAAU,EAAE,gBAAgB;SAC7B,CAAC;IACJ,CAAC;CACF;AAED,SAAS,aAAa,CAAC,MAAuB,EAAE,UAAkB;IAChE,OAAO,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;QAC9B,MAAM,KAAK,GAAG,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAc,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,UAAU,CAAC,CAAC;QACvE,OAAO,KAAK,EAAE,KAAK,IAAI,CAAC,CAAC;IAC3B,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,63 @@
1
+ import { Dataset } from './datasets';
2
+ import type { EvalCase } from './schema';
3
+ import type { JudgeConfig } from './schema';
4
+ import type { MetricFn, MetricScore, EvalCaseResult, StatisticalMetricFn } from './metrics/types';
5
+ import type { AssertionFn, AssertionResult, AggregatedMetric } from './assertions';
6
+ import type { ReporterType, ReporterOptions } from './reporters';
7
+ export interface EvalTarget {
8
+ agent?: unknown;
9
+ cogitator?: unknown;
10
+ fn?: (input: string) => Promise<string>;
11
+ }
12
+ export interface EvalProgress {
13
+ completed: number;
14
+ total: number;
15
+ currentCase?: EvalCase;
16
+ }
17
+ export interface EvalSuiteOptions {
18
+ dataset: Dataset;
19
+ target: EvalTarget;
20
+ metrics?: MetricFn[];
21
+ statisticalMetrics?: StatisticalMetricFn[];
22
+ judge?: JudgeConfig;
23
+ assertions?: AssertionFn[];
24
+ concurrency?: number;
25
+ timeout?: number;
26
+ retries?: number;
27
+ onProgress?: (progress: EvalProgress) => void;
28
+ }
29
+ export interface EvalSuiteResult {
30
+ results: Array<EvalCaseResult & {
31
+ scores: MetricScore[];
32
+ }>;
33
+ aggregated: Record<string, AggregatedMetric>;
34
+ assertions: AssertionResult[];
35
+ stats: {
36
+ total: number;
37
+ duration: number;
38
+ cost: number;
39
+ };
40
+ report: (type: ReporterType | ReporterType[], options?: ReporterOptions) => void;
41
+ saveBaseline: (path: string) => void;
42
+ }
43
+ export declare class EvalSuite {
44
+ private readonly dataset;
45
+ private readonly target;
46
+ private readonly boundMetrics;
47
+ private readonly statisticalMetrics;
48
+ private readonly assertionFns;
49
+ private readonly concurrency;
50
+ private readonly timeout;
51
+ private readonly retries;
52
+ private readonly onProgress?;
53
+ constructor(opts: EvalSuiteOptions);
54
+ private validateTarget;
55
+ run(): Promise<EvalSuiteResult>;
56
+ private executeCase;
57
+ private executeCaseAttempt;
58
+ private executeFnTarget;
59
+ private executeAgentTarget;
60
+ private evaluateCaseMetrics;
61
+ private aggregateScores;
62
+ }
63
+ //# sourceMappingURL=eval-suite.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-suite.d.ts","sourceRoot":"","sources":["../src/eval-suite.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AACrC,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAC;AAEzC,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,UAAU,CAAC;AAC5C,OAAO,KAAK,EAAE,QAAQ,EAAE,WAAW,EAAE,cAAc,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAC;AAGlG,OAAO,KAAK,EAAE,WAAW,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAGnF,OAAO,KAAK,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAEjE,MAAM,WAAW,UAAU;IACzB,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,EAAE,CAAC,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,OAAO,CAAC,MAAM,CAAC,CAAC;CACzC;AAED,MAAM,WAAW,YAAY;IAC3B,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,CAAC,EAAE,QAAQ,CAAC;CACxB;AAED,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,OAAO,CAAC;IACjB,MAAM,EAAE,UAAU,CAAC;IACnB,OAAO,CAAC,EAAE,QAAQ,EAAE,CAAC;IACrB,kBAAkB,CAAC,EAAE,mBAAmB,EAAE,CAAC;IAC3C,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,UAAU,CAAC,EAAE,WAAW,EAAE,CAAC;IAC3B,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,CAAC,QAAQ,EAAE,YAAY,KAAK,IAAI,CAAC;CAC/C;AAED,MAAM,WAAW,eAAe;IAC9B,OAAO,EAAE,KAAK,CAAC,cAAc,GAAG;QAAE,MAAM,EAAE,WAAW,EAAE,CAAA;KAAE,CAAC,CAAC;IAC3D,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,gBAAgB,CAAC,CAAC;IAC7C,UAAU,EAAE,eAAe,EAAE,CAAC;IAC9B,KAAK,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE,CAAC;IACzD,MAAM,EAAE,CAAC,IAAI,EAAE,YAAY,GAAG,YAAY,EAAE,EAAE,OAAO,CAAC,EAAE,eAAe,KAAK,IAAI,CAAC;IACjF,YAAY,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,IAAI,CAAC;CACtC;AAMD,qBAAa,SAAS;IACpB,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAU;IAClC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAa;IACpC,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAa;IAC1C,OAAO,CAAC,QAAQ,CAAC,kBAAkB,CAAwB;IAC3D,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAgB;IAC7C,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAS;IACrC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAmC;gBAEnD,IAAI,EAAE,gBAAgB;IAoClC,OAAO,CAAC,cAAc;IAuBhB,GAAG,IAAI,OAAO,CAAC,eAAe,CAAC;YA6FvB,WAAW;YAgBX,kBAAkB;YAyBlB,eAAe;YAMf,kBAAkB;YA+BlB,mBAAmB;IAKjC,OAAO,CAAC,eAAe;CAuBxB"}