@cogitator-ai/evals 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +456 -0
- package/dist/assertions/custom.d.ts +11 -0
- package/dist/assertions/custom.d.ts.map +1 -0
- package/dist/assertions/custom.js +13 -0
- package/dist/assertions/custom.js.map +1 -0
- package/dist/assertions/index.d.ts +27 -0
- package/dist/assertions/index.d.ts.map +1 -0
- package/dist/assertions/index.js +4 -0
- package/dist/assertions/index.js.map +1 -0
- package/dist/assertions/regression.d.ts +5 -0
- package/dist/assertions/regression.d.ts.map +1 -0
- package/dist/assertions/regression.js +58 -0
- package/dist/assertions/regression.js.map +1 -0
- package/dist/assertions/threshold.d.ts +3 -0
- package/dist/assertions/threshold.d.ts.map +1 -0
- package/dist/assertions/threshold.js +45 -0
- package/dist/assertions/threshold.js.map +1 -0
- package/dist/datasets/csv-loader.d.ts +3 -0
- package/dist/datasets/csv-loader.d.ts.map +1 -0
- package/dist/datasets/csv-loader.js +43 -0
- package/dist/datasets/csv-loader.js.map +1 -0
- package/dist/datasets/dataset.d.ts +15 -0
- package/dist/datasets/dataset.d.ts.map +1 -0
- package/dist/datasets/dataset.js +62 -0
- package/dist/datasets/dataset.js.map +1 -0
- package/dist/datasets/index.d.ts +4 -0
- package/dist/datasets/index.d.ts.map +1 -0
- package/dist/datasets/index.js +4 -0
- package/dist/datasets/index.js.map +1 -0
- package/dist/datasets/jsonl-loader.d.ts +3 -0
- package/dist/datasets/jsonl-loader.d.ts.map +1 -0
- package/dist/datasets/jsonl-loader.js +27 -0
- package/dist/datasets/jsonl-loader.js.map +1 -0
- package/dist/eval-builder.d.ts +30 -0
- package/dist/eval-builder.d.ts.map +1 -0
- package/dist/eval-builder.js +82 -0
- package/dist/eval-builder.js.map +1 -0
- package/dist/eval-comparison.d.ts +43 -0
- package/dist/eval-comparison.d.ts.map +1 -0
- package/dist/eval-comparison.js +125 -0
- package/dist/eval-comparison.js.map +1 -0
- package/dist/eval-suite.d.ts +63 -0
- package/dist/eval-suite.d.ts.map +1 -0
- package/dist/eval-suite.js +230 -0
- package/dist/eval-suite.js.map +1 -0
- package/dist/index.d.ts +31 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +20 -0
- package/dist/index.js.map +1 -0
- package/dist/metrics/custom.d.ts +18 -0
- package/dist/metrics/custom.d.ts.map +1 -0
- package/dist/metrics/custom.js +28 -0
- package/dist/metrics/custom.js.map +1 -0
- package/dist/metrics/deterministic.d.ts +11 -0
- package/dist/metrics/deterministic.d.ts.map +1 -0
- package/dist/metrics/deterministic.js +74 -0
- package/dist/metrics/deterministic.js.map +1 -0
- package/dist/metrics/index.d.ts +8 -0
- package/dist/metrics/index.d.ts.map +1 -0
- package/dist/metrics/index.js +5 -0
- package/dist/metrics/index.js.map +1 -0
- package/dist/metrics/llm-judge.d.ts +27 -0
- package/dist/metrics/llm-judge.d.ts.map +1 -0
- package/dist/metrics/llm-judge.js +77 -0
- package/dist/metrics/llm-judge.js.map +1 -0
- package/dist/metrics/statistical.d.ts +5 -0
- package/dist/metrics/statistical.d.ts.map +1 -0
- package/dist/metrics/statistical.js +85 -0
- package/dist/metrics/statistical.js.map +1 -0
- package/dist/metrics/types.d.ts +31 -0
- package/dist/metrics/types.d.ts.map +1 -0
- package/dist/metrics/types.js +2 -0
- package/dist/metrics/types.js.map +1 -0
- package/dist/reporters/ci.d.ts +3 -0
- package/dist/reporters/ci.d.ts.map +1 -0
- package/dist/reporters/ci.js +21 -0
- package/dist/reporters/ci.js.map +1 -0
- package/dist/reporters/console.d.ts +3 -0
- package/dist/reporters/console.d.ts.map +1 -0
- package/dist/reporters/console.js +46 -0
- package/dist/reporters/console.js.map +1 -0
- package/dist/reporters/csv.d.ts +5 -0
- package/dist/reporters/csv.d.ts.map +1 -0
- package/dist/reporters/csv.js +31 -0
- package/dist/reporters/csv.js.map +1 -0
- package/dist/reporters/index.d.ts +50 -0
- package/dist/reporters/index.d.ts.map +1 -0
- package/dist/reporters/index.js +28 -0
- package/dist/reporters/index.js.map +1 -0
- package/dist/reporters/json.d.ts +5 -0
- package/dist/reporters/json.d.ts.map +1 -0
- package/dist/reporters/json.js +5 -0
- package/dist/reporters/json.js.map +1 -0
- package/dist/schema.d.ts +29 -0
- package/dist/schema.d.ts.map +1 -0
- package/dist/schema.js +23 -0
- package/dist/schema.js.map +1 -0
- package/dist/stats/index.d.ts +6 -0
- package/dist/stats/index.d.ts.map +1 -0
- package/dist/stats/index.js +4 -0
- package/dist/stats/index.js.map +1 -0
- package/dist/stats/mcnemar.d.ts +7 -0
- package/dist/stats/mcnemar.d.ts.map +1 -0
- package/dist/stats/mcnemar.js +34 -0
- package/dist/stats/mcnemar.js.map +1 -0
- package/dist/stats/percentiles.d.ts +15 -0
- package/dist/stats/percentiles.d.ts.map +1 -0
- package/dist/stats/percentiles.js +54 -0
- package/dist/stats/percentiles.js.map +1 -0
- package/dist/stats/t-test.d.ts +9 -0
- package/dist/stats/t-test.d.ts.map +1 -0
- package/dist/stats/t-test.js +129 -0
- package/dist/stats/t-test.js.map +1 -0
- package/dist/tools.d.ts +16 -0
- package/dist/tools.d.ts.map +1 -0
- package/dist/tools.js +58 -0
- package/dist/tools.js.map +1 -0
- package/package.json +57 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import { readFileSync } from 'node:fs';
|
|
2
|
+
import { EvalCaseSchema } from '../schema';
|
|
3
|
+
export async function loadCsv(path) {
|
|
4
|
+
let Papa;
|
|
5
|
+
try {
|
|
6
|
+
Papa = await import('papaparse');
|
|
7
|
+
}
|
|
8
|
+
catch {
|
|
9
|
+
throw new Error('papaparse is required for CSV loading. Install it with: npm install papaparse');
|
|
10
|
+
}
|
|
11
|
+
const content = readFileSync(path, 'utf-8');
|
|
12
|
+
const { data, meta } = Papa.parse(content, {
|
|
13
|
+
header: true,
|
|
14
|
+
skipEmptyLines: true,
|
|
15
|
+
});
|
|
16
|
+
if (!meta.fields?.includes('input')) {
|
|
17
|
+
throw new Error('CSV must have an "input" column');
|
|
18
|
+
}
|
|
19
|
+
return data.map((row) => {
|
|
20
|
+
const evalCase = { input: row.input };
|
|
21
|
+
if (row.expected !== undefined && row.expected !== '') {
|
|
22
|
+
evalCase.expected = row.expected;
|
|
23
|
+
}
|
|
24
|
+
const metadata = {};
|
|
25
|
+
const context = {};
|
|
26
|
+
for (const [key, value] of Object.entries(row)) {
|
|
27
|
+
if (key === 'input' || key === 'expected')
|
|
28
|
+
continue;
|
|
29
|
+
if (key.startsWith('metadata.')) {
|
|
30
|
+
metadata[key.slice('metadata.'.length)] = value;
|
|
31
|
+
}
|
|
32
|
+
else if (key.startsWith('context.')) {
|
|
33
|
+
context[key.slice('context.'.length)] = value;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
if (Object.keys(metadata).length > 0)
|
|
37
|
+
evalCase.metadata = metadata;
|
|
38
|
+
if (Object.keys(context).length > 0)
|
|
39
|
+
evalCase.context = context;
|
|
40
|
+
return EvalCaseSchema.parse(evalCase);
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
//# sourceMappingURL=csv-loader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"csv-loader.js","sourceRoot":"","sources":["../../src/datasets/csv-loader.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACvC,OAAO,EAAE,cAAc,EAAE,MAAM,WAAW,CAAC;AAG3C,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,IAAY;IACxC,IAAI,IAAgC,CAAC;IACrC,IAAI,CAAC;QACH,IAAI,GAAG,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC;IACnC,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,+EAA+E,CAChF,CAAC;IACJ,CAAC;IAED,MAAM,OAAO,GAAG,YAAY,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IAC5C,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,IAAI,CAAC,KAAK,CAAyB,OAAO,EAAE;QACjE,MAAM,EAAE,IAAI;QACZ,cAAc,EAAE,IAAI;KACrB,CAAC,CAAC;IAEH,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;QACpC,MAAM,IAAI,KAAK,CAAC,iCAAiC,CAAC,CAAC;IACrD,CAAC;IAED,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;QACtB,MAAM,QAAQ,GAA4B,EAAE,KAAK,EAAE,GAAG,CAAC,KAAK,EAAE,CAAC;QAE/D,IAAI,GAAG,CAAC,QAAQ,KAAK,SAAS,IAAI,GAAG,CAAC,QAAQ,KAAK,EAAE,EAAE,CAAC;YACtD,QAAQ,CAAC,QAAQ,GAAG,GAAG,CAAC,QAAQ,CAAC;QACnC,CAAC;QAED,MAAM,QAAQ,GAA4B,EAAE,CAAC;QAC7C,MAAM,OAAO,GAA4B,EAAE,CAAC;QAE5C,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;YAC/C,IAAI,GAAG,KAAK,OAAO,IAAI,GAAG,KAAK,UAAU;gBAAE,SAAS;YAEpD,IAAI,GAAG,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;gBAChC,QAAQ,CAAC,GAAG,CAAC,KAAK,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,GAAG,KAAK,CAAC;YAClD,CAAC;iBAAM,IAAI,GAAG,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;gBACtC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,KAAK,CAAC;YAChD,CAAC;QACH,CAAC;QAED,IAAI,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,MAAM,GAAG,CAAC;YAAE,QAAQ,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACnE,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC;YAAE,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;QAEhE,OAAO,cAAc,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IACxC,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import type { EvalCase } from '../schema';
|
|
2
|
+
export declare class Dataset {
|
|
3
|
+
private readonly _cases;
|
|
4
|
+
private constructor();
|
|
5
|
+
static from(cases: EvalCase[]): Dataset;
|
|
6
|
+
static fromJsonl(path: string): Promise<Dataset>;
|
|
7
|
+
static fromCsv(path: string): Promise<Dataset>;
|
|
8
|
+
get length(): number;
|
|
9
|
+
get cases(): readonly EvalCase[];
|
|
10
|
+
[Symbol.iterator](): Iterator<EvalCase>;
|
|
11
|
+
filter(fn: (c: EvalCase) => boolean): Dataset;
|
|
12
|
+
sample(n: number): Dataset;
|
|
13
|
+
shuffle(): Dataset;
|
|
14
|
+
}
|
|
15
|
+
//# sourceMappingURL=dataset.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dataset.d.ts","sourceRoot":"","sources":["../../src/datasets/dataset.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAI1C,qBAAa,OAAO;IAClB,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAsB;IAE7C,OAAO;IAIP,MAAM,CAAC,IAAI,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,OAAO;WAI1B,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;WAKzC,OAAO,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAKpD,IAAI,MAAM,IAAI,MAAM,CAEnB;IAED,IAAI,KAAK,IAAI,SAAS,QAAQ,EAAE,CAE/B;IAED,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC;IAavC,MAAM,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,QAAQ,KAAK,OAAO,GAAG,OAAO;IAI7C,MAAM,CAAC,CAAC,EAAE,MAAM,GAAG,OAAO;IAa1B,OAAO,IAAI,OAAO;CAQnB"}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { EvalCaseSchema } from '../schema';
|
|
2
|
+
import { loadJsonl } from './jsonl-loader';
|
|
3
|
+
import { loadCsv } from './csv-loader';
|
|
4
|
+
export class Dataset {
|
|
5
|
+
_cases;
|
|
6
|
+
constructor(cases) {
|
|
7
|
+
this._cases = Object.freeze(cases.map((c) => EvalCaseSchema.parse(c)));
|
|
8
|
+
}
|
|
9
|
+
static from(cases) {
|
|
10
|
+
return new Dataset(cases);
|
|
11
|
+
}
|
|
12
|
+
static async fromJsonl(path) {
|
|
13
|
+
const cases = await loadJsonl(path);
|
|
14
|
+
return Dataset.from(cases);
|
|
15
|
+
}
|
|
16
|
+
static async fromCsv(path) {
|
|
17
|
+
const cases = await loadCsv(path);
|
|
18
|
+
return Dataset.from(cases);
|
|
19
|
+
}
|
|
20
|
+
get length() {
|
|
21
|
+
return this._cases.length;
|
|
22
|
+
}
|
|
23
|
+
get cases() {
|
|
24
|
+
return this._cases;
|
|
25
|
+
}
|
|
26
|
+
[Symbol.iterator]() {
|
|
27
|
+
let index = 0;
|
|
28
|
+
const cases = this._cases;
|
|
29
|
+
return {
|
|
30
|
+
next() {
|
|
31
|
+
if (index < cases.length) {
|
|
32
|
+
return { value: cases[index++], done: false };
|
|
33
|
+
}
|
|
34
|
+
return { value: undefined, done: true };
|
|
35
|
+
},
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
filter(fn) {
|
|
39
|
+
return new Dataset(this._cases.filter(fn));
|
|
40
|
+
}
|
|
41
|
+
sample(n) {
|
|
42
|
+
if (n <= 0 || this._cases.length === 0) {
|
|
43
|
+
return new Dataset([]);
|
|
44
|
+
}
|
|
45
|
+
const count = Math.min(n, this._cases.length);
|
|
46
|
+
const indices = Array.from({ length: this._cases.length }, (_, i) => i);
|
|
47
|
+
for (let i = indices.length - 1; i > 0; i--) {
|
|
48
|
+
const j = Math.floor(Math.random() * (i + 1));
|
|
49
|
+
[indices[i], indices[j]] = [indices[j], indices[i]];
|
|
50
|
+
}
|
|
51
|
+
return new Dataset(indices.slice(0, count).map((i) => this._cases[i]));
|
|
52
|
+
}
|
|
53
|
+
shuffle() {
|
|
54
|
+
const arr = [...this._cases];
|
|
55
|
+
for (let i = arr.length - 1; i > 0; i--) {
|
|
56
|
+
const j = Math.floor(Math.random() * (i + 1));
|
|
57
|
+
[arr[i], arr[j]] = [arr[j], arr[i]];
|
|
58
|
+
}
|
|
59
|
+
return new Dataset(arr);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
//# sourceMappingURL=dataset.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dataset.js","sourceRoot":"","sources":["../../src/datasets/dataset.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,MAAM,WAAW,CAAC;AAE3C,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAEvC,MAAM,OAAO,OAAO;IACD,MAAM,CAAsB;IAE7C,YAAoB,KAAiB;QACnC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzE,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,KAAiB;QAC3B,OAAO,IAAI,OAAO,CAAC,KAAK,CAAC,CAAC;IAC5B,CAAC;IAED,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,IAAY;QACjC,MAAM,KAAK,GAAG,MAAM,SAAS,CAAC,IAAI,CAAC,CAAC;QACpC,OAAO,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAC7B,CAAC;IAED,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,IAAY;QAC/B,MAAM,KAAK,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;QAClC,OAAO,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAC7B,CAAC;IAED,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC;IAC5B,CAAC;IAED,IAAI,KAAK;QACP,OAAO,IAAI,CAAC,MAAM,CAAC;IACrB,CAAC;IAED,CAAC,MAAM,CAAC,QAAQ,CAAC;QACf,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC;QAC1B,OAAO;YACL,IAAI;gBACF,IAAI,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;oBACzB,OAAO,EAAE,KAAK,EAAE,KAAK,CAAC,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC;gBAChD,CAAC;gBACD,OAAO,EAAE,KAAK,EAAE,SAAkB,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YACnD,CAAC;SACF,CAAC;IACJ,CAAC;IAED,MAAM,CAAC,EAA4B;QACjC,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAe,CAAC,CAAC;IAC3D,CAAC;IAED,MAAM,CAAC,CAAS;QACd,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvC,OAAO,IAAI,OAAO,CAAC,EAAE,CAAC,CAAC;QACzB,CAAC;QACD,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;QAC9C,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC;QACxE,KAAK,IAAI,CAAC,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5C,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YAC9C,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;QACtD,CAAC;QACD,OAAO,IAAI,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAe,CAAC,CAAC;IACvF,CAAC;IAED,OAAO;QACL,MAAM,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,MAAM,CAAe,CAAC;QAC3C,KAAK,IAAI,CAAC,GAAG,GAAG,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YAC9C,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACtC,CAAC;QACD,OAAO,IAAI,OAAO,CAAC,GAAG,CAAC,CAAC;IAC1B,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/datasets/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/datasets/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"jsonl-loader.d.ts","sourceRoot":"","sources":["../../src/datasets/jsonl-loader.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAE1C,wBAAsB,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC,CAyBjE"}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import { createReadStream } from 'node:fs';
|
|
2
|
+
import { createInterface } from 'node:readline';
|
|
3
|
+
import { EvalCaseSchema } from '../schema';
|
|
4
|
+
export async function loadJsonl(path) {
|
|
5
|
+
const cases = [];
|
|
6
|
+
let lineNumber = 0;
|
|
7
|
+
const rl = createInterface({
|
|
8
|
+
input: createReadStream(path, { encoding: 'utf-8' }),
|
|
9
|
+
crlfDelay: Infinity,
|
|
10
|
+
});
|
|
11
|
+
for await (const line of rl) {
|
|
12
|
+
lineNumber++;
|
|
13
|
+
const trimmed = line.trim();
|
|
14
|
+
if (!trimmed)
|
|
15
|
+
continue;
|
|
16
|
+
let parsed;
|
|
17
|
+
try {
|
|
18
|
+
parsed = JSON.parse(trimmed);
|
|
19
|
+
}
|
|
20
|
+
catch {
|
|
21
|
+
throw new Error(`Invalid JSON at line ${lineNumber}: ${trimmed}`);
|
|
22
|
+
}
|
|
23
|
+
cases.push(EvalCaseSchema.parse(parsed));
|
|
24
|
+
}
|
|
25
|
+
return cases;
|
|
26
|
+
}
|
|
27
|
+
//# sourceMappingURL=jsonl-loader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"jsonl-loader.js","sourceRoot":"","sources":["../../src/datasets/jsonl-loader.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAC3C,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,WAAW,CAAC;AAG3C,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,IAAY;IAC1C,MAAM,KAAK,GAAe,EAAE,CAAC;IAC7B,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,MAAM,EAAE,GAAG,eAAe,CAAC;QACzB,KAAK,EAAE,gBAAgB,CAAC,IAAI,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC;QACpD,SAAS,EAAE,QAAQ;KACpB,CAAC,CAAC;IAEH,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,EAAE,EAAE,CAAC;QAC5B,UAAU,EAAE,CAAC;QACb,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,CAAC,OAAO;YAAE,SAAS;QAEvB,IAAI,MAAe,CAAC;QACpB,IAAI,CAAC;YACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAC/B,CAAC;QAAC,MAAM,CAAC;YACP,MAAM,IAAI,KAAK,CAAC,wBAAwB,UAAU,KAAK,OAAO,EAAE,CAAC,CAAC;QACpE,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;IAC3C,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC"}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { EvalSuite } from './eval-suite';
|
|
2
|
+
import type { EvalTarget, EvalSuiteOptions } from './eval-suite';
|
|
3
|
+
import { Dataset } from './datasets';
|
|
4
|
+
import type { MetricFn, StatisticalMetricFn } from './metrics/types';
|
|
5
|
+
import type { JudgeConfig } from './schema';
|
|
6
|
+
import type { AssertionFn } from './assertions';
|
|
7
|
+
export declare class EvalBuilder {
|
|
8
|
+
private _dataset?;
|
|
9
|
+
private _target?;
|
|
10
|
+
private _metrics;
|
|
11
|
+
private _statisticalMetrics;
|
|
12
|
+
private _judge?;
|
|
13
|
+
private _assertions;
|
|
14
|
+
private _concurrency?;
|
|
15
|
+
private _timeout?;
|
|
16
|
+
private _retries?;
|
|
17
|
+
private _onProgress?;
|
|
18
|
+
withDataset(dataset: Dataset): this;
|
|
19
|
+
withTarget(target: EvalTarget): this;
|
|
20
|
+
withMetrics(metrics: MetricFn[]): this;
|
|
21
|
+
withStatisticalMetrics(metrics: StatisticalMetricFn[]): this;
|
|
22
|
+
withJudge(config: JudgeConfig): this;
|
|
23
|
+
withAssertions(assertions: AssertionFn[]): this;
|
|
24
|
+
withConcurrency(n: number): this;
|
|
25
|
+
withTimeout(ms: number): this;
|
|
26
|
+
withRetries(n: number): this;
|
|
27
|
+
onProgress(fn: EvalSuiteOptions['onProgress']): this;
|
|
28
|
+
build(): EvalSuite;
|
|
29
|
+
}
|
|
30
|
+
//# sourceMappingURL=eval-builder.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-builder.d.ts","sourceRoot":"","sources":["../src/eval-builder.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,KAAK,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AACjE,OAAO,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AACrC,OAAO,KAAK,EAAE,QAAQ,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAC;AACrE,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,UAAU,CAAC;AAC5C,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAOhD,qBAAa,WAAW;IACtB,OAAO,CAAC,QAAQ,CAAC,CAAU;IAC3B,OAAO,CAAC,OAAO,CAAC,CAAa;IAC7B,OAAO,CAAC,QAAQ,CAAkB;IAClC,OAAO,CAAC,mBAAmB,CAA6B;IACxD,OAAO,CAAC,MAAM,CAAC,CAAc;IAC7B,OAAO,CAAC,WAAW,CAAqB;IACxC,OAAO,CAAC,YAAY,CAAC,CAAS;IAC9B,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,WAAW,CAAC,CAAiC;IAErD,WAAW,CAAC,OAAO,EAAE,OAAO,GAAG,IAAI;IAKnC,UAAU,CAAC,MAAM,EAAE,UAAU,GAAG,IAAI;IAKpC,WAAW,CAAC,OAAO,EAAE,QAAQ,EAAE,GAAG,IAAI;IAKtC,sBAAsB,CAAC,OAAO,EAAE,mBAAmB,EAAE,GAAG,IAAI;IAK5D,SAAS,CAAC,MAAM,EAAE,WAAW,GAAG,IAAI;IAKpC,cAAc,CAAC,UAAU,EAAE,WAAW,EAAE,GAAG,IAAI;IAK/C,eAAe,CAAC,CAAC,EAAE,MAAM,GAAG,IAAI;IAKhC,WAAW,CAAC,EAAE,EAAE,MAAM,GAAG,IAAI;IAK7B,WAAW,CAAC,CAAC,EAAE,MAAM,GAAG,IAAI;IAK5B,UAAU,CAAC,EAAE,EAAE,gBAAgB,CAAC,YAAY,CAAC,GAAG,IAAI;IAKpD,KAAK,IAAI,SAAS;CA8BnB"}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import { EvalSuite } from './eval-suite';
|
|
2
|
+
function isLLMMetric(m) {
|
|
3
|
+
return 'requiresJudge' in m && m.requiresJudge === true;
|
|
4
|
+
}
|
|
5
|
+
export class EvalBuilder {
|
|
6
|
+
_dataset;
|
|
7
|
+
_target;
|
|
8
|
+
_metrics = [];
|
|
9
|
+
_statisticalMetrics = [];
|
|
10
|
+
_judge;
|
|
11
|
+
_assertions = [];
|
|
12
|
+
_concurrency;
|
|
13
|
+
_timeout;
|
|
14
|
+
_retries;
|
|
15
|
+
_onProgress;
|
|
16
|
+
withDataset(dataset) {
|
|
17
|
+
this._dataset = dataset;
|
|
18
|
+
return this;
|
|
19
|
+
}
|
|
20
|
+
withTarget(target) {
|
|
21
|
+
this._target = target;
|
|
22
|
+
return this;
|
|
23
|
+
}
|
|
24
|
+
withMetrics(metrics) {
|
|
25
|
+
this._metrics = metrics;
|
|
26
|
+
return this;
|
|
27
|
+
}
|
|
28
|
+
withStatisticalMetrics(metrics) {
|
|
29
|
+
this._statisticalMetrics = metrics;
|
|
30
|
+
return this;
|
|
31
|
+
}
|
|
32
|
+
withJudge(config) {
|
|
33
|
+
this._judge = config;
|
|
34
|
+
return this;
|
|
35
|
+
}
|
|
36
|
+
withAssertions(assertions) {
|
|
37
|
+
this._assertions = assertions;
|
|
38
|
+
return this;
|
|
39
|
+
}
|
|
40
|
+
withConcurrency(n) {
|
|
41
|
+
this._concurrency = n;
|
|
42
|
+
return this;
|
|
43
|
+
}
|
|
44
|
+
withTimeout(ms) {
|
|
45
|
+
this._timeout = ms;
|
|
46
|
+
return this;
|
|
47
|
+
}
|
|
48
|
+
withRetries(n) {
|
|
49
|
+
this._retries = n;
|
|
50
|
+
return this;
|
|
51
|
+
}
|
|
52
|
+
onProgress(fn) {
|
|
53
|
+
this._onProgress = fn;
|
|
54
|
+
return this;
|
|
55
|
+
}
|
|
56
|
+
build() {
|
|
57
|
+
if (!this._dataset) {
|
|
58
|
+
throw new Error('Dataset is required. Use .withDataset()');
|
|
59
|
+
}
|
|
60
|
+
if (!this._target) {
|
|
61
|
+
throw new Error('Target is required. Use .withTarget()');
|
|
62
|
+
}
|
|
63
|
+
const hasLLMMetrics = this._metrics.some(isLLMMetric);
|
|
64
|
+
if (hasLLMMetrics && !this._judge) {
|
|
65
|
+
throw new Error('Judge config required for LLM metrics. Use .withJudge()');
|
|
66
|
+
}
|
|
67
|
+
const opts = {
|
|
68
|
+
dataset: this._dataset,
|
|
69
|
+
target: this._target,
|
|
70
|
+
metrics: this._metrics.length > 0 ? this._metrics : undefined,
|
|
71
|
+
statisticalMetrics: this._statisticalMetrics.length > 0 ? this._statisticalMetrics : undefined,
|
|
72
|
+
judge: this._judge,
|
|
73
|
+
assertions: this._assertions.length > 0 ? this._assertions : undefined,
|
|
74
|
+
concurrency: this._concurrency,
|
|
75
|
+
timeout: this._timeout,
|
|
76
|
+
retries: this._retries,
|
|
77
|
+
onProgress: this._onProgress,
|
|
78
|
+
};
|
|
79
|
+
return new EvalSuite(opts);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
//# sourceMappingURL=eval-builder.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-builder.js","sourceRoot":"","sources":["../src/eval-builder.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAQzC,SAAS,WAAW,CAAC,CAAW;IAC9B,OAAO,eAAe,IAAI,CAAC,IAAK,CAAiB,CAAC,aAAa,KAAK,IAAI,CAAC;AAC3E,CAAC;AAED,MAAM,OAAO,WAAW;IACd,QAAQ,CAAW;IACnB,OAAO,CAAc;IACrB,QAAQ,GAAe,EAAE,CAAC;IAC1B,mBAAmB,GAA0B,EAAE,CAAC;IAChD,MAAM,CAAe;IACrB,WAAW,GAAkB,EAAE,CAAC;IAChC,YAAY,CAAU;IACtB,QAAQ,CAAU;IAClB,QAAQ,CAAU;IAClB,WAAW,CAAkC;IAErD,WAAW,CAAC,OAAgB;QAC1B,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;QACxB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,UAAU,CAAC,MAAkB;QAC3B,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC;QACtB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,WAAW,CAAC,OAAmB;QAC7B,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;QACxB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,sBAAsB,CAAC,OAA8B;QACnD,IAAI,CAAC,mBAAmB,GAAG,OAAO,CAAC;QACnC,OAAO,IAAI,CAAC;IACd,CAAC;IAED,SAAS,CAAC,MAAmB;QAC3B,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,cAAc,CAAC,UAAyB;QACtC,IAAI,CAAC,WAAW,GAAG,UAAU,CAAC;QAC9B,OAAO,IAAI,CAAC;IACd,CAAC;IAED,eAAe,CAAC,CAAS;QACvB,IAAI,CAAC,YAAY,GAAG,CAAC,CAAC;QACtB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,WAAW,CAAC,EAAU;QACpB,IAAI,CAAC,QAAQ,GAAG,EAAE,CAAC;QACnB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,WAAW,CAAC,CAAS;QACnB,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;QAClB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,UAAU,CAAC,EAAkC;QAC3C,IAAI,CAAC,WAAW,GAAG,EAAE,CAAC;QACtB,OAAO,IAAI,CAAC;IACd,CAAC;IAED,KAAK;QACH,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC;YACnB,MAAM,IAAI,KAAK,CAAC,yCAAyC,CAAC,CAAC;QAC7D,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CAAC,uCAAuC,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,aAAa,GAAG,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACtD,IAAI,aAAa,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;YAClC,MAAM,IAAI,KAAK,CAAC,yDAAyD,CAAC,CAAC;QAC7E,CAAC;QAED,MAAM,IAAI,GAAqB;YAC7B,OAAO,EAAE,IAAI,CAAC,QAAQ;YACtB,MAAM,EAAE,IAAI,CAAC,OAAO;YACpB,OAAO,EAAE,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS;YAC7D,kBAAkB,EAChB,IAAI,CAAC,mBAAmB,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC,CAAC,SAAS;YAC5E,KAAK,EAAE,IAAI,CAAC,MAAM;YAClB,UAAU,EAAE,IAAI,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,SAAS;YACtE,WAAW,EAAE,IAAI,CAAC,YAAY;YAC9B,OAAO,EAAE,IAAI,CAAC,QAAQ;YACtB,OAAO,EAAE,IAAI,CAAC,QAAQ;YACtB,UAAU,EAAE,IAAI,CAAC,WAAW;SAC7B,CAAC;QAEF,OAAO,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC;IAC7B,CAAC;CACF"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import type { EvalTarget, EvalSuiteResult } from './eval-suite';
|
|
2
|
+
import type { MetricFn, StatisticalMetricFn } from './metrics/types';
|
|
3
|
+
import type { JudgeConfig } from './schema';
|
|
4
|
+
import { Dataset } from './datasets';
|
|
5
|
+
export interface EvalComparisonOptions {
|
|
6
|
+
dataset: Dataset;
|
|
7
|
+
targets: {
|
|
8
|
+
baseline: EvalTarget;
|
|
9
|
+
challenger: EvalTarget;
|
|
10
|
+
};
|
|
11
|
+
metrics?: MetricFn[];
|
|
12
|
+
statisticalMetrics?: StatisticalMetricFn[];
|
|
13
|
+
judge?: JudgeConfig;
|
|
14
|
+
concurrency?: number;
|
|
15
|
+
timeout?: number;
|
|
16
|
+
retries?: number;
|
|
17
|
+
onProgress?: (progress: {
|
|
18
|
+
target: string;
|
|
19
|
+
completed: number;
|
|
20
|
+
total: number;
|
|
21
|
+
}) => void;
|
|
22
|
+
}
|
|
23
|
+
export interface MetricComparison {
|
|
24
|
+
baseline: number;
|
|
25
|
+
challenger: number;
|
|
26
|
+
pValue: number;
|
|
27
|
+
significant: boolean;
|
|
28
|
+
winner: 'baseline' | 'challenger' | 'tie';
|
|
29
|
+
}
|
|
30
|
+
export interface ComparisonResult {
|
|
31
|
+
summary: {
|
|
32
|
+
winner: 'baseline' | 'challenger' | 'tie';
|
|
33
|
+
metrics: Record<string, MetricComparison>;
|
|
34
|
+
};
|
|
35
|
+
baseline: EvalSuiteResult;
|
|
36
|
+
challenger: EvalSuiteResult;
|
|
37
|
+
}
|
|
38
|
+
export declare class EvalComparison {
|
|
39
|
+
private readonly opts;
|
|
40
|
+
constructor(opts: EvalComparisonOptions);
|
|
41
|
+
run(): Promise<ComparisonResult>;
|
|
42
|
+
}
|
|
43
|
+
//# sourceMappingURL=eval-comparison.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-comparison.d.ts","sourceRoot":"","sources":["../src/eval-comparison.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,UAAU,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAChE,OAAO,KAAK,EAAE,QAAQ,EAAe,mBAAmB,EAAE,MAAM,iBAAiB,CAAC;AAClF,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,UAAU,CAAC;AAC5C,OAAO,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAKrC,MAAM,WAAW,qBAAqB;IACpC,OAAO,EAAE,OAAO,CAAC;IACjB,OAAO,EAAE;QACP,QAAQ,EAAE,UAAU,CAAC;QACrB,UAAU,EAAE,UAAU,CAAC;KACxB,CAAC;IACF,OAAO,CAAC,EAAE,QAAQ,EAAE,CAAC;IACrB,kBAAkB,CAAC,EAAE,mBAAmB,EAAE,CAAC;IAC3C,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,CAAC,QAAQ,EAAE;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,KAAK,IAAI,CAAC;CACvF;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,OAAO,CAAC;IACrB,MAAM,EAAE,UAAU,GAAG,YAAY,GAAG,KAAK,CAAC;CAC3C;AAED,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE;QACP,MAAM,EAAE,UAAU,GAAG,YAAY,GAAG,KAAK,CAAC;QAC1C,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,gBAAgB,CAAC,CAAC;KAC3C,CAAC;IACF,QAAQ,EAAE,eAAe,CAAC;IAC1B,UAAU,EAAE,eAAe,CAAC;CAC7B;AAkED,qBAAa,cAAc;IACzB,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAwB;gBAEjC,IAAI,EAAE,qBAAqB;IAIjC,GAAG,IAAI,OAAO,CAAC,gBAAgB,CAAC;CAoEvC"}
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import { EvalSuite } from './eval-suite';
|
|
2
|
+
import { pairedTTest } from './stats/t-test';
|
|
3
|
+
import { mcnemarsTest } from './stats/mcnemar';
|
|
4
|
+
import { mean } from './stats/percentiles';
|
|
5
|
+
function isBinary(scores) {
|
|
6
|
+
return scores.every((s) => s === 0 || s === 1);
|
|
7
|
+
}
|
|
8
|
+
function compareMetric(baselineScores, challengerScores) {
|
|
9
|
+
const baselineMean = mean(baselineScores);
|
|
10
|
+
const challengerMean = mean(challengerScores);
|
|
11
|
+
const n = baselineScores.length;
|
|
12
|
+
if (n < 2) {
|
|
13
|
+
return {
|
|
14
|
+
baseline: baselineMean,
|
|
15
|
+
challenger: challengerMean,
|
|
16
|
+
pValue: 1,
|
|
17
|
+
significant: false,
|
|
18
|
+
winner: 'tie',
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
const binary = isBinary(baselineScores) && isBinary(challengerScores);
|
|
22
|
+
let pValue;
|
|
23
|
+
let significant;
|
|
24
|
+
if (binary) {
|
|
25
|
+
let bCorrect_cIncorrect = 0;
|
|
26
|
+
let bIncorrect_cCorrect = 0;
|
|
27
|
+
for (let i = 0; i < n; i++) {
|
|
28
|
+
if (baselineScores[i] === 1 && challengerScores[i] === 0)
|
|
29
|
+
bCorrect_cIncorrect++;
|
|
30
|
+
if (baselineScores[i] === 0 && challengerScores[i] === 1)
|
|
31
|
+
bIncorrect_cCorrect++;
|
|
32
|
+
}
|
|
33
|
+
const result = mcnemarsTest(bCorrect_cIncorrect, bIncorrect_cCorrect);
|
|
34
|
+
pValue = result.pValue;
|
|
35
|
+
significant = result.significant;
|
|
36
|
+
}
|
|
37
|
+
else {
|
|
38
|
+
const result = pairedTTest(baselineScores, challengerScores);
|
|
39
|
+
pValue = result.pValue;
|
|
40
|
+
significant = result.significant;
|
|
41
|
+
}
|
|
42
|
+
let winner = 'tie';
|
|
43
|
+
if (significant) {
|
|
44
|
+
winner = challengerMean > baselineMean ? 'challenger' : 'baseline';
|
|
45
|
+
}
|
|
46
|
+
return { baseline: baselineMean, challenger: challengerMean, pValue, significant, winner };
|
|
47
|
+
}
|
|
48
|
+
function determineOverallWinner(metrics) {
|
|
49
|
+
let baselineWins = 0;
|
|
50
|
+
let challengerWins = 0;
|
|
51
|
+
for (const mc of Object.values(metrics)) {
|
|
52
|
+
if (mc.winner === 'baseline')
|
|
53
|
+
baselineWins++;
|
|
54
|
+
if (mc.winner === 'challenger')
|
|
55
|
+
challengerWins++;
|
|
56
|
+
}
|
|
57
|
+
if (challengerWins > baselineWins)
|
|
58
|
+
return 'challenger';
|
|
59
|
+
if (baselineWins > challengerWins)
|
|
60
|
+
return 'baseline';
|
|
61
|
+
return 'tie';
|
|
62
|
+
}
|
|
63
|
+
export class EvalComparison {
|
|
64
|
+
opts;
|
|
65
|
+
constructor(opts) {
|
|
66
|
+
this.opts = opts;
|
|
67
|
+
}
|
|
68
|
+
async run() {
|
|
69
|
+
const { dataset, targets, metrics, statisticalMetrics, judge, concurrency, timeout, retries, onProgress, } = this.opts;
|
|
70
|
+
const sharedConfig = {
|
|
71
|
+
dataset,
|
|
72
|
+
metrics,
|
|
73
|
+
statisticalMetrics,
|
|
74
|
+
judge,
|
|
75
|
+
concurrency,
|
|
76
|
+
timeout,
|
|
77
|
+
retries,
|
|
78
|
+
};
|
|
79
|
+
const baselineSuite = new EvalSuite({
|
|
80
|
+
...sharedConfig,
|
|
81
|
+
target: targets.baseline,
|
|
82
|
+
onProgress: onProgress
|
|
83
|
+
? (p) => onProgress({ target: 'baseline', completed: p.completed, total: p.total })
|
|
84
|
+
: undefined,
|
|
85
|
+
});
|
|
86
|
+
const challengerSuite = new EvalSuite({
|
|
87
|
+
...sharedConfig,
|
|
88
|
+
target: targets.challenger,
|
|
89
|
+
onProgress: onProgress
|
|
90
|
+
? (p) => onProgress({ target: 'challenger', completed: p.completed, total: p.total })
|
|
91
|
+
: undefined,
|
|
92
|
+
});
|
|
93
|
+
const [baselineResult, challengerResult] = await Promise.all([
|
|
94
|
+
baselineSuite.run(),
|
|
95
|
+
challengerSuite.run(),
|
|
96
|
+
]);
|
|
97
|
+
const metricNames = new Set();
|
|
98
|
+
for (const r of baselineResult.results) {
|
|
99
|
+
for (const s of r.scores) {
|
|
100
|
+
metricNames.add(s.name);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
const metricComparisons = {};
|
|
104
|
+
for (const name of metricNames) {
|
|
105
|
+
const baselineScores = extractScores(baselineResult, name);
|
|
106
|
+
const challengerScores = extractScores(challengerResult, name);
|
|
107
|
+
metricComparisons[name] = compareMetric(baselineScores, challengerScores);
|
|
108
|
+
}
|
|
109
|
+
return {
|
|
110
|
+
summary: {
|
|
111
|
+
winner: determineOverallWinner(metricComparisons),
|
|
112
|
+
metrics: metricComparisons,
|
|
113
|
+
},
|
|
114
|
+
baseline: baselineResult,
|
|
115
|
+
challenger: challengerResult,
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
function extractScores(result, metricName) {
|
|
120
|
+
return result.results.map((r) => {
|
|
121
|
+
const score = r.scores.find((s) => s.name === metricName);
|
|
122
|
+
return score?.score ?? 0;
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
//# sourceMappingURL=eval-comparison.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-comparison.js","sourceRoot":"","sources":["../src/eval-comparison.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAKzC,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAC/C,OAAO,EAAE,IAAI,EAAE,MAAM,qBAAqB,CAAC;AAkC3C,SAAS,QAAQ,CAAC,MAAgB;IAChC,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;AACjD,CAAC;AAED,SAAS,aAAa,CAAC,cAAwB,EAAE,gBAA0B;IACzE,MAAM,YAAY,GAAG,IAAI,CAAC,cAAc,CAAC,CAAC;IAC1C,MAAM,cAAc,GAAG,IAAI,CAAC,gBAAgB,CAAC,CAAC;IAC9C,MAAM,CAAC,GAAG,cAAc,CAAC,MAAM,CAAC;IAEhC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;QACV,OAAO;YACL,QAAQ,EAAE,YAAY;YACtB,UAAU,EAAE,cAAc;YAC1B,MAAM,EAAE,CAAC;YACT,WAAW,EAAE,KAAK;YAClB,MAAM,EAAE,KAAK;SACd,CAAC;IACJ,CAAC;IAED,MAAM,MAAM,GAAG,QAAQ,CAAC,cAAc,CAAC,IAAI,QAAQ,CAAC,gBAAgB,CAAC,CAAC;IAEtE,IAAI,MAAc,CAAC;IACnB,IAAI,WAAoB,CAAC;IAEzB,IAAI,MAAM,EAAE,CAAC;QACX,IAAI,mBAAmB,GAAG,CAAC,CAAC;QAC5B,IAAI,mBAAmB,GAAG,CAAC,CAAC;QAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3B,IAAI,cAAc,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,gBAAgB,CAAC,CAAC,CAAC,KAAK,CAAC;gBAAE,mBAAmB,EAAE,CAAC;YAChF,IAAI,cAAc,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,gBAAgB,CAAC,CAAC,CAAC,KAAK,CAAC;gBAAE,mBAAmB,EAAE,CAAC;QAClF,CAAC;QACD,MAAM,MAAM,GAAG,YAAY,CAAC,mBAAmB,EAAE,mBAAmB,CAAC,CAAC;QACtE,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC;QACvB,WAAW,GAAG,MAAM,CAAC,WAAW,CAAC;IACnC,CAAC;SAAM,CAAC;QACN,MAAM,MAAM,GAAG,WAAW,CAAC,cAAc,EAAE,gBAAgB,CAAC,CAAC;QAC7D,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC;QACvB,WAAW,GAAG,MAAM,CAAC,WAAW,CAAC;IACnC,CAAC;IAED,IAAI,MAAM,GAAsC,KAAK,CAAC;IACtD,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,GAAG,cAAc,GAAG,YAAY,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,UAAU,CAAC;IACrE,CAAC;IAED,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,UAAU,EAAE,cAAc,EAAE,MAAM,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC;AAC7F,CAAC;AAED,SAAS,sBAAsB,CAC7B,OAAyC;IAEzC,IAAI,YAAY,GAAG,CAAC,CAAC;IACrB,IAAI,cAAc,GAAG,CAAC,CAAC;IAEvB,KAAK,MAAM,EAAE,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC;QACxC,IAAI,EAAE,CAAC,MAAM,KAAK,UAAU;YAAE,YAAY,EAAE,CAAC;QAC7C,IAAI,EAAE,CAAC,MAAM,KAAK,YAAY;YAAE,cAAc,EAAE,CAAC;IACnD,CAAC;IAED,IAAI,cAAc,GAAG,YAAY;QAAE,OAAO,YAAY,CAAC;IACvD,IAAI,YAAY,GAAG,cAAc;QAAE,OAAO,UAAU,CAAC;IACrD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,MAAM,OAAO,cAAc;IACR,IAAI,CAAwB;IAE7C,YAAY,IAA2B;QACrC,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC;IACnB,CAAC;IAED,KAAK,CAAC,GAAG;QACP,MAAM,EACJ,OAAO,EACP,OAAO,EACP,OAAO,EACP,kBAAkB,EAClB,KAAK,EACL,WAAW,EACX,OAAO,EACP,OAAO,EACP,UAAU,GACX,GAAG,IAAI,CAAC,IAAI,CAAC;QAEd,MAAM,YAAY,GAAG;YACnB,OAAO;YACP,OAAO;YACP,kBAAkB;YAClB,KAAK;YACL,WAAW;YACX,OAAO;YACP,OAAO;SACR,CAAC;QAEF,MAAM,aAAa,GAAG,IAAI,SAAS,CAAC;YAClC,GAAG,YAAY;YACf,MAAM,EAAE,OAAO,CAAC,QAAQ;YACxB,UAAU,EAAE,UAAU;gBACpB,CAAC,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,EAAE,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,CAAC,CAAC,SAAS,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC;gBACnF,CAAC,CAAC,SAAS;SACd,CAAC,CAAC;QAEH,MAAM,eAAe,GAAG,IAAI,SAAS,CAAC;YACpC,GAAG,YAAY;YACf,MAAM,EAAE,OAAO,CAAC,UAAU;YAC1B,UAAU,EAAE,UAAU;gBACpB,CAAC,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,EAAE,MAAM,EAAE,YAAY,EAAE,SAAS,EAAE,CAAC,CAAC,SAAS,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC;gBACrF,CAAC,CAAC,SAAS;SACd,CAAC,CAAC;QAEH,MAAM,CAAC,cAAc,EAAE,gBAAgB,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC3D,aAAa,CAAC,GAAG,EAAE;YACnB,eAAe,CAAC,GAAG,EAAE;SACtB,CAAC,CAAC;QAEH,MAAM,WAAW,GAAG,IAAI,GAAG,EAAU,CAAC;QACtC,KAAK,MAAM,CAAC,IAAI,cAAc,CAAC,OAAO,EAAE,CAAC;YACvC,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,CAAC;gBACzB,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAC1B,CAAC;QACH,CAAC;QAED,MAAM,iBAAiB,GAAqC,EAAE,CAAC;QAE/D,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;YAC/B,MAAM,cAAc,GAAG,aAAa,CAAC,cAAc,EAAE,IAAI,CAAC,CAAC;YAC3D,MAAM,gBAAgB,GAAG,aAAa,CAAC,gBAAgB,EAAE,IAAI,CAAC,CAAC;YAC/D,iBAAiB,CAAC,IAAI,CAAC,GAAG,aAAa,CAAC,cAAc,EAAE,gBAAgB,CAAC,CAAC;QAC5E,CAAC;QAED,OAAO;YACL,OAAO,EAAE;gBACP,MAAM,EAAE,sBAAsB,CAAC,iBAAiB,CAAC;gBACjD,OAAO,EAAE,iBAAiB;aAC3B;YACD,QAAQ,EAAE,cAAc;YACxB,UAAU,EAAE,gBAAgB;SAC7B,CAAC;IACJ,CAAC;CACF;AAED,SAAS,aAAa,CAAC,MAAuB,EAAE,UAAkB;IAChE,OAAO,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;QAC9B,MAAM,KAAK,GAAG,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAc,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,UAAU,CAAC,CAAC;QACvE,OAAO,KAAK,EAAE,KAAK,IAAI,CAAC,CAAC;IAC3B,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { Dataset } from './datasets';
|
|
2
|
+
import type { EvalCase } from './schema';
|
|
3
|
+
import type { JudgeConfig } from './schema';
|
|
4
|
+
import type { MetricFn, MetricScore, EvalCaseResult, StatisticalMetricFn } from './metrics/types';
|
|
5
|
+
import type { AssertionFn, AssertionResult, AggregatedMetric } from './assertions';
|
|
6
|
+
import type { ReporterType, ReporterOptions } from './reporters';
|
|
7
|
+
export interface EvalTarget {
|
|
8
|
+
agent?: unknown;
|
|
9
|
+
cogitator?: unknown;
|
|
10
|
+
fn?: (input: string) => Promise<string>;
|
|
11
|
+
}
|
|
12
|
+
export interface EvalProgress {
|
|
13
|
+
completed: number;
|
|
14
|
+
total: number;
|
|
15
|
+
currentCase?: EvalCase;
|
|
16
|
+
}
|
|
17
|
+
export interface EvalSuiteOptions {
|
|
18
|
+
dataset: Dataset;
|
|
19
|
+
target: EvalTarget;
|
|
20
|
+
metrics?: MetricFn[];
|
|
21
|
+
statisticalMetrics?: StatisticalMetricFn[];
|
|
22
|
+
judge?: JudgeConfig;
|
|
23
|
+
assertions?: AssertionFn[];
|
|
24
|
+
concurrency?: number;
|
|
25
|
+
timeout?: number;
|
|
26
|
+
retries?: number;
|
|
27
|
+
onProgress?: (progress: EvalProgress) => void;
|
|
28
|
+
}
|
|
29
|
+
export interface EvalSuiteResult {
|
|
30
|
+
results: Array<EvalCaseResult & {
|
|
31
|
+
scores: MetricScore[];
|
|
32
|
+
}>;
|
|
33
|
+
aggregated: Record<string, AggregatedMetric>;
|
|
34
|
+
assertions: AssertionResult[];
|
|
35
|
+
stats: {
|
|
36
|
+
total: number;
|
|
37
|
+
duration: number;
|
|
38
|
+
cost: number;
|
|
39
|
+
};
|
|
40
|
+
report: (type: ReporterType | ReporterType[], options?: ReporterOptions) => void;
|
|
41
|
+
saveBaseline: (path: string) => void;
|
|
42
|
+
}
|
|
43
|
+
export declare class EvalSuite {
|
|
44
|
+
private readonly dataset;
|
|
45
|
+
private readonly target;
|
|
46
|
+
private readonly boundMetrics;
|
|
47
|
+
private readonly statisticalMetrics;
|
|
48
|
+
private readonly assertionFns;
|
|
49
|
+
private readonly concurrency;
|
|
50
|
+
private readonly timeout;
|
|
51
|
+
private readonly retries;
|
|
52
|
+
private readonly onProgress?;
|
|
53
|
+
constructor(opts: EvalSuiteOptions);
|
|
54
|
+
private validateTarget;
|
|
55
|
+
run(): Promise<EvalSuiteResult>;
|
|
56
|
+
private executeCase;
|
|
57
|
+
private executeCaseAttempt;
|
|
58
|
+
private executeFnTarget;
|
|
59
|
+
private executeAgentTarget;
|
|
60
|
+
private evaluateCaseMetrics;
|
|
61
|
+
private aggregateScores;
|
|
62
|
+
}
|
|
63
|
+
//# sourceMappingURL=eval-suite.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-suite.d.ts","sourceRoot":"","sources":["../src/eval-suite.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AACrC,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAC;AAEzC,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,UAAU,CAAC;AAC5C,OAAO,KAAK,EAAE,QAAQ,EAAE,WAAW,EAAE,cAAc,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAC;AAGlG,OAAO,KAAK,EAAE,WAAW,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAGnF,OAAO,KAAK,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAEjE,MAAM,WAAW,UAAU;IACzB,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,EAAE,CAAC,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,OAAO,CAAC,MAAM,CAAC,CAAC;CACzC;AAED,MAAM,WAAW,YAAY;IAC3B,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,CAAC,EAAE,QAAQ,CAAC;CACxB;AAED,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,OAAO,CAAC;IACjB,MAAM,EAAE,UAAU,CAAC;IACnB,OAAO,CAAC,EAAE,QAAQ,EAAE,CAAC;IACrB,kBAAkB,CAAC,EAAE,mBAAmB,EAAE,CAAC;IAC3C,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,UAAU,CAAC,EAAE,WAAW,EAAE,CAAC;IAC3B,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,CAAC,QAAQ,EAAE,YAAY,KAAK,IAAI,CAAC;CAC/C;AAED,MAAM,WAAW,eAAe;IAC9B,OAAO,EAAE,KAAK,CAAC,cAAc,GAAG;QAAE,MAAM,EAAE,WAAW,EAAE,CAAA;KAAE,CAAC,CAAC;IAC3D,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,gBAAgB,CAAC,CAAC;IAC7C,UAAU,EAAE,eAAe,EAAE,CAAC;IAC9B,KAAK,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE,CAAC;IACzD,MAAM,EAAE,CAAC,IAAI,EAAE,YAAY,GAAG,YAAY,EAAE,EAAE,OAAO,CAAC,EAAE,eAAe,KAAK,IAAI,CAAC;IACjF,YAAY,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,IAAI,CAAC;CACtC;AAMD,qBAAa,SAAS;IACpB,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAU;IAClC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAa;IACpC,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAa;IAC1C,OAAO,CAAC,QAAQ,CAAC,kBAAkB,CAAwB;IAC3D,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAgB;IAC7C,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAS;IACrC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAmC;gBAEnD,IAAI,EAAE,gBAAgB;IAoClC,OAAO,CAAC,cAAc;IAuBhB,GAAG,IAAI,OAAO,CAAC,eAAe,CAAC;YA6FvB,WAAW;YAgBX,kBAAkB;YAyBlB,eAAe;YAMf,kBAAkB;YA+BlB,mBAAmB;IAKjC,OAAO,CAAC,eAAe;CAuBxB"}
|