@cogitator-ai/evals 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +456 -0
- package/dist/assertions/custom.d.ts +11 -0
- package/dist/assertions/custom.d.ts.map +1 -0
- package/dist/assertions/custom.js +13 -0
- package/dist/assertions/custom.js.map +1 -0
- package/dist/assertions/index.d.ts +27 -0
- package/dist/assertions/index.d.ts.map +1 -0
- package/dist/assertions/index.js +4 -0
- package/dist/assertions/index.js.map +1 -0
- package/dist/assertions/regression.d.ts +5 -0
- package/dist/assertions/regression.d.ts.map +1 -0
- package/dist/assertions/regression.js +58 -0
- package/dist/assertions/regression.js.map +1 -0
- package/dist/assertions/threshold.d.ts +3 -0
- package/dist/assertions/threshold.d.ts.map +1 -0
- package/dist/assertions/threshold.js +45 -0
- package/dist/assertions/threshold.js.map +1 -0
- package/dist/datasets/csv-loader.d.ts +3 -0
- package/dist/datasets/csv-loader.d.ts.map +1 -0
- package/dist/datasets/csv-loader.js +43 -0
- package/dist/datasets/csv-loader.js.map +1 -0
- package/dist/datasets/dataset.d.ts +15 -0
- package/dist/datasets/dataset.d.ts.map +1 -0
- package/dist/datasets/dataset.js +62 -0
- package/dist/datasets/dataset.js.map +1 -0
- package/dist/datasets/index.d.ts +4 -0
- package/dist/datasets/index.d.ts.map +1 -0
- package/dist/datasets/index.js +4 -0
- package/dist/datasets/index.js.map +1 -0
- package/dist/datasets/jsonl-loader.d.ts +3 -0
- package/dist/datasets/jsonl-loader.d.ts.map +1 -0
- package/dist/datasets/jsonl-loader.js +27 -0
- package/dist/datasets/jsonl-loader.js.map +1 -0
- package/dist/eval-builder.d.ts +30 -0
- package/dist/eval-builder.d.ts.map +1 -0
- package/dist/eval-builder.js +82 -0
- package/dist/eval-builder.js.map +1 -0
- package/dist/eval-comparison.d.ts +43 -0
- package/dist/eval-comparison.d.ts.map +1 -0
- package/dist/eval-comparison.js +125 -0
- package/dist/eval-comparison.js.map +1 -0
- package/dist/eval-suite.d.ts +63 -0
- package/dist/eval-suite.d.ts.map +1 -0
- package/dist/eval-suite.js +230 -0
- package/dist/eval-suite.js.map +1 -0
- package/dist/index.d.ts +31 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +20 -0
- package/dist/index.js.map +1 -0
- package/dist/metrics/custom.d.ts +18 -0
- package/dist/metrics/custom.d.ts.map +1 -0
- package/dist/metrics/custom.js +28 -0
- package/dist/metrics/custom.js.map +1 -0
- package/dist/metrics/deterministic.d.ts +11 -0
- package/dist/metrics/deterministic.d.ts.map +1 -0
- package/dist/metrics/deterministic.js +74 -0
- package/dist/metrics/deterministic.js.map +1 -0
- package/dist/metrics/index.d.ts +8 -0
- package/dist/metrics/index.d.ts.map +1 -0
- package/dist/metrics/index.js +5 -0
- package/dist/metrics/index.js.map +1 -0
- package/dist/metrics/llm-judge.d.ts +27 -0
- package/dist/metrics/llm-judge.d.ts.map +1 -0
- package/dist/metrics/llm-judge.js +77 -0
- package/dist/metrics/llm-judge.js.map +1 -0
- package/dist/metrics/statistical.d.ts +5 -0
- package/dist/metrics/statistical.d.ts.map +1 -0
- package/dist/metrics/statistical.js +85 -0
- package/dist/metrics/statistical.js.map +1 -0
- package/dist/metrics/types.d.ts +31 -0
- package/dist/metrics/types.d.ts.map +1 -0
- package/dist/metrics/types.js +2 -0
- package/dist/metrics/types.js.map +1 -0
- package/dist/reporters/ci.d.ts +3 -0
- package/dist/reporters/ci.d.ts.map +1 -0
- package/dist/reporters/ci.js +21 -0
- package/dist/reporters/ci.js.map +1 -0
- package/dist/reporters/console.d.ts +3 -0
- package/dist/reporters/console.d.ts.map +1 -0
- package/dist/reporters/console.js +46 -0
- package/dist/reporters/console.js.map +1 -0
- package/dist/reporters/csv.d.ts +5 -0
- package/dist/reporters/csv.d.ts.map +1 -0
- package/dist/reporters/csv.js +31 -0
- package/dist/reporters/csv.js.map +1 -0
- package/dist/reporters/index.d.ts +50 -0
- package/dist/reporters/index.d.ts.map +1 -0
- package/dist/reporters/index.js +28 -0
- package/dist/reporters/index.js.map +1 -0
- package/dist/reporters/json.d.ts +5 -0
- package/dist/reporters/json.d.ts.map +1 -0
- package/dist/reporters/json.js +5 -0
- package/dist/reporters/json.js.map +1 -0
- package/dist/schema.d.ts +29 -0
- package/dist/schema.d.ts.map +1 -0
- package/dist/schema.js +23 -0
- package/dist/schema.js.map +1 -0
- package/dist/stats/index.d.ts +6 -0
- package/dist/stats/index.d.ts.map +1 -0
- package/dist/stats/index.js +4 -0
- package/dist/stats/index.js.map +1 -0
- package/dist/stats/mcnemar.d.ts +7 -0
- package/dist/stats/mcnemar.d.ts.map +1 -0
- package/dist/stats/mcnemar.js +34 -0
- package/dist/stats/mcnemar.js.map +1 -0
- package/dist/stats/percentiles.d.ts +15 -0
- package/dist/stats/percentiles.d.ts.map +1 -0
- package/dist/stats/percentiles.js +54 -0
- package/dist/stats/percentiles.js.map +1 -0
- package/dist/stats/t-test.d.ts +9 -0
- package/dist/stats/t-test.d.ts.map +1 -0
- package/dist/stats/t-test.js +129 -0
- package/dist/stats/t-test.js.map +1 -0
- package/dist/tools.d.ts +16 -0
- package/dist/tools.d.ts.map +1 -0
- package/dist/tools.js +58 -0
- package/dist/tools.js.map +1 -0
- package/package.json +57 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import { mean, stdDev } from './percentiles';
|
|
2
|
+
function lnGamma(z) {
|
|
3
|
+
const g = 7;
|
|
4
|
+
const coefficients = [
|
|
5
|
+
0.99999999999980993, 676.5203681218851, -1259.1392167224028, 771.32342877765313,
|
|
6
|
+
-176.61502916214059, 12.507343278686905, -0.13857109526572012, 9.9843695780195716e-6,
|
|
7
|
+
1.5056327351493116e-7,
|
|
8
|
+
];
|
|
9
|
+
if (z < 0.5) {
|
|
10
|
+
return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z);
|
|
11
|
+
}
|
|
12
|
+
z -= 1;
|
|
13
|
+
let x = coefficients[0];
|
|
14
|
+
for (let i = 1; i < g + 2; i++) {
|
|
15
|
+
x += coefficients[i] / (z + i);
|
|
16
|
+
}
|
|
17
|
+
const t = z + g + 0.5;
|
|
18
|
+
return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
|
|
19
|
+
}
|
|
20
|
+
function betaCf(x, a, b) {
|
|
21
|
+
const qab = a + b;
|
|
22
|
+
const qap = a + 1;
|
|
23
|
+
const qam = a - 1;
|
|
24
|
+
let c = 1.0;
|
|
25
|
+
let d = 1.0 - (qab * x) / qap;
|
|
26
|
+
if (Math.abs(d) < 1e-30)
|
|
27
|
+
d = 1e-30;
|
|
28
|
+
d = 1.0 / d;
|
|
29
|
+
let h = d;
|
|
30
|
+
for (let m = 1; m <= 200; m++) {
|
|
31
|
+
const m2 = 2 * m;
|
|
32
|
+
let aa = (m * (b - m) * x) / ((qam + m2) * (a + m2));
|
|
33
|
+
d = 1.0 + aa * d;
|
|
34
|
+
if (Math.abs(d) < 1e-30)
|
|
35
|
+
d = 1e-30;
|
|
36
|
+
c = 1.0 + aa / c;
|
|
37
|
+
if (Math.abs(c) < 1e-30)
|
|
38
|
+
c = 1e-30;
|
|
39
|
+
d = 1.0 / d;
|
|
40
|
+
h *= d * c;
|
|
41
|
+
aa = (-(a + m) * (qab + m) * x) / ((a + m2) * (qap + m2));
|
|
42
|
+
d = 1.0 + aa * d;
|
|
43
|
+
if (Math.abs(d) < 1e-30)
|
|
44
|
+
d = 1e-30;
|
|
45
|
+
c = 1.0 + aa / c;
|
|
46
|
+
if (Math.abs(c) < 1e-30)
|
|
47
|
+
c = 1e-30;
|
|
48
|
+
d = 1.0 / d;
|
|
49
|
+
const del = d * c;
|
|
50
|
+
h *= del;
|
|
51
|
+
if (Math.abs(del - 1.0) < 1e-14)
|
|
52
|
+
break;
|
|
53
|
+
}
|
|
54
|
+
return h;
|
|
55
|
+
}
|
|
56
|
+
function regularizedIncompleteBeta(x, a, b) {
|
|
57
|
+
if (x <= 0)
|
|
58
|
+
return 0;
|
|
59
|
+
if (x >= 1)
|
|
60
|
+
return 1;
|
|
61
|
+
if (x > (a + 1) / (a + b + 2)) {
|
|
62
|
+
return 1.0 - regularizedIncompleteBeta(1.0 - x, b, a);
|
|
63
|
+
}
|
|
64
|
+
const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
|
|
65
|
+
const front = Math.exp(a * Math.log(x) + b * Math.log(1 - x) - lnBeta);
|
|
66
|
+
return (front * betaCf(x, a, b)) / a;
|
|
67
|
+
}
|
|
68
|
+
function tDistCdf(t, df) {
|
|
69
|
+
const x = df / (df + t * t);
|
|
70
|
+
const ibeta = regularizedIncompleteBeta(x, df / 2, 0.5);
|
|
71
|
+
return 0.5 * (1 + Math.sign(t) * (1 - ibeta));
|
|
72
|
+
}
|
|
73
|
+
function tDistQuantile(p, df) {
|
|
74
|
+
if (p <= 0)
|
|
75
|
+
return -Infinity;
|
|
76
|
+
if (p >= 1)
|
|
77
|
+
return Infinity;
|
|
78
|
+
let lo = -100;
|
|
79
|
+
let hi = 100;
|
|
80
|
+
for (let i = 0; i < 100; i++) {
|
|
81
|
+
const mid = (lo + hi) / 2;
|
|
82
|
+
if (tDistCdf(mid, df) < p) {
|
|
83
|
+
lo = mid;
|
|
84
|
+
}
|
|
85
|
+
else {
|
|
86
|
+
hi = mid;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
return (lo + hi) / 2;
|
|
90
|
+
}
|
|
91
|
+
export function pairedTTest(samplesA, samplesB) {
|
|
92
|
+
if (samplesA.length !== samplesB.length) {
|
|
93
|
+
throw new Error('Sample arrays must have equal length');
|
|
94
|
+
}
|
|
95
|
+
if (samplesA.length < 2) {
|
|
96
|
+
throw new Error('Need at least 2 paired samples');
|
|
97
|
+
}
|
|
98
|
+
const n = samplesA.length;
|
|
99
|
+
const diffs = new Array(n);
|
|
100
|
+
for (let i = 0; i < n; i++) {
|
|
101
|
+
diffs[i] = samplesA[i] - samplesB[i];
|
|
102
|
+
}
|
|
103
|
+
const meanD = mean(diffs);
|
|
104
|
+
const sdD = stdDev(diffs);
|
|
105
|
+
const df = n - 1;
|
|
106
|
+
if (sdD === 0) {
|
|
107
|
+
return {
|
|
108
|
+
tStatistic: meanD === 0 ? 0 : Infinity,
|
|
109
|
+
degreesOfFreedom: df,
|
|
110
|
+
pValue: meanD === 0 ? 1 : 0,
|
|
111
|
+
significant: meanD !== 0,
|
|
112
|
+
confidenceInterval: [meanD, meanD],
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
const se = sdD / Math.sqrt(n);
|
|
116
|
+
const tStat = meanD / se;
|
|
117
|
+
const cdfVal = tDistCdf(Math.abs(tStat), df);
|
|
118
|
+
const pValue = 2 * (1 - cdfVal);
|
|
119
|
+
const tCritical = tDistQuantile(0.975, df);
|
|
120
|
+
const ci = [meanD - tCritical * se, meanD + tCritical * se];
|
|
121
|
+
return {
|
|
122
|
+
tStatistic: tStat,
|
|
123
|
+
degreesOfFreedom: df,
|
|
124
|
+
pValue,
|
|
125
|
+
significant: pValue < 0.05,
|
|
126
|
+
confidenceInterval: ci,
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
//# sourceMappingURL=t-test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"t-test.js","sourceRoot":"","sources":["../../src/stats/t-test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,eAAe,CAAC;AAU7C,SAAS,OAAO,CAAC,CAAS;IACxB,MAAM,CAAC,GAAG,CAAC,CAAC;IACZ,MAAM,YAAY,GAAG;QACnB,mBAAmB,EAAE,iBAAiB,EAAE,CAAC,kBAAkB,EAAE,kBAAkB;QAC/E,CAAC,kBAAkB,EAAE,kBAAkB,EAAE,CAAC,mBAAmB,EAAE,qBAAqB;QACpF,qBAAqB;KACtB,CAAC;IAEF,IAAI,CAAC,GAAG,GAAG,EAAE,CAAC;QACZ,OAAO,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACpE,CAAC;IAED,CAAC,IAAI,CAAC,CAAC;IACP,IAAI,CAAC,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;IACxB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC/B,CAAC,IAAI,YAAY,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACjC,CAAC;IAED,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC;IACtB,OAAO,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;AACjF,CAAC;AAED,SAAS,MAAM,CAAC,CAAS,EAAE,CAAS,EAAE,CAAS;IAC7C,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC;IAClB,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC;IAClB,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC;IAElB,IAAI,CAAC,GAAG,GAAG,CAAC;IACZ,IAAI,CAAC,GAAG,GAAG,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC;IAC9B,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,KAAK;QAAE,CAAC,GAAG,KAAK,CAAC;IACnC,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC;IACZ,IAAI,CAAC,GAAG,CAAC,CAAC;IAEV,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAC9B,MAAM,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC;QAEjB,IAAI,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;QACrD,CAAC,GAAG,GAAG,GAAG,EAAE,GAAG,CAAC,CAAC;QACjB,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,KAAK;YAAE,CAAC,GAAG,KAAK,CAAC;QACnC,CAAC,GAAG,GAAG,GAAG,EAAE,GAAG,CAAC,CAAC;QACjB,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,KAAK;YAAE,CAAC,GAAG,KAAK,CAAC;QACnC,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC;QACZ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAEX,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,GAAG,GAAG,EAAE,CAAC,CAAC,CAAC;QAC1D,CAAC,GAAG,GAAG,GAAG,EAAE,GAAG,CAAC,CAAC;QACjB,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,KAAK;YAAE,CAAC,GAAG,KAAK,CAAC;QACnC,CAAC,GAAG,GAAG,GAAG,EAAE,GAAG,CAAC,CAAC;QACjB,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,KAAK;YAAE,CAAC,GAAG,KAAK,CAAC;QACnC,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC;QACZ,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC;QAClB,CAAC,IAAI,GAAG,CAAC;QAET,IAAI,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,GAAG,CAAC,GAAG,KAAK;YAAE,MAAM;IACzC,CAAC;IAED,OAAO,CAAC,CAAC;AACX,CAAC;AAED,SAAS,yBAAyB,CAAC,CAAS,EAAE,CAAS,EAAE,CAAS;IAChE,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,CAAC,CAAC;IACrB,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,CAAC,CAAC;IAErB,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;QAC9B,OAAO,GAAG,GAAG,yBAAyB,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;IACxD,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACxD,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;IAEvE,OAAO,CAAC,KAAK,GAAG,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;AACvC,CAAC;AAED,SAAS,QAAQ,CAAC,CAAS,EAAE,EAAU;IACrC,MAAM,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;IAC5B,MAAM,KAAK,GAAG,yBAAyB,CAAC,CAAC,EAAE,EAAE,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC;IACxD,OAAO,GAAG,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC;AAChD,CAAC;AAED,SAAS,aAAa,CAAC,CAAS,EAAE,EAAU;IAC1C,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,CAAC,QAAQ,CAAC;IAC7B,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,QAAQ,CAAC;IAE5B,IAAI,EAAE,GAAG,CAAC,GAAG,CAAC;IACd,IAAI,EAAE,GAAG,GAAG,CAAC;IACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAC7B,MAAM,GAAG,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC;QAC1B,IAAI,QAAQ,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC;YAC1B,EAAE,GAAG,GAAG,CAAC;QACX,CAAC;aAAM,CAAC;YACN,EAAE,GAAG,GAAG,CAAC;QACX,CAAC;IACH,CAAC;IACD,OAAO,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC;AACvB,CAAC;AAED,MAAM,UAAU,WAAW,CAAC,QAAkB,EAAE,QAAkB;IAChE,IAAI,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,MAAM,EAAE,CAAC;QACxC,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;IAC1D,CAAC;IACD,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxB,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;IACpD,CAAC;IAED,MAAM,CAAC,GAAG,QAAQ,CAAC,MAAM,CAAC;IAC1B,MAAM,KAAK,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,CAAC;IACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3B,KAAK,CAAC,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;IACvC,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC;IAC1B,MAAM,GAAG,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;IAC1B,MAAM,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC;IAEjB,IAAI,GAAG,KAAK,CAAC,EAAE,CAAC;QACd,OAAO;YACL,UAAU,EAAE,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ;YACtC,gBAAgB,EAAE,EAAE;YACpB,MAAM,EAAE,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC3B,WAAW,EAAE,KAAK,KAAK,CAAC;YACxB,kBAAkB,EAAE,CAAC,KAAK,EAAE,KAAK,CAAC;SACnC,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,GAAG,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAC9B,MAAM,KAAK,GAAG,KAAK,GAAG,EAAE,CAAC;IAEzB,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;IAC7C,MAAM,MAAM,GAAG,CAAC,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;IAEhC,MAAM,SAAS,GAAG,aAAa,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IAC3C,MAAM,EAAE,GAAqB,CAAC,KAAK,GAAG,SAAS,GAAG,EAAE,EAAE,KAAK,GAAG,SAAS,GAAG,EAAE,CAAC,CAAC;IAE9E,OAAO;QACL,UAAU,EAAE,KAAK;QACjB,gBAAgB,EAAE,EAAE;QACpB,MAAM;QACN,WAAW,EAAE,MAAM,GAAG,IAAI;QAC1B,kBAAkB,EAAE,EAAE;KACvB,CAAC;AACJ,CAAC"}
|
package/dist/tools.d.ts
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import type { EvalSuite } from './eval-suite';
|
|
3
|
+
export interface EvalTool<TParams = unknown> {
|
|
4
|
+
name: string;
|
|
5
|
+
description: string;
|
|
6
|
+
parameters: z.ZodType<TParams>;
|
|
7
|
+
execute: (params: TParams) => Promise<unknown>;
|
|
8
|
+
}
|
|
9
|
+
declare const RunEvalParamsSchema: z.ZodObject<{
|
|
10
|
+
maxCases: z.ZodOptional<z.ZodNumber>;
|
|
11
|
+
}, z.core.$strip>;
|
|
12
|
+
type RunEvalParams = z.infer<typeof RunEvalParamsSchema>;
|
|
13
|
+
export declare function createRunEvalTool(suite: EvalSuite): EvalTool<RunEvalParams>;
|
|
14
|
+
export declare function evalTools(suite: EvalSuite): [EvalTool<RunEvalParams>];
|
|
15
|
+
export {};
|
|
16
|
+
//# sourceMappingURL=tools.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tools.d.ts","sourceRoot":"","sources":["../src/tools.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,KAAK,EAAE,SAAS,EAAmB,MAAM,cAAc,CAAC;AAE/D,MAAM,WAAW,QAAQ,CAAC,OAAO,GAAG,OAAO;IACzC,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,UAAU,EAAE,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;IAC/B,OAAO,EAAE,CAAC,MAAM,EAAE,OAAO,KAAK,OAAO,CAAC,OAAO,CAAC,CAAC;CAChD;AAED,QAAA,MAAM,mBAAmB;;iBAEvB,CAAC;AAEH,KAAK,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC;AAwCzD,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,SAAS,GAAG,QAAQ,CAAC,aAAa,CAAC,CAc3E;AAED,wBAAgB,SAAS,CAAC,KAAK,EAAE,SAAS,GAAG,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC,CAErE"}
|
package/dist/tools.js
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
const RunEvalParamsSchema = z.object({
|
|
3
|
+
maxCases: z.number().int().positive().optional(),
|
|
4
|
+
});
|
|
5
|
+
function buildSummary(result, maxCases) {
|
|
6
|
+
const capped = maxCases ? result.results.slice(0, maxCases) : result.results;
|
|
7
|
+
const total = capped.length;
|
|
8
|
+
const duration = maxCases
|
|
9
|
+
? capped.reduce((sum, r) => sum + r.duration, 0)
|
|
10
|
+
: result.stats.duration;
|
|
11
|
+
const cost = maxCases
|
|
12
|
+
? capped.reduce((sum, r) => sum + (r.usage?.cost ?? 0), 0)
|
|
13
|
+
: result.stats.cost;
|
|
14
|
+
const metrics = {};
|
|
15
|
+
if (maxCases && total > 0) {
|
|
16
|
+
const scoresByMetric = new Map();
|
|
17
|
+
for (const r of capped) {
|
|
18
|
+
for (const s of r.scores) {
|
|
19
|
+
let arr = scoresByMetric.get(s.name);
|
|
20
|
+
if (!arr) {
|
|
21
|
+
arr = [];
|
|
22
|
+
scoresByMetric.set(s.name, arr);
|
|
23
|
+
}
|
|
24
|
+
arr.push(s.score);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
for (const [name, values] of scoresByMetric) {
|
|
28
|
+
metrics[name] = values.reduce((a, b) => a + b, 0) / values.length;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
for (const [name, agg] of Object.entries(result.aggregated)) {
|
|
33
|
+
metrics[name] = agg.mean;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
const assertionsPassed = maxCases ? true : result.assertions.every((a) => a.passed);
|
|
37
|
+
return { success: true, total, duration, cost, metrics, assertionsPassed };
|
|
38
|
+
}
|
|
39
|
+
export function createRunEvalTool(suite) {
|
|
40
|
+
return {
|
|
41
|
+
name: 'run_eval',
|
|
42
|
+
description: 'Run an evaluation suite against the configured dataset and target',
|
|
43
|
+
parameters: RunEvalParamsSchema,
|
|
44
|
+
execute: async ({ maxCases }) => {
|
|
45
|
+
try {
|
|
46
|
+
const result = await suite.run();
|
|
47
|
+
return buildSummary(result, maxCases);
|
|
48
|
+
}
|
|
49
|
+
catch (err) {
|
|
50
|
+
return { success: false, error: err instanceof Error ? err.message : String(err) };
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
export function evalTools(suite) {
|
|
56
|
+
return [createRunEvalTool(suite)];
|
|
57
|
+
}
|
|
58
|
+
//# sourceMappingURL=tools.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tools.js","sourceRoot":"","sources":["../src/tools.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAUxB,MAAM,mBAAmB,GAAG,CAAC,CAAC,MAAM,CAAC;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,EAAE;CACjD,CAAC,CAAC;AAIH,SAAS,YAAY,CAAC,MAAuB,EAAE,QAAiB;IAC9D,MAAM,MAAM,GAAG,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC;IAE7E,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC;IAC5B,MAAM,QAAQ,GAAG,QAAQ;QACvB,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;QAChD,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC;IAC1B,MAAM,IAAI,GAAG,QAAQ;QACnB,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,KAAK,EAAE,IAAI,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC;QAC1D,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC;IAEtB,MAAM,OAAO,GAA2B,EAAE,CAAC;IAC3C,IAAI,QAAQ,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,cAAc,GAAG,IAAI,GAAG,EAAoB,CAAC;QACnD,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,CAAC;gBACzB,IAAI,GAAG,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;gBACrC,IAAI,CAAC,GAAG,EAAE,CAAC;oBACT,GAAG,GAAG,EAAE,CAAC;oBACT,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;gBAClC,CAAC;gBACD,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;YACpB,CAAC;QACH,CAAC;QACD,KAAK,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,IAAI,cAAc,EAAE,CAAC;YAC5C,OAAO,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;QACpE,CAAC;IACH,CAAC;SAAM,CAAC;QACN,KAAK,MAAM,CAAC,IAAI,EAAE,GAAG,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC;YAC5D,OAAO,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,IAAI,CAAC;QAC3B,CAAC;IACH,CAAC;IAED,MAAM,gBAAgB,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAEpF,OAAO,EAAE,OAAO,EAAE,IAAa,EAAE,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,gBAAgB,EAAE,CAAC;AACtF,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,KAAgB;IAChD,OAAO;QACL,IAAI,EAAE,UAAU;QAChB,WAAW,EAAE,mEAAmE;QAChF,UAAU,EAAE,mBAAmB;QAC/B,OAAO,EAAE,KAAK,EAAE,EAAE,QAAQ,EAAE,EAAE,EAAE;YAC9B,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE,CAAC;gBACjC,OAAO,YAAY,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;YACxC,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC;YACrF,CAAC;QACH,CAAC;KACF,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,SAAS,CAAC,KAAgB;IACxC,OAAO,CAAC,iBAAiB,CAAC,KAAK,CAAC,CAAC,CAAC;AACpC,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@cogitator-ai/evals",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Evaluation framework for Cogitator AI agents",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.js",
|
|
7
|
+
"types": "./dist/index.d.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"types": "./dist/index.d.ts",
|
|
11
|
+
"import": "./dist/index.js"
|
|
12
|
+
}
|
|
13
|
+
},
|
|
14
|
+
"files": [
|
|
15
|
+
"dist"
|
|
16
|
+
],
|
|
17
|
+
"scripts": {
|
|
18
|
+
"build": "tsc",
|
|
19
|
+
"dev": "tsc --watch",
|
|
20
|
+
"clean": "rm -rf dist",
|
|
21
|
+
"typecheck": "tsc --noEmit",
|
|
22
|
+
"test": "vitest run",
|
|
23
|
+
"test:watch": "vitest"
|
|
24
|
+
},
|
|
25
|
+
"dependencies": {
|
|
26
|
+
"@cogitator-ai/types": "workspace:*",
|
|
27
|
+
"nanoid": "^5.0.4",
|
|
28
|
+
"zod": "^4.3.6"
|
|
29
|
+
},
|
|
30
|
+
"peerDependencies": {
|
|
31
|
+
"@cogitator-ai/core": "workspace:*"
|
|
32
|
+
},
|
|
33
|
+
"peerDependenciesMeta": {
|
|
34
|
+
"@cogitator-ai/core": {
|
|
35
|
+
"optional": true
|
|
36
|
+
}
|
|
37
|
+
},
|
|
38
|
+
"optionalDependencies": {
|
|
39
|
+
"papaparse": "^5.5.0"
|
|
40
|
+
},
|
|
41
|
+
"devDependencies": {
|
|
42
|
+
"@cogitator-ai/core": "workspace:*",
|
|
43
|
+
"@types/papaparse": "^5.3.15",
|
|
44
|
+
"typescript": "^5.7.2",
|
|
45
|
+
"vitest": "^4.0.18"
|
|
46
|
+
},
|
|
47
|
+
"repository": {
|
|
48
|
+
"type": "git",
|
|
49
|
+
"url": "https://github.com/cogitator-ai/Cogitator-AI.git",
|
|
50
|
+
"directory": "packages/evals"
|
|
51
|
+
},
|
|
52
|
+
"publishConfig": {
|
|
53
|
+
"access": "public",
|
|
54
|
+
"registry": "https://npm.pkg.github.com"
|
|
55
|
+
},
|
|
56
|
+
"license": "MIT"
|
|
57
|
+
}
|