@cogitator-ai/evals 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +456 -0
  2. package/dist/assertions/custom.d.ts +11 -0
  3. package/dist/assertions/custom.d.ts.map +1 -0
  4. package/dist/assertions/custom.js +13 -0
  5. package/dist/assertions/custom.js.map +1 -0
  6. package/dist/assertions/index.d.ts +27 -0
  7. package/dist/assertions/index.d.ts.map +1 -0
  8. package/dist/assertions/index.js +4 -0
  9. package/dist/assertions/index.js.map +1 -0
  10. package/dist/assertions/regression.d.ts +5 -0
  11. package/dist/assertions/regression.d.ts.map +1 -0
  12. package/dist/assertions/regression.js +58 -0
  13. package/dist/assertions/regression.js.map +1 -0
  14. package/dist/assertions/threshold.d.ts +3 -0
  15. package/dist/assertions/threshold.d.ts.map +1 -0
  16. package/dist/assertions/threshold.js +45 -0
  17. package/dist/assertions/threshold.js.map +1 -0
  18. package/dist/datasets/csv-loader.d.ts +3 -0
  19. package/dist/datasets/csv-loader.d.ts.map +1 -0
  20. package/dist/datasets/csv-loader.js +43 -0
  21. package/dist/datasets/csv-loader.js.map +1 -0
  22. package/dist/datasets/dataset.d.ts +15 -0
  23. package/dist/datasets/dataset.d.ts.map +1 -0
  24. package/dist/datasets/dataset.js +62 -0
  25. package/dist/datasets/dataset.js.map +1 -0
  26. package/dist/datasets/index.d.ts +4 -0
  27. package/dist/datasets/index.d.ts.map +1 -0
  28. package/dist/datasets/index.js +4 -0
  29. package/dist/datasets/index.js.map +1 -0
  30. package/dist/datasets/jsonl-loader.d.ts +3 -0
  31. package/dist/datasets/jsonl-loader.d.ts.map +1 -0
  32. package/dist/datasets/jsonl-loader.js +27 -0
  33. package/dist/datasets/jsonl-loader.js.map +1 -0
  34. package/dist/eval-builder.d.ts +30 -0
  35. package/dist/eval-builder.d.ts.map +1 -0
  36. package/dist/eval-builder.js +82 -0
  37. package/dist/eval-builder.js.map +1 -0
  38. package/dist/eval-comparison.d.ts +43 -0
  39. package/dist/eval-comparison.d.ts.map +1 -0
  40. package/dist/eval-comparison.js +125 -0
  41. package/dist/eval-comparison.js.map +1 -0
  42. package/dist/eval-suite.d.ts +63 -0
  43. package/dist/eval-suite.d.ts.map +1 -0
  44. package/dist/eval-suite.js +230 -0
  45. package/dist/eval-suite.js.map +1 -0
  46. package/dist/index.d.ts +31 -0
  47. package/dist/index.d.ts.map +1 -0
  48. package/dist/index.js +20 -0
  49. package/dist/index.js.map +1 -0
  50. package/dist/metrics/custom.d.ts +18 -0
  51. package/dist/metrics/custom.d.ts.map +1 -0
  52. package/dist/metrics/custom.js +28 -0
  53. package/dist/metrics/custom.js.map +1 -0
  54. package/dist/metrics/deterministic.d.ts +11 -0
  55. package/dist/metrics/deterministic.d.ts.map +1 -0
  56. package/dist/metrics/deterministic.js +74 -0
  57. package/dist/metrics/deterministic.js.map +1 -0
  58. package/dist/metrics/index.d.ts +8 -0
  59. package/dist/metrics/index.d.ts.map +1 -0
  60. package/dist/metrics/index.js +5 -0
  61. package/dist/metrics/index.js.map +1 -0
  62. package/dist/metrics/llm-judge.d.ts +27 -0
  63. package/dist/metrics/llm-judge.d.ts.map +1 -0
  64. package/dist/metrics/llm-judge.js +77 -0
  65. package/dist/metrics/llm-judge.js.map +1 -0
  66. package/dist/metrics/statistical.d.ts +5 -0
  67. package/dist/metrics/statistical.d.ts.map +1 -0
  68. package/dist/metrics/statistical.js +85 -0
  69. package/dist/metrics/statistical.js.map +1 -0
  70. package/dist/metrics/types.d.ts +31 -0
  71. package/dist/metrics/types.d.ts.map +1 -0
  72. package/dist/metrics/types.js +2 -0
  73. package/dist/metrics/types.js.map +1 -0
  74. package/dist/reporters/ci.d.ts +3 -0
  75. package/dist/reporters/ci.d.ts.map +1 -0
  76. package/dist/reporters/ci.js +21 -0
  77. package/dist/reporters/ci.js.map +1 -0
  78. package/dist/reporters/console.d.ts +3 -0
  79. package/dist/reporters/console.d.ts.map +1 -0
  80. package/dist/reporters/console.js +46 -0
  81. package/dist/reporters/console.js.map +1 -0
  82. package/dist/reporters/csv.d.ts +5 -0
  83. package/dist/reporters/csv.d.ts.map +1 -0
  84. package/dist/reporters/csv.js +31 -0
  85. package/dist/reporters/csv.js.map +1 -0
  86. package/dist/reporters/index.d.ts +50 -0
  87. package/dist/reporters/index.d.ts.map +1 -0
  88. package/dist/reporters/index.js +28 -0
  89. package/dist/reporters/index.js.map +1 -0
  90. package/dist/reporters/json.d.ts +5 -0
  91. package/dist/reporters/json.d.ts.map +1 -0
  92. package/dist/reporters/json.js +5 -0
  93. package/dist/reporters/json.js.map +1 -0
  94. package/dist/schema.d.ts +29 -0
  95. package/dist/schema.d.ts.map +1 -0
  96. package/dist/schema.js +23 -0
  97. package/dist/schema.js.map +1 -0
  98. package/dist/stats/index.d.ts +6 -0
  99. package/dist/stats/index.d.ts.map +1 -0
  100. package/dist/stats/index.js +4 -0
  101. package/dist/stats/index.js.map +1 -0
  102. package/dist/stats/mcnemar.d.ts +7 -0
  103. package/dist/stats/mcnemar.d.ts.map +1 -0
  104. package/dist/stats/mcnemar.js +34 -0
  105. package/dist/stats/mcnemar.js.map +1 -0
  106. package/dist/stats/percentiles.d.ts +15 -0
  107. package/dist/stats/percentiles.d.ts.map +1 -0
  108. package/dist/stats/percentiles.js +54 -0
  109. package/dist/stats/percentiles.js.map +1 -0
  110. package/dist/stats/t-test.d.ts +9 -0
  111. package/dist/stats/t-test.d.ts.map +1 -0
  112. package/dist/stats/t-test.js +129 -0
  113. package/dist/stats/t-test.js.map +1 -0
  114. package/dist/tools.d.ts +16 -0
  115. package/dist/tools.d.ts.map +1 -0
  116. package/dist/tools.js +58 -0
  117. package/dist/tools.js.map +1 -0
  118. package/package.json +57 -0
@@ -0,0 +1,129 @@
1
+ import { mean, stdDev } from './percentiles';
2
+ function lnGamma(z) {
3
+ const g = 7;
4
+ const coefficients = [
5
+ 0.99999999999980993, 676.5203681218851, -1259.1392167224028, 771.32342877765313,
6
+ -176.61502916214059, 12.507343278686905, -0.13857109526572012, 9.9843695780195716e-6,
7
+ 1.5056327351493116e-7,
8
+ ];
9
+ if (z < 0.5) {
10
+ return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z);
11
+ }
12
+ z -= 1;
13
+ let x = coefficients[0];
14
+ for (let i = 1; i < g + 2; i++) {
15
+ x += coefficients[i] / (z + i);
16
+ }
17
+ const t = z + g + 0.5;
18
+ return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
19
+ }
20
+ function betaCf(x, a, b) {
21
+ const qab = a + b;
22
+ const qap = a + 1;
23
+ const qam = a - 1;
24
+ let c = 1.0;
25
+ let d = 1.0 - (qab * x) / qap;
26
+ if (Math.abs(d) < 1e-30)
27
+ d = 1e-30;
28
+ d = 1.0 / d;
29
+ let h = d;
30
+ for (let m = 1; m <= 200; m++) {
31
+ const m2 = 2 * m;
32
+ let aa = (m * (b - m) * x) / ((qam + m2) * (a + m2));
33
+ d = 1.0 + aa * d;
34
+ if (Math.abs(d) < 1e-30)
35
+ d = 1e-30;
36
+ c = 1.0 + aa / c;
37
+ if (Math.abs(c) < 1e-30)
38
+ c = 1e-30;
39
+ d = 1.0 / d;
40
+ h *= d * c;
41
+ aa = (-(a + m) * (qab + m) * x) / ((a + m2) * (qap + m2));
42
+ d = 1.0 + aa * d;
43
+ if (Math.abs(d) < 1e-30)
44
+ d = 1e-30;
45
+ c = 1.0 + aa / c;
46
+ if (Math.abs(c) < 1e-30)
47
+ c = 1e-30;
48
+ d = 1.0 / d;
49
+ const del = d * c;
50
+ h *= del;
51
+ if (Math.abs(del - 1.0) < 1e-14)
52
+ break;
53
+ }
54
+ return h;
55
+ }
56
+ function regularizedIncompleteBeta(x, a, b) {
57
+ if (x <= 0)
58
+ return 0;
59
+ if (x >= 1)
60
+ return 1;
61
+ if (x > (a + 1) / (a + b + 2)) {
62
+ return 1.0 - regularizedIncompleteBeta(1.0 - x, b, a);
63
+ }
64
+ const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
65
+ const front = Math.exp(a * Math.log(x) + b * Math.log(1 - x) - lnBeta);
66
+ return (front * betaCf(x, a, b)) / a;
67
+ }
68
+ function tDistCdf(t, df) {
69
+ const x = df / (df + t * t);
70
+ const ibeta = regularizedIncompleteBeta(x, df / 2, 0.5);
71
+ return 0.5 * (1 + Math.sign(t) * (1 - ibeta));
72
+ }
73
+ function tDistQuantile(p, df) {
74
+ if (p <= 0)
75
+ return -Infinity;
76
+ if (p >= 1)
77
+ return Infinity;
78
+ let lo = -100;
79
+ let hi = 100;
80
+ for (let i = 0; i < 100; i++) {
81
+ const mid = (lo + hi) / 2;
82
+ if (tDistCdf(mid, df) < p) {
83
+ lo = mid;
84
+ }
85
+ else {
86
+ hi = mid;
87
+ }
88
+ }
89
+ return (lo + hi) / 2;
90
+ }
91
+ export function pairedTTest(samplesA, samplesB) {
92
+ if (samplesA.length !== samplesB.length) {
93
+ throw new Error('Sample arrays must have equal length');
94
+ }
95
+ if (samplesA.length < 2) {
96
+ throw new Error('Need at least 2 paired samples');
97
+ }
98
+ const n = samplesA.length;
99
+ const diffs = new Array(n);
100
+ for (let i = 0; i < n; i++) {
101
+ diffs[i] = samplesA[i] - samplesB[i];
102
+ }
103
+ const meanD = mean(diffs);
104
+ const sdD = stdDev(diffs);
105
+ const df = n - 1;
106
+ if (sdD === 0) {
107
+ return {
108
+ tStatistic: meanD === 0 ? 0 : Infinity,
109
+ degreesOfFreedom: df,
110
+ pValue: meanD === 0 ? 1 : 0,
111
+ significant: meanD !== 0,
112
+ confidenceInterval: [meanD, meanD],
113
+ };
114
+ }
115
+ const se = sdD / Math.sqrt(n);
116
+ const tStat = meanD / se;
117
+ const cdfVal = tDistCdf(Math.abs(tStat), df);
118
+ const pValue = 2 * (1 - cdfVal);
119
+ const tCritical = tDistQuantile(0.975, df);
120
+ const ci = [meanD - tCritical * se, meanD + tCritical * se];
121
+ return {
122
+ tStatistic: tStat,
123
+ degreesOfFreedom: df,
124
+ pValue,
125
+ significant: pValue < 0.05,
126
+ confidenceInterval: ci,
127
+ };
128
+ }
129
+ //# sourceMappingURL=t-test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"t-test.js","sourceRoot":"","sources":["../../src/stats/t-test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,eAAe,CAAC;AAU7C,SAAS,OAAO,CAAC,CAAS;IACxB,MAAM,CAAC,GAAG,CAAC,CAAC;IACZ,MAAM,YAAY,GAAG;QACnB,mBAAmB,EAAE,iBAAiB,EAAE,CAAC,kBAAkB,EAAE,kBAAkB;QAC/E,CAAC,kBAAkB,EAAE,kBAAkB,EAAE,CAAC,mBAAmB,EAAE,qBAAqB;QACpF,qBAAqB;KACtB,CAAC;IAEF,IAAI,CAAC,GAAG,GAAG,EAAE,CAAC;QACZ,OAAO,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACpE,CAAC;IAED,CAAC,IAAI,CAAC,CAAC;IACP,IAAI,CAAC,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;IACxB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC/B,CAAC,IAAI,YAAY,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACjC,CAAC;IAED,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC;IACtB,OAAO,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;AACjF,CAAC;AAED,SAAS,MAAM,CAAC,CAAS,EAAE,CAAS,EAAE,CAAS;IAC7C,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC;IAClB,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC;IAClB,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC;IAElB,IAAI,CAAC,GAAG,GAAG,CAAC;IACZ,IAAI,CAAC,GAAG,GAAG,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC;IAC9B,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,KAAK;QAAE,CAAC,GAAG,KAAK,CAAC;IACnC,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC;IACZ,IAAI,CAAC,GAAG,CAAC,CAAC;IAEV,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAC9B,MAAM,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC;QAEjB,IAAI,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;QACrD,CAAC,GAAG,GAAG,GAAG,EAAE,GAAG,CAAC,CAAC;QACjB,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,KAAK;YAAE,CAAC,GAAG,KAAK,CAAC;QACnC,CAAC,GAAG,GAAG,GAAG,EAAE,GAAG,CAAC,CAAC;QACjB,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,KAAK;YAAE,CAAC,GAAG,KAAK,CAAC;QACnC,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC;QACZ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAEX,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,GAAG,GAAG,EAAE,CAAC,CAAC,CAAC;QAC1D,CAAC,GAAG,GAAG,GAAG,EAAE,GAAG,CAAC,CAAC;QACjB,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,KAAK;YAAE,CAAC,GAAG,KAAK,CAAC;QACnC,CAAC,GAAG,GAAG,GAAG,EAAE,GAAG,CAAC,CAAC;QACjB,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,KAAK;YAAE,CAAC,GAAG,KAAK,CAAC;QACnC,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC;QACZ,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC;QAClB,CAAC,IAAI,GAAG,CAAC;QAET,IAAI,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,GAAG,CAAC,GAAG,KAAK;YAAE,MAAM;IACzC,CAAC;IAED,OAAO,CAAC,CAAC;AACX,CAAC;AAED,SAAS,yBAAyB,CAAC,CAAS,EAAE,CAAS,EAAE,CAAS;IAChE,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,CAAC,CAAC;IACrB,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,CAAC,CAAC;IAErB,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;QAC9B,OAAO,GAAG,GAAG,yBAAyB,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;IACxD,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACxD,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;IAEvE,OAAO,CAAC,KAAK,GAAG,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;AACvC,CAAC;AAED,SAAS,QAAQ,CAAC,CAAS,EAAE,EAAU;IACrC,MAAM,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;IAC5B,MAAM,KAAK,GAAG,yBAAyB,CAAC,CAAC,EAAE,EAAE,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC;IACxD,OAAO,GAAG,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC;AAChD,CAAC;AAED,SAAS,aAAa,CAAC,CAAS,EAAE,EAAU;IAC1C,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,CAAC,QAAQ,CAAC;IAC7B,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,QAAQ,CAAC;IAE5B,IAAI,EAAE,GAAG,CAAC,GAAG,CAAC;IACd,IAAI,EAAE,GAAG,GAAG,CAAC;IACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAC7B,MAAM,GAAG,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC;QAC1B,IAAI,QAAQ,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC;YAC1B,EAAE,GAAG,GAAG,CAAC;QACX,CAAC;aAAM,CAAC;YACN,EAAE,GAAG,GAAG,CAAC;QACX,CAAC;IACH,CAAC;IACD,OAAO,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC;AACvB,CAAC;AAED,MAAM,UAAU,WAAW,CAAC,QAAkB,EAAE,QAAkB;IAChE,IAAI,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,MAAM,EAAE,CAAC;QACxC,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;IAC1D,CAAC;IACD,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxB,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;IACpD,CAAC;IAED,MAAM,CAAC,GAAG,QAAQ,CAAC,MAAM,CAAC;IAC1B,MAAM,KAAK,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,CAAC;IACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3B,KAAK,CAAC,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;IACvC,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC;IAC1B,MAAM,GAAG,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;IAC1B,MAAM,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC;IAEjB,IAAI,GAAG,KAAK,CAAC,EAAE,CAAC;QACd,OAAO;YACL,UAAU,EAAE,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ;YACtC,gBAAgB,EAAE,EAAE;YACpB,MAAM,EAAE,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC3B,WAAW,EAAE,KAAK,KAAK,CAAC;YACxB,kBAAkB,EAAE,CAAC,KAAK,EAAE,KAAK,CAAC;SACnC,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,GAAG,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAC9B,MAAM,KAAK,GAAG,KAAK,GAAG,EAAE,CAAC;IAEzB,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;IAC7C,MAAM,MAAM,GAAG,CAAC,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;IAEhC,MAAM,SAAS,GAAG,aAAa,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IAC3C,MAAM,EAAE,GAAqB,CAAC,KAAK,GAAG,SAAS,GAAG,EAAE,EAAE,KAAK,GAAG,SAAS,GAAG,EAAE,CAAC,CAAC;IAE9E,OAAO;QACL,UAAU,EAAE,KAAK;QACjB,gBAAgB,EAAE,EAAE;QACpB,MAAM;QACN,WAAW,EAAE,MAAM,GAAG,IAAI;QAC1B,kBAAkB,EAAE,EAAE;KACvB,CAAC;AACJ,CAAC"}
@@ -0,0 +1,16 @@
1
+ import { z } from 'zod';
2
+ import type { EvalSuite } from './eval-suite';
3
+ export interface EvalTool<TParams = unknown> {
4
+ name: string;
5
+ description: string;
6
+ parameters: z.ZodType<TParams>;
7
+ execute: (params: TParams) => Promise<unknown>;
8
+ }
9
+ declare const RunEvalParamsSchema: z.ZodObject<{
10
+ maxCases: z.ZodOptional<z.ZodNumber>;
11
+ }, z.core.$strip>;
12
+ type RunEvalParams = z.infer<typeof RunEvalParamsSchema>;
13
+ export declare function createRunEvalTool(suite: EvalSuite): EvalTool<RunEvalParams>;
14
+ export declare function evalTools(suite: EvalSuite): [EvalTool<RunEvalParams>];
15
+ export {};
16
+ //# sourceMappingURL=tools.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tools.d.ts","sourceRoot":"","sources":["../src/tools.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,KAAK,EAAE,SAAS,EAAmB,MAAM,cAAc,CAAC;AAE/D,MAAM,WAAW,QAAQ,CAAC,OAAO,GAAG,OAAO;IACzC,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,UAAU,EAAE,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;IAC/B,OAAO,EAAE,CAAC,MAAM,EAAE,OAAO,KAAK,OAAO,CAAC,OAAO,CAAC,CAAC;CAChD;AAED,QAAA,MAAM,mBAAmB;;iBAEvB,CAAC;AAEH,KAAK,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC;AAwCzD,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,SAAS,GAAG,QAAQ,CAAC,aAAa,CAAC,CAc3E;AAED,wBAAgB,SAAS,CAAC,KAAK,EAAE,SAAS,GAAG,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC,CAErE"}
package/dist/tools.js ADDED
@@ -0,0 +1,58 @@
1
+ import { z } from 'zod';
2
+ const RunEvalParamsSchema = z.object({
3
+ maxCases: z.number().int().positive().optional(),
4
+ });
5
+ function buildSummary(result, maxCases) {
6
+ const capped = maxCases ? result.results.slice(0, maxCases) : result.results;
7
+ const total = capped.length;
8
+ const duration = maxCases
9
+ ? capped.reduce((sum, r) => sum + r.duration, 0)
10
+ : result.stats.duration;
11
+ const cost = maxCases
12
+ ? capped.reduce((sum, r) => sum + (r.usage?.cost ?? 0), 0)
13
+ : result.stats.cost;
14
+ const metrics = {};
15
+ if (maxCases && total > 0) {
16
+ const scoresByMetric = new Map();
17
+ for (const r of capped) {
18
+ for (const s of r.scores) {
19
+ let arr = scoresByMetric.get(s.name);
20
+ if (!arr) {
21
+ arr = [];
22
+ scoresByMetric.set(s.name, arr);
23
+ }
24
+ arr.push(s.score);
25
+ }
26
+ }
27
+ for (const [name, values] of scoresByMetric) {
28
+ metrics[name] = values.reduce((a, b) => a + b, 0) / values.length;
29
+ }
30
+ }
31
+ else {
32
+ for (const [name, agg] of Object.entries(result.aggregated)) {
33
+ metrics[name] = agg.mean;
34
+ }
35
+ }
36
+ const assertionsPassed = maxCases ? true : result.assertions.every((a) => a.passed);
37
+ return { success: true, total, duration, cost, metrics, assertionsPassed };
38
+ }
39
+ export function createRunEvalTool(suite) {
40
+ return {
41
+ name: 'run_eval',
42
+ description: 'Run an evaluation suite against the configured dataset and target',
43
+ parameters: RunEvalParamsSchema,
44
+ execute: async ({ maxCases }) => {
45
+ try {
46
+ const result = await suite.run();
47
+ return buildSummary(result, maxCases);
48
+ }
49
+ catch (err) {
50
+ return { success: false, error: err instanceof Error ? err.message : String(err) };
51
+ }
52
+ },
53
+ };
54
+ }
55
+ export function evalTools(suite) {
56
+ return [createRunEvalTool(suite)];
57
+ }
58
+ //# sourceMappingURL=tools.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tools.js","sourceRoot":"","sources":["../src/tools.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAUxB,MAAM,mBAAmB,GAAG,CAAC,CAAC,MAAM,CAAC;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,EAAE;CACjD,CAAC,CAAC;AAIH,SAAS,YAAY,CAAC,MAAuB,EAAE,QAAiB;IAC9D,MAAM,MAAM,GAAG,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC;IAE7E,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC;IAC5B,MAAM,QAAQ,GAAG,QAAQ;QACvB,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;QAChD,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC;IAC1B,MAAM,IAAI,GAAG,QAAQ;QACnB,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,KAAK,EAAE,IAAI,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC;QAC1D,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC;IAEtB,MAAM,OAAO,GAA2B,EAAE,CAAC;IAC3C,IAAI,QAAQ,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,cAAc,GAAG,IAAI,GAAG,EAAoB,CAAC;QACnD,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,CAAC;gBACzB,IAAI,GAAG,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;gBACrC,IAAI,CAAC,GAAG,EAAE,CAAC;oBACT,GAAG,GAAG,EAAE,CAAC;oBACT,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;gBAClC,CAAC;gBACD,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;YACpB,CAAC;QACH,CAAC;QACD,KAAK,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,IAAI,cAAc,EAAE,CAAC;YAC5C,OAAO,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;QACpE,CAAC;IACH,CAAC;SAAM,CAAC;QACN,KAAK,MAAM,CAAC,IAAI,EAAE,GAAG,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC;YAC5D,OAAO,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,IAAI,CAAC;QAC3B,CAAC;IACH,CAAC;IAED,MAAM,gBAAgB,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAEpF,OAAO,EAAE,OAAO,EAAE,IAAa,EAAE,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,gBAAgB,EAAE,CAAC;AACtF,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,KAAgB;IAChD,OAAO;QACL,IAAI,EAAE,UAAU;QAChB,WAAW,EAAE,mEAAmE;QAChF,UAAU,EAAE,mBAAmB;QAC/B,OAAO,EAAE,KAAK,EAAE,EAAE,QAAQ,EAAE,EAAE,EAAE;YAC9B,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE,CAAC;gBACjC,OAAO,YAAY,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;YACxC,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC;YACrF,CAAC;QACH,CAAC;KACF,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,SAAS,CAAC,KAAgB;IACxC,OAAO,CAAC,iBAAiB,CAAC,KAAK,CAAC,CAAC,CAAC;AACpC,CAAC"}
package/package.json ADDED
@@ -0,0 +1,57 @@
1
+ {
2
+ "name": "@cogitator-ai/evals",
3
+ "version": "0.1.0",
4
+ "description": "Evaluation framework for Cogitator AI agents",
5
+ "type": "module",
6
+ "main": "./dist/index.js",
7
+ "types": "./dist/index.d.ts",
8
+ "exports": {
9
+ ".": {
10
+ "types": "./dist/index.d.ts",
11
+ "import": "./dist/index.js"
12
+ }
13
+ },
14
+ "files": [
15
+ "dist"
16
+ ],
17
+ "scripts": {
18
+ "build": "tsc",
19
+ "dev": "tsc --watch",
20
+ "clean": "rm -rf dist",
21
+ "typecheck": "tsc --noEmit",
22
+ "test": "vitest run",
23
+ "test:watch": "vitest"
24
+ },
25
+ "dependencies": {
26
+ "@cogitator-ai/types": "workspace:*",
27
+ "nanoid": "^5.0.4",
28
+ "zod": "^4.3.6"
29
+ },
30
+ "peerDependencies": {
31
+ "@cogitator-ai/core": "workspace:*"
32
+ },
33
+ "peerDependenciesMeta": {
34
+ "@cogitator-ai/core": {
35
+ "optional": true
36
+ }
37
+ },
38
+ "optionalDependencies": {
39
+ "papaparse": "^5.5.0"
40
+ },
41
+ "devDependencies": {
42
+ "@cogitator-ai/core": "workspace:*",
43
+ "@types/papaparse": "^5.3.15",
44
+ "typescript": "^5.7.2",
45
+ "vitest": "^4.0.18"
46
+ },
47
+ "repository": {
48
+ "type": "git",
49
+ "url": "https://github.com/cogitator-ai/Cogitator-AI.git",
50
+ "directory": "packages/evals"
51
+ },
52
+ "publishConfig": {
53
+ "access": "public",
54
+ "registry": "https://npm.pkg.github.com"
55
+ },
56
+ "license": "MIT"
57
+ }