model-test-bench 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +389 -0
  3. package/dist/bin/mtb.d.ts +3 -0
  4. package/dist/bin/mtb.d.ts.map +1 -0
  5. package/dist/bin/mtb.js +148 -0
  6. package/dist/bin/mtb.js.map +1 -0
  7. package/dist/server/index.d.ts +13 -0
  8. package/dist/server/index.d.ts.map +1 -0
  9. package/dist/server/index.js +72 -0
  10. package/dist/server/index.js.map +1 -0
  11. package/dist/server/interfaces/evaluator.d.ts +15 -0
  12. package/dist/server/interfaces/evaluator.d.ts.map +1 -0
  13. package/dist/server/interfaces/evaluator.js +2 -0
  14. package/dist/server/interfaces/evaluator.js.map +1 -0
  15. package/dist/server/interfaces/logger.d.ts +9 -0
  16. package/dist/server/interfaces/logger.d.ts.map +1 -0
  17. package/dist/server/interfaces/logger.js +2 -0
  18. package/dist/server/interfaces/logger.js.map +1 -0
  19. package/dist/server/interfaces/runner.d.ts +9 -0
  20. package/dist/server/interfaces/runner.d.ts.map +1 -0
  21. package/dist/server/interfaces/runner.js +2 -0
  22. package/dist/server/interfaces/runner.js.map +1 -0
  23. package/dist/server/interfaces/storage.d.ts +36 -0
  24. package/dist/server/interfaces/storage.d.ts.map +1 -0
  25. package/dist/server/interfaces/storage.js +2 -0
  26. package/dist/server/interfaces/storage.js.map +1 -0
  27. package/dist/server/routes/eval-queue.d.ts +23 -0
  28. package/dist/server/routes/eval-queue.d.ts.map +1 -0
  29. package/dist/server/routes/eval-queue.js +45 -0
  30. package/dist/server/routes/eval-queue.js.map +1 -0
  31. package/dist/server/routes/evaluations.d.ts +8 -0
  32. package/dist/server/routes/evaluations.d.ts.map +1 -0
  33. package/dist/server/routes/evaluations.js +221 -0
  34. package/dist/server/routes/evaluations.js.map +1 -0
  35. package/dist/server/routes/providers.d.ts +5 -0
  36. package/dist/server/routes/providers.d.ts.map +1 -0
  37. package/dist/server/routes/providers.js +179 -0
  38. package/dist/server/routes/providers.js.map +1 -0
  39. package/dist/server/routes/run-queue.d.ts +17 -0
  40. package/dist/server/routes/run-queue.d.ts.map +1 -0
  41. package/dist/server/routes/run-queue.js +34 -0
  42. package/dist/server/routes/run-queue.js.map +1 -0
  43. package/dist/server/routes/run-sse.d.ts +18 -0
  44. package/dist/server/routes/run-sse.d.ts.map +1 -0
  45. package/dist/server/routes/run-sse.js +57 -0
  46. package/dist/server/routes/run-sse.js.map +1 -0
  47. package/dist/server/routes/runs.d.ts +9 -0
  48. package/dist/server/routes/runs.d.ts.map +1 -0
  49. package/dist/server/routes/runs.js +380 -0
  50. package/dist/server/routes/runs.js.map +1 -0
  51. package/dist/server/routes/scenarios.d.ts +5 -0
  52. package/dist/server/routes/scenarios.d.ts.map +1 -0
  53. package/dist/server/routes/scenarios.js +181 -0
  54. package/dist/server/routes/scenarios.js.map +1 -0
  55. package/dist/server/services/eval-helpers.d.ts +22 -0
  56. package/dist/server/services/eval-helpers.d.ts.map +1 -0
  57. package/dist/server/services/eval-helpers.js +75 -0
  58. package/dist/server/services/eval-helpers.js.map +1 -0
  59. package/dist/server/services/eval-parsers-debate-impl.d.ts +11 -0
  60. package/dist/server/services/eval-parsers-debate-impl.d.ts.map +1 -0
  61. package/dist/server/services/eval-parsers-debate-impl.js +133 -0
  62. package/dist/server/services/eval-parsers-debate-impl.js.map +1 -0
  63. package/dist/server/services/eval-parsers.d.ts +24 -0
  64. package/dist/server/services/eval-parsers.d.ts.map +1 -0
  65. package/dist/server/services/eval-parsers.js +153 -0
  66. package/dist/server/services/eval-parsers.js.map +1 -0
  67. package/dist/server/services/eval-prompts.d.ts +9 -0
  68. package/dist/server/services/eval-prompts.d.ts.map +1 -0
  69. package/dist/server/services/eval-prompts.js +164 -0
  70. package/dist/server/services/eval-prompts.js.map +1 -0
  71. package/dist/server/services/evaluator.d.ts +10 -0
  72. package/dist/server/services/evaluator.d.ts.map +1 -0
  73. package/dist/server/services/evaluator.js +143 -0
  74. package/dist/server/services/evaluator.js.map +1 -0
  75. package/dist/server/services/fs-adapter.d.ts +20 -0
  76. package/dist/server/services/fs-adapter.d.ts.map +1 -0
  77. package/dist/server/services/fs-adapter.js +13 -0
  78. package/dist/server/services/fs-adapter.js.map +1 -0
  79. package/dist/server/services/instruction-parser.d.ts +26 -0
  80. package/dist/server/services/instruction-parser.d.ts.map +1 -0
  81. package/dist/server/services/instruction-parser.js +121 -0
  82. package/dist/server/services/instruction-parser.js.map +1 -0
  83. package/dist/server/services/log-rotator.d.ts +20 -0
  84. package/dist/server/services/log-rotator.d.ts.map +1 -0
  85. package/dist/server/services/log-rotator.js +60 -0
  86. package/dist/server/services/log-rotator.js.map +1 -0
  87. package/dist/server/services/logger.d.ts +15 -0
  88. package/dist/server/services/logger.d.ts.map +1 -0
  89. package/dist/server/services/logger.js +69 -0
  90. package/dist/server/services/logger.js.map +1 -0
  91. package/dist/server/services/model-factory.d.ts +10 -0
  92. package/dist/server/services/model-factory.d.ts.map +1 -0
  93. package/dist/server/services/model-factory.js +33 -0
  94. package/dist/server/services/model-factory.js.map +1 -0
  95. package/dist/server/services/runner.d.ts +9 -0
  96. package/dist/server/services/runner.d.ts.map +1 -0
  97. package/dist/server/services/runner.js +99 -0
  98. package/dist/server/services/runner.js.map +1 -0
  99. package/dist/server/services/seeder.d.ts +5 -0
  100. package/dist/server/services/seeder.d.ts.map +1 -0
  101. package/dist/server/services/seeder.js +79 -0
  102. package/dist/server/services/seeder.js.map +1 -0
  103. package/dist/server/services/storage-test-helpers.d.ts +15 -0
  104. package/dist/server/services/storage-test-helpers.d.ts.map +1 -0
  105. package/dist/server/services/storage-test-helpers.js +151 -0
  106. package/dist/server/services/storage-test-helpers.js.map +1 -0
  107. package/dist/server/services/storage.d.ts +35 -0
  108. package/dist/server/services/storage.d.ts.map +1 -0
  109. package/dist/server/services/storage.js +219 -0
  110. package/dist/server/services/storage.js.map +1 -0
  111. package/dist/server/services/tools.d.ts +6 -0
  112. package/dist/server/services/tools.d.ts.map +1 -0
  113. package/dist/server/services/tools.js +94 -0
  114. package/dist/server/services/tools.js.map +1 -0
  115. package/dist/server/services/transcript-formatter.d.ts +18 -0
  116. package/dist/server/services/transcript-formatter.d.ts.map +1 -0
  117. package/dist/server/services/transcript-formatter.js +227 -0
  118. package/dist/server/services/transcript-formatter.js.map +1 -0
  119. package/dist/server/services/update-checker.d.ts +3 -0
  120. package/dist/server/services/update-checker.d.ts.map +1 -0
  121. package/dist/server/services/update-checker.js +34 -0
  122. package/dist/server/services/update-checker.js.map +1 -0
  123. package/dist/server/types/evaluation.d.ts +94 -0
  124. package/dist/server/types/evaluation.d.ts.map +1 -0
  125. package/dist/server/types/evaluation.js +5 -0
  126. package/dist/server/types/evaluation.js.map +1 -0
  127. package/dist/server/types/index.d.ts +5 -0
  128. package/dist/server/types/index.d.ts.map +1 -0
  129. package/dist/server/types/index.js +5 -0
  130. package/dist/server/types/index.js.map +1 -0
  131. package/dist/server/types/provider.d.ts +23 -0
  132. package/dist/server/types/provider.d.ts.map +1 -0
  133. package/dist/server/types/provider.js +5 -0
  134. package/dist/server/types/provider.js.map +1 -0
  135. package/dist/server/types/run.d.ts +31 -0
  136. package/dist/server/types/run.d.ts.map +1 -0
  137. package/dist/server/types/run.js +5 -0
  138. package/dist/server/types/run.js.map +1 -0
  139. package/dist/server/types/scenario.d.ts +19 -0
  140. package/dist/server/types/scenario.d.ts.map +1 -0
  141. package/dist/server/types/scenario.js +5 -0
  142. package/dist/server/types/scenario.js.map +1 -0
  143. package/dist/src/server/index.d.ts +13 -0
  144. package/dist/src/server/index.d.ts.map +1 -0
  145. package/dist/src/server/index.js +72 -0
  146. package/dist/src/server/index.js.map +1 -0
  147. package/dist/src/server/interfaces/evaluator.d.ts +15 -0
  148. package/dist/src/server/interfaces/evaluator.d.ts.map +1 -0
  149. package/dist/src/server/interfaces/evaluator.js +2 -0
  150. package/dist/src/server/interfaces/evaluator.js.map +1 -0
  151. package/dist/src/server/interfaces/logger.d.ts +9 -0
  152. package/dist/src/server/interfaces/logger.d.ts.map +1 -0
  153. package/dist/src/server/interfaces/logger.js +2 -0
  154. package/dist/src/server/interfaces/logger.js.map +1 -0
  155. package/dist/src/server/interfaces/runner.d.ts +9 -0
  156. package/dist/src/server/interfaces/runner.d.ts.map +1 -0
  157. package/dist/src/server/interfaces/runner.js +2 -0
  158. package/dist/src/server/interfaces/runner.js.map +1 -0
  159. package/dist/src/server/interfaces/storage.d.ts +36 -0
  160. package/dist/src/server/interfaces/storage.d.ts.map +1 -0
  161. package/dist/src/server/interfaces/storage.js +2 -0
  162. package/dist/src/server/interfaces/storage.js.map +1 -0
  163. package/dist/src/server/routes/eval-queue.d.ts +23 -0
  164. package/dist/src/server/routes/eval-queue.d.ts.map +1 -0
  165. package/dist/src/server/routes/eval-queue.js +45 -0
  166. package/dist/src/server/routes/eval-queue.js.map +1 -0
  167. package/dist/src/server/routes/evaluations.d.ts +8 -0
  168. package/dist/src/server/routes/evaluations.d.ts.map +1 -0
  169. package/dist/src/server/routes/evaluations.js +221 -0
  170. package/dist/src/server/routes/evaluations.js.map +1 -0
  171. package/dist/src/server/routes/providers.d.ts +5 -0
  172. package/dist/src/server/routes/providers.d.ts.map +1 -0
  173. package/dist/src/server/routes/providers.js +179 -0
  174. package/dist/src/server/routes/providers.js.map +1 -0
  175. package/dist/src/server/routes/run-queue.d.ts +17 -0
  176. package/dist/src/server/routes/run-queue.d.ts.map +1 -0
  177. package/dist/src/server/routes/run-queue.js +34 -0
  178. package/dist/src/server/routes/run-queue.js.map +1 -0
  179. package/dist/src/server/routes/run-sse.d.ts +18 -0
  180. package/dist/src/server/routes/run-sse.d.ts.map +1 -0
  181. package/dist/src/server/routes/run-sse.js +57 -0
  182. package/dist/src/server/routes/run-sse.js.map +1 -0
  183. package/dist/src/server/routes/runs.d.ts +9 -0
  184. package/dist/src/server/routes/runs.d.ts.map +1 -0
  185. package/dist/src/server/routes/runs.js +380 -0
  186. package/dist/src/server/routes/runs.js.map +1 -0
  187. package/dist/src/server/routes/scenarios.d.ts +5 -0
  188. package/dist/src/server/routes/scenarios.d.ts.map +1 -0
  189. package/dist/src/server/routes/scenarios.js +181 -0
  190. package/dist/src/server/routes/scenarios.js.map +1 -0
  191. package/dist/src/server/services/eval-helpers.d.ts +22 -0
  192. package/dist/src/server/services/eval-helpers.d.ts.map +1 -0
  193. package/dist/src/server/services/eval-helpers.js +75 -0
  194. package/dist/src/server/services/eval-helpers.js.map +1 -0
  195. package/dist/src/server/services/eval-parsers-debate-impl.d.ts +11 -0
  196. package/dist/src/server/services/eval-parsers-debate-impl.d.ts.map +1 -0
  197. package/dist/src/server/services/eval-parsers-debate-impl.js +133 -0
  198. package/dist/src/server/services/eval-parsers-debate-impl.js.map +1 -0
  199. package/dist/src/server/services/eval-parsers.d.ts +24 -0
  200. package/dist/src/server/services/eval-parsers.d.ts.map +1 -0
  201. package/dist/src/server/services/eval-parsers.js +153 -0
  202. package/dist/src/server/services/eval-parsers.js.map +1 -0
  203. package/dist/src/server/services/eval-prompts.d.ts +9 -0
  204. package/dist/src/server/services/eval-prompts.d.ts.map +1 -0
  205. package/dist/src/server/services/eval-prompts.js +164 -0
  206. package/dist/src/server/services/eval-prompts.js.map +1 -0
  207. package/dist/src/server/services/evaluator.d.ts +10 -0
  208. package/dist/src/server/services/evaluator.d.ts.map +1 -0
  209. package/dist/src/server/services/evaluator.js +143 -0
  210. package/dist/src/server/services/evaluator.js.map +1 -0
  211. package/dist/src/server/services/fs-adapter.d.ts +20 -0
  212. package/dist/src/server/services/fs-adapter.d.ts.map +1 -0
  213. package/dist/src/server/services/fs-adapter.js +13 -0
  214. package/dist/src/server/services/fs-adapter.js.map +1 -0
  215. package/dist/src/server/services/instruction-parser.d.ts +26 -0
  216. package/dist/src/server/services/instruction-parser.d.ts.map +1 -0
  217. package/dist/src/server/services/instruction-parser.js +121 -0
  218. package/dist/src/server/services/instruction-parser.js.map +1 -0
  219. package/dist/src/server/services/log-rotator.d.ts +20 -0
  220. package/dist/src/server/services/log-rotator.d.ts.map +1 -0
  221. package/dist/src/server/services/log-rotator.js +60 -0
  222. package/dist/src/server/services/log-rotator.js.map +1 -0
  223. package/dist/src/server/services/logger.d.ts +15 -0
  224. package/dist/src/server/services/logger.d.ts.map +1 -0
  225. package/dist/src/server/services/logger.js +69 -0
  226. package/dist/src/server/services/logger.js.map +1 -0
  227. package/dist/src/server/services/model-factory.d.ts +10 -0
  228. package/dist/src/server/services/model-factory.d.ts.map +1 -0
  229. package/dist/src/server/services/model-factory.js +33 -0
  230. package/dist/src/server/services/model-factory.js.map +1 -0
  231. package/dist/src/server/services/runner.d.ts +9 -0
  232. package/dist/src/server/services/runner.d.ts.map +1 -0
  233. package/dist/src/server/services/runner.js +99 -0
  234. package/dist/src/server/services/runner.js.map +1 -0
  235. package/dist/src/server/services/seeder.d.ts +5 -0
  236. package/dist/src/server/services/seeder.d.ts.map +1 -0
  237. package/dist/src/server/services/seeder.js +79 -0
  238. package/dist/src/server/services/seeder.js.map +1 -0
  239. package/dist/src/server/services/storage.d.ts +35 -0
  240. package/dist/src/server/services/storage.d.ts.map +1 -0
  241. package/dist/src/server/services/storage.js +219 -0
  242. package/dist/src/server/services/storage.js.map +1 -0
  243. package/dist/src/server/services/tools.d.ts +6 -0
  244. package/dist/src/server/services/tools.d.ts.map +1 -0
  245. package/dist/src/server/services/tools.js +94 -0
  246. package/dist/src/server/services/tools.js.map +1 -0
  247. package/dist/src/server/services/transcript-formatter.d.ts +18 -0
  248. package/dist/src/server/services/transcript-formatter.d.ts.map +1 -0
  249. package/dist/src/server/services/transcript-formatter.js +227 -0
  250. package/dist/src/server/services/transcript-formatter.js.map +1 -0
  251. package/dist/src/server/services/update-checker.d.ts +3 -0
  252. package/dist/src/server/services/update-checker.d.ts.map +1 -0
  253. package/dist/src/server/services/update-checker.js +34 -0
  254. package/dist/src/server/services/update-checker.js.map +1 -0
  255. package/dist/src/server/types/evaluation.d.ts +94 -0
  256. package/dist/src/server/types/evaluation.d.ts.map +1 -0
  257. package/dist/src/server/types/evaluation.js +5 -0
  258. package/dist/src/server/types/evaluation.js.map +1 -0
  259. package/dist/src/server/types/index.d.ts +5 -0
  260. package/dist/src/server/types/index.d.ts.map +1 -0
  261. package/dist/src/server/types/index.js +5 -0
  262. package/dist/src/server/types/index.js.map +1 -0
  263. package/dist/src/server/types/provider.d.ts +23 -0
  264. package/dist/src/server/types/provider.d.ts.map +1 -0
  265. package/dist/src/server/types/provider.js +5 -0
  266. package/dist/src/server/types/provider.js.map +1 -0
  267. package/dist/src/server/types/run.d.ts +31 -0
  268. package/dist/src/server/types/run.d.ts.map +1 -0
  269. package/dist/src/server/types/run.js +5 -0
  270. package/dist/src/server/types/run.js.map +1 -0
  271. package/dist/src/server/types/scenario.d.ts +19 -0
  272. package/dist/src/server/types/scenario.d.ts.map +1 -0
  273. package/dist/src/server/types/scenario.js +5 -0
  274. package/dist/src/server/types/scenario.js.map +1 -0
  275. package/dist/web/assets/index-AJu1Yn5F.js +70 -0
  276. package/dist/web/assets/index-C_ioEISr.css +1 -0
  277. package/dist/web/index.html +15 -0
  278. package/docs/schemas/provider-api.example.json +12 -0
  279. package/docs/schemas/provider-openai.example.json +11 -0
  280. package/docs/schemas/scenario-baseline.example.json +24 -0
  281. package/docs/schemas/scenario-carwash-baseline.example.json +22 -0
  282. package/docs/schemas/scenario-carwash-with-system-prompt.example.json +24 -0
  283. package/docs/schemas/scenario-golden-rules-baseline.example.json +24 -0
  284. package/docs/schemas/scenario-golden-rules-with-system-prompt.example.json +28 -0
  285. package/docs/schemas/scenario-negative-analysis-baseline.example.json +23 -0
  286. package/docs/schemas/scenario-negative-analysis-with-system-prompt.example.json +25 -0
  287. package/docs/schemas/scenario-with-system-prompt.example.json +25 -0
  288. package/package.json +97 -0
@@ -0,0 +1,75 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Evaluation Helpers — result aggregation extracted from the orchestrator
3
+ // ---------------------------------------------------------------------------
4
+ import { toInstructionCompliance } from './eval-parsers.js';
5
+ /** Check whether evaluator scores have converged (within 1 point per dimension). */
6
+ export function checkConsensus(accumulators) {
7
+ if (accumulators.length < 2)
8
+ return true;
9
+ // Require ALL evaluators to have parsed scores before declaring consensus.
10
+ // If any evaluator's score result is missing, consensus is false.
11
+ const allScores = accumulators.map((a) => a.scoreResult.scores ?? {});
12
+ if (allScores.some((s) => Object.keys(s).length === 0))
13
+ return false;
14
+ const dimensions = new Set(allScores.flatMap((s) => Object.keys(s)));
15
+ for (const dim of dimensions) {
16
+ const vals = allScores.map((s) => s[dim]).filter((v) => v !== undefined);
17
+ if (vals.length < 2)
18
+ continue;
19
+ const range = Math.max(...vals) - Math.min(...vals);
20
+ if (range > 1)
21
+ return false;
22
+ }
23
+ return true;
24
+ }
25
+ /** Aggregate answer closeness from all evaluators into a comparison. */
26
+ export function buildAnswerComparison(accumulators) {
27
+ const closenessValues = accumulators
28
+ .map((a) => a.scoreResult.overallCloseness)
29
+ .filter((v) => v !== undefined && v > 0);
30
+ const avgCloseness = closenessValues.length > 0
31
+ ? closenessValues.reduce((a, b) => a + b, 0) / closenessValues.length
32
+ : 0;
33
+ const summaries = accumulators
34
+ .map((a) => a.scoreResult.summary)
35
+ .filter((s) => !!s);
36
+ return {
37
+ matches: avgCloseness >= 0.7,
38
+ explanation: summaries[0] ?? 'No explanation available',
39
+ similarity: avgCloseness,
40
+ };
41
+ }
42
+ /** Build critical requirement results by checking evaluator-flagged misses. */
43
+ export function buildCriticalResults(accumulators, scenario) {
44
+ const allMissed = new Set(accumulators.flatMap((a) => a.scoreResult.missedCritical ?? []));
45
+ return scenario.criticalRequirements.map((req) => ({
46
+ requirement: req,
47
+ met: !allMissed.has(req),
48
+ evidence: allMissed.has(req) ? 'Flagged as missed by evaluator' : 'Not flagged',
49
+ }));
50
+ }
51
+ /** Merge compliance results from all evaluators into a single report. */
52
+ export function mergeCompliance(accumulators) {
53
+ const followed = new Set();
54
+ const violated = new Set();
55
+ const notApplicable = new Set();
56
+ let complianceSum = 0;
57
+ let complianceCount = 0;
58
+ for (const acc of accumulators) {
59
+ const c = acc.complianceResult;
60
+ (c.followed ?? []).forEach((s) => followed.add(s));
61
+ (c.violated ?? []).forEach((s) => violated.add(s));
62
+ (c.notApplicable ?? []).forEach((s) => notApplicable.add(s));
63
+ if (c.overallCompliance !== undefined) {
64
+ complianceSum += c.overallCompliance;
65
+ complianceCount++;
66
+ }
67
+ }
68
+ return toInstructionCompliance({
69
+ followed: [...followed],
70
+ violated: [...violated],
71
+ notApplicable: [...notApplicable],
72
+ overallCompliance: complianceCount > 0 ? complianceSum / complianceCount : 0,
73
+ });
74
+ }
75
+ //# sourceMappingURL=eval-helpers.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-helpers.js","sourceRoot":"","sources":["../../../src/server/services/eval-helpers.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,0EAA0E;AAC1E,8EAA8E;AAQ9E,OAAO,EAAE,uBAAuB,EAAE,MAAM,mBAAmB,CAAC;AAe5D,oFAAoF;AACpF,MAAM,UAAU,cAAc,CAAC,YAA6C;IAC1E,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,IAAI,CAAC;IACzC,2EAA2E;IAC3E,kEAAkE;IAClE,MAAM,SAAS,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC;IACtE,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC;QAAE,OAAO,KAAK,CAAC;IAErE,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACrE,KAAK,MAAM,GAAG,IAAI,UAAU,EAAE,CAAC;QAC7B,MAAM,IAAI,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,CAAC,KAAK,SAAS,CAAC,CAAC;QACtF,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC;YAAE,SAAS;QAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,CAAC;QACpD,IAAI,KAAK,GAAG,CAAC;YAAE,OAAO,KAAK,CAAC;IAC9B,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,wEAAwE;AACxE,MAAM,UAAU,qBAAqB,CACnC,YAA6C;IAE7C,MAAM,eAAe,GAAG,YAAY;SACjC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,gBAAgB,CAAC;SAC1C,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,CAAC,KAAK,SAAS,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IACxD,MAAM,YAAY,GAAG,eAAe,CAAC,MAAM,GAAG,CAAC;QAC7C,CAAC,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,eAAe,CAAC,MAAM;QACrE,CAAC,CAAC,CAAC,CAAC;IACN,MAAM,SAAS,GAAG,YAAY;SAC3B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,OAAO,CAAC;SACjC,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAEnC,OAAO;QACL,OAAO,EAAE,YAAY,IAAI,GAAG;QAC5B,WAAW,EAAE,SAAS,CAAC,CAAC,CAAC,IAAI,0BAA0B;QACvD,UAAU,EAAE,YAAY;KACzB,CAAC;AACJ,CAAC;AAED,+EAA+E;AAC/E,MAAM,UAAU,oBAAoB,CAClC,YAA6C,EAC7C,QAAkB;IAElB,MAAM,SAAS,GAAG,IAAI,GAAG,CACvB,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,cAAc,IAAI,EAAE,CAAC,CAChE,CAAC;IACF,OAAO,QAAQ,CAAC,oBAAoB,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QACjD,WAAW,EAAE,GAAG;QAChB,GAAG,EAAE,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC;QACxB,QAAQ,EAAE,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,gCAAgC,CAAC,CAAC,CAAC,aAAa;KAChF,CAAC,CAAC,CAAC;AACN,CAAC;AAED,yEAAyE;AACzE,MAAM,UAAU,eAAe,CAC7B,YAA6C;IAE7C,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAC;IACnC,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAC;IACnC,MAAM,aAAa,GAAG,IAAI,GAAG,EAAU,CAAC;IACxC,IAAI,aAAa,GAAG,CAAC,CAAC;IACtB,IAAI,eAAe,GAAG,CAAC,CAAC;IAExB,KAAK,MAAM,GAAG,IAAI,YAAY,EAAE,CAAC;QAC/B,MAAM,CAAC,GAAG,GAAG,CAAC,gBAAgB,CAAC;QAC/B,CAAC,CAAC,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACnD,CAAC,CAAC,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACnD,CAAC,CAAC,CAAC,aAAa,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAC7D,IAAI,CAAC,CAAC,iBAAiB,KAAK,SAAS,EAAE,CAAC;YACtC,aAAa,IAAI,CAAC,CAAC,iBAAiB,CAAC;YACrC,eAAe,EAAE,CAAC;QACpB,CAAC;IACH,CAAC;IAED,OAAO,uBAAuB,CAAC;QAC7B,QAAQ,EAAE,CAAC,GAAG,QAAQ,CAAC;QACvB,QAAQ,EAAE,CAAC,GAAG,QAAQ,CAAC;QACvB,aAAa,EAAE,CAAC,GAAG,aAAa,CAAC;QACjC,iBAAiB,EAAE,eAAe,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC;KAC7E,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,11 @@
1
+ import type { EvaluationSynthesis } from '../types/evaluation.js';
2
+ export type Verdict = 'AGREE' | 'DISAGREE' | 'PARTIAL';
3
+ export interface DebateParseResult {
4
+ readonly verdict: Verdict;
5
+ readonly updatedScores: Readonly<Record<string, number>>;
6
+ readonly critiques: readonly string[];
7
+ readonly reasoning: string;
8
+ }
9
+ export declare function parseSynthesisResponse(response: string): Partial<EvaluationSynthesis>;
10
+ export declare function parseDebateResponse(response: string): Partial<DebateParseResult>;
11
+ //# sourceMappingURL=eval-parsers-debate-impl.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-parsers-debate-impl.d.ts","sourceRoot":"","sources":["../../../src/server/services/eval-parsers-debate-impl.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAC;AAMlE,MAAM,MAAM,OAAO,GAAG,OAAO,GAAG,UAAU,GAAG,SAAS,CAAC;AAEvD,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,OAAO,EAAE,OAAO,CAAC;IAC1B,QAAQ,CAAC,aAAa,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;IACzD,QAAQ,CAAC,SAAS,EAAE,SAAS,MAAM,EAAE,CAAC;IACtC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;CAC5B;AAMD,wBAAgB,sBAAsB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,mBAAmB,CAAC,CAWrF;AAMD,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,iBAAiB,CAAC,CAahF"}
@@ -0,0 +1,133 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Debate & Synthesis parsers — extracted from eval-parsers.ts
3
+ // ---------------------------------------------------------------------------
4
+ // ---------------------------------------------------------------------------
5
+ // Synthesis response
6
+ // ---------------------------------------------------------------------------
7
+ export function parseSynthesisResponse(response) {
8
+ const parsed = tryParseJson(response);
9
+ if (parsed) {
10
+ return {
11
+ dimensionScores: validScores(parsed.dimensionScores),
12
+ weightedTotal: clampScore(parsed.weightedTotal),
13
+ confidence: clamp01(parsed.confidence),
14
+ dissenting: toStringArray(parsed.dissenting),
15
+ };
16
+ }
17
+ return parseSynthesisFromText(response);
18
+ }
19
+ // ---------------------------------------------------------------------------
20
+ // Debate verdict parsing
21
+ // ---------------------------------------------------------------------------
22
+ export function parseDebateResponse(response) {
23
+ const parsed = tryParseJson(response);
24
+ if (parsed) {
25
+ return {
26
+ verdict: parseVerdict(parsed.verdict),
27
+ updatedScores: validScores(parsed.updatedScores),
28
+ critiques: toStringArray(parsed.critiques),
29
+ reasoning: typeof parsed.reasoning === 'string' ? parsed.reasoning : undefined,
30
+ };
31
+ }
32
+ // Fallback: extract verdict and scores from text patterns
33
+ return parseDebateFromText(response);
34
+ }
35
+ // ---------------------------------------------------------------------------
36
+ // Internal: Text fallback parsers
37
+ // ---------------------------------------------------------------------------
38
+ function parseDebateFromText(text) {
39
+ // Try to extract verdict from text patterns like "VERDICT: AGREE"
40
+ const verdictMatch = text.match(/VERDICT\s*:\s*(AGREE|DISAGREE|PARTIAL)/i);
41
+ const verdict = verdictMatch ? parseVerdict(verdictMatch[1]) : 'PARTIAL';
42
+ // Try to extract scores using the same pattern as score text fallback
43
+ const scores = {};
44
+ const scorePattern = /(\w[\w\s]*?):\s*(\d+(?:\.\d+)?)\s*(?:\/\s*10)?/g;
45
+ let match;
46
+ while ((match = scorePattern.exec(text)) !== null) {
47
+ const dim = match[1].trim();
48
+ // Skip the VERDICT line we already parsed
49
+ if (/^verdict$/i.test(dim))
50
+ continue;
51
+ const val = parseFloat(match[2]);
52
+ if (!isNaN(val) && val <= 10)
53
+ scores[dim] = val;
54
+ }
55
+ return {
56
+ verdict,
57
+ updatedScores: Object.keys(scores).length > 0 ? scores : undefined,
58
+ };
59
+ }
60
+ function parseSynthesisFromText(text) {
61
+ const scoreMatch = text.match(/weighted\s*(?:total|average|score)\s*:?\s*(\d+(?:\.\d+)?)/i);
62
+ return {
63
+ weightedTotal: scoreMatch ? clampScore(parseFloat(scoreMatch[1])) : undefined,
64
+ };
65
+ }
66
+ // ---------------------------------------------------------------------------
67
+ // Internal: JSON parsing
68
+ // ---------------------------------------------------------------------------
69
+ function tryParseJson(text) {
70
+ try {
71
+ return JSON.parse(text);
72
+ }
73
+ catch {
74
+ // fall through
75
+ }
76
+ const jsonMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/);
77
+ if (jsonMatch) {
78
+ try {
79
+ return JSON.parse(jsonMatch[1]);
80
+ }
81
+ catch {
82
+ // fall through
83
+ }
84
+ }
85
+ const braceMatch = text.match(/\{[\s\S]*\}/);
86
+ if (braceMatch) {
87
+ try {
88
+ return JSON.parse(braceMatch[0]);
89
+ }
90
+ catch {
91
+ // fall through
92
+ }
93
+ }
94
+ return undefined;
95
+ }
96
+ // ---------------------------------------------------------------------------
97
+ // Internal: Utilities
98
+ // ---------------------------------------------------------------------------
99
+ function validScores(scores) {
100
+ if (!scores || typeof scores !== 'object')
101
+ return {};
102
+ const result = {};
103
+ for (const [k, v] of Object.entries(scores)) {
104
+ if (typeof v === 'number' && !isNaN(v))
105
+ result[k] = clampScore(v);
106
+ }
107
+ return result;
108
+ }
109
+ function clamp01(val) {
110
+ if (val === undefined || isNaN(val))
111
+ return 0;
112
+ return Math.max(0, Math.min(1, val));
113
+ }
114
+ function clampScore(val) {
115
+ if (val === undefined || isNaN(val))
116
+ return 0;
117
+ return Math.max(0, Math.min(10, val));
118
+ }
119
+ function toStringArray(arr) {
120
+ if (!Array.isArray(arr))
121
+ return [];
122
+ return arr.filter((x) => typeof x === 'string');
123
+ }
124
+ function parseVerdict(val) {
125
+ if (typeof val === 'string') {
126
+ const upper = val.toUpperCase();
127
+ if (upper === 'AGREE' || upper === 'DISAGREE' || upper === 'PARTIAL') {
128
+ return upper;
129
+ }
130
+ }
131
+ return 'PARTIAL';
132
+ }
133
+ //# sourceMappingURL=eval-parsers-debate-impl.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-parsers-debate-impl.js","sourceRoot":"","sources":["../../../src/server/services/eval-parsers-debate-impl.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,8DAA8D;AAC9D,8EAA8E;AAiB9E,8EAA8E;AAC9E,qBAAqB;AACrB,8EAA8E;AAE9E,MAAM,UAAU,sBAAsB,CAAC,QAAgB;IACrD,MAAM,MAAM,GAAG,YAAY,CAAuB,QAAQ,CAAC,CAAC;IAC5D,IAAI,MAAM,EAAE,CAAC;QACX,OAAO;YACL,eAAe,EAAE,WAAW,CAAC,MAAM,CAAC,eAAe,CAAC;YACpD,aAAa,EAAE,UAAU,CAAC,MAAM,CAAC,aAAa,CAAC;YAC/C,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,UAAU,CAAC;YACtC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,UAAU,CAAC;SAC7C,CAAC;IACJ,CAAC;IACD,OAAO,sBAAsB,CAAC,QAAQ,CAAC,CAAC;AAC1C,CAAC;AAED,8EAA8E;AAC9E,yBAAyB;AACzB,8EAA8E;AAE9E,MAAM,UAAU,mBAAmB,CAAC,QAAgB;IAClD,MAAM,MAAM,GAAG,YAAY,CAAoB,QAAQ,CAAC,CAAC;IACzD,IAAI,MAAM,EAAE,CAAC;QACX,OAAO;YACL,OAAO,EAAE,YAAY,CAAC,MAAM,CAAC,OAAO,CAAC;YACrC,aAAa,EAAE,WAAW,CAAC,MAAM,CAAC,aAAa,CAAC;YAChD,SAAS,EAAE,aAAa,CAAC,MAAM,CAAC,SAAS,CAAC;YAC1C,SAAS,EAAE,OAAO,MAAM,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS;SAC/E,CAAC;IACJ,CAAC;IAED,0DAA0D;IAC1D,OAAO,mBAAmB,CAAC,QAAQ,CAAC,CAAC;AACvC,CAAC;AAED,8EAA8E;AAC9E,kCAAkC;AAClC,8EAA8E;AAE9E,SAAS,mBAAmB,CAAC,IAAY;IACvC,kEAAkE;IAClE,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,yCAAyC,CAAC,CAAC;IAC3E,MAAM,OAAO,GAAG,YAAY,CAAC,CAAC,CAAC,YAAY,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAoB,CAAC;IAEpF,sEAAsE;IACtE,MAAM,MAAM,GAA2B,EAAE,CAAC;IAC1C,MAAM,YAAY,GAAG,iDAAiD,CAAC;IACvE,IAAI,KAA6B,CAAC;IAClC,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAClD,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5B,0CAA0C;QAC1C,IAAI,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC;YAAE,SAAS;QACrC,MAAM,GAAG,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QACjC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,GAAG,IAAI,EAAE;YAAE,MAAM,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC;IAClD,CAAC;IAED,OAAO;QACL,OAAO;QACP,aAAa,EAAE,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS;KACnE,CAAC;AACJ,CAAC;AAED,SAAS,sBAAsB,CAAC,IAAY;IAC1C,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAC3B,4DAA4D,CAC7D,CAAC;IACF,OAAO;QACL,aAAa,EAAE,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS;KAC9E,CAAC;AACJ,CAAC;AAED,8EAA8E;AAC9E,yBAAyB;AACzB,8EAA8E;AAE9E,SAAS,YAAY,CAAI,IAAY;IACnC,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAM,CAAC;IAC/B,CAAC;IAAC,MAAM,CAAC;QACP,eAAe;IACjB,CAAC;IACD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;IAC7D,IAAI,SAAS,EAAE,CAAC;QACd,IAAI,CAAC;YACH,OAAO,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,CAAM,CAAC;QACvC,CAAC;QAAC,MAAM,CAAC;YACP,eAAe;QACjB,CAAC;IACH,CAAC;IACD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IAC7C,IAAI,UAAU,EAAE,CAAC;QACf,IAAI,CAAC;YACH,OAAO,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,CAAM,CAAC;QACxC,CAAC;QAAC,MAAM,CAAC;YACP,eAAe;QACjB,CAAC;IACH,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAoBD,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E,SAAS,WAAW,CAClB,MAA0C;IAE1C,IAAI,CAAC,MAAM,IAAI,OAAO,MAAM,KAAK,QAAQ;QAAE,OAAO,EAAE,CAAC;IACrD,MAAM,MAAM,GAA2B,EAAE,CAAC;IAC1C,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAC5C,IAAI,OAAO,CAAC,KAAK,QAAQ,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;YAAE,MAAM,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IACpE,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,OAAO,CAAC,GAAuB;IACtC,IAAI,GAAG,KAAK,SAAS,IAAI,KAAK,CAAC,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IAC9C,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;AACvC,CAAC;AAED,SAAS,UAAU,CAAC,GAAuB;IACzC,IAAI,GAAG,KAAK,SAAS,IAAI,KAAK,CAAC,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IAC9C,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC,CAAC;AACxC,CAAC;AAED,SAAS,aAAa,CAAC,GAAY;IACjC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,EAAE,CAAC;IACnC,OAAO,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,CAAC,CAAC;AAC/D,CAAC;AAED,SAAS,YAAY,CAAC,GAAY;IAChC,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;QAC5B,MAAM,KAAK,GAAG,GAAG,CAAC,WAAW,EAAE,CAAC;QAChC,IAAI,KAAK,KAAK,OAAO,IAAI,KAAK,KAAK,UAAU,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACrE,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC"}
@@ -0,0 +1,24 @@
1
+ import type { IndividualEvaluation, InstructionCompliance } from '../types/evaluation.js';
2
+ export { parseSynthesisResponse, parseDebateResponse } from './eval-parsers-debate-impl.js';
3
+ export type { Verdict, DebateParseResult } from './eval-parsers-debate-impl.js';
4
+ export interface ScoreParseResult {
5
+ readonly scores: Readonly<Record<string, number>>;
6
+ readonly overallCloseness: number;
7
+ readonly missedCritical: readonly string[];
8
+ readonly strengths: readonly string[];
9
+ readonly weaknesses: readonly string[];
10
+ readonly summary: string;
11
+ }
12
+ export declare function parseScoreResponse(response: string): Partial<ScoreParseResult>;
13
+ export interface ComplianceParseResult {
14
+ readonly followed: readonly string[];
15
+ readonly violated: readonly string[];
16
+ readonly notApplicable: readonly string[];
17
+ readonly overallCompliance: number;
18
+ }
19
+ export declare function parseComplianceResponse(response: string): Partial<ComplianceParseResult>;
20
+ /** Convert parsed compliance into our InstructionCompliance type. */
21
+ export declare function toInstructionCompliance(result: Partial<ComplianceParseResult>): InstructionCompliance;
22
+ /** Convert score parse result into IndividualEvaluation entries. */
23
+ export declare function toIndividualEvaluations(scores: Readonly<Record<string, number>>, role: string, reasoningMap: Readonly<Record<string, string>>): IndividualEvaluation[];
24
+ //# sourceMappingURL=eval-parsers.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-parsers.d.ts","sourceRoot":"","sources":["../../../src/server/services/eval-parsers.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EACV,oBAAoB,EACpB,qBAAqB,EACtB,MAAM,wBAAwB,CAAC;AAGhC,OAAO,EAAE,sBAAsB,EAAE,mBAAmB,EAAE,MAAM,+BAA+B,CAAC;AAC5F,YAAY,EAAE,OAAO,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AAMhF,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,MAAM,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;IAClD,QAAQ,CAAC,gBAAgB,EAAE,MAAM,CAAC;IAClC,QAAQ,CAAC,cAAc,EAAE,SAAS,MAAM,EAAE,CAAC;IAC3C,QAAQ,CAAC,SAAS,EAAE,SAAS,MAAM,EAAE,CAAC;IACtC,QAAQ,CAAC,UAAU,EAAE,SAAS,MAAM,EAAE,CAAC;IACvC,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B;AAED,wBAAgB,kBAAkB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAa9E;AAMD,MAAM,WAAW,qBAAqB;IACpC,QAAQ,CAAC,QAAQ,EAAE,SAAS,MAAM,EAAE,CAAC;IACrC,QAAQ,CAAC,QAAQ,EAAE,SAAS,MAAM,EAAE,CAAC;IACrC,QAAQ,CAAC,aAAa,EAAE,SAAS,MAAM,EAAE,CAAC;IAC1C,QAAQ,CAAC,iBAAiB,EAAE,MAAM,CAAC;CACpC;AAED,wBAAgB,uBAAuB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,qBAAqB,CAAC,CAMxF;AAED,qEAAqE;AACrE,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,OAAO,CAAC,qBAAqB,CAAC,GACrC,qBAAqB,CAOvB;AAMD,oEAAoE;AACpE,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,EACxC,IAAI,EAAE,MAAM,EACZ,YAAY,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,GAC7C,oBAAoB,EAAE,CAOxB"}
@@ -0,0 +1,153 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Evaluation Response Parsers — extract structured data from LLM responses
3
+ // ---------------------------------------------------------------------------
4
+ // Re-export debate/synthesis parsers so existing imports continue to work
5
+ export { parseSynthesisResponse, parseDebateResponse } from './eval-parsers-debate-impl.js';
6
+ export function parseScoreResponse(response) {
7
+ const parsed = tryParseJson(response);
8
+ if (parsed) {
9
+ return {
10
+ scores: validScores(parsed.scores),
11
+ overallCloseness: clamp01(parsed.overallCloseness),
12
+ missedCritical: toStringArray(parsed.missedCritical),
13
+ strengths: toStringArray(parsed.strengths),
14
+ weaknesses: toStringArray(parsed.weaknesses),
15
+ summary: typeof parsed.summary === 'string' ? parsed.summary : undefined,
16
+ };
17
+ }
18
+ return parseScoreFromText(response);
19
+ }
20
+ export function parseComplianceResponse(response) {
21
+ const parsed = tryParseJson(response);
22
+ if (parsed) {
23
+ return categorizeComplianceResults(parsed);
24
+ }
25
+ return parseComplianceFromText(response);
26
+ }
27
+ /** Convert parsed compliance into our InstructionCompliance type. */
28
+ export function toInstructionCompliance(result) {
29
+ return {
30
+ followed: result.followed ?? [],
31
+ violated: result.violated ?? [],
32
+ notApplicable: result.notApplicable ?? [],
33
+ overallCompliance: result.overallCompliance ?? 0,
34
+ };
35
+ }
36
+ // ---------------------------------------------------------------------------
37
+ // Helpers
38
+ // ---------------------------------------------------------------------------
39
+ /** Convert score parse result into IndividualEvaluation entries. */
40
+ export function toIndividualEvaluations(scores, role, reasoningMap) {
41
+ return Object.entries(scores).map(([dimension, score]) => ({
42
+ evaluatorRole: role,
43
+ dimension,
44
+ score: clampScore(score),
45
+ reasoning: reasoningMap[dimension] ?? '',
46
+ }));
47
+ }
48
+ // ---------------------------------------------------------------------------
49
+ // Internal: JSON parsing
50
+ // ---------------------------------------------------------------------------
51
+ function tryParseJson(text) {
52
+ try {
53
+ return JSON.parse(text);
54
+ }
55
+ catch {
56
+ // fall through
57
+ }
58
+ const jsonMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/);
59
+ if (jsonMatch) {
60
+ try {
61
+ return JSON.parse(jsonMatch[1]);
62
+ }
63
+ catch {
64
+ // fall through
65
+ }
66
+ }
67
+ const braceMatch = text.match(/\{[\s\S]*\}/);
68
+ if (braceMatch) {
69
+ try {
70
+ return JSON.parse(braceMatch[0]);
71
+ }
72
+ catch {
73
+ // fall through
74
+ }
75
+ }
76
+ return undefined;
77
+ }
78
+ // ---------------------------------------------------------------------------
79
+ // Internal: Text fallback parsers
80
+ // ---------------------------------------------------------------------------
81
+ function parseScoreFromText(text) {
82
+ const scores = {};
83
+ const scorePattern = /(\w[\w\s]*?):\s*(\d+(?:\.\d+)?)\s*(?:\/\s*10)?/g;
84
+ let match;
85
+ while ((match = scorePattern.exec(text)) !== null) {
86
+ const dim = match[1].trim();
87
+ const val = parseFloat(match[2]);
88
+ if (!isNaN(val) && val <= 10)
89
+ scores[dim] = val;
90
+ }
91
+ return { scores: Object.keys(scores).length > 0 ? scores : undefined };
92
+ }
93
+ function parseComplianceFromText(text) {
94
+ const followed = [];
95
+ const violated = [];
96
+ if (/followed|compliant/i.test(text))
97
+ followed.push('(extracted from text)');
98
+ if (/violated|non-compliant/i.test(text))
99
+ violated.push('(extracted from text)');
100
+ return { followed, violated, notApplicable: [], overallCompliance: undefined };
101
+ }
102
+ // ---------------------------------------------------------------------------
103
+ // Internal: Utilities
104
+ // ---------------------------------------------------------------------------
105
+ function validScores(scores) {
106
+ if (!scores || typeof scores !== 'object')
107
+ return {};
108
+ const result = {};
109
+ for (const [k, v] of Object.entries(scores)) {
110
+ if (typeof v === 'number' && !isNaN(v))
111
+ result[k] = clampScore(v);
112
+ }
113
+ return result;
114
+ }
115
+ function clamp01(val) {
116
+ if (val === undefined || isNaN(val))
117
+ return 0;
118
+ return Math.max(0, Math.min(1, val));
119
+ }
120
+ function clampScore(val) {
121
+ if (val === undefined || isNaN(val))
122
+ return 0;
123
+ return Math.max(0, Math.min(10, val));
124
+ }
125
+ function toStringArray(arr) {
126
+ if (!Array.isArray(arr))
127
+ return [];
128
+ return arr.filter((x) => typeof x === 'string');
129
+ }
130
+ function categorizeComplianceResults(parsed) {
131
+ const followed = [];
132
+ const violated = [];
133
+ const notApplicable = [];
134
+ if (Array.isArray(parsed.results)) {
135
+ for (const r of parsed.results) {
136
+ const text = r.instruction ?? '(unknown)';
137
+ const status = (r.status ?? '').toLowerCase();
138
+ if (status === 'followed')
139
+ followed.push(text);
140
+ else if (status === 'violated')
141
+ violated.push(text);
142
+ else
143
+ notApplicable.push(text);
144
+ }
145
+ }
146
+ return {
147
+ followed,
148
+ violated,
149
+ notApplicable,
150
+ overallCompliance: clamp01(parsed.overallCompliance),
151
+ };
152
+ }
153
+ //# sourceMappingURL=eval-parsers.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-parsers.js","sourceRoot":"","sources":["../../../src/server/services/eval-parsers.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,2EAA2E;AAC3E,8EAA8E;AAO9E,0EAA0E;AAC1E,OAAO,EAAE,sBAAsB,EAAE,mBAAmB,EAAE,MAAM,+BAA+B,CAAC;AAgB5F,MAAM,UAAU,kBAAkB,CAAC,QAAgB;IACjD,MAAM,MAAM,GAAG,YAAY,CAAmB,QAAQ,CAAC,CAAC;IACxD,IAAI,MAAM,EAAE,CAAC;QACX,OAAO;YACL,MAAM,EAAE,WAAW,CAAC,MAAM,CAAC,MAAM,CAAC;YAClC,gBAAgB,EAAE,OAAO,CAAC,MAAM,CAAC,gBAAgB,CAAC;YAClD,cAAc,EAAE,aAAa,CAAC,MAAM,CAAC,cAAc,CAAC;YACpD,SAAS,EAAE,aAAa,CAAC,MAAM,CAAC,SAAS,CAAC;YAC1C,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,UAAU,CAAC;YAC5C,OAAO,EAAE,OAAO,MAAM,CAAC,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS;SACzE,CAAC;IACJ,CAAC;IACD,OAAO,kBAAkB,CAAC,QAAQ,CAAC,CAAC;AACtC,CAAC;AAaD,MAAM,UAAU,uBAAuB,CAAC,QAAgB;IACtD,MAAM,MAAM,GAAG,YAAY,CAAwB,QAAQ,CAAC,CAAC;IAC7D,IAAI,MAAM,EAAE,CAAC;QACX,OAAO,2BAA2B,CAAC,MAAM,CAAC,CAAC;IAC7C,CAAC;IACD,OAAO,uBAAuB,CAAC,QAAQ,CAAC,CAAC;AAC3C,CAAC;AAED,qEAAqE;AACrE,MAAM,UAAU,uBAAuB,CACrC,MAAsC;IAEtC,OAAO;QACL,QAAQ,EAAE,MAAM,CAAC,QAAQ,IAAI,EAAE;QAC/B,QAAQ,EAAE,MAAM,CAAC,QAAQ,IAAI,EAAE;QAC/B,aAAa,EAAE,MAAM,CAAC,aAAa,IAAI,EAAE;QACzC,iBAAiB,EAAE,MAAM,CAAC,iBAAiB,IAAI,CAAC;KACjD,CAAC;AACJ,CAAC;AAED,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E,oEAAoE;AACpE,MAAM,UAAU,uBAAuB,CACrC,MAAwC,EACxC,IAAY,EACZ,YAA8C;IAE9C,OAAO,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;QACzD,aAAa,EAAE,IAAI;QACnB,SAAS;QACT,KAAK,EAAE,UAAU,CAAC,KAAK,CAAC;QACxB,SAAS,EAAE,YAAY,CAAC,SAAS,CAAC,IAAI,EAAE;KACzC,CAAC,CAAC,CAAC;AACN,CAAC;AAED,8EAA8E;AAC9E,yBAAyB;AACzB,8EAA8E;AAE9E,SAAS,YAAY,CAAI,IAAY;IACnC,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAM,CAAC;IAC/B,CAAC;IAAC,MAAM,CAAC;QACP,eAAe;IACjB,CAAC;IACD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;IAC7D,IAAI,SAAS,EAAE,CAAC;QACd,IAAI,CAAC;YACH,OAAO,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,CAAM,CAAC;QACvC,CAAC;QAAC,MAAM,CAAC;YACP,eAAe;QACjB,CAAC;IACH,CAAC;IACD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IAC7C,IAAI,UAAU,EAAE,CAAC;QACf,IAAI,CAAC;YACH,OAAO,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,CAAM,CAAC;QACxC,CAAC;QAAC,MAAM,CAAC;YACP,eAAe;QACjB,CAAC;IACH,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,8EAA8E;AAC9E,kCAAkC;AAClC,8EAA8E;AAE9E,SAAS,kBAAkB,CAAC,IAAY;IACtC,MAAM,MAAM,GAA2B,EAAE,CAAC;IAC1C,MAAM,YAAY,GAAG,iDAAiD,CAAC;IACvE,IAAI,KAA6B,CAAC;IAClC,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAClD,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5B,MAAM,GAAG,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QACjC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,GAAG,IAAI,EAAE;YAAE,MAAM,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC;IAClD,CAAC;IACD,OAAO,EAAE,MAAM,EAAE,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC;AACzE,CAAC;AAED,SAAS,uBAAuB,CAAC,IAAY;IAC3C,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,IAAI,qBAAqB,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,QAAQ,CAAC,IAAI,CAAC,uBAAuB,CAAC,CAAC;IAC7E,IAAI,yBAAyB,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,QAAQ,CAAC,IAAI,CAAC,uBAAuB,CAAC,CAAC;IACjF,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,aAAa,EAAE,EAAE,EAAE,iBAAiB,EAAE,SAAS,EAAE,CAAC;AACjF,CAAC;AAoBD,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E,SAAS,WAAW,CAClB,MAA0C;IAE1C,IAAI,CAAC,MAAM,IAAI,OAAO,MAAM,KAAK,QAAQ;QAAE,OAAO,EAAE,CAAC;IACrD,MAAM,MAAM,GAA2B,EAAE,CAAC;IAC1C,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAC5C,IAAI,OAAO,CAAC,KAAK,QAAQ,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;YAAE,MAAM,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IACpE,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,OAAO,CAAC,GAAuB;IACtC,IAAI,GAAG,KAAK,SAAS,IAAI,KAAK,CAAC,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IAC9C,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;AACvC,CAAC;AAED,SAAS,UAAU,CAAC,GAAuB;IACzC,IAAI,GAAG,KAAK,SAAS,IAAI,KAAK,CAAC,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IAC9C,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC,CAAC;AACxC,CAAC;AAED,SAAS,aAAa,CAAC,GAAY;IACjC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,EAAE,CAAC;IACnC,OAAO,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,CAAC,CAAC;AAC/D,CAAC;AAED,SAAS,2BAA2B,CAClC,MAA6B;IAE7B,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,MAAM,aAAa,GAAa,EAAE,CAAC;IAEnC,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC;QAClC,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YAC/B,MAAM,IAAI,GAAG,CAAC,CAAC,WAAW,IAAI,WAAW,CAAC;YAC1C,MAAM,MAAM,GAAG,CAAC,CAAC,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;YAC9C,IAAI,MAAM,KAAK,UAAU;gBAAE,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;iBAC1C,IAAI,MAAM,KAAK,UAAU;gBAAE,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;;gBAC/C,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChC,CAAC;IACH,CAAC;IAED,OAAO;QACL,QAAQ;QACR,QAAQ;QACR,aAAa;QACb,iBAAiB,EAAE,OAAO,CAAC,MAAM,CAAC,iBAAiB,CAAC;KACrD,CAAC;AACJ,CAAC"}
@@ -0,0 +1,9 @@
1
+ import type { Scenario, Provider } from '../types/index.js';
2
+ import type { IndividualEvaluation } from '../types/evaluation.js';
3
+ import type { TranscriptSummary } from './transcript-formatter.js';
4
+ import type { InstructionBlock } from './instruction-parser.js';
5
+ export declare function buildScorePrompt(transcript: string, scenario: Scenario, summary: TranscriptSummary): string;
6
+ export declare function buildCompliancePrompt(transcript: string, scenario: Scenario, instructions: readonly InstructionBlock[]): string;
7
+ export declare function buildDebatePrompt(myPreviousAssessment: string, otherAssessments: readonly string[], roundNumber: number): string;
8
+ export declare function buildSynthesisPrompt(allEvaluations: readonly IndividualEvaluation[], scenario: Scenario, provider: Provider): string;
9
+ //# sourceMappingURL=eval-prompts.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-prompts.d.ts","sourceRoot":"","sources":["../../../src/server/services/eval-prompts.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,QAAQ,EAAoB,QAAQ,EAAE,MAAM,mBAAmB,CAAC;AAC9E,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AACnE,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,2BAA2B,CAAC;AACnE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAMhE,wBAAgB,gBAAgB,CAC9B,UAAU,EAAE,MAAM,EAClB,QAAQ,EAAE,QAAQ,EAClB,OAAO,EAAE,iBAAiB,GACzB,MAAM,CA+CR;AAMD,wBAAgB,qBAAqB,CACnC,UAAU,EAAE,MAAM,EAClB,QAAQ,EAAE,QAAQ,EAClB,YAAY,EAAE,SAAS,gBAAgB,EAAE,GACxC,MAAM,CA4BR;AAMD,wBAAgB,iBAAiB,CAC/B,oBAAoB,EAAE,MAAM,EAC5B,gBAAgB,EAAE,SAAS,MAAM,EAAE,EACnC,WAAW,EAAE,MAAM,GAClB,MAAM,CAyBR;AAMD,wBAAgB,oBAAoB,CAClC,cAAc,EAAE,SAAS,oBAAoB,EAAE,EAC/C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,QAAQ,GACjB,MAAM,CAkCR"}
@@ -0,0 +1,164 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Evaluation Prompt Builders
3
+ // ---------------------------------------------------------------------------
4
+ // ---------------------------------------------------------------------------
5
+ // Query 1: Score + answer comparison prompt
6
+ // ---------------------------------------------------------------------------
7
+ export function buildScorePrompt(transcript, scenario, summary) {
8
+ const dimensions = formatDimensions(scenario.scoringDimensions);
9
+ const toolSequence = summary.toolCallSequence.length > 0
10
+ ? `Tool call sequence: ${summary.toolCallSequence.join(' → ')}`
11
+ : 'No tool calls recorded.';
12
+ return `You are an expert evaluator assessing an AI agent's performance on a task.
13
+
14
+ ## Task Description
15
+ ${scenario.prompt}
16
+
17
+ ## Expected Answer
18
+ ${scenario.expectedAnswer}
19
+
20
+ ## Critical Requirements
21
+ ${formatCriticalRequirements(scenario.criticalRequirements)}
22
+
23
+ ## Grading Guidelines
24
+ ${scenario.gradingGuidelines || 'No specific grading guidelines provided.'}
25
+
26
+ ## Scoring Dimensions
27
+ ${dimensions}
28
+
29
+ ## Agent Behavior Summary
30
+ ${toolSequence}
31
+ Files read: ${summary.filesRead.length > 0 ? summary.filesRead.join(', ') : 'none'}
32
+ Files modified: ${summary.filesModified.length > 0 ? summary.filesModified.join(', ') : 'none'}
33
+ Command failures: ${summary.commandFailures.length}
34
+ Asked clarifying questions: ${summary.askedClarifyingQuestions ? 'yes' : 'no'}
35
+
36
+ ## Full Transcript
37
+ ${transcript}
38
+
39
+ ## Instructions
40
+ Evaluate the agent's output. For each scoring dimension, provide a score from 0-10.
41
+ Also assess how closely the agent's final answer matches the expected answer (0.0-1.0).
42
+ Identify any critical requirements that were missed, as well as strengths and weaknesses.
43
+
44
+ Respond with valid JSON matching this structure:
45
+ {
46
+ "scores": { "<dimension_name>": <0-10>, ... },
47
+ "overallCloseness": <0.0-1.0>,
48
+ "missedCritical": ["<requirement that was not met>", ...],
49
+ "strengths": ["<strength>", ...],
50
+ "weaknesses": ["<weakness>", ...],
51
+ "summary": "<brief overall assessment>"
52
+ }`;
53
+ }
54
+ // ---------------------------------------------------------------------------
55
+ // Query 2: Instruction compliance prompt
56
+ // ---------------------------------------------------------------------------
57
+ export function buildCompliancePrompt(transcript, scenario, instructions) {
58
+ const instructionList = instructions
59
+ .map((b, i) => `${i + 1}. [${b.source}] ${b.text}`)
60
+ .join('\n');
61
+ return `You are an expert evaluator checking whether an AI agent followed its configured instructions.
62
+
63
+ ## Instructions to Check
64
+ ${instructionList || 'No instructions configured.'}
65
+
66
+ ## Agent Transcript
67
+ ${transcript}
68
+
69
+ ## Instructions
70
+ For each instruction listed above, determine if the agent:
71
+ - "followed" it (clear evidence of compliance)
72
+ - "violated" it (clear evidence of non-compliance)
73
+ - "not_applicable" (instruction was not relevant to this task)
74
+
75
+ Also rate overall compliance from 0.0 to 1.0.
76
+
77
+ Respond with valid JSON:
78
+ {
79
+ "results": [
80
+ { "instruction": "<instruction text>", "status": "followed|violated|not_applicable", "evidence": "<brief evidence>" }
81
+ ],
82
+ "overallCompliance": <0.0-1.0>
83
+ }`;
84
+ }
85
+ // ---------------------------------------------------------------------------
86
+ // Multi-round debate prompt
87
+ // ---------------------------------------------------------------------------
88
+ export function buildDebatePrompt(myPreviousAssessment, otherAssessments, roundNumber) {
89
+ const othersFormatted = otherAssessments
90
+ .map((a, i) => `### Other Evaluator ${i + 1}\n${a}`)
91
+ .join('\n\n');
92
+ return `You are participating in round ${roundNumber} of a multi-evaluator review.
93
+
94
+ ## Your Previous Assessment
95
+ ${myPreviousAssessment}
96
+
97
+ ## Other Evaluators' Assessments
98
+ ${othersFormatted}
99
+
100
+ ## Instructions
101
+ Review the other evaluators' assessments and compare them with your own.
102
+ Determine if you agree, partially agree, or disagree with the emerging consensus.
103
+ Provide your updated scores if you have changed your mind, and explain why.
104
+
105
+ Respond with valid JSON:
106
+ {
107
+ "verdict": "AGREE|DISAGREE|PARTIAL",
108
+ "updatedScores": { "<dimension_name>": <0-10>, ... },
109
+ "critiques": ["<specific point of disagreement>", ...],
110
+ "reasoning": "<why you agree/disagree>"
111
+ }`;
112
+ }
113
+ // ---------------------------------------------------------------------------
114
+ // Synthesis prompt (final aggregation)
115
+ // ---------------------------------------------------------------------------
116
+ export function buildSynthesisPrompt(allEvaluations, scenario, provider) {
117
+ const evalSummaries = allEvaluations
118
+ .map((e) => `[${e.evaluatorRole}] ${e.dimension}: ${e.score}/10 — ${e.reasoning}`)
119
+ .join('\n');
120
+ const dimensions = formatDimensions(scenario.scoringDimensions);
121
+ return `You are the final synthesizer for a multi-evaluator assessment.
122
+
123
+ ## Scenario
124
+ ${scenario.name}: ${scenario.prompt}
125
+
126
+ ## Scoring Dimensions & Weights
127
+ ${dimensions}
128
+
129
+ ## Provider
130
+ Name: ${provider.name}
131
+ Model: ${provider.model}
132
+
133
+ ## All Individual Evaluations
134
+ ${evalSummaries}
135
+
136
+ ## Instructions
137
+ Synthesize all evaluations into final scores. Weight each dimension according to the scoring
138
+ dimensions defined above. Identify areas of evaluator consensus and disagreement.
139
+ Provide a confidence level (0.0-1.0) based on evaluator agreement.
140
+
141
+ Respond with valid JSON:
142
+ {
143
+ "dimensionScores": { "<dimension_name>": <0-10>, ... },
144
+ "weightedTotal": <weighted average 0-10>,
145
+ "confidence": <0.0-1.0>,
146
+ "dissenting": ["<areas where evaluators disagreed>", ...]
147
+ }`;
148
+ }
149
+ // ---------------------------------------------------------------------------
150
+ // Internal helpers
151
+ // ---------------------------------------------------------------------------
152
+ function formatDimensions(dims) {
153
+ if (dims.length === 0)
154
+ return 'No dimensions defined. Use general quality assessment.';
155
+ return dims
156
+ .map((d) => `- ${d.name} (weight: ${d.weight}): ${d.description}`)
157
+ .join('\n');
158
+ }
159
+ function formatCriticalRequirements(reqs) {
160
+ if (reqs.length === 0)
161
+ return 'None specified.';
162
+ return reqs.map((r, i) => `${i + 1}. ${r}`).join('\n');
163
+ }
164
+ //# sourceMappingURL=eval-prompts.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-prompts.js","sourceRoot":"","sources":["../../../src/server/services/eval-prompts.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,6BAA6B;AAC7B,8EAA8E;AAO9E,8EAA8E;AAC9E,4CAA4C;AAC5C,8EAA8E;AAE9E,MAAM,UAAU,gBAAgB,CAC9B,UAAkB,EAClB,QAAkB,EAClB,OAA0B;IAE1B,MAAM,UAAU,GAAG,gBAAgB,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;IAChE,MAAM,YAAY,GAAG,OAAO,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC;QACtD,CAAC,CAAC,uBAAuB,OAAO,CAAC,gBAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE;QAC/D,CAAC,CAAC,yBAAyB,CAAC;IAE9B,OAAO;;;EAGP,QAAQ,CAAC,MAAM;;;EAGf,QAAQ,CAAC,cAAc;;;EAGvB,0BAA0B,CAAC,QAAQ,CAAC,oBAAoB,CAAC;;;EAGzD,QAAQ,CAAC,iBAAiB,IAAI,0CAA0C;;;EAGxE,UAAU;;;EAGV,YAAY;cACA,OAAO,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM;kBAChE,OAAO,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM;oBAC1E,OAAO,CAAC,eAAe,CAAC,MAAM;8BACpB,OAAO,CAAC,wBAAwB,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI;;;EAG3E,UAAU;;;;;;;;;;;;;;;EAeV,CAAC;AACH,CAAC;AAED,8EAA8E;AAC9E,yCAAyC;AACzC,8EAA8E;AAE9E,MAAM,UAAU,qBAAqB,CACnC,UAAkB,EAClB,QAAkB,EAClB,YAAyC;IAEzC,MAAM,eAAe,GAAG,YAAY;SACjC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC;SAClD,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,OAAO;;;EAGP,eAAe,IAAI,6BAA6B;;;EAGhD,UAAU;;;;;;;;;;;;;;;;EAgBV,CAAC;AACH,CAAC;AAED,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E,MAAM,UAAU,iBAAiB,CAC/B,oBAA4B,EAC5B,gBAAmC,EACnC,WAAmB;IAEnB,MAAM,eAAe,GAAG,gBAAgB;SACrC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,uBAAuB,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;SACnD,IAAI,CAAC,MAAM,CAAC,CAAC;IAEhB,OAAO,kCAAkC,WAAW;;;EAGpD,oBAAoB;;;EAGpB,eAAe;;;;;;;;;;;;;EAaf,CAAC;AACH,CAAC;AAED,8EAA8E;AAC9E,uCAAuC;AACvC,8EAA8E;AAE9E,MAAM,UAAU,oBAAoB,CAClC,cAA+C,EAC/C,QAAkB,EAClB,QAAkB;IAElB,MAAM,aAAa,GAAG,cAAc;SACjC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC,aAAa,KAAK,CAAC,CAAC,SAAS,KAAK,CAAC,CAAC,KAAK,SAAS,CAAC,CAAC,SAAS,EAAE,CAAC;SACjF,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,MAAM,UAAU,GAAG,gBAAgB,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;IAEhE,OAAO;;;EAGP,QAAQ,CAAC,IAAI,KAAK,QAAQ,CAAC,MAAM;;;EAGjC,UAAU;;;QAGJ,QAAQ,CAAC,IAAI;SACZ,QAAQ,CAAC,KAAK;;;EAGrB,aAAa;;;;;;;;;;;;;EAab,CAAC;AACH,CAAC;AAED,8EAA8E;AAC9E,mBAAmB;AACnB,8EAA8E;AAE9E,SAAS,gBAAgB,CAAC,IAAiC;IACzD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,wDAAwD,CAAC;IACvF,OAAO,IAAI;SACR,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,aAAa,CAAC,CAAC,MAAM,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;SACjE,IAAI,CAAC,IAAI,CAAC,CAAC;AAChB,CAAC;AAED,SAAS,0BAA0B,CAAC,IAAuB;IACzD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,iBAAiB,CAAC;IAChD,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACzD,CAAC"}
@@ -0,0 +1,10 @@
1
+ import type { IEvaluator, EvaluationCallbacks } from '../interfaces/evaluator.js';
2
+ import type { Run, Scenario, Provider, Evaluation, EvaluationRequest } from '../types/index.js';
3
+ export declare class EvaluationOrchestrator implements IEvaluator {
4
+ evaluateRun(run: Run, scenario: Scenario, provider: Provider, request: EvaluationRequest, callbacks: EvaluationCallbacks): Promise<Evaluation>;
5
+ private runRound1;
6
+ private runDebateRound;
7
+ private runSynthesis;
8
+ private runQuery;
9
+ }
10
+ //# sourceMappingURL=evaluator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../../../src/server/services/evaluator.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,UAAU,EAAE,mBAAmB,EAAmB,MAAM,4BAA4B,CAAC;AACnG,OAAO,KAAK,EACV,GAAG,EACH,QAAQ,EACR,QAAQ,EACR,UAAU,EACV,iBAAiB,EAMlB,MAAM,mBAAmB,CAAC;AA6B3B,qBAAa,sBAAuB,YAAW,UAAU;IACjD,WAAW,CACf,GAAG,EAAE,GAAG,EACR,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,QAAQ,EAClB,OAAO,EAAE,iBAAiB,EAC1B,SAAS,EAAE,mBAAmB,GAC7B,OAAO,CAAC,UAAU,CAAC;YA0ER,SAAS;YA6BT,cAAc;YAyBd,YAAY;YAiBZ,QAAQ;CA6BvB"}