@sanity/ailf 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (444) hide show
  1. package/canonical/grader-references/README.md +2 -2
  2. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  3. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  4. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  5. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  6. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  7. package/config/features.ts +1 -1
  8. package/config/models.ts +28 -23
  9. package/config/sources.ts +1 -1
  10. package/config/thresholds.ts +1 -1
  11. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  13. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  17. package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
  18. package/dist/_vendor/ailf-core/config-helpers.js +29 -0
  19. package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
  20. package/dist/_vendor/ailf-core/examples/index.js +208 -114
  21. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  22. package/dist/_vendor/ailf-core/index.js +1 -0
  23. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  25. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  27. package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
  28. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  29. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  30. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  31. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  32. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  33. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
  34. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
  35. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  36. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  37. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  38. package/dist/_vendor/ailf-core/services/index.js +1 -1
  39. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  40. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
  41. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  42. package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
  43. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
  44. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  45. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  46. package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
  47. package/dist/_vendor/ailf-tasks/cli.js +61 -0
  48. package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
  49. package/dist/_vendor/ailf-tasks/index.js +16 -0
  50. package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
  51. package/dist/_vendor/ailf-tasks/parser.js +73 -0
  52. package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
  53. package/dist/_vendor/ailf-tasks/schemas.js +180 -0
  54. package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
  55. package/dist/_vendor/ailf-tasks/validation.js +162 -0
  56. package/dist/adapters/api-client/remediation.js +2 -2
  57. package/dist/adapters/config-sources/file-config-adapter.js +6 -1
  58. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  59. package/dist/adapters/index.d.ts +0 -1
  60. package/dist/adapters/index.js +0 -1
  61. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  62. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  63. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  64. package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
  65. package/dist/adapters/task-sources/index.d.ts +1 -2
  66. package/dist/adapters/task-sources/index.js +1 -2
  67. package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
  68. package/dist/adapters/task-sources/repo-schemas.js +2 -2
  69. package/dist/adapters/task-sources/repo-task-source.js +1 -1
  70. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  71. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
  73. package/dist/adapters/task-sources/task-file-loader.js +20 -6
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/explain-handler.d.ts +1 -1
  95. package/dist/commands/explain-handler.js +37 -8
  96. package/dist/commands/fetch-docs.js +1 -0
  97. package/dist/commands/generate-configs.d.ts +3 -3
  98. package/dist/commands/generate-configs.js +20 -8
  99. package/dist/commands/init.d.ts +2 -3
  100. package/dist/commands/init.js +56 -170
  101. package/dist/commands/pipeline-action.d.ts +7 -1
  102. package/dist/commands/pipeline-action.js +43 -19
  103. package/dist/commands/pipeline.d.ts +6 -1
  104. package/dist/commands/pipeline.js +7 -2
  105. package/dist/commands/pr-comment.js +1 -0
  106. package/dist/commands/publish.js +1 -0
  107. package/dist/commands/shared/help.js +2 -2
  108. package/dist/commands/update-quality-scores.d.ts +5 -0
  109. package/dist/commands/update-quality-scores.js +20 -0
  110. package/dist/composition-root.d.ts +2 -3
  111. package/dist/composition-root.js +27 -14
  112. package/dist/config/features.ts +23 -0
  113. package/dist/config/models.ts +100 -0
  114. package/dist/config/prompts.ts +16 -0
  115. package/dist/config/rubrics.ts +225 -0
  116. package/dist/config/schedules.ts +47 -0
  117. package/dist/config/sinks.ts +37 -0
  118. package/dist/config/sources.ts +21 -0
  119. package/dist/config/thresholds.ts +61 -0
  120. package/dist/lib/agent-behavior-report.d.ts +8 -0
  121. package/dist/lib/agent-behavior-report.js +185 -0
  122. package/dist/lib/baseline.d.ts +19 -0
  123. package/dist/lib/baseline.js +153 -0
  124. package/dist/lib/calculate-scores.d.ts +23 -0
  125. package/dist/lib/calculate-scores.js +42 -0
  126. package/dist/lib/compare.d.ts +18 -0
  127. package/dist/lib/compare.js +170 -0
  128. package/dist/lib/coverage-audit.d.ts +4 -0
  129. package/dist/lib/coverage-audit.js +42 -0
  130. package/dist/lib/discovery-report.d.ts +13 -0
  131. package/dist/lib/discovery-report.js +57 -0
  132. package/dist/lib/fetch-docs.d.ts +30 -0
  133. package/dist/lib/fetch-docs.js +171 -0
  134. package/dist/lib/generate-configs.d.ts +25 -0
  135. package/dist/lib/generate-configs.js +42 -0
  136. package/dist/lib/grader-api.d.ts +21 -0
  137. package/dist/lib/grader-api.js +34 -0
  138. package/dist/lib/grader-compare.d.ts +19 -0
  139. package/dist/lib/grader-compare.js +91 -0
  140. package/dist/lib/grader-consistency.d.ts +27 -0
  141. package/dist/lib/grader-consistency.js +79 -0
  142. package/dist/lib/grader-sensitivity.d.ts +19 -0
  143. package/dist/lib/grader-sensitivity.js +75 -0
  144. package/dist/lib/grader-validate.d.ts +19 -0
  145. package/dist/lib/grader-validate.js +78 -0
  146. package/dist/lib/measure-retrieval.d.ts +14 -0
  147. package/dist/lib/measure-retrieval.js +71 -0
  148. package/dist/lib/pr-comment.d.ts +16 -0
  149. package/dist/lib/pr-comment.js +28 -0
  150. package/dist/lib/readiness-report.d.ts +13 -0
  151. package/dist/lib/readiness-report.js +108 -0
  152. package/dist/lib/webhook-server.d.ts +11 -0
  153. package/dist/lib/webhook-server.js +24 -0
  154. package/dist/lib/weekly-digest.d.ts +24 -0
  155. package/dist/lib/weekly-digest.js +148 -0
  156. package/dist/orchestration/build-app-context.js +13 -0
  157. package/dist/orchestration/cache-context.d.ts +23 -0
  158. package/dist/orchestration/cache-context.js +43 -0
  159. package/dist/orchestration/env-bridge.d.ts +21 -0
  160. package/dist/orchestration/env-bridge.js +66 -0
  161. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  162. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  163. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  164. package/dist/orchestration/step-runner.js +5 -1
  165. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  166. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  167. package/dist/orchestration/steps/callback-step.js +10 -1
  168. package/dist/orchestration/steps/compare-step.js +6 -3
  169. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  170. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  171. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  172. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  173. package/dist/orchestration/steps/fetch-docs-step.js +30 -16
  174. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  175. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  176. package/dist/orchestration/steps/generate-configs-step.js +50 -15
  177. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  178. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  179. package/dist/orchestration/steps/publish-report-step.js +19 -0
  180. package/dist/orchestration/steps/readiness-step.js +8 -3
  181. package/dist/orchestration/steps/report-step.js +17 -4
  182. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  183. package/dist/orchestration/steps/run-eval-step.js +52 -32
  184. package/dist/pipeline/agent-behavior-report.js +6 -0
  185. package/dist/pipeline/attribution.d.ts +1 -1
  186. package/dist/pipeline/attribution.js +1 -1
  187. package/dist/pipeline/cache.js +29 -15
  188. package/dist/pipeline/calculate-scores.d.ts +2 -0
  189. package/dist/pipeline/calculate-scores.js +70 -33
  190. package/dist/pipeline/checks.d.ts +8 -3
  191. package/dist/pipeline/checks.js +23 -3
  192. package/dist/pipeline/chronic-failures.d.ts +55 -0
  193. package/dist/pipeline/chronic-failures.js +110 -0
  194. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
  195. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  196. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  197. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  198. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  199. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  200. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  201. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  202. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  203. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  204. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  205. package/dist/pipeline/compiler/config-loader.js +42 -2
  206. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  207. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  208. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  209. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  210. package/dist/pipeline/compiler/index.d.ts +2 -5
  211. package/dist/pipeline/compiler/index.js +2 -5
  212. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  213. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  214. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
  215. package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
  216. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
  217. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
  218. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
  219. package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
  220. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
  221. package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
  222. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
  223. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
  224. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  225. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  226. package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
  227. package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
  228. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
  229. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
  230. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  231. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  232. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
  233. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
  234. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  235. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  237. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
  241. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
  242. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
  244. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
  250. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  251. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  252. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
  253. package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
  254. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  255. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  256. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  257. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  258. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  259. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  260. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  261. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  262. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  263. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  264. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  265. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  266. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  267. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  268. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  269. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  270. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  271. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  272. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  273. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  274. package/dist/pipeline/compiler/task-bridge.js +92 -0
  275. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  276. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  277. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  278. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  279. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  280. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  281. package/dist/pipeline/coverage-audit.d.ts +1 -1
  282. package/dist/pipeline/coverage-audit.js +1 -1
  283. package/dist/pipeline/degradations.d.ts +1 -1
  284. package/dist/pipeline/degradations.js +1 -1
  285. package/dist/pipeline/failure-modes.d.ts +1 -1
  286. package/dist/pipeline/failure-modes.js +13 -1
  287. package/dist/pipeline/gap-analysis.d.ts +1 -1
  288. package/dist/pipeline/gap-analysis.js +3 -1
  289. package/dist/pipeline/generate-configs.d.ts +2 -2
  290. package/dist/pipeline/generate-configs.js +15 -8
  291. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  292. package/dist/pipeline/grader-compare-runner.js +7 -1
  293. package/dist/pipeline/grader-comparison.d.ts +1 -1
  294. package/dist/pipeline/grader-comparison.js +1 -1
  295. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  296. package/dist/pipeline/grader-consistency-runner.js +7 -1
  297. package/dist/pipeline/grader-consistency.d.ts +1 -1
  298. package/dist/pipeline/grader-consistency.js +1 -1
  299. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  300. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  301. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  302. package/dist/pipeline/grader-sensitivity.js +1 -1
  303. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  304. package/dist/pipeline/grader-validate-runner.js +2 -2
  305. package/dist/pipeline/grader-validation.d.ts +1 -1
  306. package/dist/pipeline/grader-validation.js +1 -1
  307. package/dist/pipeline/map-request-to-config.js +15 -2
  308. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  309. package/dist/pipeline/mirror-repo-tasks.js +1 -1
  310. package/dist/pipeline/plan-format.d.ts +1 -1
  311. package/dist/pipeline/plan-format.js +1 -1
  312. package/dist/pipeline/plan.d.ts +1 -1
  313. package/dist/pipeline/plan.js +67 -29
  314. package/dist/pipeline/probe.d.ts +1 -1
  315. package/dist/pipeline/probe.js +1 -1
  316. package/dist/pipeline/readiness-report.d.ts +2 -2
  317. package/dist/pipeline/readiness-report.js +2 -2
  318. package/dist/pipeline/release-classification.d.ts +1 -1
  319. package/dist/pipeline/release-classification.js +1 -1
  320. package/dist/pipeline/release-report.d.ts +1 -1
  321. package/dist/pipeline/release-report.js +1 -1
  322. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  323. package/dist/pipeline/repo-eval-comment.js +1 -1
  324. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  325. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  326. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  327. package/dist/pipeline/resolve-mappings.js +44 -44
  328. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  329. package/dist/pipeline/retrieval-metrics.js +28 -20
  330. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  331. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  332. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  333. package/dist/pipeline/steps/compare-step.js +90 -0
  334. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  335. package/dist/pipeline/steps/eval-step.js +347 -0
  336. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  337. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  338. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  339. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  340. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  341. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  342. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  343. package/dist/pipeline/steps/publish-report-step.js +243 -0
  344. package/dist/pipeline/steps/report-step.d.ts +13 -0
  345. package/dist/pipeline/steps/report-step.js +56 -0
  346. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  347. package/dist/pipeline/steps/update-scores-step.js +42 -0
  348. package/dist/pipeline/targeted-loo.d.ts +1 -1
  349. package/dist/pipeline/targeted-loo.js +1 -1
  350. package/dist/pipeline/thresholds.d.ts +1 -1
  351. package/dist/pipeline/thresholds.js +1 -1
  352. package/dist/pipeline/validate.js +13 -0
  353. package/dist/report-store.d.ts +17 -0
  354. package/dist/report-store.js +24 -0
  355. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  356. package/dist/scripts/agent-behavior-report.js +315 -0
  357. package/dist/scripts/baseline.d.ts +43 -0
  358. package/dist/scripts/baseline.js +267 -0
  359. package/dist/scripts/calculate-scores.d.ts +166 -0
  360. package/dist/scripts/calculate-scores.js +1296 -0
  361. package/dist/scripts/compare.d.ts +22 -0
  362. package/dist/scripts/compare.js +334 -0
  363. package/dist/scripts/coverage-audit.d.ts +44 -0
  364. package/dist/scripts/coverage-audit.js +209 -0
  365. package/dist/scripts/debug-eval.d.ts +19 -0
  366. package/dist/scripts/debug-eval.js +73 -0
  367. package/dist/scripts/discovery-report.d.ts +58 -0
  368. package/dist/scripts/discovery-report.js +250 -0
  369. package/dist/scripts/fetch-docs.d.ts +35 -0
  370. package/dist/scripts/fetch-docs.js +472 -0
  371. package/dist/scripts/generate-configs.d.ts +66 -0
  372. package/dist/scripts/generate-configs.js +459 -0
  373. package/dist/scripts/grader-api.d.ts +27 -0
  374. package/dist/scripts/grader-api.js +206 -0
  375. package/dist/scripts/grader-compare.d.ts +22 -0
  376. package/dist/scripts/grader-compare.js +368 -0
  377. package/dist/scripts/grader-consistency.d.ts +20 -0
  378. package/dist/scripts/grader-consistency.js +313 -0
  379. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  380. package/dist/scripts/grader-sensitivity.js +354 -0
  381. package/dist/scripts/grader-validate.d.ts +19 -0
  382. package/dist/scripts/grader-validate.js +267 -0
  383. package/dist/scripts/measure-retrieval.d.ts +10 -0
  384. package/dist/scripts/measure-retrieval.js +145 -0
  385. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  386. package/dist/scripts/migrate-task-mode.js +1 -1
  387. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  388. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  389. package/dist/scripts/pipeline.d.ts +76 -0
  390. package/dist/scripts/pipeline.js +1031 -0
  391. package/dist/scripts/pr-comment.d.ts +10 -0
  392. package/dist/scripts/pr-comment.js +510 -0
  393. package/dist/scripts/readiness-report.d.ts +88 -0
  394. package/dist/scripts/readiness-report.js +342 -0
  395. package/dist/scripts/update-quality-scores.d.ts +15 -0
  396. package/dist/scripts/update-quality-scores.js +184 -0
  397. package/dist/scripts/validate-task-sources.d.ts +1 -1
  398. package/dist/scripts/validate-task-sources.js +1 -1
  399. package/dist/scripts/validate.d.ts +13 -0
  400. package/dist/scripts/validate.js +79 -0
  401. package/dist/scripts/webhook-server.d.ts +26 -0
  402. package/dist/scripts/webhook-server.js +147 -0
  403. package/dist/scripts/weekly-digest.d.ts +24 -0
  404. package/dist/scripts/weekly-digest.js +144 -0
  405. package/dist/sinks/format-slack.d.ts +64 -0
  406. package/dist/sinks/format-slack.js +306 -0
  407. package/dist/sinks/slack-sink.d.ts +27 -0
  408. package/dist/sinks/slack-sink.js +78 -0
  409. package/dist/sinks/types.d.ts +1 -1
  410. package/dist/sinks/types.js +1 -1
  411. package/dist/sinks/webhook-sink.d.ts +19 -0
  412. package/dist/sinks/webhook-sink.js +50 -0
  413. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  414. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  415. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  416. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  417. package/dist/tasks/literacy/functions.task.ts +70 -0
  418. package/dist/tasks/literacy/groq.task.ts +259 -0
  419. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  420. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  421. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  422. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  423. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  424. package/package.json +25 -25
  425. package/tasks/.expanded.agentic.yaml +280 -0
  426. package/tasks/.expanded.yaml +565 -0
  427. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  428. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  429. package/tasks/literacy/content-lake.task.ts +181 -0
  430. package/tasks/literacy/frameworks.task.ts +1 -0
  431. package/tasks/literacy/functions.task.ts +1 -0
  432. package/tasks/literacy/groq.task.ts +1 -0
  433. package/tasks/literacy/image-handling.task.ts +95 -0
  434. package/tasks/literacy/nextjs-live.task.ts +2 -1
  435. package/tasks/literacy/portable-text.task.ts +169 -0
  436. package/tasks/literacy/studio-setup.task.ts +5 -2
  437. package/tasks/literacy/visual-editing.task.ts +1 -0
  438. package/LICENSE +0 -21
  439. package/tasks/frameworks.yaml +0 -98
  440. package/tasks/functions.yaml +0 -51
  441. package/tasks/groq.yaml +0 -216
  442. package/tasks/nextjs-live.yaml +0 -62
  443. package/tasks/studio-setup.yaml +0 -111
  444. package/tasks/visual-editing.yaml +0 -120
@@ -0,0 +1,22 @@
1
+ /**
2
+ * grader-compare.ts
3
+ *
4
+ * CLI for inter-grader comparison (Phase 3 of grader reliability).
5
+ *
6
+ * Re-runs grading assertions on existing eval results using candidate grader
7
+ * models, then compares the resulting scores against the baseline grader.
8
+ *
9
+ * Usage:
10
+ * pnpm grader-compare # compare vs configured candidates
11
+ * pnpm grader-compare --candidate openai:gpt-5.5-preview
12
+ * pnpm grader-compare --candidate openai:gpt-5.5-preview --candidate anthropic:claude-4-opus
13
+ * pnpm grader-compare --results eval-results.json
14
+ * pnpm grader-compare --format json # machine-readable output
15
+ *
16
+ * Reads: results/latest/eval-results.json (model responses to re-grade)
17
+ * Reads: config/models.yaml (baseline grader + optional candidate list)
18
+ * Writes: results/latest/grader-comparison.json
19
+ *
20
+ * @see docs/exec-plans/completed/grader-reliability.md — Phase 3
21
+ */
22
+ export {};
@@ -0,0 +1,368 @@
1
+ /**
2
+ * grader-compare.ts
3
+ *
4
+ * CLI for inter-grader comparison (Phase 3 of grader reliability).
5
+ *
6
+ * Re-runs grading assertions on existing eval results using candidate grader
7
+ * models, then compares the resulting scores against the baseline grader.
8
+ *
9
+ * Usage:
10
+ * pnpm grader-compare # compare vs configured candidates
11
+ * pnpm grader-compare --candidate openai:gpt-5.5-preview
12
+ * pnpm grader-compare --candidate openai:gpt-5.5-preview --candidate anthropic:claude-4-opus
13
+ * pnpm grader-compare --results eval-results.json
14
+ * pnpm grader-compare --format json # machine-readable output
15
+ *
16
+ * Reads: results/latest/eval-results.json (model responses to re-grade)
17
+ * Reads: config/models.yaml (baseline grader + optional candidate list)
18
+ * Writes: results/latest/grader-comparison.json
19
+ *
20
+ * @see docs/exec-plans/completed/grader-reliability.md — Phase 3
21
+ */
22
+ import { existsSync, readFileSync, writeFileSync } from "fs";
23
+ import { dirname, join, resolve } from "path";
24
+ import { fileURLToPath } from "url";
25
+ import { load } from "js-yaml";
26
+ import { compareGraders, } from "../pipeline/grader-comparison.js";
27
+ import { classifyCorrelation } from "../pipeline/grader-validation.js";
28
+ import { gradeOnce } from "./grader-api.js";
29
+ const __dirname = dirname(fileURLToPath(import.meta.url));
30
+ const ROOT = resolve(__dirname, "..", "..");
31
+ // ---------------------------------------------------------------------------
32
+ // CLI argument parsing
33
+ // ---------------------------------------------------------------------------
34
+ const args = process.argv.slice(2);
35
+ function getAllOptions(name) {
36
+ const results = [];
37
+ const flag = `--${name}`;
38
+ for (let i = 0; i < args.length; i++) {
39
+ if (args[i] === flag && i + 1 < args.length) {
40
+ results.push(args[i + 1]);
41
+ }
42
+ }
43
+ return results;
44
+ }
45
+ function getFlag(name) {
46
+ return args.includes(`--${name}`);
47
+ }
48
+ function getOption(name) {
49
+ const idx = args.indexOf(`--${name}`);
50
+ return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
51
+ }
52
+ const candidateArgs = getAllOptions("candidate");
53
+ const resultsPath = getOption("results") ?? "results/latest/eval-results.json";
54
+ const format = getOption("format") ?? "table";
55
+ const outputPath = getOption("output");
56
+ const showHelp = getFlag("help") || getFlag("h");
57
+ if (showHelp) {
58
+ console.log(`
59
+ Usage: pnpm grader-compare [options]
60
+
61
+ Compare multiple grader models on the same evaluation responses.
62
+
63
+ Options:
64
+ --candidate <model> Candidate grader model ID (repeatable)
65
+ e.g., --candidate openai:gpt-5.5-preview
66
+ --results <path> Path to eval results (default: results/latest/eval-results.json)
67
+ --format <fmt> Output format: table (default) or json
68
+ --output <path> Write JSON report to file
69
+ --help, -h Show this help
70
+
71
+ If no --candidate flags are provided, reads grader-candidates from config/models.yaml.
72
+
73
+ Examples:
74
+ pnpm grader-compare --candidate openai:gpt-5.5-preview
75
+ pnpm grader-compare --candidate openai:gpt-5.5-preview --candidate anthropic:claude-4-opus
76
+ pnpm grader-compare --format json
77
+ `);
78
+ process.exit(0);
79
+ }
80
+ // ---------------------------------------------------------------------------
81
+ // Dimension classification
82
+ // ---------------------------------------------------------------------------
83
+ // DimensionName imported from pipeline/types.ts
84
+ const DIMENSION_PATTERNS = [
85
+ { dimension: "taskCompletion", pattern: /task[_-]?completion/i },
86
+ { dimension: "codeCorrectness", pattern: /code[_-]?correct/i },
87
+ { dimension: "docCoverage", pattern: /doc[_-]?coverage/i },
88
+ ];
89
+ /** Classify a component result into a dimension based on rubric content or metric */
90
+ function classifyDimension(comp) {
91
+ // Check the metric name first (structured dimensions)
92
+ const metric = comp.assertion?.metric ?? "";
93
+ for (const { dimension, pattern } of DIMENSION_PATTERNS) {
94
+ if (pattern.test(metric))
95
+ return dimension;
96
+ }
97
+ // Fall back to rubric text analysis
98
+ const rubric = typeof comp.assertion?.value === "string" ? comp.assertion.value : "";
99
+ for (const { dimension, pattern } of DIMENSION_PATTERNS) {
100
+ if (pattern.test(rubric))
101
+ return dimension;
102
+ }
103
+ return null;
104
+ }
105
+ // ---------------------------------------------------------------------------
106
+ // Judgment extraction (same pattern as grader-consistency.ts)
107
+ // ---------------------------------------------------------------------------
108
+ /** Detect feature area from test description */
109
+ function detectFeatureArea(description) {
110
+ // Pattern: "[gold] Area Name — Task Description" or "Area Name — Task Description"
111
+ const cleaned = description.replace(/^\[(?:gold|baseline)\]\s*/i, "");
112
+ const parts = cleaned.split("—");
113
+ if (parts.length >= 2) {
114
+ return parts[0].trim().toLowerCase().replace(/\s+/g, "-");
115
+ }
116
+ return "unknown";
117
+ }
118
+ /** Detect task ID from test description */
119
+ function detectTaskId(description) {
120
+ // Description format: "[gold] Area Name — Task Description"
121
+ const cleaned = description.replace(/^\[(?:gold|baseline)\]\s*/i, "");
122
+ return cleaned
123
+ .toLowerCase()
124
+ .replace(/\s+/g, "-")
125
+ .replace(/[^a-z0-9-]/g, "")
126
+ .slice(0, 60);
127
+ }
128
+ // ---------------------------------------------------------------------------
129
+ // OpenAI grading API call (reuses pattern from grader-consistency.ts)
130
+ // ---------------------------------------------------------------------------
131
+ function extractJudgments(evalResults) {
132
+ const judgments = [];
133
+ const results = evalResults.results?.results ?? [];
134
+ for (const result of results) {
135
+ const description = result.testCase?.description ?? result.description ?? "";
136
+ // Only process gold tests (with-docs), skip baseline tests
137
+ if (!description.toLowerCase().includes("[gold]"))
138
+ continue;
139
+ const area = detectFeatureArea(description);
140
+ const taskId = detectTaskId(description);
141
+ const providerId = result.provider?.id;
142
+ const components = result.gradingResult?.componentResults ?? [];
143
+ for (const comp of components) {
144
+ if (comp.assertion?.type !== "llm-rubric")
145
+ continue;
146
+ const dimension = classifyDimension(comp);
147
+ if (!dimension)
148
+ continue;
149
+ const rubricText = typeof comp.assertion.value === "string" ? comp.assertion.value : "";
150
+ if (!rubricText)
151
+ continue;
152
+ judgments.push({
153
+ area,
154
+ dimension,
155
+ originalScore: typeof comp.score === "number" ? comp.score : 0,
156
+ providerId,
157
+ responseText: result.response?.output ?? "",
158
+ rubricText,
159
+ taskId,
160
+ });
161
+ }
162
+ }
163
+ return judgments;
164
+ }
165
+ // ---------------------------------------------------------------------------
166
+ // Config loading
167
+ // ---------------------------------------------------------------------------
168
+ function formatComparisonReport(result) {
169
+ console.log("-".repeat(80));
170
+ console.log("COMPARISON RESULTS");
171
+ console.log("-".repeat(80));
172
+ console.log();
173
+ console.log(` Baseline grader: ${result.baselineGrader}`);
174
+ console.log(` Candidates: ${result.candidateGraders.join(", ")}`);
175
+ console.log();
176
+ for (const pair of result.pairwise) {
177
+ console.log("-".repeat(80));
178
+ console.log(` ${pair.graderA} vs ${pair.graderB}`);
179
+ console.log("-".repeat(80));
180
+ console.log();
181
+ console.log(` Overall:`);
182
+ console.log(` Correlation: r=${pair.correlation} (${classifyCorrelation(pair.correlation)})`);
183
+ console.log(` Bias: ${pair.bias > 0 ? "+" : ""}${pair.bias} (${pair.bias > 0 ? "candidate grades higher" : pair.bias < 0 ? "candidate grades lower" : "no systematic bias"})`);
184
+ console.log(` Mean Abs Diff: ${pair.meanAbsDiff} points`);
185
+ console.log();
186
+ // Per-dimension table
187
+ const h = "| Dimension | Correlation | Quality | Bias | MAD | Count |";
188
+ const sep = "|------------------|-------------|-----------|--------|-------|-------|";
189
+ console.log(h);
190
+ console.log(sep);
191
+ const dims = [
192
+ { data: pair.perDimension.taskCompletion, name: "Task Completion" },
193
+ { data: pair.perDimension.codeCorrectness, name: "Code Correctness" },
194
+ { data: pair.perDimension.docCoverage, name: "Doc Coverage" },
195
+ ];
196
+ for (const { data, name } of dims) {
197
+ const quality = classifyCorrelation(data.correlation);
198
+ const biasStr = data.bias > 0 ? `+${data.bias}` : `${data.bias}`;
199
+ console.log(`| ${name.padEnd(16)} | r=${String(data.correlation).padStart(9)} | ${quality.padEnd(9)} | ${biasStr.padStart(6)} | ${String(data.meanAbsDiff).padStart(5)} | ${String(data.count).padStart(5)} |`);
200
+ }
201
+ console.log();
202
+ }
203
+ // Recommendations
204
+ if (result.recommendations.length > 0) {
205
+ console.log("-".repeat(80));
206
+ console.log("RECOMMENDATIONS");
207
+ console.log("-".repeat(80));
208
+ console.log();
209
+ for (const rec of result.recommendations) {
210
+ const icon = rec.recommendation === "comparable"
211
+ ? "✅"
212
+ : rec.recommendation === "divergent"
213
+ ? "⚠️"
214
+ : "❌";
215
+ console.log(` ${icon} ${rec.modelId}: ${rec.recommendation}`);
216
+ console.log(` ${rec.reason}`);
217
+ }
218
+ console.log();
219
+ }
220
+ }
221
+ // ---------------------------------------------------------------------------
222
+ // Main execution
223
+ // ---------------------------------------------------------------------------
224
+ function loadConfig() {
225
+ const modelsPath = join(ROOT, "config", "models.yaml");
226
+ if (!existsSync(modelsPath)) {
227
+ console.error("❌ config/models.yaml not found");
228
+ process.exit(1);
229
+ }
230
+ const raw = readFileSync(modelsPath, "utf-8");
231
+ const data = load(raw);
232
+ const grader = {
233
+ id: data?.grader?.id ?? "openai:gpt-5",
234
+ label: data?.grader?.label ?? "GPT-5 (grader)",
235
+ };
236
+ // CLI candidates override config candidates
237
+ let candidates;
238
+ if (candidateArgs.length > 0) {
239
+ candidates = candidateArgs.map((id) => ({
240
+ id,
241
+ label: id.split(":").pop() ?? id,
242
+ }));
243
+ }
244
+ else {
245
+ const configCandidates = data?.["grader-candidates"] ?? [];
246
+ candidates = configCandidates.map((c) => ({
247
+ id: c.id,
248
+ label: c.label ?? c.id.split(":").pop() ?? c.id,
249
+ }));
250
+ }
251
+ return { baselineGrader: grader, candidates };
252
+ }
253
+ // ---------------------------------------------------------------------------
254
+ // Formatted output
255
+ // ---------------------------------------------------------------------------
256
+ async function main() {
257
+ console.log("=".repeat(80));
258
+ console.log(" INTER-GRADER COMPARISON");
259
+ console.log("=".repeat(80));
260
+ console.log();
261
+ // Load config
262
+ const { baselineGrader, candidates } = loadConfig();
263
+ if (candidates.length === 0) {
264
+ console.error("❌ No candidate graders specified. Use --candidate <model> or add grader-candidates to config/models.yaml.");
265
+ process.exit(1);
266
+ }
267
+ console.log(` Baseline grader: ${baselineGrader.id} (${baselineGrader.label})`);
268
+ for (const c of candidates) {
269
+ console.log(` Candidate: ${c.id} (${c.label})`);
270
+ }
271
+ console.log();
272
+ // Load eval results
273
+ const evalResultsPath = resolve(ROOT, resultsPath);
274
+ if (!existsSync(evalResultsPath)) {
275
+ console.error(`❌ Eval results not found: ${evalResultsPath}`);
276
+ console.error(" Run the evaluation pipeline first: pnpm pipeline");
277
+ process.exit(1);
278
+ }
279
+ const evalResultsRaw = readFileSync(evalResultsPath, "utf-8");
280
+ const evalResults = JSON.parse(evalResultsRaw);
281
+ const evalData = evalResults;
282
+ // Extract judgments
283
+ const judgments = extractJudgments(evalData);
284
+ console.log(` Judgments found: ${judgments.length}`);
285
+ if (judgments.length === 0) {
286
+ console.error("❌ No gold-test judgments found in eval results.");
287
+ process.exit(1);
288
+ }
289
+ // Build baseline scores from original eval results
290
+ const baselineScores = judgments.map((j) => ({
291
+ area: j.area,
292
+ dimension: j.dimension,
293
+ score: Math.round(j.originalScore * 100),
294
+ taskId: j.taskId,
295
+ }));
296
+ const baselineScoreSet = {
297
+ label: baselineGrader.label,
298
+ modelId: baselineGrader.id,
299
+ scores: baselineScores,
300
+ };
301
+ // Grade with each candidate
302
+ const candidateScoreSets = [];
303
+ for (const candidate of candidates) {
304
+ console.log();
305
+ console.log(` Grading with ${candidate.id}...`);
306
+ const candidateScores = [];
307
+ let completed = 0;
308
+ let failed = 0;
309
+ for (const j of judgments) {
310
+ const score = await gradeOnce(candidate.id, j.responseText, j.rubricText);
311
+ completed++;
312
+ if (score !== null) {
313
+ candidateScores.push({
314
+ area: j.area,
315
+ dimension: j.dimension,
316
+ score,
317
+ taskId: j.taskId,
318
+ });
319
+ }
320
+ else {
321
+ failed++;
322
+ }
323
+ if (completed % 10 === 0 || completed === judgments.length) {
324
+ process.stdout.write(`\r Progress: ${completed}/${judgments.length}${failed > 0 ? ` (${failed} failed)` : ""}`);
325
+ }
326
+ }
327
+ console.log();
328
+ candidateScoreSets.push({
329
+ label: candidate.label,
330
+ modelId: candidate.id,
331
+ scores: candidateScores,
332
+ });
333
+ }
334
+ console.log();
335
+ // Run comparison
336
+ const comparison = compareGraders(baselineScoreSet, candidateScoreSets);
337
+ // Output
338
+ if (format === "json") {
339
+ const json = JSON.stringify(comparison, null, 2);
340
+ if (outputPath) {
341
+ writeFileSync(outputPath, json);
342
+ console.log(` ✅ Report written to ${outputPath}`);
343
+ }
344
+ else {
345
+ console.log(json);
346
+ }
347
+ }
348
+ else {
349
+ formatComparisonReport(comparison);
350
+ }
351
+ // Write to results/latest/
352
+ const resultFilePath = join(ROOT, "results", "latest", "grader-comparison.json");
353
+ try {
354
+ writeFileSync(resultFilePath, JSON.stringify(comparison, null, 2));
355
+ console.log(` 📄 Report saved: ${resultFilePath}`);
356
+ }
357
+ catch {
358
+ // results/latest/ may not exist yet
359
+ }
360
+ }
361
+ // Only run when invoked directly
362
+ if (process.argv[1]?.endsWith("grader-compare.ts") ||
363
+ process.argv[1]?.endsWith("grader-compare.js")) {
364
+ main().catch((err) => {
365
+ console.error("❌ Fatal error:", err);
366
+ process.exit(1);
367
+ });
368
+ }
@@ -0,0 +1,20 @@
1
+ /**
2
+ * grader-consistency.ts
3
+ *
4
+ * CLI script for measuring grader consistency (Phase 1 of grader reliability).
5
+ *
6
+ * Reads existing eval results, re-runs ONLY the grading assertions N additional
7
+ * times with the configured grader model, and analyzes score variance.
8
+ *
9
+ * This does NOT re-run the models under test — it only re-grades the same
10
+ * responses. Cost is low: ~$0.005 per grading call × N replications.
11
+ *
12
+ * Usage:
13
+ * pnpm grader-consistency # 5 replications (default)
14
+ * pnpm grader-consistency --replications 3 # custom count
15
+ * pnpm grader-consistency --results <path> # custom results file
16
+ *
17
+ * Reads: results/latest/eval-results.json (default)
18
+ * Writes: results/latest/grader-consistency.json
19
+ */
20
+ import "dotenv/config";