@sanity/ailf 0.4.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
  14. package/dist/_vendor/ailf-core/examples/index.js +10 -10
  15. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  16. package/dist/_vendor/ailf-core/index.js +5 -0
  17. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  18. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  19. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  20. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  21. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  22. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  23. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  24. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  25. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  26. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  27. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  28. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
  29. package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
  30. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  31. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  32. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  33. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  34. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  35. package/dist/_vendor/ailf-core/services/index.js +2 -1
  36. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  37. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  38. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  39. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  40. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  41. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  42. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  43. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  44. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  46. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  47. package/dist/_vendor/ailf-core/types/index.js +8 -1
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  50. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  51. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  52. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  53. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  54. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  55. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  56. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  57. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  58. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  59. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  60. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  61. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  62. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  63. package/dist/_vendor/ailf-shared/index.js +0 -1
  64. package/dist/adapters/api-client/build-request.js +14 -13
  65. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  66. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  67. package/dist/adapters/config-sources/index.d.ts +2 -0
  68. package/dist/adapters/config-sources/index.js +1 -0
  69. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  70. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  71. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  72. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  73. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  74. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  75. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  76. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  77. package/dist/adapters/task-sources/index.d.ts +1 -0
  78. package/dist/adapters/task-sources/index.js +1 -0
  79. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  80. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  81. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  82. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  83. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  84. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  85. package/dist/cli.js +0 -2
  86. package/dist/commands/baseline.js +4 -1
  87. package/dist/commands/calculate-scores.js +1 -1
  88. package/dist/commands/coverage-audit.js +7 -1
  89. package/dist/commands/explain-handler.js +25 -23
  90. package/dist/commands/fetch-docs.js +3 -2
  91. package/dist/commands/generate-configs.js +1 -1
  92. package/dist/commands/interactive.js +11 -7
  93. package/dist/commands/pipeline-action.d.ts +2 -0
  94. package/dist/commands/pipeline-action.js +16 -6
  95. package/dist/commands/pipeline.d.ts +1 -0
  96. package/dist/commands/pipeline.js +4 -2
  97. package/dist/commands/pr-comment.js +1 -1
  98. package/dist/commands/publish.js +2 -2
  99. package/dist/commands/readiness-report.js +13 -6
  100. package/dist/composition-root.d.ts +1 -1
  101. package/dist/composition-root.js +67 -4
  102. package/dist/orchestration/build-app-context.js +1 -0
  103. package/dist/orchestration/build-step-sequence.js +24 -6
  104. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  105. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  106. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  107. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  108. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  109. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  110. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  111. package/dist/orchestration/steps/readiness-step.js +5 -6
  112. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  113. package/dist/orchestration/steps/run-eval-step.js +8 -7
  114. package/dist/pipeline/cache.d.ts +1 -1
  115. package/dist/pipeline/cache.js +36 -8
  116. package/dist/pipeline/calculate-scores.d.ts +5 -7
  117. package/dist/pipeline/calculate-scores.js +74 -153
  118. package/dist/pipeline/checks.js +2 -2
  119. package/dist/pipeline/compare.js +8 -8
  120. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  121. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  122. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  123. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  124. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  126. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  128. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  129. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  130. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  131. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  132. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  133. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  136. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  137. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  138. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  139. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  140. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  141. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  142. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  143. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  144. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  145. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  146. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  147. package/dist/pipeline/compiler/config-loader.js +111 -0
  148. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  149. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  150. package/dist/pipeline/compiler/hash.d.ts +11 -0
  151. package/dist/pipeline/compiler/hash.js +18 -0
  152. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  153. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  154. package/dist/pipeline/compiler/index.d.ts +29 -0
  155. package/dist/pipeline/compiler/index.js +45 -0
  156. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  157. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  162. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  163. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  164. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  165. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  166. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  167. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  168. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  169. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  170. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  171. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  174. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  175. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  176. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  177. package/dist/pipeline/compiler/presets/index.js +8 -0
  178. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  179. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  180. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  181. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  182. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  183. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  184. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  185. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  186. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  187. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  188. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  189. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  190. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  191. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  194. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  195. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  196. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  197. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  198. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  199. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  200. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  201. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  202. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  203. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  204. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  205. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  206. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  207. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  208. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  209. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  210. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  211. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  212. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  213. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  214. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  215. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  216. package/dist/pipeline/coverage-audit.d.ts +15 -5
  217. package/dist/pipeline/coverage-audit.js +41 -22
  218. package/dist/pipeline/eval-constants.d.ts +16 -6
  219. package/dist/pipeline/eval-constants.js +25 -4
  220. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  221. package/dist/pipeline/eval-fingerprint.js +8 -9
  222. package/dist/pipeline/expand-tasks.d.ts +23 -14
  223. package/dist/pipeline/expand-tasks.js +37 -31
  224. package/dist/pipeline/gap-analysis.d.ts +1 -1
  225. package/dist/pipeline/gap-analysis.js +2 -2
  226. package/dist/pipeline/generate-configs.d.ts +22 -4
  227. package/dist/pipeline/generate-configs.js +53 -24
  228. package/dist/pipeline/grader-api.d.ts +3 -3
  229. package/dist/pipeline/grader-api.js +5 -12
  230. package/dist/pipeline/grader-compare-runner.js +20 -27
  231. package/dist/pipeline/grader-comparison.d.ts +4 -8
  232. package/dist/pipeline/grader-comparison.js +11 -17
  233. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  234. package/dist/pipeline/grader-consistency-runner.js +18 -21
  235. package/dist/pipeline/grader-consistency.d.ts +6 -10
  236. package/dist/pipeline/grader-consistency.js +13 -32
  237. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  238. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  239. package/dist/pipeline/grader-sensitivity.js +10 -10
  240. package/dist/pipeline/grader-validate-runner.js +7 -5
  241. package/dist/pipeline/grader-validation.d.ts +2 -6
  242. package/dist/pipeline/grader-validation.js +14 -22
  243. package/dist/pipeline/map-request-to-config.js +6 -1
  244. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  245. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  246. package/dist/pipeline/normalize-mode.d.ts +49 -0
  247. package/dist/pipeline/normalize-mode.js +64 -0
  248. package/dist/pipeline/plan.d.ts +5 -2
  249. package/dist/pipeline/plan.js +134 -78
  250. package/dist/pipeline/pr-comment.js +2 -0
  251. package/dist/pipeline/profile-resolution.d.ts +47 -0
  252. package/dist/pipeline/profile-resolution.js +91 -0
  253. package/dist/pipeline/provenance.d.ts +2 -2
  254. package/dist/pipeline/provenance.js +12 -17
  255. package/dist/pipeline/release-report.js +4 -4
  256. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  257. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  258. package/dist/pipeline/rubric-loader.d.ts +20 -0
  259. package/dist/pipeline/rubric-loader.js +37 -0
  260. package/dist/pipeline/validate.d.ts +4 -4
  261. package/dist/pipeline/validate.js +64 -53
  262. package/dist/schedules/loader.js +18 -8
  263. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  264. package/dist/scripts/migrate-task-mode.js +85 -0
  265. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  266. package/dist/scripts/validate-task-sources.d.ts +1 -1
  267. package/dist/scripts/validate-task-sources.js +15 -15
  268. package/dist/sinks/loader.js +5 -7
  269. package/dist/sources.d.ts +7 -7
  270. package/dist/sources.js +22 -24
  271. package/dist/webhook/dispatch.js +2 -1
  272. package/package.json +6 -3
  273. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  274. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  275. package/tasks/literacy/frameworks.task.ts +128 -0
  276. package/tasks/literacy/functions.task.ts +69 -0
  277. package/tasks/literacy/groq.task.ts +258 -0
  278. package/tasks/literacy/nextjs-live.task.ts +75 -0
  279. package/tasks/literacy/studio-setup.task.ts +131 -0
  280. package/tasks/literacy/visual-editing.task.ts +146 -0
  281. package/config/features.yaml +0 -116
  282. package/config/models.yaml +0 -116
  283. package/config/prompts.yaml +0 -75
  284. package/config/rubrics.yaml +0 -62
  285. package/config/schedules.yaml +0 -43
  286. package/config/sinks.yaml +0 -54
  287. package/config/sources.yaml +0 -51
  288. package/config/thresholds.yaml +0 -49
  289. package/dist/agent-observer/test-imports.d.ts +0 -7
  290. package/dist/agent-observer/test-imports.js +0 -185
@@ -9,12 +9,23 @@
9
9
  * Ports & Adapters migration (Phase 0c). The original file is now a
10
10
  * re-export barrel that preserves backward compatibility.
11
11
  */
12
- import type { ConcreteEvalMode as _ConcreteEvalMode, DocumentRef as _DocumentRef, EvalMode as _EvalMode } from "../../ailf-shared/index.d.ts";
12
+ import type { DocumentRef as _DocumentRef, EvalMode as _EvalMode } from "../../ailf-shared/index.d.ts";
13
13
  export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "./scoring-input.js";
14
14
  export type { DocumentRef } from "../../ailf-shared/index.d.ts";
15
+ export type { StoredBaseline, StoredReport, StoredRun, StoredTaskResult, StoredTrace, SchemaVersioned, } from "./storage-schema.js";
16
+ export { CURRENT_SCHEMA_VERSION, isSchemaVersioned, migrateDocument, } from "./storage-schema.js";
17
+ export type { AssertionRegistration, FixtureResolverRegistration, ModeRegistration, PluginManifest, PluginRegistry, PresetDefinition, ReportSinkRegistration, RubricTemplateRegistration, } from "./plugin-registry.js";
18
+ export { InMemoryPluginRegistry } from "./plugin-registry.js";
19
+ export type { AgentHarnessConfig, AgentHarnessModeConfig, CustomModeConfig, EvalModeConfig, EvalModeType, KnowledgeBaseRef, KnowledgeProbeModeConfig, LiteracyModeConfig, MCPServerConfig, MCPServerModeConfig, ProbeStrategy, SandboxConfig, ToolDef, } from "./eval-mode-config.js";
20
+ export { evalModeType } from "./eval-mode-config.js";
21
+ export type { DependencyEdge, ResolvedFixture, TaskGraph, TaskNode, } from "./task-graph.js";
22
+ export type { VariableDeclaration, VariableEnvelope, VariableProvenance, VariableSource, } from "./variable-envelope.js";
23
+ export type { EvalTrace, ToolCallCategory, ToolCallRecord, TraceEvent, TraceSpan, TraceTokenUsage, } from "./trace.js";
24
+ export type { ArtifactId, Brand, Err, FixtureId, IdValidationError, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
25
+ export { err, fixtureId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
26
+ export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
15
27
  type DocumentRef = _DocumentRef;
16
28
  type EvalMode = _EvalMode;
17
- type ConcreteEvalMode = _ConcreteEvalMode;
18
29
  /** Aggregated retrieval metrics for a feature area */
19
30
  export interface AreaRetrievalMetrics {
20
31
  area: string;
@@ -76,8 +87,7 @@ export interface DebugOptions {
76
87
  /** Random sample of N tests */
77
88
  sample?: number;
78
89
  }
79
- export type { DimensionName, EvalMode, ConcreteEvalMode, } from "../../ailf-shared/index.d.ts";
80
- export { FULL_MODE_SUBMODES } from "../../ailf-shared/index.d.ts";
90
+ export type { EvalMode } from "../../ailf-shared/index.d.ts";
81
91
  /** A classified failure mode with confidence level */
82
92
  export interface FailureMode {
83
93
  /** How confident we are in this classification */
@@ -263,21 +273,11 @@ export interface GraderReliability {
263
273
  totalJudgments: number;
264
274
  /** Recommended noise threshold for comparisons (2× max dimension σ) */
265
275
  recommendedThreshold: number;
266
- /** Per-dimension consistency */
267
- perDimension: {
268
- taskCompletion: {
269
- avgStdDev: number;
270
- maxStdDev: number;
271
- };
272
- codeCorrectness: {
273
- avgStdDev: number;
274
- maxStdDev: number;
275
- };
276
- docCoverage: {
277
- avgStdDev: number;
278
- maxStdDev: number;
279
- };
280
- };
276
+ /** Per-dimension consistency (keyed by dimension name) */
277
+ perDimension: Record<string, {
278
+ avgStdDev: number;
279
+ maxStdDev: number;
280
+ }>;
281
281
  };
282
282
  /** Grader model used for this evaluation */
283
283
  graderModel: string;
@@ -289,21 +289,11 @@ export interface GraderReliability {
289
289
  avgSeparation: number;
290
290
  /** Total paired comparisons analyzed */
291
291
  totalPairs: number;
292
- /** Per-dimension sensitivity */
293
- perDimension: {
294
- taskCompletion: {
295
- concordanceRate: number;
296
- avgSeparation: number;
297
- };
298
- codeCorrectness: {
299
- concordanceRate: number;
300
- avgSeparation: number;
301
- };
302
- docCoverage: {
303
- concordanceRate: number;
304
- avgSeparation: number;
305
- };
306
- };
292
+ /** Per-dimension sensitivity (keyed by dimension name) */
293
+ perDimension: Record<string, {
294
+ concordanceRate: number;
295
+ avgSeparation: number;
296
+ }>;
307
297
  };
308
298
  /** Criterion validity (from human reference grades) — Phase 2 */
309
299
  validity?: {
@@ -313,12 +303,8 @@ export interface GraderReliability {
313
303
  correlation: number;
314
304
  /** Systematic bias (positive = grader scores higher than humans) */
315
305
  bias: number;
316
- /** Per-dimension correlation with human grades */
317
- perDimension: {
318
- taskCompletion: number;
319
- codeCorrectness: number;
320
- docCoverage: number;
321
- };
306
+ /** Per-dimension correlation with human grades (keyed by dimension name) */
307
+ perDimension: Record<string, number>;
322
308
  /** Number of human-graded reference samples */
323
309
  sampleSize: number;
324
310
  /** Whether the grader passes the MAE threshold */
@@ -420,7 +406,7 @@ export interface PipelineOptions {
420
406
  }
421
407
  /** A Promptfoo share URL tagged with the evaluation mode that produced it. */
422
408
  export interface PromptfooUrlEntry {
423
- mode: ConcreteEvalMode;
409
+ mode: string;
424
410
  url: string;
425
411
  }
426
412
  /**
@@ -694,14 +680,18 @@ export interface ScoreSummary {
694
680
  */
695
681
  documentManifest?: DocumentRef[];
696
682
  /**
697
- * Which evaluation modes contributed data to this summary.
683
+ * Which evaluation variant contributed data to this summary.
684
+ *
685
+ * For literacy mode this is a variant name:
698
686
  * - `'full'`: both baseline and agentic data present
699
687
  * - `'baseline'`: floor + ceiling only (no agentic data)
700
688
  * - `'agentic'`: actual only (no floor/ceiling data)
701
689
  * - `'observed'`: observed mode data
690
+ *
691
+ * For non-literacy modes this is the canonical mode name.
702
692
  * Absent in legacy summaries — treat as `'baseline'` for backward compat.
703
693
  */
704
- evaluationMode?: EvalMode;
694
+ evaluationMode?: string;
705
695
  /** Failure mode analysis (Phase 3a) — diagnostic breakdown of why scores are low */
706
696
  failureModes?: FailureModeReport;
707
697
  /**
@@ -870,24 +860,12 @@ export interface AreaDelta {
870
860
  costDelta?: number;
871
861
  /** Overall score delta (experiment − baseline) */
872
862
  delta: number;
873
- /** Per-dimension deltas */
874
- dimensions: {
875
- taskCompletion: {
876
- baseline: number;
877
- experiment: number;
878
- delta: number;
879
- };
880
- codeCorrectness: {
881
- baseline: number;
882
- experiment: number;
883
- delta: number;
884
- };
885
- docCoverage: {
886
- baseline: number;
887
- experiment: number;
888
- delta: number;
889
- };
890
- };
863
+ /** Per-dimension deltas (keyed by dimension name, e.g. 'taskCompletion') */
864
+ dimensions: Record<string, {
865
+ baseline: number;
866
+ experiment: number;
867
+ delta: number;
868
+ }>;
891
869
  /** Doc Lift delta */
892
870
  docLiftDelta: number;
893
871
  /** Experiment total score */
@@ -958,11 +936,7 @@ export interface ComparisonReport {
958
936
  /** Per-area total score deltas */
959
937
  perArea: Record<string, number>;
960
938
  /** Per-dimension average deltas (across all areas) */
961
- perDimension: {
962
- taskCompletion: number;
963
- codeCorrectness: number;
964
- docCoverage: number;
965
- };
939
+ perDimension: Record<string, number>;
966
940
  /** Doc Lift average delta */
967
941
  docLift: number;
968
942
  /** Cost delta (if both runs have cost data) */
@@ -1018,21 +992,11 @@ export interface ConfidenceAnnotation {
1018
992
  * Matches the shape produced by pipeline/grader-consistency.ts.
1019
993
  */
1020
994
  export interface GraderConsistencyData {
1021
- /** Per-dimension consistency metrics */
1022
- perDimension: {
1023
- codeCorrectness: {
1024
- avgStdDev: number;
1025
- maxStdDev: number;
1026
- };
1027
- docCoverage: {
1028
- avgStdDev: number;
1029
- maxStdDev: number;
1030
- };
1031
- taskCompletion: {
1032
- avgStdDev: number;
1033
- maxStdDev: number;
1034
- };
1035
- };
995
+ /** Per-dimension consistency metrics (keyed by dimension name) */
996
+ perDimension: Record<string, {
997
+ avgStdDev: number;
998
+ maxStdDev: number;
999
+ }>;
1036
1000
  /** Recommended noise threshold for comparisons (2× max dimension σ) */
1037
1001
  recommendedThreshold: number;
1038
1002
  }
@@ -9,7 +9,14 @@
9
9
  * Ports & Adapters migration (Phase 0c). The original file is now a
10
10
  * re-export barrel that preserves backward compatibility.
11
11
  */
12
- export { FULL_MODE_SUBMODES } from "../../ailf-shared/index.js";
12
+ export { CURRENT_SCHEMA_VERSION, isSchemaVersioned, migrateDocument, } from "./storage-schema.js";
13
+ export { InMemoryPluginRegistry } from "./plugin-registry.js";
14
+ // Note: DocSourceConfig is NOT re-exported here — it conflicts with the
15
+ // existing DocSourceConfig in ports/doc-fetcher.ts. The eval-mode-config
16
+ // version is used internally by LiteracyModeConfig. If consumers need
17
+ // the mode-specific version, they import from "./eval-mode-config.js".
18
+ export { evalModeType } from "./eval-mode-config.js";
19
+ export { err, fixtureId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
13
20
  // ---------------------------------------------------------------------------
14
21
  // Comparison (Approach 2: structured comparison output)
15
22
  // ---------------------------------------------------------------------------
@@ -0,0 +1,202 @@
1
+ /**
2
+ * Plugin registry — typed extension points for AILF evaluation capabilities.
3
+ *
4
+ * Twelve extension points: evaluation modes, providers, assertions,
5
+ * rubric templates, fixture resolvers, report sinks, dashboard renderers,
6
+ * prompt templates, scoring profiles, doc fetcher factory, source defs,
7
+ * and feature defs.
8
+ *
9
+ * Presets bundle multiple extensions into a single installable unit.
10
+ *
11
+ * @see docs/design-docs/architecture-overhaul/extensibility-plugins.md
12
+ */
13
+ import type { PromptTemplate } from "../ports/mode-handler.js";
14
+ import type { DocFetcher } from "../ports/doc-fetcher.js";
15
+ import type { SourceEntry } from "../config-helpers.js";
16
+ import type { FeatureRegistry } from "../schemas/pipeline.js";
17
+ /** A registered evaluation mode handler */
18
+ export interface ModeRegistration {
19
+ /** Unique mode identifier (e.g., "api-contract") */
20
+ id: string;
21
+ /** Human-readable label */
22
+ label: string;
23
+ /** Valid provider pattern regexes */
24
+ validProviderPatterns: string[];
25
+ /** Rubric template IDs available for this mode */
26
+ rubricTemplateIds: string[];
27
+ /** Compile function module path (loaded at runtime) */
28
+ handlerModule: string;
29
+ }
30
+ /** A registered assertion type */
31
+ export interface AssertionRegistration {
32
+ /** Assertion type name (e.g., "api-contract-match") */
33
+ type: string;
34
+ /** Human-readable label */
35
+ label: string;
36
+ /** Which modes this assertion is compatible with */
37
+ compatibleModes: string[];
38
+ /** Assertion handler module path */
39
+ handlerModule: string;
40
+ }
41
+ /** A registered rubric template */
42
+ export interface RubricTemplateRegistration {
43
+ /** Template ID (e.g., "api-accuracy") */
44
+ id: string;
45
+ /** Scoring dimension this template contributes to */
46
+ dimension: string;
47
+ /** Scale header text */
48
+ header: string;
49
+ /** Scale entries */
50
+ scale: string[];
51
+ /** Criteria label */
52
+ criteriaLabel?: string;
53
+ }
54
+ /** A registered fixture resolver */
55
+ export interface FixtureResolverRegistration {
56
+ /** URI scheme this resolver handles (e.g., "graphql://") */
57
+ scheme: string;
58
+ /** Resolver module path */
59
+ handlerModule: string;
60
+ }
61
+ /** A registered report sink */
62
+ export interface ReportSinkRegistration {
63
+ /** Sink identifier (e.g., "bigquery", "slack") */
64
+ id: string;
65
+ /** Sink module path */
66
+ handlerModule: string;
67
+ }
68
+ /** Plugin manifest describing a single plugin */
69
+ export interface PluginManifest {
70
+ /** Plugin name (npm package style) */
71
+ name: string;
72
+ /** Semver version */
73
+ version: string;
74
+ /** Human-readable description */
75
+ description?: string;
76
+ /** Plugin API version this plugin targets */
77
+ pluginApiVersion: number;
78
+ /** Minimum AILF version required */
79
+ minAILFVersion?: string;
80
+ /** Dependencies on other plugins */
81
+ requires?: string[];
82
+ }
83
+ /** A preset bundles multiple extensions into an installable unit */
84
+ export interface PresetDefinition {
85
+ /** Preset name */
86
+ name: string;
87
+ /** Plugin manifest */
88
+ manifest: PluginManifest;
89
+ /** Evaluation modes to register */
90
+ modes?: ModeRegistration[];
91
+ /** Assertion types to register */
92
+ assertions?: AssertionRegistration[];
93
+ /** Rubric templates to register */
94
+ rubricTemplates?: RubricTemplateRegistration[];
95
+ /** Fixture resolvers to register */
96
+ fixtureResolvers?: FixtureResolverRegistration[];
97
+ /** Report sinks to register */
98
+ reportSinks?: ReportSinkRegistration[];
99
+ /** Prompt templates keyed by template name (e.g. "with-docs", "agentic") */
100
+ promptTemplates?: Record<string, PromptTemplate>;
101
+ /** Scoring profiles mapping profile name to dimension-weight pairs */
102
+ scoringProfiles?: Record<string, Record<string, number>>;
103
+ /** Factory function that creates a DocFetcher instance */
104
+ docFetcher?: () => DocFetcher;
105
+ /** Documentation source definitions (production, branch, local, etc.) */
106
+ sourceDefs?: SourceEntry[];
107
+ /** Product feature registry for coverage tracking */
108
+ featureDefs?: FeatureRegistry;
109
+ }
110
+ /**
111
+ * PluginRegistry — central registry for all AILF extensions.
112
+ *
113
+ * Plugins register their capabilities here. The pipeline queries the
114
+ * registry to discover available modes, assertions, templates, etc.
115
+ */
116
+ export interface PluginRegistry {
117
+ /** Register a complete preset (bundles multiple extensions) */
118
+ registerPreset(preset: PresetDefinition): void;
119
+ /** Register a single evaluation mode */
120
+ registerMode(mode: ModeRegistration): void;
121
+ /** Register a single assertion type */
122
+ registerAssertion(assertion: AssertionRegistration): void;
123
+ /** Register a rubric template */
124
+ registerRubricTemplate(template: RubricTemplateRegistration): void;
125
+ /** Register a fixture resolver */
126
+ registerFixtureResolver(resolver: FixtureResolverRegistration): void;
127
+ /** Register a report sink */
128
+ registerReportSink(sink: ReportSinkRegistration): void;
129
+ /** Get all registered modes */
130
+ getModes(): ModeRegistration[];
131
+ /** Get a mode by ID */
132
+ getMode(id: string): ModeRegistration | undefined;
133
+ /** Get all registered assertion types */
134
+ getAssertions(): AssertionRegistration[];
135
+ /** Get all registered rubric templates */
136
+ getRubricTemplates(): RubricTemplateRegistration[];
137
+ /** Get all registered fixture resolvers */
138
+ getFixtureResolvers(): FixtureResolverRegistration[];
139
+ /** Get all registered report sinks */
140
+ getReportSinks(): ReportSinkRegistration[];
141
+ /** Register prompt templates (merged with existing) */
142
+ registerPromptTemplates(templates: Record<string, PromptTemplate>): void;
143
+ /** Get all registered prompt templates */
144
+ getPromptTemplates(): Record<string, PromptTemplate>;
145
+ /** Register scoring profiles (merged with existing) */
146
+ registerScoringProfiles(profiles: Record<string, Record<string, number>>): void;
147
+ /** Get all registered scoring profiles */
148
+ getScoringProfiles(): Record<string, Record<string, number>>;
149
+ /** Register a doc fetcher factory (last-write-wins) */
150
+ registerDocFetcherFactory(factory: () => DocFetcher): void;
151
+ /** Get the registered doc fetcher factory, if any */
152
+ getDocFetcherFactory(): (() => DocFetcher) | undefined;
153
+ /** Register source definitions (concatenated with existing) */
154
+ registerSourceDefs(sources: SourceEntry[]): void;
155
+ /** Get all registered source definitions */
156
+ getSourceDefs(): SourceEntry[];
157
+ /** Register a feature registry (last-write-wins) */
158
+ registerFeatureDefs(features: FeatureRegistry): void;
159
+ /** Get the registered feature registry, if any */
160
+ getFeatureDefs(): FeatureRegistry | undefined;
161
+ /** Get all registered presets */
162
+ getPresets(): PresetDefinition[];
163
+ }
164
+ /**
165
+ * In-memory plugin registry implementation.
166
+ */
167
+ export declare class InMemoryPluginRegistry implements PluginRegistry {
168
+ private readonly modes;
169
+ private readonly assertions_;
170
+ private readonly rubricTemplates_;
171
+ private readonly fixtureResolvers_;
172
+ private readonly reportSinks_;
173
+ private readonly presets_;
174
+ private promptTemplates_;
175
+ private scoringProfiles_;
176
+ private docFetcherFactory_;
177
+ private sourceDefs_;
178
+ private featureDefs_;
179
+ registerPreset(preset: PresetDefinition): void;
180
+ registerMode(mode: ModeRegistration): void;
181
+ registerAssertion(assertion: AssertionRegistration): void;
182
+ registerRubricTemplate(template: RubricTemplateRegistration): void;
183
+ registerFixtureResolver(resolver: FixtureResolverRegistration): void;
184
+ registerReportSink(sink: ReportSinkRegistration): void;
185
+ getModes(): ModeRegistration[];
186
+ getMode(id: string): ModeRegistration | undefined;
187
+ getAssertions(): AssertionRegistration[];
188
+ getRubricTemplates(): RubricTemplateRegistration[];
189
+ getFixtureResolvers(): FixtureResolverRegistration[];
190
+ getReportSinks(): ReportSinkRegistration[];
191
+ getPresets(): PresetDefinition[];
192
+ registerPromptTemplates(templates: Record<string, PromptTemplate>): void;
193
+ getPromptTemplates(): Record<string, PromptTemplate>;
194
+ registerScoringProfiles(profiles: Record<string, Record<string, number>>): void;
195
+ getScoringProfiles(): Record<string, Record<string, number>>;
196
+ registerDocFetcherFactory(factory: () => DocFetcher): void;
197
+ getDocFetcherFactory(): (() => DocFetcher) | undefined;
198
+ registerSourceDefs(sources: SourceEntry[]): void;
199
+ getSourceDefs(): SourceEntry[];
200
+ registerFeatureDefs(features: FeatureRegistry): void;
201
+ getFeatureDefs(): FeatureRegistry | undefined;
202
+ }
@@ -0,0 +1,132 @@
1
+ /**
2
+ * Plugin registry — typed extension points for AILF evaluation capabilities.
3
+ *
4
+ * Twelve extension points: evaluation modes, providers, assertions,
5
+ * rubric templates, fixture resolvers, report sinks, dashboard renderers,
6
+ * prompt templates, scoring profiles, doc fetcher factory, source defs,
7
+ * and feature defs.
8
+ *
9
+ * Presets bundle multiple extensions into a single installable unit.
10
+ *
11
+ * @see docs/design-docs/architecture-overhaul/extensibility-plugins.md
12
+ */
13
+ /**
14
+ * In-memory plugin registry implementation.
15
+ */
16
+ export class InMemoryPluginRegistry {
17
+ modes = new Map();
18
+ assertions_ = new Map();
19
+ rubricTemplates_ = new Map();
20
+ fixtureResolvers_ = new Map();
21
+ reportSinks_ = new Map();
22
+ presets_ = new Map();
23
+ promptTemplates_ = {};
24
+ scoringProfiles_ = {};
25
+ docFetcherFactory_;
26
+ sourceDefs_ = [];
27
+ featureDefs_;
28
+ registerPreset(preset) {
29
+ this.presets_.set(preset.name, preset);
30
+ if (preset.modes) {
31
+ for (const mode of preset.modes)
32
+ this.registerMode(mode);
33
+ }
34
+ if (preset.assertions) {
35
+ for (const a of preset.assertions)
36
+ this.registerAssertion(a);
37
+ }
38
+ if (preset.rubricTemplates) {
39
+ for (const t of preset.rubricTemplates)
40
+ this.registerRubricTemplate(t);
41
+ }
42
+ if (preset.fixtureResolvers) {
43
+ for (const r of preset.fixtureResolvers)
44
+ this.registerFixtureResolver(r);
45
+ }
46
+ if (preset.reportSinks) {
47
+ for (const s of preset.reportSinks)
48
+ this.registerReportSink(s);
49
+ }
50
+ if (preset.promptTemplates) {
51
+ this.registerPromptTemplates(preset.promptTemplates);
52
+ }
53
+ if (preset.scoringProfiles) {
54
+ this.registerScoringProfiles(preset.scoringProfiles);
55
+ }
56
+ if (preset.docFetcher) {
57
+ this.registerDocFetcherFactory(preset.docFetcher);
58
+ }
59
+ if (preset.sourceDefs) {
60
+ this.registerSourceDefs(preset.sourceDefs);
61
+ }
62
+ if (preset.featureDefs) {
63
+ this.registerFeatureDefs(preset.featureDefs);
64
+ }
65
+ }
66
+ registerMode(mode) {
67
+ this.modes.set(mode.id, mode);
68
+ }
69
+ registerAssertion(assertion) {
70
+ this.assertions_.set(assertion.type, assertion);
71
+ }
72
+ registerRubricTemplate(template) {
73
+ this.rubricTemplates_.set(template.id, template);
74
+ }
75
+ registerFixtureResolver(resolver) {
76
+ this.fixtureResolvers_.set(resolver.scheme, resolver);
77
+ }
78
+ registerReportSink(sink) {
79
+ this.reportSinks_.set(sink.id, sink);
80
+ }
81
+ getModes() {
82
+ return [...this.modes.values()];
83
+ }
84
+ getMode(id) {
85
+ return this.modes.get(id);
86
+ }
87
+ getAssertions() {
88
+ return [...this.assertions_.values()];
89
+ }
90
+ getRubricTemplates() {
91
+ return [...this.rubricTemplates_.values()];
92
+ }
93
+ getFixtureResolvers() {
94
+ return [...this.fixtureResolvers_.values()];
95
+ }
96
+ getReportSinks() {
97
+ return [...this.reportSinks_.values()];
98
+ }
99
+ getPresets() {
100
+ return [...this.presets_.values()];
101
+ }
102
+ registerPromptTemplates(templates) {
103
+ Object.assign(this.promptTemplates_, templates);
104
+ }
105
+ getPromptTemplates() {
106
+ return this.promptTemplates_;
107
+ }
108
+ registerScoringProfiles(profiles) {
109
+ Object.assign(this.scoringProfiles_, profiles);
110
+ }
111
+ getScoringProfiles() {
112
+ return this.scoringProfiles_;
113
+ }
114
+ registerDocFetcherFactory(factory) {
115
+ this.docFetcherFactory_ = factory;
116
+ }
117
+ getDocFetcherFactory() {
118
+ return this.docFetcherFactory_;
119
+ }
120
+ registerSourceDefs(sources) {
121
+ this.sourceDefs_ = [...this.sourceDefs_, ...sources];
122
+ }
123
+ getSourceDefs() {
124
+ return this.sourceDefs_;
125
+ }
126
+ registerFeatureDefs(features) {
127
+ this.featureDefs_ = features;
128
+ }
129
+ getFeatureDefs() {
130
+ return this.featureDefs_;
131
+ }
132
+ }