@sanity/ailf 0.4.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
  14. package/dist/_vendor/ailf-core/examples/index.js +10 -10
  15. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  16. package/dist/_vendor/ailf-core/index.js +5 -0
  17. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  18. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  19. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  20. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  21. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  22. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  23. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  24. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  25. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  26. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  27. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  28. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
  29. package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
  30. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  31. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  32. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  33. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  34. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  35. package/dist/_vendor/ailf-core/services/index.js +2 -1
  36. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  37. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  38. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  39. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  40. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  41. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  42. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  43. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  44. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  46. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  47. package/dist/_vendor/ailf-core/types/index.js +8 -1
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  50. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  51. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  52. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  53. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  54. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  55. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  56. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  57. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  58. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  59. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  60. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  61. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  62. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  63. package/dist/_vendor/ailf-shared/index.js +0 -1
  64. package/dist/adapters/api-client/build-request.js +14 -13
  65. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  66. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  67. package/dist/adapters/config-sources/index.d.ts +2 -0
  68. package/dist/adapters/config-sources/index.js +1 -0
  69. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  70. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  71. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  72. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  73. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  74. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  75. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  76. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  77. package/dist/adapters/task-sources/index.d.ts +1 -0
  78. package/dist/adapters/task-sources/index.js +1 -0
  79. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  80. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  81. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  82. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  83. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  84. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  85. package/dist/cli.js +0 -2
  86. package/dist/commands/baseline.js +4 -1
  87. package/dist/commands/calculate-scores.js +1 -1
  88. package/dist/commands/coverage-audit.js +7 -1
  89. package/dist/commands/explain-handler.js +25 -23
  90. package/dist/commands/fetch-docs.js +3 -2
  91. package/dist/commands/generate-configs.js +1 -1
  92. package/dist/commands/interactive.js +11 -7
  93. package/dist/commands/pipeline-action.d.ts +2 -0
  94. package/dist/commands/pipeline-action.js +16 -6
  95. package/dist/commands/pipeline.d.ts +1 -0
  96. package/dist/commands/pipeline.js +4 -2
  97. package/dist/commands/pr-comment.js +1 -1
  98. package/dist/commands/publish.js +2 -2
  99. package/dist/commands/readiness-report.js +13 -6
  100. package/dist/composition-root.d.ts +1 -1
  101. package/dist/composition-root.js +67 -4
  102. package/dist/orchestration/build-app-context.js +1 -0
  103. package/dist/orchestration/build-step-sequence.js +24 -6
  104. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  105. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  106. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  107. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  108. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  109. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  110. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  111. package/dist/orchestration/steps/readiness-step.js +5 -6
  112. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  113. package/dist/orchestration/steps/run-eval-step.js +8 -7
  114. package/dist/pipeline/cache.d.ts +1 -1
  115. package/dist/pipeline/cache.js +36 -8
  116. package/dist/pipeline/calculate-scores.d.ts +5 -7
  117. package/dist/pipeline/calculate-scores.js +74 -153
  118. package/dist/pipeline/checks.js +2 -2
  119. package/dist/pipeline/compare.js +8 -8
  120. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  121. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  122. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  123. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  124. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  126. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  128. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  129. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  130. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  131. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  132. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  133. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  136. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  137. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  138. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  139. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  140. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  141. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  142. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  143. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  144. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  145. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  146. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  147. package/dist/pipeline/compiler/config-loader.js +111 -0
  148. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  149. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  150. package/dist/pipeline/compiler/hash.d.ts +11 -0
  151. package/dist/pipeline/compiler/hash.js +18 -0
  152. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  153. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  154. package/dist/pipeline/compiler/index.d.ts +29 -0
  155. package/dist/pipeline/compiler/index.js +45 -0
  156. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  157. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  162. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  163. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  164. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  165. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  166. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  167. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  168. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  169. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  170. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  171. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  174. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  175. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  176. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  177. package/dist/pipeline/compiler/presets/index.js +8 -0
  178. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  179. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  180. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  181. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  182. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  183. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  184. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  185. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  186. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  187. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  188. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  189. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  190. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  191. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  194. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  195. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  196. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  197. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  198. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  199. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  200. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  201. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  202. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  203. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  204. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  205. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  206. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  207. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  208. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  209. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  210. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  211. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  212. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  213. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  214. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  215. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  216. package/dist/pipeline/coverage-audit.d.ts +15 -5
  217. package/dist/pipeline/coverage-audit.js +41 -22
  218. package/dist/pipeline/eval-constants.d.ts +16 -6
  219. package/dist/pipeline/eval-constants.js +25 -4
  220. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  221. package/dist/pipeline/eval-fingerprint.js +8 -9
  222. package/dist/pipeline/expand-tasks.d.ts +23 -14
  223. package/dist/pipeline/expand-tasks.js +37 -31
  224. package/dist/pipeline/gap-analysis.d.ts +1 -1
  225. package/dist/pipeline/gap-analysis.js +2 -2
  226. package/dist/pipeline/generate-configs.d.ts +22 -4
  227. package/dist/pipeline/generate-configs.js +53 -24
  228. package/dist/pipeline/grader-api.d.ts +3 -3
  229. package/dist/pipeline/grader-api.js +5 -12
  230. package/dist/pipeline/grader-compare-runner.js +20 -27
  231. package/dist/pipeline/grader-comparison.d.ts +4 -8
  232. package/dist/pipeline/grader-comparison.js +11 -17
  233. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  234. package/dist/pipeline/grader-consistency-runner.js +18 -21
  235. package/dist/pipeline/grader-consistency.d.ts +6 -10
  236. package/dist/pipeline/grader-consistency.js +13 -32
  237. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  238. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  239. package/dist/pipeline/grader-sensitivity.js +10 -10
  240. package/dist/pipeline/grader-validate-runner.js +7 -5
  241. package/dist/pipeline/grader-validation.d.ts +2 -6
  242. package/dist/pipeline/grader-validation.js +14 -22
  243. package/dist/pipeline/map-request-to-config.js +6 -1
  244. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  245. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  246. package/dist/pipeline/normalize-mode.d.ts +49 -0
  247. package/dist/pipeline/normalize-mode.js +64 -0
  248. package/dist/pipeline/plan.d.ts +5 -2
  249. package/dist/pipeline/plan.js +134 -78
  250. package/dist/pipeline/pr-comment.js +2 -0
  251. package/dist/pipeline/profile-resolution.d.ts +47 -0
  252. package/dist/pipeline/profile-resolution.js +91 -0
  253. package/dist/pipeline/provenance.d.ts +2 -2
  254. package/dist/pipeline/provenance.js +12 -17
  255. package/dist/pipeline/release-report.js +4 -4
  256. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  257. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  258. package/dist/pipeline/rubric-loader.d.ts +20 -0
  259. package/dist/pipeline/rubric-loader.js +37 -0
  260. package/dist/pipeline/validate.d.ts +4 -4
  261. package/dist/pipeline/validate.js +64 -53
  262. package/dist/schedules/loader.js +18 -8
  263. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  264. package/dist/scripts/migrate-task-mode.js +85 -0
  265. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  266. package/dist/scripts/validate-task-sources.d.ts +1 -1
  267. package/dist/scripts/validate-task-sources.js +15 -15
  268. package/dist/sinks/loader.js +5 -7
  269. package/dist/sources.d.ts +7 -7
  270. package/dist/sources.js +22 -24
  271. package/dist/webhook/dispatch.js +2 -1
  272. package/package.json +6 -3
  273. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  274. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  275. package/tasks/literacy/frameworks.task.ts +128 -0
  276. package/tasks/literacy/functions.task.ts +69 -0
  277. package/tasks/literacy/groq.task.ts +258 -0
  278. package/tasks/literacy/nextjs-live.task.ts +75 -0
  279. package/tasks/literacy/studio-setup.task.ts +131 -0
  280. package/tasks/literacy/visual-editing.task.ts +146 -0
  281. package/config/features.yaml +0 -116
  282. package/config/models.yaml +0 -116
  283. package/config/prompts.yaml +0 -75
  284. package/config/rubrics.yaml +0 -62
  285. package/config/schedules.yaml +0 -43
  286. package/config/sinks.yaml +0 -54
  287. package/config/sources.yaml +0 -51
  288. package/config/thresholds.yaml +0 -49
  289. package/dist/agent-observer/test-imports.d.ts +0 -7
  290. package/dist/agent-observer/test-imports.js +0 -185
@@ -0,0 +1,199 @@
1
+ /**
2
+ * Storage schema types for Content Lake documents.
3
+ *
4
+ * All AILF document types include `schemaVersion` for forward-compatible
5
+ * migration. Full trace data goes to blob storage (GCP/local); only
6
+ * sanitized summaries are stored in Content Lake.
7
+ *
8
+ * SECURITY: The `next` dataset is publicly accessible. Never store
9
+ * sensitive data (API keys, full tool call arguments, raw outputs with
10
+ * PII) in Content Lake. Use blob storage + traceDataUri references.
11
+ *
12
+ * @see docs/design-docs/architecture-overhaul/storage-schema.md
13
+ */
14
+ import type { VariableEnvelope } from "./variable-envelope.js";
15
+ /** Current schema version for all AILF document types */
16
+ export declare const CURRENT_SCHEMA_VERSION = 1;
17
+ /** Schema-versioned document base */
18
+ export interface SchemaVersioned {
19
+ /** Schema version for forward-compatible migration */
20
+ schemaVersion: number;
21
+ }
22
+ /** An evaluation run — one execution of the pipeline */
23
+ export interface StoredRun extends SchemaVersioned {
24
+ _type: "ailf.run";
25
+ _id: string;
26
+ /** ISO timestamp when the run started */
27
+ startedAt: string;
28
+ /** ISO timestamp when the run completed */
29
+ completedAt?: string;
30
+ /** Run status */
31
+ status: "completed" | "failed" | "running";
32
+ /** Evaluation mode used */
33
+ mode: string;
34
+ /** Models evaluated */
35
+ models: {
36
+ id: string;
37
+ label: string;
38
+ }[];
39
+ /** Grader model used */
40
+ graderModel: string;
41
+ /** Number of tasks evaluated */
42
+ taskCount: number;
43
+ /** Run-level aggregate score (0-100) */
44
+ overallScore?: number;
45
+ /** Per-area scores */
46
+ areaScores?: {
47
+ areaId: string;
48
+ score: number;
49
+ }[];
50
+ /** Pipeline configuration fingerprint (for cache invalidation) */
51
+ configFingerprint?: string;
52
+ /** Git context at run time */
53
+ git?: {
54
+ branch: string;
55
+ sha: string;
56
+ repo: string;
57
+ };
58
+ /** Run metadata */
59
+ metadata?: Record<string, unknown>;
60
+ }
61
+ /** Result for one task × one model */
62
+ export interface StoredTaskResult extends SchemaVersioned {
63
+ _type: "ailf.taskResult";
64
+ _id: string;
65
+ /** Reference to parent run */
66
+ runRef: {
67
+ _type: "reference";
68
+ _ref: string;
69
+ };
70
+ /** Reference to task definition */
71
+ taskRef: {
72
+ _type: "reference";
73
+ _ref: string;
74
+ };
75
+ /** Task ID (denormalized for querying) */
76
+ taskId: string;
77
+ /** Model that produced this result */
78
+ modelId: string;
79
+ /** Overall task score (0-100) */
80
+ score: number;
81
+ /** Per-dimension scores */
82
+ dimensions: {
83
+ dimensionId: string;
84
+ score: number;
85
+ weight: number;
86
+ assertionCount: number;
87
+ passCount: number;
88
+ }[];
89
+ /** Whether task passed its quality threshold */
90
+ passesThreshold: boolean;
91
+ /** Input variable envelope (provenance tracking) */
92
+ inputEnvelope?: VariableEnvelope;
93
+ /** Output variable envelope */
94
+ outputEnvelope?: VariableEnvelope;
95
+ /** URI to full trace payload in blob storage */
96
+ traceDataUri?: string;
97
+ /** Sanitized trace summary (safe for public dataset) */
98
+ traceSummary?: {
99
+ toolCallCount: number;
100
+ toolCallCategories: Record<string, number>;
101
+ totalTokens: number;
102
+ costEstimate: number;
103
+ durationMs: number;
104
+ };
105
+ /** Evaluation mode */
106
+ mode: string;
107
+ /** Mode-specific metadata */
108
+ modeMetadata?: Record<string, unknown>;
109
+ }
110
+ /** Aggregated report for a run */
111
+ export interface StoredReport extends SchemaVersioned {
112
+ _type: "ailf.report";
113
+ _id: string;
114
+ /** Reference to parent run */
115
+ runRef: {
116
+ _type: "reference";
117
+ _ref: string;
118
+ };
119
+ /** ISO timestamp */
120
+ createdAt: string;
121
+ /** Report tag/label */
122
+ tag?: string;
123
+ /** Overall score */
124
+ overallScore: number;
125
+ /** Per-area breakdown */
126
+ areas: {
127
+ areaId: string;
128
+ score: number;
129
+ taskCount: number;
130
+ passingCount: number;
131
+ }[];
132
+ /** Per-model breakdown */
133
+ models: {
134
+ modelId: string;
135
+ score: number;
136
+ }[];
137
+ /** Human-readable markdown summary */
138
+ summary?: string;
139
+ }
140
+ /** Baseline snapshot for comparison */
141
+ export interface StoredBaseline extends SchemaVersioned {
142
+ _type: "ailf.baseline";
143
+ _id: string;
144
+ /** ISO timestamp when baseline was saved */
145
+ savedAt: string;
146
+ /** Label for this baseline */
147
+ label?: string;
148
+ /** Reference to the run this baseline was taken from */
149
+ runRef: {
150
+ _type: "reference";
151
+ _ref: string;
152
+ };
153
+ /** Per-task scores at baseline time */
154
+ taskScores: {
155
+ taskId: string;
156
+ score: number;
157
+ dimensions: Record<string, number>;
158
+ }[];
159
+ /** Per-area scores at baseline time */
160
+ areaScores: {
161
+ areaId: string;
162
+ score: number;
163
+ }[];
164
+ }
165
+ /** Trace summary document in Content Lake */
166
+ export interface StoredTrace extends SchemaVersioned {
167
+ _type: "ailf.trace";
168
+ _id: string;
169
+ /** Reference to parent run */
170
+ runRef: {
171
+ _type: "reference";
172
+ _ref: string;
173
+ };
174
+ /** Reference to task result */
175
+ taskResultRef: {
176
+ _type: "reference";
177
+ _ref: string;
178
+ };
179
+ /** URI to full trace payload in blob storage */
180
+ traceDataUri: string;
181
+ /** Task ID (denormalized) */
182
+ taskId: string;
183
+ /** Model ID */
184
+ modelId: string;
185
+ /** Tool call count */
186
+ toolCallCount: number;
187
+ /** Tool call category breakdown */
188
+ toolCallCategories: Record<string, number>;
189
+ /** Total tokens consumed */
190
+ totalTokens: number;
191
+ /** Cost estimate in USD */
192
+ costEstimate: number;
193
+ /** Duration in milliseconds */
194
+ durationMs: number;
195
+ }
196
+ /** Check if a document is schema-versioned */
197
+ export declare function isSchemaVersioned(doc: unknown): doc is SchemaVersioned;
198
+ /** Migrate a document to the current schema version (no-op for v1) */
199
+ export declare function migrateDocument<T extends SchemaVersioned>(doc: T): T;
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Storage schema types for Content Lake documents.
3
+ *
4
+ * All AILF document types include `schemaVersion` for forward-compatible
5
+ * migration. Full trace data goes to blob storage (GCP/local); only
6
+ * sanitized summaries are stored in Content Lake.
7
+ *
8
+ * SECURITY: The `next` dataset is publicly accessible. Never store
9
+ * sensitive data (API keys, full tool call arguments, raw outputs with
10
+ * PII) in Content Lake. Use blob storage + traceDataUri references.
11
+ *
12
+ * @see docs/design-docs/architecture-overhaul/storage-schema.md
13
+ */
14
+ // ---------------------------------------------------------------------------
15
+ // Schema version
16
+ // ---------------------------------------------------------------------------
17
+ /** Current schema version for all AILF document types */
18
+ export const CURRENT_SCHEMA_VERSION = 1;
19
+ // ---------------------------------------------------------------------------
20
+ // Type guards
21
+ // ---------------------------------------------------------------------------
22
+ /** Check if a document is schema-versioned */
23
+ export function isSchemaVersioned(doc) {
24
+ return (typeof doc === "object" &&
25
+ doc !== null &&
26
+ "schemaVersion" in doc &&
27
+ typeof doc.schemaVersion === "number");
28
+ }
29
+ /** Migrate a document to the current schema version (no-op for v1) */
30
+ export function migrateDocument(doc) {
31
+ if (doc.schemaVersion > CURRENT_SCHEMA_VERSION) {
32
+ throw new Error(`Document schema version ${doc.schemaVersion} is newer than supported version ${CURRENT_SCHEMA_VERSION}. ` +
33
+ `Please upgrade @sanity/ailf-core.`);
34
+ }
35
+ if (doc.schemaVersion === CURRENT_SCHEMA_VERSION)
36
+ return doc;
37
+ // When version 2 is introduced, add migration transforms here.
38
+ return { ...doc, schemaVersion: CURRENT_SCHEMA_VERSION };
39
+ }
@@ -0,0 +1,86 @@
1
+ /**
2
+ * TaskGraph — Intermediate Representation for the evaluation compiler.
3
+ *
4
+ * The TaskGraph is a directed acyclic graph (DAG) of tasks with dependency
5
+ * edges, shared fixtures, and compilation targets. It sits between the
6
+ * user-authored task definitions and the generated Promptfoo configs.
7
+ *
8
+ * Compilation pipeline:
9
+ * GeneralizedTaskDefinitions → parse → TaskGraph → optimize → PromptfooConfig[]
10
+ * ↘ validate ↗
11
+ *
12
+ * These types are introduced in Phase 0 but not consumed until Phase 2
13
+ * (the compiler). They establish the IR vocabulary early so all design
14
+ * discussions and subsequent phases use the same type language.
15
+ *
16
+ * Design decision: DAG-only (no cycles). See domain-model.md.
17
+ *
18
+ * @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
19
+ */
20
+ import type { VariableEnvelope } from "./variable-envelope.js";
21
+ /** A resolved fixture ready for injection into evaluations */
22
+ export interface ResolvedFixture {
23
+ /** Fixture identifier */
24
+ id: string;
25
+ /** Human-readable name */
26
+ name: string;
27
+ /** Fixture type */
28
+ type: "static" | "fetched" | "generated";
29
+ /** Resolved content (null if generation failed or content is deferred) */
30
+ content: string | null;
31
+ /** Content hash for cache invalidation */
32
+ contentHash?: string;
33
+ /** Time-to-live in seconds (for fetched/generated fixtures) */
34
+ ttlSeconds?: number;
35
+ }
36
+ /**
37
+ * A single node in the task graph — one task with its resolved context.
38
+ */
39
+ export interface TaskNode {
40
+ /** The task identifier */
41
+ taskId: string;
42
+ /** The evaluation mode this task belongs to (from GeneralizedTaskDefinition.mode) */
43
+ mode?: string;
44
+ /** Resolved prompt template (after variable interpolation) */
45
+ resolvedPrompt: string;
46
+ /** All variables available to this task, with provenance tracking */
47
+ resolvedVariables: VariableEnvelope;
48
+ /** Task IDs this node depends on (must complete before this node runs) */
49
+ dependsOn: string[];
50
+ /** Execution priority (lower = higher priority). Used for topological tie-breaking. */
51
+ priority: number;
52
+ }
53
+ /**
54
+ * A directed edge between two tasks in the graph.
55
+ *
56
+ * Edge types:
57
+ * - `data` — the downstream task consumes output from the upstream task
58
+ * - `ordering` — pure execution ordering constraint (no data flow)
59
+ * - `fixture` — both tasks share a fixture; the fixture must be resolved first
60
+ */
61
+ export interface DependencyEdge {
62
+ /** Source task (upstream — must complete first) */
63
+ from: string;
64
+ /** Target task (downstream — depends on `from`) */
65
+ to: string;
66
+ /** Relationship type */
67
+ type: "data" | "ordering" | "fixture";
68
+ }
69
+ /**
70
+ * The complete task graph IR — the compilation unit for the eval pipeline.
71
+ *
72
+ * Invariants:
73
+ * - The graph is a DAG (no cycles). The compiler rejects cyclic graphs.
74
+ * - Every `taskId` in `edges` must exist as a key in `nodes`.
75
+ * - Every `fixtureId` referenced by a node must exist in `fixtures`.
76
+ */
77
+ export interface TaskGraph {
78
+ /** All task nodes, keyed by task ID */
79
+ nodes: Map<string, TaskNode>;
80
+ /** Dependency edges between tasks */
81
+ edges: DependencyEdge[];
82
+ /** Resolved fixtures shared across tasks, keyed by fixture ID */
83
+ fixtures: Map<string, ResolvedFixture>;
84
+ /** The backend this graph compiles to */
85
+ compilationTarget: "promptfoo" | "custom";
86
+ }
@@ -0,0 +1,20 @@
1
+ /**
2
+ * TaskGraph — Intermediate Representation for the evaluation compiler.
3
+ *
4
+ * The TaskGraph is a directed acyclic graph (DAG) of tasks with dependency
5
+ * edges, shared fixtures, and compilation targets. It sits between the
6
+ * user-authored task definitions and the generated Promptfoo configs.
7
+ *
8
+ * Compilation pipeline:
9
+ * GeneralizedTaskDefinitions → parse → TaskGraph → optimize → PromptfooConfig[]
10
+ * ↘ validate ↗
11
+ *
12
+ * These types are introduced in Phase 0 but not consumed until Phase 2
13
+ * (the compiler). They establish the IR vocabulary early so all design
14
+ * discussions and subsequent phases use the same type language.
15
+ *
16
+ * Design decision: DAG-only (no cycles). See domain-model.md.
17
+ *
18
+ * @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
19
+ */
20
+ export {};
@@ -0,0 +1,118 @@
1
+ /**
2
+ * EvalTrace — Observability types for evaluation execution.
3
+ *
4
+ * Captures the full request/response lifecycle for a single test case,
5
+ * including tool calls, URLs visited, search terms, token usage, and
6
+ * a chronological event log for replay and debugging.
7
+ *
8
+ * Canonical field names:
9
+ * - `operation` on TraceSpan (not `name` — storage adapts on serialization)
10
+ *
11
+ * Design decision: Start with simple per-result traces (flat span list
12
+ * per EvalResult). No depth limits for agent harness spans. OTel
13
+ * integration deferred.
14
+ *
15
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md (canonical)
16
+ * @see docs/design-docs/architecture-overhaul/domain-model.md (Trace entity)
17
+ */
18
+ /**
19
+ * Normalized tool call category for cross-model comparison.
20
+ *
21
+ * Raw tool names from providers are noisy and inconsistent (`WebSearch`
22
+ * vs `web_search` vs `Browser.search`). AILF normalizes every tool call
23
+ * into one of six categories.
24
+ */
25
+ export type ToolCallCategory = "search" | "read" | "write" | "execute" | "navigate" | "communicate";
26
+ /** A single tool call made during evaluation execution */
27
+ export interface ToolCallRecord {
28
+ /** Tool name as reported by the provider (e.g., "WebSearch", "Bash") */
29
+ name: string;
30
+ /** Serialized input arguments */
31
+ input: Record<string, unknown>;
32
+ /** Serialized output (truncated to maxOutputBytes) */
33
+ output: unknown;
34
+ /** Wall-clock duration of the tool call in milliseconds */
35
+ durationMs: number;
36
+ /** Error message if the tool call failed */
37
+ error?: string;
38
+ /** Classification category */
39
+ category: ToolCallCategory;
40
+ }
41
+ /** A single span in the trace — one logical operation */
42
+ export interface TraceSpan {
43
+ /** Unique identifier for this span */
44
+ spanId: string;
45
+ /** Parent span ID (null for root spans) */
46
+ parentSpanId: string | null;
47
+ /** Operation name (canonical field — storage may use `name`) */
48
+ operation: string;
49
+ /** Start time relative to trace start, in milliseconds */
50
+ startMs: number;
51
+ /** End time relative to trace start, in milliseconds */
52
+ endMs: number;
53
+ /** Arbitrary key-value attributes for this span */
54
+ attributes: Record<string, unknown>;
55
+ }
56
+ /** Timestamped event in the trace log */
57
+ export interface TraceEvent {
58
+ /** ISO 8601 timestamp */
59
+ timestamp: string;
60
+ /** Event type */
61
+ type: "tool_call_start" | "tool_call_end" | "llm_request" | "llm_response" | "error" | "checkpoint" | "custom";
62
+ /** Event-specific payload */
63
+ data: Record<string, unknown>;
64
+ }
65
+ /** Token usage breakdown for a single evaluation step */
66
+ export interface TraceTokenUsage {
67
+ /** Tokens in the prompt/input */
68
+ promptTokens: number;
69
+ /** Tokens in the completion/output */
70
+ completionTokens: number;
71
+ /** Total tokens (prompt + completion) */
72
+ totalTokens: number;
73
+ /** Tokens consumed by tool call inputs/outputs */
74
+ toolTokens?: number;
75
+ }
76
+ /**
77
+ * The complete trace for a single test case execution.
78
+ *
79
+ * Captures everything that happened during one task × one provider
80
+ * evaluation: tool calls, URLs visited, search terms, token usage,
81
+ * cost, and a chronological event log.
82
+ */
83
+ export interface EvalTrace {
84
+ /** Unique identifier for this trace */
85
+ traceId: string;
86
+ /** Parent run this trace belongs to */
87
+ runId: string;
88
+ /** Task definition that produced this test case */
89
+ taskId: string;
90
+ /** Test case index within the task */
91
+ testCaseIndex: number;
92
+ /** Model under evaluation */
93
+ modelId: string;
94
+ /** Hierarchical spans for structured tracing */
95
+ spans: TraceSpan[];
96
+ /** Ordered list of tool calls made during execution */
97
+ toolCalls: ToolCallRecord[];
98
+ /** URLs fetched via WebFetch, WebSearch, or browser navigation */
99
+ urlsVisited: string[];
100
+ /** Search queries issued to WebSearch or semantic search */
101
+ searchTerms: string[];
102
+ /** Files read during execution (sandbox paths) */
103
+ filesRead: string[];
104
+ /** Files created or modified during execution */
105
+ filesWritten: string[];
106
+ /** Aggregate token usage across all LLM calls in this trace */
107
+ tokensUsed: TraceTokenUsage;
108
+ /** Estimated cost in USD */
109
+ costEstimate: number;
110
+ /** Total wall-clock execution time in milliseconds */
111
+ durationMs: number;
112
+ /** Chronological event log for replay / debugging */
113
+ events: TraceEvent[];
114
+ /** ISO 8601 start timestamp */
115
+ startedAt: string;
116
+ /** ISO 8601 end timestamp */
117
+ completedAt: string;
118
+ }
@@ -0,0 +1,18 @@
1
+ /**
2
+ * EvalTrace — Observability types for evaluation execution.
3
+ *
4
+ * Captures the full request/response lifecycle for a single test case,
5
+ * including tool calls, URLs visited, search terms, token usage, and
6
+ * a chronological event log for replay and debugging.
7
+ *
8
+ * Canonical field names:
9
+ * - `operation` on TraceSpan (not `name` — storage adapts on serialization)
10
+ *
11
+ * Design decision: Start with simple per-result traces (flat span list
12
+ * per EvalResult). No depth limits for agent harness spans. OTel
13
+ * integration deferred.
14
+ *
15
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md (canonical)
16
+ * @see docs/design-docs/architecture-overhaul/domain-model.md (Trace entity)
17
+ */
18
+ export {};
@@ -0,0 +1,80 @@
1
+ /**
2
+ * VariableEnvelope — Structured variable tracking with provenance.
3
+ *
4
+ * Replaces ad-hoc `Record<string, string>` variable bags with a
5
+ * structured, auditable format. Every variable carries metadata about
6
+ * where it came from, when it was resolved, and its content hash for
7
+ * cache invalidation.
8
+ *
9
+ * Canonical field names: `values` and `provenance`.
10
+ * The storage layer (Content Lake) may adapt these to different names
11
+ * on serialization (e.g., `vars`/`meta`), but the domain model always
12
+ * uses the canonical names defined here.
13
+ *
14
+ * @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
15
+ */
16
+ /** Where a variable's value originated */
17
+ export type VariableSource = {
18
+ type: "fixture";
19
+ fixtureId: string;
20
+ } | {
21
+ type: "inline";
22
+ definedIn: "task" | "suite";
23
+ } | {
24
+ type: "environment";
25
+ envVar: string;
26
+ } | {
27
+ type: "derived";
28
+ expression: string;
29
+ inputs: string[];
30
+ } | {
31
+ type: "fetched";
32
+ url: string;
33
+ } | {
34
+ type: "previous-result";
35
+ resultId: string;
36
+ field: string;
37
+ };
38
+ /** Provenance metadata for a single variable */
39
+ export interface VariableProvenance {
40
+ /** Where this variable's value came from */
41
+ source: VariableSource;
42
+ /** When the variable was resolved (ISO 8601 timestamp) */
43
+ resolvedAt: string;
44
+ /** Time-to-live in seconds (for fetched/cached variables) */
45
+ ttl?: number;
46
+ /** Content-addressable hash for cache invalidation */
47
+ hash: string;
48
+ }
49
+ /** Schema declaration for a variable — enables compile-time template validation */
50
+ export interface VariableDeclaration {
51
+ /** Variable name (matches the key in `values`) */
52
+ name: string;
53
+ /** Expected type of the variable's value */
54
+ type: "string" | "number" | "boolean" | "json" | "markdown" | "unknown";
55
+ /** Whether the variable must be present for the task to run */
56
+ required: boolean;
57
+ /** Human-readable description */
58
+ description?: string;
59
+ /** Default value used when the variable is not provided */
60
+ default?: unknown;
61
+ }
62
+ /**
63
+ * A structured container for all variables flowing through an evaluation step.
64
+ *
65
+ * The envelope enables:
66
+ * - **Cache invalidation** — if a variable's content hash changes, downstream
67
+ * results are invalidated.
68
+ * - **Audit trails** — every variable in a result can be traced back to its
69
+ * origin.
70
+ * - **Type checking** — declarations allow compile-time validation of prompt
71
+ * templates against available variables.
72
+ */
73
+ export interface VariableEnvelope {
74
+ /** The resolved key-value pairs available to prompt templates */
75
+ values: Record<string, unknown>;
76
+ /** Provenance metadata for each variable (keyed by variable name) */
77
+ provenance: Record<string, VariableProvenance>;
78
+ /** Schema declarations for type-safe access */
79
+ declarations: VariableDeclaration[];
80
+ }
@@ -0,0 +1,16 @@
1
+ /**
2
+ * VariableEnvelope — Structured variable tracking with provenance.
3
+ *
4
+ * Replaces ad-hoc `Record<string, string>` variable bags with a
5
+ * structured, auditable format. Every variable carries metadata about
6
+ * where it came from, when it was resolved, and its content hash for
7
+ * cache invalidation.
8
+ *
9
+ * Canonical field names: `values` and `provenance`.
10
+ * The storage layer (Content Lake) may adapt these to different names
11
+ * on serialization (e.g., `vars`/`meta`), but the domain model always
12
+ * uses the canonical names defined here.
13
+ *
14
+ * @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
15
+ */
16
+ export {};
@@ -1,21 +1,8 @@
1
1
  /**
2
- * Grading dimension types and labels.
2
+ * @deprecated This file is intentionally empty.
3
3
  *
4
- * Shared between eval (scoring engine) and studio (display components).
5
- * Adding a dimension here ensures both packages show it correctly.
4
+ * Dimension names are now dynamic derived at runtime from the scoring
5
+ * profile's dimension-weight map in packages/eval/config/rubrics.ts.
6
+ * Use `extractDimensions()` from @sanity/ailf-core to get dimension
7
+ * names for a given profile.
6
8
  */
7
- /** The three grading dimensions */
8
- export type DimensionName = "codeCorrectness" | "docCoverage" | "taskCompletion";
9
- /**
10
- * Human-readable display labels for dimensions.
11
- *
12
- * Note: The Sanity Content Lake stores dimension names in kebab-case
13
- * (e.g., "task-completion") while TypeScript uses camelCase. Both
14
- * mappings are provided for cross-system compatibility.
15
- */
16
- export declare const DIMENSION_LABELS: Record<DimensionName, string>;
17
- /**
18
- * Kebab-case dimension labels as stored in Sanity Content Lake.
19
- * Used by the studio package when rendering judgment data from GROQ.
20
- */
21
- export declare const DIMENSION_LABELS_KEBAB: Record<string, string>;
@@ -1,27 +1,9 @@
1
+ "use strict";
1
2
  /**
2
- * Grading dimension types and labels.
3
+ * @deprecated This file is intentionally empty.
3
4
  *
4
- * Shared between eval (scoring engine) and studio (display components).
5
- * Adding a dimension here ensures both packages show it correctly.
5
+ * Dimension names are now dynamic derived at runtime from the scoring
6
+ * profile's dimension-weight map in packages/eval/config/rubrics.ts.
7
+ * Use `extractDimensions()` from @sanity/ailf-core to get dimension
8
+ * names for a given profile.
6
9
  */
7
- /**
8
- * Human-readable display labels for dimensions.
9
- *
10
- * Note: The Sanity Content Lake stores dimension names in kebab-case
11
- * (e.g., "task-completion") while TypeScript uses camelCase. Both
12
- * mappings are provided for cross-system compatibility.
13
- */
14
- export const DIMENSION_LABELS = {
15
- codeCorrectness: "Code Correctness",
16
- docCoverage: "Doc Coverage",
17
- taskCompletion: "Task Completion",
18
- };
19
- /**
20
- * Kebab-case dimension labels as stored in Sanity Content Lake.
21
- * Used by the studio package when rendering judgment data from GROQ.
22
- */
23
- export const DIMENSION_LABELS_KEBAB = {
24
- "code-correctness": "Code Correctness",
25
- "doc-coverage": "Doc Coverage",
26
- "task-completion": "Task Completion",
27
- };