@sanity/ailf 2.0.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/LICENSE +21 -0
  2. package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
  3. package/dist/_vendor/ailf-core/examples/index.js +66 -1
  4. package/dist/agent-harness/assertions-runtime.d.ts +49 -0
  5. package/dist/agent-harness/assertions-runtime.js +138 -0
  6. package/dist/agent-harness/provider.d.ts +58 -0
  7. package/dist/agent-harness/provider.js +104 -0
  8. package/dist/cli.js +0 -0
  9. package/dist/commands/init.js +3 -0
  10. package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
  11. package/dist/orchestration/steps/generate-configs-step.js +35 -2
  12. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +39 -25
  13. package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
  14. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +9 -0
  15. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +28 -85
  16. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
  17. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
  18. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
  19. package/package.json +25 -24
  20. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  21. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  22. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  23. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  24. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  25. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  26. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  27. package/dist/_vendor/ailf-tasks/index.js +0 -16
  28. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  29. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  30. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  31. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  32. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  33. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  34. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  35. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  36. package/dist/agent-observer/test-imports.d.ts +0 -7
  37. package/dist/agent-observer/test-imports.js +0 -185
  38. package/dist/commands/update-quality-scores.d.ts +0 -5
  39. package/dist/commands/update-quality-scores.js +0 -20
  40. package/dist/lib/agent-behavior-report.d.ts +0 -8
  41. package/dist/lib/agent-behavior-report.js +0 -185
  42. package/dist/lib/baseline.d.ts +0 -19
  43. package/dist/lib/baseline.js +0 -153
  44. package/dist/lib/calculate-scores.d.ts +0 -23
  45. package/dist/lib/calculate-scores.js +0 -42
  46. package/dist/lib/compare.d.ts +0 -18
  47. package/dist/lib/compare.js +0 -170
  48. package/dist/lib/coverage-audit.d.ts +0 -4
  49. package/dist/lib/coverage-audit.js +0 -42
  50. package/dist/lib/discovery-report.d.ts +0 -13
  51. package/dist/lib/discovery-report.js +0 -57
  52. package/dist/lib/fetch-docs.d.ts +0 -30
  53. package/dist/lib/fetch-docs.js +0 -171
  54. package/dist/lib/generate-configs.d.ts +0 -25
  55. package/dist/lib/generate-configs.js +0 -42
  56. package/dist/lib/grader-api.d.ts +0 -21
  57. package/dist/lib/grader-api.js +0 -34
  58. package/dist/lib/grader-compare.d.ts +0 -19
  59. package/dist/lib/grader-compare.js +0 -91
  60. package/dist/lib/grader-consistency.d.ts +0 -27
  61. package/dist/lib/grader-consistency.js +0 -79
  62. package/dist/lib/grader-sensitivity.d.ts +0 -19
  63. package/dist/lib/grader-sensitivity.js +0 -75
  64. package/dist/lib/grader-validate.d.ts +0 -19
  65. package/dist/lib/grader-validate.js +0 -78
  66. package/dist/lib/measure-retrieval.d.ts +0 -14
  67. package/dist/lib/measure-retrieval.js +0 -71
  68. package/dist/lib/pr-comment.d.ts +0 -16
  69. package/dist/lib/pr-comment.js +0 -28
  70. package/dist/lib/readiness-report.d.ts +0 -13
  71. package/dist/lib/readiness-report.js +0 -108
  72. package/dist/lib/webhook-server.d.ts +0 -11
  73. package/dist/lib/webhook-server.js +0 -24
  74. package/dist/lib/weekly-digest.d.ts +0 -24
  75. package/dist/lib/weekly-digest.js +0 -148
  76. package/dist/orchestration/env-bridge.d.ts +0 -21
  77. package/dist/orchestration/env-bridge.js +0 -66
  78. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  79. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  80. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  81. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  82. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  83. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  86. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  87. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  88. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  89. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  90. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  91. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  92. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  93. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  94. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  95. package/dist/pipeline/compiler/task-bridge.js +0 -92
  96. package/dist/pipeline/expand-tasks.d.ts +0 -232
  97. package/dist/pipeline/expand-tasks.js +0 -467
  98. package/dist/pipeline/generate-configs.d.ts +0 -92
  99. package/dist/pipeline/generate-configs.js +0 -445
  100. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  102. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  103. package/dist/pipeline/steps/compare-step.js +0 -90
  104. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  105. package/dist/pipeline/steps/eval-step.js +0 -347
  106. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  107. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  108. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  109. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  110. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  111. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  112. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  113. package/dist/pipeline/steps/publish-report-step.js +0 -243
  114. package/dist/pipeline/steps/report-step.d.ts +0 -13
  115. package/dist/pipeline/steps/report-step.js +0 -56
  116. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  117. package/dist/pipeline/steps/update-scores-step.js +0 -42
  118. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  119. package/dist/scripts/agent-behavior-report.js +0 -315
  120. package/dist/scripts/baseline.d.ts +0 -43
  121. package/dist/scripts/baseline.js +0 -267
  122. package/dist/scripts/calculate-scores.d.ts +0 -166
  123. package/dist/scripts/calculate-scores.js +0 -1296
  124. package/dist/scripts/compare.d.ts +0 -22
  125. package/dist/scripts/compare.js +0 -334
  126. package/dist/scripts/coverage-audit.d.ts +0 -44
  127. package/dist/scripts/coverage-audit.js +0 -209
  128. package/dist/scripts/debug-eval.d.ts +0 -19
  129. package/dist/scripts/debug-eval.js +0 -73
  130. package/dist/scripts/discovery-report.d.ts +0 -58
  131. package/dist/scripts/discovery-report.js +0 -250
  132. package/dist/scripts/fetch-docs.d.ts +0 -35
  133. package/dist/scripts/fetch-docs.js +0 -472
  134. package/dist/scripts/generate-configs.d.ts +0 -66
  135. package/dist/scripts/generate-configs.js +0 -459
  136. package/dist/scripts/grader-api.d.ts +0 -27
  137. package/dist/scripts/grader-api.js +0 -206
  138. package/dist/scripts/grader-compare.d.ts +0 -22
  139. package/dist/scripts/grader-compare.js +0 -368
  140. package/dist/scripts/grader-consistency.d.ts +0 -20
  141. package/dist/scripts/grader-consistency.js +0 -313
  142. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  143. package/dist/scripts/grader-sensitivity.js +0 -354
  144. package/dist/scripts/grader-validate.d.ts +0 -19
  145. package/dist/scripts/grader-validate.js +0 -267
  146. package/dist/scripts/measure-retrieval.d.ts +0 -10
  147. package/dist/scripts/measure-retrieval.js +0 -145
  148. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  149. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  150. package/dist/scripts/pipeline.d.ts +0 -76
  151. package/dist/scripts/pipeline.js +0 -1031
  152. package/dist/scripts/pr-comment.d.ts +0 -10
  153. package/dist/scripts/pr-comment.js +0 -510
  154. package/dist/scripts/readiness-report.d.ts +0 -88
  155. package/dist/scripts/readiness-report.js +0 -342
  156. package/dist/scripts/update-quality-scores.d.ts +0 -15
  157. package/dist/scripts/update-quality-scores.js +0 -184
  158. package/dist/scripts/validate-task-sources.d.ts +0 -21
  159. package/dist/scripts/validate-task-sources.js +0 -210
  160. package/dist/scripts/validate.d.ts +0 -13
  161. package/dist/scripts/validate.js +0 -79
  162. package/dist/scripts/webhook-server.d.ts +0 -26
  163. package/dist/scripts/webhook-server.js +0 -147
  164. package/dist/scripts/weekly-digest.d.ts +0 -24
  165. package/dist/scripts/weekly-digest.js +0 -144
  166. package/dist/sinks/format-slack.d.ts +0 -64
  167. package/dist/sinks/format-slack.js +0 -306
  168. package/dist/sinks/slack-sink.d.ts +0 -27
  169. package/dist/sinks/slack-sink.js +0 -78
  170. package/dist/sinks/webhook-sink.d.ts +0 -19
  171. package/dist/sinks/webhook-sink.js +0 -50
  172. package/tasks/.expanded.agentic.yaml +0 -280
  173. package/tasks/.expanded.yaml +0 -565
@@ -1,92 +0,0 @@
1
- /**
2
- * pipeline/generate-configs.ts
3
- *
4
- * @deprecated This is the LEGACY compilation path. New code should use the
5
- * config compiler pipeline instead:
6
- *
7
- * import { compileLiteracyTasks } from "./compiler/literacy-bridge.js"
8
- * import { buildTaskGraph, compileToPromptfoo } from "./compiler/index.js"
9
- *
10
- * This file is retained behind the `--legacy-compiler` CLI flag as an
11
- * emergency fallback during the migration period. It will be removed once
12
- * the new compiler has been validated in production.
13
- *
14
- * @see packages/eval/src/pipeline/compiler/ — the new compiler pipeline
15
- * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
16
- *
17
- * ---
18
- *
19
- * Reads config/models.yaml (the central model registry) and generates all
20
- * promptfoo config files with the correct provider entries.
21
- *
22
- * This keeps model definitions in one place — add a model to config/models.yaml
23
- * and run `pnpm generate-configs` to propagate it to all eval modes.
24
- *
25
- * Generated configs:
26
- * - promptfooconfig.yaml (baseline: with-docs vs without-docs)
27
- * - promptfooconfig.observed.yaml (instrumented HTTP recording)
28
- * - promptfooconfig.agentic.yaml (agentic tool-calling: naive vs optimized)
29
- *
30
- * All functions accept rootDir as a parameter — no module-level constants.
31
- * No process.argv parsing. No env var fallbacks. Callers provide typed options.
32
- *
33
- * @see config/models.yaml — the central model registry
34
- * @see docs/archive/exec-plans/eliminate-lib-layer.md
35
- */
36
- import { type LiteracyTaskDefinition, type Logger } from "../_vendor/ailf-core/index.d.ts";
37
- import type { FilterOptions } from "./types.js";
38
- import { type ResolvedSourceConfig } from "../sources.js";
39
- export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "../_vendor/ailf-core/index.d.ts";
40
- /** Auto-discover all task YAML files in the tasks/ directory. */
41
- export declare function discoverTaskFiles(rootDir: string): string[];
42
- interface LoadedPrompts {
43
- agentic: {
44
- id: string;
45
- label: string;
46
- raw: string;
47
- };
48
- withDocs: {
49
- id: string;
50
- label: string;
51
- raw: string;
52
- };
53
- withoutDocs: {
54
- id: string;
55
- label: string;
56
- raw: string;
57
- };
58
- }
59
- /**
60
- * Load prompt templates. Uses handler-owned literacy templates as defaults,
61
- * with config/prompts.ts as an override layer for user customization.
62
- */
63
- export declare function loadPrompts(rootDir: string): LoadedPrompts;
64
- /** Options for the generateConfigs function. */
65
- export interface GenerateConfigsOptions {
66
- /** Allowed origins for agentic mode (controls source isolation assertion) */
67
- allowedOrigins?: string[];
68
- /** Filter to specific feature areas or task IDs */
69
- filter?: FilterOptions;
70
- /** Logger instance (defaults to ConsoleLogger) */
71
- logger?: Logger;
72
- /** Pre-resolved source config (skips loadSource() call) */
73
- resolvedSource?: ResolvedSourceConfig;
74
- /** Root directory of the eval package (required) */
75
- rootDir: string;
76
- /** Search mode for agentic mode */
77
- searchMode?: string;
78
- /** Documentation source name (e.g., "branch", "local") */
79
- source?: string;
80
- /** Pre-loaded literacy task definitions from a TaskSource adapter.
81
- * When provided, expandTaskDefinitions() is used instead of
82
- * loadAndExpandTasks() (which reads from tasks/*.yaml files). */
83
- tasks?: LiteracyTaskDefinition[];
84
- }
85
- /**
86
- * Generate Promptfoo config files from models.yaml + task definitions.
87
- *
88
- * All parameters are passed via the typed options object — no process.argv
89
- * parsing or env var fallbacks. Callers (command handlers, orchestration
90
- * steps) are responsible for resolving options from their own context.
91
- */
92
- export declare function generateConfigs(options: GenerateConfigsOptions): void;
@@ -1,445 +0,0 @@
1
- /**
2
- * pipeline/generate-configs.ts
3
- *
4
- * @deprecated This is the LEGACY compilation path. New code should use the
5
- * config compiler pipeline instead:
6
- *
7
- * import { compileLiteracyTasks } from "./compiler/literacy-bridge.js"
8
- * import { buildTaskGraph, compileToPromptfoo } from "./compiler/index.js"
9
- *
10
- * This file is retained behind the `--legacy-compiler` CLI flag as an
11
- * emergency fallback during the migration period. It will be removed once
12
- * the new compiler has been validated in production.
13
- *
14
- * @see packages/eval/src/pipeline/compiler/ — the new compiler pipeline
15
- * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
16
- *
17
- * ---
18
- *
19
- * Reads config/models.yaml (the central model registry) and generates all
20
- * promptfoo config files with the correct provider entries.
21
- *
22
- * This keeps model definitions in one place — add a model to config/models.yaml
23
- * and run `pnpm generate-configs` to propagate it to all eval modes.
24
- *
25
- * Generated configs:
26
- * - promptfooconfig.yaml (baseline: with-docs vs without-docs)
27
- * - promptfooconfig.observed.yaml (instrumented HTTP recording)
28
- * - promptfooconfig.agentic.yaml (agentic tool-calling: naive vs optimized)
29
- *
30
- * All functions accept rootDir as a parameter — no module-level constants.
31
- * No process.argv parsing. No env var fallbacks. Callers provide typed options.
32
- *
33
- * @see config/models.yaml — the central model registry
34
- * @see docs/archive/exec-plans/eliminate-lib-layer.md
35
- */
36
- import { extractModelName, extractProvider, mergeConfig, } from "../_vendor/ailf-core/index.js";
37
- import { existsSync, readdirSync, writeFileSync } from "fs";
38
- import { resolve } from "path";
39
- import { dump } from "js-yaml";
40
- import { ConsoleLogger } from "../adapters/loggers/index.js";
41
- import { loadConfigFile } from "./compiler/config-loader.js";
42
- import { modelMatchesLiteracyVariant } from "./compiler/mode-bases/literacy.js";
43
- import { LITERACY_PROMPT_TEMPLATES } from "./compiler/mode-handlers/literacy/index.js";
44
- import { expandTaskDefinitions, loadAndExpandTasks } from "./expand-tasks.js";
45
- import { validateModelsYaml } from "./validate.js";
46
- import { LiteracyVariant } from "./normalize-mode.js";
47
- import { loadSource } from "../sources.js";
48
- // Re-export pure functions from core for backward compatibility.
49
- // Tests and other modules that previously imported from lib/generate-configs
50
- // can import from pipeline/generate-configs instead.
51
- export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "../_vendor/ailf-core/index.js";
52
- // ---------------------------------------------------------------------------
53
- // Helpers
54
- // ---------------------------------------------------------------------------
55
- /** Auto-discover all task YAML files in the tasks/ directory. */
56
- export function discoverTaskFiles(rootDir) {
57
- const tasksDir = resolve(rootDir, "tasks");
58
- if (!existsSync(tasksDir)) {
59
- return []; // tasks may come from Content Lake instead
60
- }
61
- return readdirSync(tasksDir)
62
- .filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
63
- .sort()
64
- .map((f) => `file://tasks/${f}`);
65
- }
66
- /**
67
- * Load prompt templates. Uses handler-owned literacy templates as defaults,
68
- * with config/prompts.ts as an override layer for user customization.
69
- */
70
- export function loadPrompts(rootDir) {
71
- const toPrompt = (entry) => ({
72
- id: entry.id,
73
- label: entry.label,
74
- raw: entry.template,
75
- });
76
- // Load user overrides from config/prompts (may be empty after Wave 4)
77
- let overrides = {};
78
- try {
79
- const loaded = loadConfigFile("prompts", rootDir).data;
80
- // config/prompts.ts may export a Record (legacy) or an empty array (post-Wave 4)
81
- if (loaded && !Array.isArray(loaded)) {
82
- overrides = loaded;
83
- }
84
- }
85
- catch {
86
- // No config/prompts file — use handler defaults only
87
- }
88
- // Handler-owned templates are the canonical source; overrides take precedence
89
- return {
90
- agentic: overrides[LiteracyVariant.AGENTIC]
91
- ? toPrompt(overrides[LiteracyVariant.AGENTIC])
92
- : toPrompt(LITERACY_PROMPT_TEMPLATES[LiteracyVariant.AGENTIC]),
93
- withDocs: overrides["with-docs"]
94
- ? toPrompt(overrides["with-docs"])
95
- : toPrompt(LITERACY_PROMPT_TEMPLATES["with-docs"]),
96
- withoutDocs: overrides["without-docs"]
97
- ? toPrompt(overrides["without-docs"])
98
- : toPrompt(LITERACY_PROMPT_TEMPLATES["without-docs"]),
99
- };
100
- }
101
- function loadModels(rootDir) {
102
- return loadConfigFile("models", rootDir).data;
103
- }
104
- // ---------------------------------------------------------------------------
105
- // Shared components
106
- // ---------------------------------------------------------------------------
107
- const URL_EXTRACTION_ASSERT = {
108
- type: "javascript",
109
- value: `const urlPattern = /https?:\\/\\/[^\\s\\)\\"\\'\\\`>]+/g;
110
- const urls = [...new Set((output.match(urlPattern) || []))];
111
- const sanityUrls = urls.filter(u => u.includes('sanity.io'));
112
- return {
113
- pass: true,
114
- score: 1,
115
- reason: JSON.stringify({
116
- sanityUrls,
117
- otherUrls: urls.filter(u => !u.includes('sanity.io')),
118
- totalUrlCount: urls.length,
119
- sanityUrlCount: sanityUrls.length
120
- })
121
- };`,
122
- weight: 0,
123
- };
124
- /**
125
- * Source isolation assertion — advisory (weight: 0).
126
- * Verifies that the agentic provider only fetched docs from allowed origins.
127
- * Only injected when origin sandboxing is active.
128
- */
129
- const SOURCE_ISOLATION_ASSERT = {
130
- metadata: { dimension: "source-isolation" },
131
- type: "javascript",
132
- value: "file://dist/assertions/source-isolation.js",
133
- weight: 0,
134
- };
135
- // ---------------------------------------------------------------------------
136
- // Config generators
137
- // ---------------------------------------------------------------------------
138
- function generateAgenticConfig(models, tests, prompts, source, searchMode, allowedOrigins) {
139
- const naiveModels = models.models.filter((m) => modelMatchesLiteracyVariant(m, "agentic-naive"));
140
- const optimizedModels = models.models.filter((m) => modelMatchesLiteracyVariant(m, "agentic-optimized"));
141
- const providers = [];
142
- // Build doc source config to inject into providers
143
- const resolvedSearchMode = searchMode ?? "open";
144
- const sourceConfig = source
145
- ? {
146
- ...(source.allowedOrigins?.length
147
- ? { allowedOrigins: source.allowedOrigins }
148
- : {}),
149
- docBaseUrl: source.baseUrl,
150
- ...(source.headers && Object.keys(source.headers).length > 0
151
- ? { customHeaders: source.headers }
152
- : {}),
153
- llmsTxtUrl: source.llmsTxt,
154
- ...(source.priorityDomain
155
- ? { priorityDomain: source.priorityDomain }
156
- : {}),
157
- // Tool access control: search mode for web_search behavior
158
- ...(resolvedSearchMode !== "open"
159
- ? { searchMode: resolvedSearchMode }
160
- : {}),
161
- }
162
- : {};
163
- for (const model of naiveModels) {
164
- const modelName = extractModelName(model.id);
165
- const provider = extractProvider(model.id);
166
- providers.push({
167
- config: {
168
- ...mergeConfig(models.defaults, model.config, {
169
- agentMode: "naive",
170
- maxToolRounds: models.defaults.maxToolRounds ?? 5,
171
- model: modelName,
172
- provider,
173
- }),
174
- ...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
175
- ...sourceConfig,
176
- observe: true,
177
- observerOptions: models.defaults.observerOptions ?? {},
178
- },
179
- id: "file://dist/agent-observer/agentic-provider.js",
180
- label: `${model.label} (Naive Agent)`,
181
- });
182
- }
183
- for (const model of optimizedModels) {
184
- const modelName = extractModelName(model.id);
185
- const provider = extractProvider(model.id);
186
- providers.push({
187
- config: {
188
- ...mergeConfig(models.defaults, model.config, {
189
- agentMode: "optimized",
190
- maxToolRounds: models.defaults.maxToolRounds ?? 5,
191
- model: modelName,
192
- provider,
193
- }),
194
- ...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
195
- ...sourceConfig,
196
- observe: true,
197
- observerOptions: models.defaults.observerOptions ?? {},
198
- },
199
- id: "file://dist/agent-observer/agentic-provider.js",
200
- label: `${model.label} (Optimized Agent)`,
201
- });
202
- }
203
- // Inject source isolation assertion when origin sandboxing is active
204
- const hasOriginSandbox = Boolean(allowedOrigins?.length);
205
- const agenticAssertions = hasOriginSandbox ? [SOURCE_ISOLATION_ASSERT] : [];
206
- return {
207
- commandLineOptions: { table: false },
208
- defaultTest: {
209
- ...(agenticAssertions.length > 0 ? { assert: agenticAssertions } : {}),
210
- options: {
211
- provider: models.grader.id,
212
- rubricProvider: models.grader.id,
213
- },
214
- },
215
- description: "Sanity AI Literacy Evaluation — Agentic (naive vs optimized)",
216
- ...(models.maxConcurrency
217
- ? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
218
- : {}),
219
- outputPath: "results/latest/eval-results-agentic.json",
220
- prompts: [prompts.agentic],
221
- providers,
222
- tests,
223
- };
224
- }
225
- function generateBaselineConfig(models, tests, prompts) {
226
- const baselineModels = models.models.filter((m) => modelMatchesLiteracyVariant(m, "baseline"));
227
- const providers = baselineModels.map((model) => ({
228
- config: {
229
- ...mergeConfig(models.defaults, model.config),
230
- ...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
231
- },
232
- id: model.id,
233
- label: model.label,
234
- }));
235
- return {
236
- commandLineOptions: { table: false },
237
- defaultTest: {
238
- assert: [URL_EXTRACTION_ASSERT],
239
- options: {
240
- provider: models.grader.id,
241
- rubricProvider: models.grader.id,
242
- },
243
- },
244
- description: "Sanity AI Literacy Evaluation — Baseline",
245
- ...(models.maxConcurrency
246
- ? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
247
- : {}),
248
- outputPath: "results/latest/eval-results.json",
249
- prompts: [prompts.withDocs, prompts.withoutDocs],
250
- providers,
251
- tests,
252
- };
253
- }
254
- function generateObservedConfig(models, tests, prompts) {
255
- const observedModels = models.models.filter((m) => modelMatchesLiteracyVariant(m, "observed"));
256
- const providers = observedModels.map((model) => {
257
- const modelName = extractModelName(model.id);
258
- return {
259
- config: {
260
- ...mergeConfig(models.defaults, model.config),
261
- ...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
262
- modelName,
263
- observe: true,
264
- recordOptions: models.defaults.observerOptions ?? {},
265
- },
266
- id: "file://dist/agent-observer/provider.js",
267
- label: `${model.label} (Observed)`,
268
- };
269
- });
270
- return {
271
- commandLineOptions: { table: false },
272
- defaultTest: {
273
- options: {
274
- provider: models.grader.id,
275
- rubricProvider: models.grader.id,
276
- },
277
- },
278
- description: "Sanity AI Literacy Evaluation — Observed",
279
- ...(models.maxConcurrency
280
- ? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
281
- : {}),
282
- outputPath: "results/latest/eval-results-observed.json",
283
- prompts: [prompts.withDocs, prompts.withoutDocs],
284
- providers,
285
- tests,
286
- };
287
- }
288
- // ---------------------------------------------------------------------------
289
- // Main entry point
290
- // ---------------------------------------------------------------------------
291
- /**
292
- * Generate Promptfoo config files from models.yaml + task definitions.
293
- *
294
- * All parameters are passed via the typed options object — no process.argv
295
- * parsing or env var fallbacks. Callers (command handlers, orchestration
296
- * steps) are responsible for resolving options from their own context.
297
- */
298
- export function generateConfigs(options) {
299
- const { rootDir } = options;
300
- const log = options.logger ?? new ConsoleLogger();
301
- // Validate config/models.yaml before generating configs
302
- const modelIssues = validateModelsYaml(rootDir);
303
- const modelErrors = modelIssues.filter((i) => i.severity === "error");
304
- if (modelErrors.length > 0) {
305
- const details = modelErrors
306
- .map((e) => (e.path ? `${e.message} (at ${e.path})` : e.message))
307
- .join("; ");
308
- throw new Error(`config/models.yaml validation failed: ${details}. Run 'pnpm validate' for details.`);
309
- }
310
- log.info("Loading config/models.yaml...");
311
- const models = loadModels(rootDir);
312
- const activeModels = models.models.filter((m) => m.id && m.label);
313
- log.debug("Models loaded from config/models.yaml", {
314
- totalModels: models.models.length,
315
- activeModels: activeModels.length,
316
- modelIds: activeModels.map((m) => m.id),
317
- graderId: models.grader.id,
318
- maxConcurrency: models.maxConcurrency,
319
- });
320
- log.info(` Found ${activeModels.length} active model(s):`);
321
- for (const m of activeModels) {
322
- // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty array join → "all"
323
- const modes = m.modes?.join(", ") || "all";
324
- log.info(` - ${m.label} (${m.id}) → [${modes}]`);
325
- }
326
- // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty label falls through to id
327
- log.info(` Grader: ${models.grader.label || models.grader.id}`);
328
- // Build filter from options
329
- const filter = options.filter?.areas || options.filter?.taskIds
330
- ? options.filter
331
- : undefined;
332
- // Expand tasks — use GeneralizedTaskDefinition[] from TaskSource when provided,
333
- // otherwise fall back to loading from tasks/*.yaml files.
334
- let entries;
335
- let agenticEntries;
336
- if (options.tasks) {
337
- // TaskSource path — tasks already loaded and filtered by the adapter
338
- log.debug("Expanding tasks from TaskSource adapter", {
339
- taskCount: options.tasks.length,
340
- taskIds: options.tasks.map((t) => t.id),
341
- });
342
- const baselineResult = expandTaskDefinitions(options.tasks, rootDir, LiteracyVariant.STANDARD);
343
- entries = baselineResult.entries;
344
- log.info(` Expanded ${baselineResult.stats.totalTasks} task(s) → ${baselineResult.stats.expandedTotal} test entries (from TaskSource)`);
345
- const agenticResult = expandTaskDefinitions(options.tasks, rootDir, LiteracyVariant.AGENTIC);
346
- agenticEntries = agenticResult.entries;
347
- log.info(` Agentic: ${agenticResult.stats.expandedTotal} entries (gold only, no baseline)`);
348
- }
349
- else {
350
- // Legacy path — read from tasks/*.yaml files
351
- const { entries: baselineEntries, stats } = loadAndExpandTasks(rootDir, filter, LiteracyVariant.STANDARD, log);
352
- entries = baselineEntries;
353
- log.info(` Expanded ${stats.singleDefinitions} task(s) → ${stats.expandedTotal} test entries`);
354
- if (stats.legacyEntries > 0) {
355
- log.info(` ⚠ ${stats.legacyEntries} legacy (paired) entries passed through unchanged`);
356
- }
357
- if (filter) {
358
- const parts = [];
359
- if (filter.areas) {
360
- parts.push(`areas: ${filter.areas.join(", ")}`);
361
- }
362
- if (filter.taskIds) {
363
- parts.push(`tasks: ${filter.taskIds.join(", ")}`);
364
- }
365
- log.info(` Scoped to: ${parts.join("; ")}`);
366
- }
367
- const { entries: agenticFromYaml, stats: agenticStats } = loadAndExpandTasks(rootDir, filter, LiteracyVariant.AGENTIC, log);
368
- agenticEntries = agenticFromYaml;
369
- log.info(` Agentic: ${agenticStats.expandedTotal} entries (gold only, no baseline)`);
370
- }
371
- // Write expanded tasks to generated files for Promptfoo to consume
372
- const expandedPath = resolve(rootDir, "tasks", ".expanded.yaml");
373
- const expandedYaml = dump(entries, {
374
- forceQuotes: false,
375
- lineWidth: 120,
376
- noRefs: true,
377
- quotingType: "'",
378
- });
379
- writeFileSync(expandedPath, `# .expanded.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${expandedYaml}`, "utf-8");
380
- log.info(` ✓ tasks/.expanded.yaml (${entries.length} entries)`);
381
- const agenticExpandedPath = resolve(rootDir, "tasks", ".expanded.agentic.yaml");
382
- const agenticExpandedYaml = dump(agenticEntries, {
383
- forceQuotes: false,
384
- lineWidth: 120,
385
- noRefs: true,
386
- quotingType: "'",
387
- });
388
- writeFileSync(agenticExpandedPath, `# .expanded.agentic.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Gold entries only (no baseline) for agentic evaluation mode.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${agenticExpandedYaml}`, "utf-8");
389
- log.info(` ✓ tasks/.expanded.agentic.yaml (${agenticEntries.length} entries)`);
390
- const taskFiles = ["file://tasks/.expanded.yaml"];
391
- const agenticTaskFiles = ["file://tasks/.expanded.agentic.yaml"];
392
- // Load prompt templates
393
- const prompts = loadPrompts(rootDir);
394
- log.debug("Prompt templates loaded", {
395
- keys: Object.keys(prompts),
396
- withDocsId: prompts.withDocs.id,
397
- withoutDocsId: prompts.withoutDocs.id,
398
- agenticId: prompts.agentic.id,
399
- });
400
- log.info(` Loaded prompts: ${Object.keys(prompts).join(", ")}`);
401
- // Load optional documentation source configuration
402
- // Pre-resolved source wins over name-based lookup
403
- let source = options.resolvedSource;
404
- const sourceName = options.source;
405
- if (!source && sourceName) {
406
- log.info(`\nLoading source: ${sourceName}`);
407
- try {
408
- source = loadSource(sourceName);
409
- }
410
- catch (err) {
411
- const msg = err instanceof Error ? err.message : String(err);
412
- log.warn(`\n⚠ Failed to load source "${sourceName}": ${msg}`);
413
- }
414
- }
415
- if (source) {
416
- log.info(` Base URL: ${source.baseUrl}`);
417
- log.info(` Dataset: ${source.dataset}`);
418
- if (source.allowedOrigins?.length) {
419
- log.info(` Allowed origins: ${source.allowedOrigins.join(", ")}`);
420
- }
421
- }
422
- log.info("\nGenerating configs...");
423
- writeConfig(rootDir, "promptfooconfig.yaml", generateBaselineConfig(models, taskFiles, prompts), `# promptfooconfig.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`, log);
424
- writeConfig(rootDir, "promptfooconfig.observed.yaml", generateObservedConfig(models, taskFiles, prompts), `# promptfooconfig.observed.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`, log);
425
- writeConfig(rootDir, "promptfooconfig.agentic.yaml", generateAgenticConfig(models, agenticTaskFiles, prompts, source, options.searchMode, options.allowedOrigins), `# promptfooconfig.agentic.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`, log);
426
- log.info("\nDone! Configs are ready.");
427
- if (source) {
428
- log.info(` (using doc source: ${sourceName})`);
429
- }
430
- }
431
- // ---------------------------------------------------------------------------
432
- // File writing
433
- // ---------------------------------------------------------------------------
434
- function writeConfig(rootDir, filename, config, header, log) {
435
- const yamlStr = dump(config, {
436
- forceQuotes: false,
437
- lineWidth: 120,
438
- noRefs: true,
439
- quotingType: "'",
440
- });
441
- const content = `${header}\n${yamlStr}`;
442
- const outPath = resolve(rootDir, filename);
443
- writeFileSync(outPath, content, "utf-8");
444
- log.info(` ✓ ${filename}`);
445
- }
@@ -1,11 +0,0 @@
1
- /**
2
- * Pipeline step: Calculate AI Literacy Scores from eval results.
3
- *
4
- * Preconditions: eval-results.json exists and is valid
5
- * Postconditions: score-summary.json exists and is valid
6
- *
7
- * Cache key: eval results JSON file(s)
8
- * Cache outputs: results/latest/score-summary.json
9
- */
10
- import type { EvalMode, StepResult } from "../types.js";
11
- export declare function runCalculateScores(source?: string, mode?: EvalMode, noCache?: boolean): StepResult;
@@ -1,89 +0,0 @@
1
- /**
2
- * Pipeline step: Calculate AI Literacy Scores from eval results.
3
- *
4
- * Preconditions: eval-results.json exists and is valid
5
- * Postconditions: score-summary.json exists and is valid
6
- *
7
- * Cache key: eval results JSON file(s)
8
- * Cache outputs: results/latest/score-summary.json
9
- */
10
- import { execSync } from "child_process";
11
- import { dirname, resolve } from "path";
12
- import { fileURLToPath } from "url";
13
- import { getStepInputPaths, hashFiles, lookupCache, recordCache, } from "../cache.js";
14
- import { checkResultsExist, checkScoreSummaryValid } from "../checks.js";
15
- import { RESULTS_FILES } from "./eval-step.js";
16
- const __dirname = dirname(fileURLToPath(import.meta.url));
17
- const ROOT = resolve(__dirname, "..", "..", "..");
18
- export function runCalculateScores(source, mode = "baseline", noCache = false) {
19
- const start = Date.now();
20
- // For full mode, use the baseline results file as the primary input
21
- // (calculate-scores reads all available results files internally)
22
- const primaryMode = mode === "full" ? "baseline" : mode;
23
- const resultsFile = RESULTS_FILES[primaryMode];
24
- const resultsIssues = checkResultsExist(ROOT, resultsFile);
25
- const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
26
- if (resultsErrors.length > 0) {
27
- return {
28
- durationMs: Date.now() - start,
29
- error: `Results missing: ${resultsErrors.map((e) => e.message).join("; ")}`,
30
- status: "failed",
31
- };
32
- }
33
- // Cache check
34
- if (!noCache) {
35
- const cacheResult = lookupCache(ROOT, "calculate-scores");
36
- if (cacheResult.hit) {
37
- return {
38
- durationMs: Date.now() - start,
39
- status: "success",
40
- summary: `Skipped (cached) — ${cacheResult.entry.summary}`,
41
- };
42
- }
43
- }
44
- // Execute — note: calculate-scores exits 1 when areas are below critical,
45
- // which is expected behavior, not an error
46
- try {
47
- const sourceArg = source ? ` --source ${source}` : "";
48
- const resultsArg = primaryMode !== "baseline" ? ` ${resultsFile}` : "";
49
- execSync(`tsx src/lib/calculate-scores.ts${resultsArg}${sourceArg}`, {
50
- cwd: ROOT,
51
- env: process.env,
52
- stdio: "inherit",
53
- });
54
- }
55
- catch (err) {
56
- const code = err !== null && typeof err === "object" && "status" in err
57
- ? err.status
58
- : 1;
59
- // Exit code 1 means "areas below critical" — that's expected
60
- if (code !== 1) {
61
- return {
62
- durationMs: Date.now() - start,
63
- error: `calculate-scores failed with exit code ${code}`,
64
- status: "failed",
65
- };
66
- }
67
- }
68
- // Postcondition: score summary exists and is valid
69
- const summaryIssues = checkScoreSummaryValid(ROOT);
70
- const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
71
- if (summaryErrors.length > 0) {
72
- return {
73
- durationMs: Date.now() - start,
74
- error: `Postcondition failed: ${summaryErrors.map((e) => e.message).join("; ")}`,
75
- status: "failed",
76
- };
77
- }
78
- const durationMs = Date.now() - start;
79
- const summary = "Scores calculated and summary written";
80
- // Record cache
81
- if (!noCache) {
82
- const inputPaths = getStepInputPaths(ROOT, "calculate-scores");
83
- const inputHash = hashFiles(inputPaths);
84
- recordCache(ROOT, "calculate-scores", inputHash, summary, durationMs, [
85
- "results/latest/score-summary.json",
86
- ]);
87
- }
88
- return { durationMs, status: "success", summary };
89
- }
@@ -1,18 +0,0 @@
1
- /**
2
- * Pipeline step: Compare current scores against a baseline.
3
- *
4
- * Preconditions: score-summary.json exists
5
- * Postconditions: comparison-report.json written to results/latest/
6
- *
7
- * This step is optional — it only runs when --compare is passed
8
- * (or a baseline exists and auto-compare is enabled).
9
- */
10
- import type { CompareOptions, StepResult } from "../types.js";
11
- /**
12
- * Run comparison against a baseline.
13
- *
14
- * @param rootDir Package root directory
15
- * @param baselinePath Explicit baseline file path (optional — uses latest if omitted)
16
- * @param options Compare options (noise threshold, etc.)
17
- */
18
- export declare function runCompare(rootDir: string, baselinePath?: string, options?: CompareOptions): StepResult;