@sanity/ailf 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. package/LICENSE +21 -0
  2. package/dist/cli.js +0 -0
  3. package/dist/orchestration/steps/run-eval-step.js +1 -1
  4. package/dist/pipeline/checks.d.ts +8 -3
  5. package/dist/pipeline/checks.js +23 -3
  6. package/package.json +25 -25
  7. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  8. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  9. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  10. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  11. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  12. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  13. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  14. package/dist/_vendor/ailf-tasks/index.js +0 -16
  15. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  16. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  17. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  18. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  19. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  20. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  21. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  22. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  23. package/dist/agent-observer/test-imports.d.ts +0 -7
  24. package/dist/agent-observer/test-imports.js +0 -185
  25. package/dist/commands/update-quality-scores.d.ts +0 -5
  26. package/dist/commands/update-quality-scores.js +0 -20
  27. package/dist/lib/agent-behavior-report.d.ts +0 -8
  28. package/dist/lib/agent-behavior-report.js +0 -185
  29. package/dist/lib/baseline.d.ts +0 -19
  30. package/dist/lib/baseline.js +0 -153
  31. package/dist/lib/calculate-scores.d.ts +0 -23
  32. package/dist/lib/calculate-scores.js +0 -42
  33. package/dist/lib/compare.d.ts +0 -18
  34. package/dist/lib/compare.js +0 -170
  35. package/dist/lib/coverage-audit.d.ts +0 -4
  36. package/dist/lib/coverage-audit.js +0 -42
  37. package/dist/lib/discovery-report.d.ts +0 -13
  38. package/dist/lib/discovery-report.js +0 -57
  39. package/dist/lib/fetch-docs.d.ts +0 -30
  40. package/dist/lib/fetch-docs.js +0 -171
  41. package/dist/lib/generate-configs.d.ts +0 -25
  42. package/dist/lib/generate-configs.js +0 -42
  43. package/dist/lib/grader-api.d.ts +0 -21
  44. package/dist/lib/grader-api.js +0 -34
  45. package/dist/lib/grader-compare.d.ts +0 -19
  46. package/dist/lib/grader-compare.js +0 -91
  47. package/dist/lib/grader-consistency.d.ts +0 -27
  48. package/dist/lib/grader-consistency.js +0 -79
  49. package/dist/lib/grader-sensitivity.d.ts +0 -19
  50. package/dist/lib/grader-sensitivity.js +0 -75
  51. package/dist/lib/grader-validate.d.ts +0 -19
  52. package/dist/lib/grader-validate.js +0 -78
  53. package/dist/lib/measure-retrieval.d.ts +0 -14
  54. package/dist/lib/measure-retrieval.js +0 -71
  55. package/dist/lib/pr-comment.d.ts +0 -16
  56. package/dist/lib/pr-comment.js +0 -28
  57. package/dist/lib/readiness-report.d.ts +0 -13
  58. package/dist/lib/readiness-report.js +0 -108
  59. package/dist/lib/webhook-server.d.ts +0 -11
  60. package/dist/lib/webhook-server.js +0 -24
  61. package/dist/lib/weekly-digest.d.ts +0 -24
  62. package/dist/lib/weekly-digest.js +0 -148
  63. package/dist/orchestration/env-bridge.d.ts +0 -21
  64. package/dist/orchestration/env-bridge.js +0 -66
  65. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  66. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  67. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  68. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  69. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  70. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  71. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  72. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  73. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  74. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  75. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  76. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  77. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  78. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  79. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  80. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  81. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  82. package/dist/pipeline/compiler/task-bridge.js +0 -92
  83. package/dist/pipeline/expand-tasks.d.ts +0 -232
  84. package/dist/pipeline/expand-tasks.js +0 -467
  85. package/dist/pipeline/generate-configs.d.ts +0 -92
  86. package/dist/pipeline/generate-configs.js +0 -445
  87. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  88. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  89. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  90. package/dist/pipeline/steps/compare-step.js +0 -90
  91. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  92. package/dist/pipeline/steps/eval-step.js +0 -347
  93. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  94. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  95. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  96. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  97. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  98. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  99. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  100. package/dist/pipeline/steps/publish-report-step.js +0 -243
  101. package/dist/pipeline/steps/report-step.d.ts +0 -13
  102. package/dist/pipeline/steps/report-step.js +0 -56
  103. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  104. package/dist/pipeline/steps/update-scores-step.js +0 -42
  105. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  106. package/dist/scripts/agent-behavior-report.js +0 -315
  107. package/dist/scripts/baseline.d.ts +0 -43
  108. package/dist/scripts/baseline.js +0 -267
  109. package/dist/scripts/calculate-scores.d.ts +0 -166
  110. package/dist/scripts/calculate-scores.js +0 -1296
  111. package/dist/scripts/compare.d.ts +0 -22
  112. package/dist/scripts/compare.js +0 -334
  113. package/dist/scripts/coverage-audit.d.ts +0 -44
  114. package/dist/scripts/coverage-audit.js +0 -209
  115. package/dist/scripts/debug-eval.d.ts +0 -19
  116. package/dist/scripts/debug-eval.js +0 -73
  117. package/dist/scripts/discovery-report.d.ts +0 -58
  118. package/dist/scripts/discovery-report.js +0 -250
  119. package/dist/scripts/fetch-docs.d.ts +0 -35
  120. package/dist/scripts/fetch-docs.js +0 -472
  121. package/dist/scripts/generate-configs.d.ts +0 -66
  122. package/dist/scripts/generate-configs.js +0 -459
  123. package/dist/scripts/grader-api.d.ts +0 -27
  124. package/dist/scripts/grader-api.js +0 -206
  125. package/dist/scripts/grader-compare.d.ts +0 -22
  126. package/dist/scripts/grader-compare.js +0 -368
  127. package/dist/scripts/grader-consistency.d.ts +0 -20
  128. package/dist/scripts/grader-consistency.js +0 -313
  129. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  130. package/dist/scripts/grader-sensitivity.js +0 -354
  131. package/dist/scripts/grader-validate.d.ts +0 -19
  132. package/dist/scripts/grader-validate.js +0 -267
  133. package/dist/scripts/measure-retrieval.d.ts +0 -10
  134. package/dist/scripts/measure-retrieval.js +0 -145
  135. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  136. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  137. package/dist/scripts/pipeline.d.ts +0 -76
  138. package/dist/scripts/pipeline.js +0 -1031
  139. package/dist/scripts/pr-comment.d.ts +0 -10
  140. package/dist/scripts/pr-comment.js +0 -510
  141. package/dist/scripts/readiness-report.d.ts +0 -88
  142. package/dist/scripts/readiness-report.js +0 -342
  143. package/dist/scripts/update-quality-scores.d.ts +0 -15
  144. package/dist/scripts/update-quality-scores.js +0 -184
  145. package/dist/scripts/validate-task-sources.d.ts +0 -21
  146. package/dist/scripts/validate-task-sources.js +0 -210
  147. package/dist/scripts/validate.d.ts +0 -13
  148. package/dist/scripts/validate.js +0 -79
  149. package/dist/scripts/webhook-server.d.ts +0 -26
  150. package/dist/scripts/webhook-server.js +0 -147
  151. package/dist/scripts/weekly-digest.d.ts +0 -24
  152. package/dist/scripts/weekly-digest.js +0 -144
  153. package/dist/sinks/format-slack.d.ts +0 -64
  154. package/dist/sinks/format-slack.js +0 -306
  155. package/dist/sinks/slack-sink.d.ts +0 -27
  156. package/dist/sinks/slack-sink.js +0 -78
  157. package/dist/sinks/webhook-sink.d.ts +0 -19
  158. package/dist/sinks/webhook-sink.js +0 -50
  159. package/tasks/.expanded.agentic.yaml +0 -280
  160. package/tasks/.expanded.yaml +0 -565
@@ -1,92 +0,0 @@
1
- /**
2
- * pipeline/generate-configs.ts
3
- *
4
- * @deprecated This is the LEGACY compilation path. New code should use the
5
- * config compiler pipeline instead:
6
- *
7
- * import { compileLiteracyTasks } from "./compiler/literacy-bridge.js"
8
- * import { buildTaskGraph, compileToPromptfoo } from "./compiler/index.js"
9
- *
10
- * This file is retained behind the `--legacy-compiler` CLI flag as an
11
- * emergency fallback during the migration period. It will be removed once
12
- * the new compiler has been validated in production.
13
- *
14
- * @see packages/eval/src/pipeline/compiler/ — the new compiler pipeline
15
- * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
16
- *
17
- * ---
18
- *
19
- * Reads config/models.yaml (the central model registry) and generates all
20
- * promptfoo config files with the correct provider entries.
21
- *
22
- * This keeps model definitions in one place — add a model to config/models.yaml
23
- * and run `pnpm generate-configs` to propagate it to all eval modes.
24
- *
25
- * Generated configs:
26
- * - promptfooconfig.yaml (baseline: with-docs vs without-docs)
27
- * - promptfooconfig.observed.yaml (instrumented HTTP recording)
28
- * - promptfooconfig.agentic.yaml (agentic tool-calling: naive vs optimized)
29
- *
30
- * All functions accept rootDir as a parameter — no module-level constants.
31
- * No process.argv parsing. No env var fallbacks. Callers provide typed options.
32
- *
33
- * @see config/models.yaml — the central model registry
34
- * @see docs/archive/exec-plans/eliminate-lib-layer.md
35
- */
36
- import { type LiteracyTaskDefinition, type Logger } from "../_vendor/ailf-core/index.d.ts";
37
- import type { FilterOptions } from "./types.js";
38
- import { type ResolvedSourceConfig } from "../sources.js";
39
- export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "../_vendor/ailf-core/index.d.ts";
40
- /** Auto-discover all task YAML files in the tasks/ directory. */
41
- export declare function discoverTaskFiles(rootDir: string): string[];
42
- interface LoadedPrompts {
43
- agentic: {
44
- id: string;
45
- label: string;
46
- raw: string;
47
- };
48
- withDocs: {
49
- id: string;
50
- label: string;
51
- raw: string;
52
- };
53
- withoutDocs: {
54
- id: string;
55
- label: string;
56
- raw: string;
57
- };
58
- }
59
- /**
60
- * Load prompt templates. Uses handler-owned literacy templates as defaults,
61
- * with config/prompts.ts as an override layer for user customization.
62
- */
63
- export declare function loadPrompts(rootDir: string): LoadedPrompts;
64
- /** Options for the generateConfigs function. */
65
- export interface GenerateConfigsOptions {
66
- /** Allowed origins for agentic mode (controls source isolation assertion) */
67
- allowedOrigins?: string[];
68
- /** Filter to specific feature areas or task IDs */
69
- filter?: FilterOptions;
70
- /** Logger instance (defaults to ConsoleLogger) */
71
- logger?: Logger;
72
- /** Pre-resolved source config (skips loadSource() call) */
73
- resolvedSource?: ResolvedSourceConfig;
74
- /** Root directory of the eval package (required) */
75
- rootDir: string;
76
- /** Search mode for agentic mode */
77
- searchMode?: string;
78
- /** Documentation source name (e.g., "branch", "local") */
79
- source?: string;
80
- /** Pre-loaded literacy task definitions from a TaskSource adapter.
81
- * When provided, expandTaskDefinitions() is used instead of
82
- * loadAndExpandTasks() (which reads from tasks/*.yaml files). */
83
- tasks?: LiteracyTaskDefinition[];
84
- }
85
- /**
86
- * Generate Promptfoo config files from models.yaml + task definitions.
87
- *
88
- * All parameters are passed via the typed options object — no process.argv
89
- * parsing or env var fallbacks. Callers (command handlers, orchestration
90
- * steps) are responsible for resolving options from their own context.
91
- */
92
- export declare function generateConfigs(options: GenerateConfigsOptions): void;
@@ -1,445 +0,0 @@
1
- /**
2
- * pipeline/generate-configs.ts
3
- *
4
- * @deprecated This is the LEGACY compilation path. New code should use the
5
- * config compiler pipeline instead:
6
- *
7
- * import { compileLiteracyTasks } from "./compiler/literacy-bridge.js"
8
- * import { buildTaskGraph, compileToPromptfoo } from "./compiler/index.js"
9
- *
10
- * This file is retained behind the `--legacy-compiler` CLI flag as an
11
- * emergency fallback during the migration period. It will be removed once
12
- * the new compiler has been validated in production.
13
- *
14
- * @see packages/eval/src/pipeline/compiler/ — the new compiler pipeline
15
- * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
16
- *
17
- * ---
18
- *
19
- * Reads config/models.yaml (the central model registry) and generates all
20
- * promptfoo config files with the correct provider entries.
21
- *
22
- * This keeps model definitions in one place — add a model to config/models.yaml
23
- * and run `pnpm generate-configs` to propagate it to all eval modes.
24
- *
25
- * Generated configs:
26
- * - promptfooconfig.yaml (baseline: with-docs vs without-docs)
27
- * - promptfooconfig.observed.yaml (instrumented HTTP recording)
28
- * - promptfooconfig.agentic.yaml (agentic tool-calling: naive vs optimized)
29
- *
30
- * All functions accept rootDir as a parameter — no module-level constants.
31
- * No process.argv parsing. No env var fallbacks. Callers provide typed options.
32
- *
33
- * @see config/models.yaml — the central model registry
34
- * @see docs/archive/exec-plans/eliminate-lib-layer.md
35
- */
36
- import { extractModelName, extractProvider, mergeConfig, } from "../_vendor/ailf-core/index.js";
37
- import { existsSync, readdirSync, writeFileSync } from "fs";
38
- import { resolve } from "path";
39
- import { dump } from "js-yaml";
40
- import { ConsoleLogger } from "../adapters/loggers/index.js";
41
- import { loadConfigFile } from "./compiler/config-loader.js";
42
- import { modelMatchesLiteracyVariant } from "./compiler/mode-bases/literacy.js";
43
- import { LITERACY_PROMPT_TEMPLATES } from "./compiler/mode-handlers/literacy/index.js";
44
- import { expandTaskDefinitions, loadAndExpandTasks } from "./expand-tasks.js";
45
- import { validateModelsYaml } from "./validate.js";
46
- import { LiteracyVariant } from "./normalize-mode.js";
47
- import { loadSource } from "../sources.js";
48
- // Re-export pure functions from core for backward compatibility.
49
- // Tests and other modules that previously imported from lib/generate-configs
50
- // can import from pipeline/generate-configs instead.
51
- export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "../_vendor/ailf-core/index.js";
52
- // ---------------------------------------------------------------------------
53
- // Helpers
54
- // ---------------------------------------------------------------------------
55
- /** Auto-discover all task YAML files in the tasks/ directory. */
56
- export function discoverTaskFiles(rootDir) {
57
- const tasksDir = resolve(rootDir, "tasks");
58
- if (!existsSync(tasksDir)) {
59
- return []; // tasks may come from Content Lake instead
60
- }
61
- return readdirSync(tasksDir)
62
- .filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
63
- .sort()
64
- .map((f) => `file://tasks/${f}`);
65
- }
66
- /**
67
- * Load prompt templates. Uses handler-owned literacy templates as defaults,
68
- * with config/prompts.ts as an override layer for user customization.
69
- */
70
- export function loadPrompts(rootDir) {
71
- const toPrompt = (entry) => ({
72
- id: entry.id,
73
- label: entry.label,
74
- raw: entry.template,
75
- });
76
- // Load user overrides from config/prompts (may be empty after Wave 4)
77
- let overrides = {};
78
- try {
79
- const loaded = loadConfigFile("prompts", rootDir).data;
80
- // config/prompts.ts may export a Record (legacy) or an empty array (post-Wave 4)
81
- if (loaded && !Array.isArray(loaded)) {
82
- overrides = loaded;
83
- }
84
- }
85
- catch {
86
- // No config/prompts file — use handler defaults only
87
- }
88
- // Handler-owned templates are the canonical source; overrides take precedence
89
- return {
90
- agentic: overrides[LiteracyVariant.AGENTIC]
91
- ? toPrompt(overrides[LiteracyVariant.AGENTIC])
92
- : toPrompt(LITERACY_PROMPT_TEMPLATES[LiteracyVariant.AGENTIC]),
93
- withDocs: overrides["with-docs"]
94
- ? toPrompt(overrides["with-docs"])
95
- : toPrompt(LITERACY_PROMPT_TEMPLATES["with-docs"]),
96
- withoutDocs: overrides["without-docs"]
97
- ? toPrompt(overrides["without-docs"])
98
- : toPrompt(LITERACY_PROMPT_TEMPLATES["without-docs"]),
99
- };
100
- }
101
- function loadModels(rootDir) {
102
- return loadConfigFile("models", rootDir).data;
103
- }
104
- // ---------------------------------------------------------------------------
105
- // Shared components
106
- // ---------------------------------------------------------------------------
107
- const URL_EXTRACTION_ASSERT = {
108
- type: "javascript",
109
- value: `const urlPattern = /https?:\\/\\/[^\\s\\)\\"\\'\\\`>]+/g;
110
- const urls = [...new Set((output.match(urlPattern) || []))];
111
- const sanityUrls = urls.filter(u => u.includes('sanity.io'));
112
- return {
113
- pass: true,
114
- score: 1,
115
- reason: JSON.stringify({
116
- sanityUrls,
117
- otherUrls: urls.filter(u => !u.includes('sanity.io')),
118
- totalUrlCount: urls.length,
119
- sanityUrlCount: sanityUrls.length
120
- })
121
- };`,
122
- weight: 0,
123
- };
124
- /**
125
- * Source isolation assertion — advisory (weight: 0).
126
- * Verifies that the agentic provider only fetched docs from allowed origins.
127
- * Only injected when origin sandboxing is active.
128
- */
129
- const SOURCE_ISOLATION_ASSERT = {
130
- metadata: { dimension: "source-isolation" },
131
- type: "javascript",
132
- value: "file://dist/assertions/source-isolation.js",
133
- weight: 0,
134
- };
135
- // ---------------------------------------------------------------------------
136
- // Config generators
137
- // ---------------------------------------------------------------------------
138
- function generateAgenticConfig(models, tests, prompts, source, searchMode, allowedOrigins) {
139
- const naiveModels = models.models.filter((m) => modelMatchesLiteracyVariant(m, "agentic-naive"));
140
- const optimizedModels = models.models.filter((m) => modelMatchesLiteracyVariant(m, "agentic-optimized"));
141
- const providers = [];
142
- // Build doc source config to inject into providers
143
- const resolvedSearchMode = searchMode ?? "open";
144
- const sourceConfig = source
145
- ? {
146
- ...(source.allowedOrigins?.length
147
- ? { allowedOrigins: source.allowedOrigins }
148
- : {}),
149
- docBaseUrl: source.baseUrl,
150
- ...(source.headers && Object.keys(source.headers).length > 0
151
- ? { customHeaders: source.headers }
152
- : {}),
153
- llmsTxtUrl: source.llmsTxt,
154
- ...(source.priorityDomain
155
- ? { priorityDomain: source.priorityDomain }
156
- : {}),
157
- // Tool access control: search mode for web_search behavior
158
- ...(resolvedSearchMode !== "open"
159
- ? { searchMode: resolvedSearchMode }
160
- : {}),
161
- }
162
- : {};
163
- for (const model of naiveModels) {
164
- const modelName = extractModelName(model.id);
165
- const provider = extractProvider(model.id);
166
- providers.push({
167
- config: {
168
- ...mergeConfig(models.defaults, model.config, {
169
- agentMode: "naive",
170
- maxToolRounds: models.defaults.maxToolRounds ?? 5,
171
- model: modelName,
172
- provider,
173
- }),
174
- ...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
175
- ...sourceConfig,
176
- observe: true,
177
- observerOptions: models.defaults.observerOptions ?? {},
178
- },
179
- id: "file://dist/agent-observer/agentic-provider.js",
180
- label: `${model.label} (Naive Agent)`,
181
- });
182
- }
183
- for (const model of optimizedModels) {
184
- const modelName = extractModelName(model.id);
185
- const provider = extractProvider(model.id);
186
- providers.push({
187
- config: {
188
- ...mergeConfig(models.defaults, model.config, {
189
- agentMode: "optimized",
190
- maxToolRounds: models.defaults.maxToolRounds ?? 5,
191
- model: modelName,
192
- provider,
193
- }),
194
- ...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
195
- ...sourceConfig,
196
- observe: true,
197
- observerOptions: models.defaults.observerOptions ?? {},
198
- },
199
- id: "file://dist/agent-observer/agentic-provider.js",
200
- label: `${model.label} (Optimized Agent)`,
201
- });
202
- }
203
- // Inject source isolation assertion when origin sandboxing is active
204
- const hasOriginSandbox = Boolean(allowedOrigins?.length);
205
- const agenticAssertions = hasOriginSandbox ? [SOURCE_ISOLATION_ASSERT] : [];
206
- return {
207
- commandLineOptions: { table: false },
208
- defaultTest: {
209
- ...(agenticAssertions.length > 0 ? { assert: agenticAssertions } : {}),
210
- options: {
211
- provider: models.grader.id,
212
- rubricProvider: models.grader.id,
213
- },
214
- },
215
- description: "Sanity AI Literacy Evaluation — Agentic (naive vs optimized)",
216
- ...(models.maxConcurrency
217
- ? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
218
- : {}),
219
- outputPath: "results/latest/eval-results-agentic.json",
220
- prompts: [prompts.agentic],
221
- providers,
222
- tests,
223
- };
224
- }
225
- function generateBaselineConfig(models, tests, prompts) {
226
- const baselineModels = models.models.filter((m) => modelMatchesLiteracyVariant(m, "baseline"));
227
- const providers = baselineModels.map((model) => ({
228
- config: {
229
- ...mergeConfig(models.defaults, model.config),
230
- ...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
231
- },
232
- id: model.id,
233
- label: model.label,
234
- }));
235
- return {
236
- commandLineOptions: { table: false },
237
- defaultTest: {
238
- assert: [URL_EXTRACTION_ASSERT],
239
- options: {
240
- provider: models.grader.id,
241
- rubricProvider: models.grader.id,
242
- },
243
- },
244
- description: "Sanity AI Literacy Evaluation — Baseline",
245
- ...(models.maxConcurrency
246
- ? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
247
- : {}),
248
- outputPath: "results/latest/eval-results.json",
249
- prompts: [prompts.withDocs, prompts.withoutDocs],
250
- providers,
251
- tests,
252
- };
253
- }
254
- function generateObservedConfig(models, tests, prompts) {
255
- const observedModels = models.models.filter((m) => modelMatchesLiteracyVariant(m, "observed"));
256
- const providers = observedModels.map((model) => {
257
- const modelName = extractModelName(model.id);
258
- return {
259
- config: {
260
- ...mergeConfig(models.defaults, model.config),
261
- ...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
262
- modelName,
263
- observe: true,
264
- recordOptions: models.defaults.observerOptions ?? {},
265
- },
266
- id: "file://dist/agent-observer/provider.js",
267
- label: `${model.label} (Observed)`,
268
- };
269
- });
270
- return {
271
- commandLineOptions: { table: false },
272
- defaultTest: {
273
- options: {
274
- provider: models.grader.id,
275
- rubricProvider: models.grader.id,
276
- },
277
- },
278
- description: "Sanity AI Literacy Evaluation — Observed",
279
- ...(models.maxConcurrency
280
- ? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
281
- : {}),
282
- outputPath: "results/latest/eval-results-observed.json",
283
- prompts: [prompts.withDocs, prompts.withoutDocs],
284
- providers,
285
- tests,
286
- };
287
- }
288
- // ---------------------------------------------------------------------------
289
- // Main entry point
290
- // ---------------------------------------------------------------------------
291
- /**
292
- * Generate Promptfoo config files from models.yaml + task definitions.
293
- *
294
- * All parameters are passed via the typed options object — no process.argv
295
- * parsing or env var fallbacks. Callers (command handlers, orchestration
296
- * steps) are responsible for resolving options from their own context.
297
- */
298
- export function generateConfigs(options) {
299
- const { rootDir } = options;
300
- const log = options.logger ?? new ConsoleLogger();
301
- // Validate config/models.yaml before generating configs
302
- const modelIssues = validateModelsYaml(rootDir);
303
- const modelErrors = modelIssues.filter((i) => i.severity === "error");
304
- if (modelErrors.length > 0) {
305
- const details = modelErrors
306
- .map((e) => (e.path ? `${e.message} (at ${e.path})` : e.message))
307
- .join("; ");
308
- throw new Error(`config/models.yaml validation failed: ${details}. Run 'pnpm validate' for details.`);
309
- }
310
- log.info("Loading config/models.yaml...");
311
- const models = loadModels(rootDir);
312
- const activeModels = models.models.filter((m) => m.id && m.label);
313
- log.debug("Models loaded from config/models.yaml", {
314
- totalModels: models.models.length,
315
- activeModels: activeModels.length,
316
- modelIds: activeModels.map((m) => m.id),
317
- graderId: models.grader.id,
318
- maxConcurrency: models.maxConcurrency,
319
- });
320
- log.info(` Found ${activeModels.length} active model(s):`);
321
- for (const m of activeModels) {
322
- // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty array join → "all"
323
- const modes = m.modes?.join(", ") || "all";
324
- log.info(` - ${m.label} (${m.id}) → [${modes}]`);
325
- }
326
- // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty label falls through to id
327
- log.info(` Grader: ${models.grader.label || models.grader.id}`);
328
- // Build filter from options
329
- const filter = options.filter?.areas || options.filter?.taskIds
330
- ? options.filter
331
- : undefined;
332
- // Expand tasks — use GeneralizedTaskDefinition[] from TaskSource when provided,
333
- // otherwise fall back to loading from tasks/*.yaml files.
334
- let entries;
335
- let agenticEntries;
336
- if (options.tasks) {
337
- // TaskSource path — tasks already loaded and filtered by the adapter
338
- log.debug("Expanding tasks from TaskSource adapter", {
339
- taskCount: options.tasks.length,
340
- taskIds: options.tasks.map((t) => t.id),
341
- });
342
- const baselineResult = expandTaskDefinitions(options.tasks, rootDir, LiteracyVariant.STANDARD);
343
- entries = baselineResult.entries;
344
- log.info(` Expanded ${baselineResult.stats.totalTasks} task(s) → ${baselineResult.stats.expandedTotal} test entries (from TaskSource)`);
345
- const agenticResult = expandTaskDefinitions(options.tasks, rootDir, LiteracyVariant.AGENTIC);
346
- agenticEntries = agenticResult.entries;
347
- log.info(` Agentic: ${agenticResult.stats.expandedTotal} entries (gold only, no baseline)`);
348
- }
349
- else {
350
- // Legacy path — read from tasks/*.yaml files
351
- const { entries: baselineEntries, stats } = loadAndExpandTasks(rootDir, filter, LiteracyVariant.STANDARD, log);
352
- entries = baselineEntries;
353
- log.info(` Expanded ${stats.singleDefinitions} task(s) → ${stats.expandedTotal} test entries`);
354
- if (stats.legacyEntries > 0) {
355
- log.info(` ⚠ ${stats.legacyEntries} legacy (paired) entries passed through unchanged`);
356
- }
357
- if (filter) {
358
- const parts = [];
359
- if (filter.areas) {
360
- parts.push(`areas: ${filter.areas.join(", ")}`);
361
- }
362
- if (filter.taskIds) {
363
- parts.push(`tasks: ${filter.taskIds.join(", ")}`);
364
- }
365
- log.info(` Scoped to: ${parts.join("; ")}`);
366
- }
367
- const { entries: agenticFromYaml, stats: agenticStats } = loadAndExpandTasks(rootDir, filter, LiteracyVariant.AGENTIC, log);
368
- agenticEntries = agenticFromYaml;
369
- log.info(` Agentic: ${agenticStats.expandedTotal} entries (gold only, no baseline)`);
370
- }
371
- // Write expanded tasks to generated files for Promptfoo to consume
372
- const expandedPath = resolve(rootDir, "tasks", ".expanded.yaml");
373
- const expandedYaml = dump(entries, {
374
- forceQuotes: false,
375
- lineWidth: 120,
376
- noRefs: true,
377
- quotingType: "'",
378
- });
379
- writeFileSync(expandedPath, `# .expanded.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${expandedYaml}`, "utf-8");
380
- log.info(` ✓ tasks/.expanded.yaml (${entries.length} entries)`);
381
- const agenticExpandedPath = resolve(rootDir, "tasks", ".expanded.agentic.yaml");
382
- const agenticExpandedYaml = dump(agenticEntries, {
383
- forceQuotes: false,
384
- lineWidth: 120,
385
- noRefs: true,
386
- quotingType: "'",
387
- });
388
- writeFileSync(agenticExpandedPath, `# .expanded.agentic.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Gold entries only (no baseline) for agentic evaluation mode.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${agenticExpandedYaml}`, "utf-8");
389
- log.info(` ✓ tasks/.expanded.agentic.yaml (${agenticEntries.length} entries)`);
390
- const taskFiles = ["file://tasks/.expanded.yaml"];
391
- const agenticTaskFiles = ["file://tasks/.expanded.agentic.yaml"];
392
- // Load prompt templates
393
- const prompts = loadPrompts(rootDir);
394
- log.debug("Prompt templates loaded", {
395
- keys: Object.keys(prompts),
396
- withDocsId: prompts.withDocs.id,
397
- withoutDocsId: prompts.withoutDocs.id,
398
- agenticId: prompts.agentic.id,
399
- });
400
- log.info(` Loaded prompts: ${Object.keys(prompts).join(", ")}`);
401
- // Load optional documentation source configuration
402
- // Pre-resolved source wins over name-based lookup
403
- let source = options.resolvedSource;
404
- const sourceName = options.source;
405
- if (!source && sourceName) {
406
- log.info(`\nLoading source: ${sourceName}`);
407
- try {
408
- source = loadSource(sourceName);
409
- }
410
- catch (err) {
411
- const msg = err instanceof Error ? err.message : String(err);
412
- log.warn(`\n⚠ Failed to load source "${sourceName}": ${msg}`);
413
- }
414
- }
415
- if (source) {
416
- log.info(` Base URL: ${source.baseUrl}`);
417
- log.info(` Dataset: ${source.dataset}`);
418
- if (source.allowedOrigins?.length) {
419
- log.info(` Allowed origins: ${source.allowedOrigins.join(", ")}`);
420
- }
421
- }
422
- log.info("\nGenerating configs...");
423
- writeConfig(rootDir, "promptfooconfig.yaml", generateBaselineConfig(models, taskFiles, prompts), `# promptfooconfig.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`, log);
424
- writeConfig(rootDir, "promptfooconfig.observed.yaml", generateObservedConfig(models, taskFiles, prompts), `# promptfooconfig.observed.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`, log);
425
- writeConfig(rootDir, "promptfooconfig.agentic.yaml", generateAgenticConfig(models, agenticTaskFiles, prompts, source, options.searchMode, options.allowedOrigins), `# promptfooconfig.agentic.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`, log);
426
- log.info("\nDone! Configs are ready.");
427
- if (source) {
428
- log.info(` (using doc source: ${sourceName})`);
429
- }
430
- }
431
- // ---------------------------------------------------------------------------
432
- // File writing
433
- // ---------------------------------------------------------------------------
434
- function writeConfig(rootDir, filename, config, header, log) {
435
- const yamlStr = dump(config, {
436
- forceQuotes: false,
437
- lineWidth: 120,
438
- noRefs: true,
439
- quotingType: "'",
440
- });
441
- const content = `${header}\n${yamlStr}`;
442
- const outPath = resolve(rootDir, filename);
443
- writeFileSync(outPath, content, "utf-8");
444
- log.info(` ✓ ${filename}`);
445
- }
@@ -1,11 +0,0 @@
1
- /**
2
- * Pipeline step: Calculate AI Literacy Scores from eval results.
3
- *
4
- * Preconditions: eval-results.json exists and is valid
5
- * Postconditions: score-summary.json exists and is valid
6
- *
7
- * Cache key: eval results JSON file(s)
8
- * Cache outputs: results/latest/score-summary.json
9
- */
10
- import type { EvalMode, StepResult } from "../types.js";
11
- export declare function runCalculateScores(source?: string, mode?: EvalMode, noCache?: boolean): StepResult;
@@ -1,89 +0,0 @@
1
- /**
2
- * Pipeline step: Calculate AI Literacy Scores from eval results.
3
- *
4
- * Preconditions: eval-results.json exists and is valid
5
- * Postconditions: score-summary.json exists and is valid
6
- *
7
- * Cache key: eval results JSON file(s)
8
- * Cache outputs: results/latest/score-summary.json
9
- */
10
- import { execSync } from "child_process";
11
- import { dirname, resolve } from "path";
12
- import { fileURLToPath } from "url";
13
- import { getStepInputPaths, hashFiles, lookupCache, recordCache, } from "../cache.js";
14
- import { checkResultsExist, checkScoreSummaryValid } from "../checks.js";
15
- import { RESULTS_FILES } from "./eval-step.js";
16
- const __dirname = dirname(fileURLToPath(import.meta.url));
17
- const ROOT = resolve(__dirname, "..", "..", "..");
18
- export function runCalculateScores(source, mode = "baseline", noCache = false) {
19
- const start = Date.now();
20
- // For full mode, use the baseline results file as the primary input
21
- // (calculate-scores reads all available results files internally)
22
- const primaryMode = mode === "full" ? "baseline" : mode;
23
- const resultsFile = RESULTS_FILES[primaryMode];
24
- const resultsIssues = checkResultsExist(ROOT, resultsFile);
25
- const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
26
- if (resultsErrors.length > 0) {
27
- return {
28
- durationMs: Date.now() - start,
29
- error: `Results missing: ${resultsErrors.map((e) => e.message).join("; ")}`,
30
- status: "failed",
31
- };
32
- }
33
- // Cache check
34
- if (!noCache) {
35
- const cacheResult = lookupCache(ROOT, "calculate-scores");
36
- if (cacheResult.hit) {
37
- return {
38
- durationMs: Date.now() - start,
39
- status: "success",
40
- summary: `Skipped (cached) — ${cacheResult.entry.summary}`,
41
- };
42
- }
43
- }
44
- // Execute — note: calculate-scores exits 1 when areas are below critical,
45
- // which is expected behavior, not an error
46
- try {
47
- const sourceArg = source ? ` --source ${source}` : "";
48
- const resultsArg = primaryMode !== "baseline" ? ` ${resultsFile}` : "";
49
- execSync(`tsx src/lib/calculate-scores.ts${resultsArg}${sourceArg}`, {
50
- cwd: ROOT,
51
- env: process.env,
52
- stdio: "inherit",
53
- });
54
- }
55
- catch (err) {
56
- const code = err !== null && typeof err === "object" && "status" in err
57
- ? err.status
58
- : 1;
59
- // Exit code 1 means "areas below critical" — that's expected
60
- if (code !== 1) {
61
- return {
62
- durationMs: Date.now() - start,
63
- error: `calculate-scores failed with exit code ${code}`,
64
- status: "failed",
65
- };
66
- }
67
- }
68
- // Postcondition: score summary exists and is valid
69
- const summaryIssues = checkScoreSummaryValid(ROOT);
70
- const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
71
- if (summaryErrors.length > 0) {
72
- return {
73
- durationMs: Date.now() - start,
74
- error: `Postcondition failed: ${summaryErrors.map((e) => e.message).join("; ")}`,
75
- status: "failed",
76
- };
77
- }
78
- const durationMs = Date.now() - start;
79
- const summary = "Scores calculated and summary written";
80
- // Record cache
81
- if (!noCache) {
82
- const inputPaths = getStepInputPaths(ROOT, "calculate-scores");
83
- const inputHash = hashFiles(inputPaths);
84
- recordCache(ROOT, "calculate-scores", inputHash, summary, durationMs, [
85
- "results/latest/score-summary.json",
86
- ]);
87
- }
88
- return { durationMs, status: "success", summary };
89
- }
@@ -1,18 +0,0 @@
1
- /**
2
- * Pipeline step: Compare current scores against a baseline.
3
- *
4
- * Preconditions: score-summary.json exists
5
- * Postconditions: comparison-report.json written to results/latest/
6
- *
7
- * This step is optional — it only runs when --compare is passed
8
- * (or a baseline exists and auto-compare is enabled).
9
- */
10
- import type { CompareOptions, StepResult } from "../types.js";
11
- /**
12
- * Run comparison against a baseline.
13
- *
14
- * @param rootDir Package root directory
15
- * @param baselinePath Explicit baseline file path (optional — uses latest if omitted)
16
- * @param options Compare options (noise threshold, etc.)
17
- */
18
- export declare function runCompare(rootDir: string, baselinePath?: string, options?: CompareOptions): StepResult;