@sanity/ailf 2.0.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/LICENSE +21 -0
  2. package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
  3. package/dist/_vendor/ailf-core/examples/index.js +66 -1
  4. package/dist/agent-harness/assertions-runtime.d.ts +49 -0
  5. package/dist/agent-harness/assertions-runtime.js +138 -0
  6. package/dist/agent-harness/provider.d.ts +58 -0
  7. package/dist/agent-harness/provider.js +104 -0
  8. package/dist/cli.js +0 -0
  9. package/dist/commands/init.js +3 -0
  10. package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
  11. package/dist/orchestration/steps/generate-configs-step.js +35 -2
  12. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +39 -25
  13. package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
  14. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +9 -0
  15. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +28 -85
  16. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
  17. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
  18. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
  19. package/package.json +25 -24
  20. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  21. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  22. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  23. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  24. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  25. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  26. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  27. package/dist/_vendor/ailf-tasks/index.js +0 -16
  28. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  29. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  30. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  31. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  32. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  33. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  34. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  35. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  36. package/dist/agent-observer/test-imports.d.ts +0 -7
  37. package/dist/agent-observer/test-imports.js +0 -185
  38. package/dist/commands/update-quality-scores.d.ts +0 -5
  39. package/dist/commands/update-quality-scores.js +0 -20
  40. package/dist/lib/agent-behavior-report.d.ts +0 -8
  41. package/dist/lib/agent-behavior-report.js +0 -185
  42. package/dist/lib/baseline.d.ts +0 -19
  43. package/dist/lib/baseline.js +0 -153
  44. package/dist/lib/calculate-scores.d.ts +0 -23
  45. package/dist/lib/calculate-scores.js +0 -42
  46. package/dist/lib/compare.d.ts +0 -18
  47. package/dist/lib/compare.js +0 -170
  48. package/dist/lib/coverage-audit.d.ts +0 -4
  49. package/dist/lib/coverage-audit.js +0 -42
  50. package/dist/lib/discovery-report.d.ts +0 -13
  51. package/dist/lib/discovery-report.js +0 -57
  52. package/dist/lib/fetch-docs.d.ts +0 -30
  53. package/dist/lib/fetch-docs.js +0 -171
  54. package/dist/lib/generate-configs.d.ts +0 -25
  55. package/dist/lib/generate-configs.js +0 -42
  56. package/dist/lib/grader-api.d.ts +0 -21
  57. package/dist/lib/grader-api.js +0 -34
  58. package/dist/lib/grader-compare.d.ts +0 -19
  59. package/dist/lib/grader-compare.js +0 -91
  60. package/dist/lib/grader-consistency.d.ts +0 -27
  61. package/dist/lib/grader-consistency.js +0 -79
  62. package/dist/lib/grader-sensitivity.d.ts +0 -19
  63. package/dist/lib/grader-sensitivity.js +0 -75
  64. package/dist/lib/grader-validate.d.ts +0 -19
  65. package/dist/lib/grader-validate.js +0 -78
  66. package/dist/lib/measure-retrieval.d.ts +0 -14
  67. package/dist/lib/measure-retrieval.js +0 -71
  68. package/dist/lib/pr-comment.d.ts +0 -16
  69. package/dist/lib/pr-comment.js +0 -28
  70. package/dist/lib/readiness-report.d.ts +0 -13
  71. package/dist/lib/readiness-report.js +0 -108
  72. package/dist/lib/webhook-server.d.ts +0 -11
  73. package/dist/lib/webhook-server.js +0 -24
  74. package/dist/lib/weekly-digest.d.ts +0 -24
  75. package/dist/lib/weekly-digest.js +0 -148
  76. package/dist/orchestration/env-bridge.d.ts +0 -21
  77. package/dist/orchestration/env-bridge.js +0 -66
  78. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  79. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  80. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  81. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  82. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  83. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  86. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  87. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  88. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  89. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  90. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  91. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  92. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  93. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  94. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  95. package/dist/pipeline/compiler/task-bridge.js +0 -92
  96. package/dist/pipeline/expand-tasks.d.ts +0 -232
  97. package/dist/pipeline/expand-tasks.js +0 -467
  98. package/dist/pipeline/generate-configs.d.ts +0 -92
  99. package/dist/pipeline/generate-configs.js +0 -445
  100. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  102. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  103. package/dist/pipeline/steps/compare-step.js +0 -90
  104. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  105. package/dist/pipeline/steps/eval-step.js +0 -347
  106. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  107. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  108. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  109. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  110. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  111. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  112. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  113. package/dist/pipeline/steps/publish-report-step.js +0 -243
  114. package/dist/pipeline/steps/report-step.d.ts +0 -13
  115. package/dist/pipeline/steps/report-step.js +0 -56
  116. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  117. package/dist/pipeline/steps/update-scores-step.js +0 -42
  118. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  119. package/dist/scripts/agent-behavior-report.js +0 -315
  120. package/dist/scripts/baseline.d.ts +0 -43
  121. package/dist/scripts/baseline.js +0 -267
  122. package/dist/scripts/calculate-scores.d.ts +0 -166
  123. package/dist/scripts/calculate-scores.js +0 -1296
  124. package/dist/scripts/compare.d.ts +0 -22
  125. package/dist/scripts/compare.js +0 -334
  126. package/dist/scripts/coverage-audit.d.ts +0 -44
  127. package/dist/scripts/coverage-audit.js +0 -209
  128. package/dist/scripts/debug-eval.d.ts +0 -19
  129. package/dist/scripts/debug-eval.js +0 -73
  130. package/dist/scripts/discovery-report.d.ts +0 -58
  131. package/dist/scripts/discovery-report.js +0 -250
  132. package/dist/scripts/fetch-docs.d.ts +0 -35
  133. package/dist/scripts/fetch-docs.js +0 -472
  134. package/dist/scripts/generate-configs.d.ts +0 -66
  135. package/dist/scripts/generate-configs.js +0 -459
  136. package/dist/scripts/grader-api.d.ts +0 -27
  137. package/dist/scripts/grader-api.js +0 -206
  138. package/dist/scripts/grader-compare.d.ts +0 -22
  139. package/dist/scripts/grader-compare.js +0 -368
  140. package/dist/scripts/grader-consistency.d.ts +0 -20
  141. package/dist/scripts/grader-consistency.js +0 -313
  142. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  143. package/dist/scripts/grader-sensitivity.js +0 -354
  144. package/dist/scripts/grader-validate.d.ts +0 -19
  145. package/dist/scripts/grader-validate.js +0 -267
  146. package/dist/scripts/measure-retrieval.d.ts +0 -10
  147. package/dist/scripts/measure-retrieval.js +0 -145
  148. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  149. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  150. package/dist/scripts/pipeline.d.ts +0 -76
  151. package/dist/scripts/pipeline.js +0 -1031
  152. package/dist/scripts/pr-comment.d.ts +0 -10
  153. package/dist/scripts/pr-comment.js +0 -510
  154. package/dist/scripts/readiness-report.d.ts +0 -88
  155. package/dist/scripts/readiness-report.js +0 -342
  156. package/dist/scripts/update-quality-scores.d.ts +0 -15
  157. package/dist/scripts/update-quality-scores.js +0 -184
  158. package/dist/scripts/validate-task-sources.d.ts +0 -21
  159. package/dist/scripts/validate-task-sources.js +0 -210
  160. package/dist/scripts/validate.d.ts +0 -13
  161. package/dist/scripts/validate.js +0 -79
  162. package/dist/scripts/webhook-server.d.ts +0 -26
  163. package/dist/scripts/webhook-server.js +0 -147
  164. package/dist/scripts/weekly-digest.d.ts +0 -24
  165. package/dist/scripts/weekly-digest.js +0 -144
  166. package/dist/sinks/format-slack.d.ts +0 -64
  167. package/dist/sinks/format-slack.js +0 -306
  168. package/dist/sinks/slack-sink.d.ts +0 -27
  169. package/dist/sinks/slack-sink.js +0 -78
  170. package/dist/sinks/webhook-sink.d.ts +0 -19
  171. package/dist/sinks/webhook-sink.js +0 -50
  172. package/tasks/.expanded.agentic.yaml +0 -280
  173. package/tasks/.expanded.yaml +0 -565
@@ -1,90 +0,0 @@
1
- /**
2
- * Pipeline step: Compare current scores against a baseline.
3
- *
4
- * Preconditions: score-summary.json exists
5
- * Postconditions: comparison-report.json written to results/latest/
6
- *
7
- * This step is optional — it only runs when --compare is passed
8
- * (or a baseline exists and auto-compare is enabled).
9
- */
10
- import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
11
- import { join, resolve } from "path";
12
- import { compare } from "../compare.js";
13
- /**
14
- * Run comparison against a baseline.
15
- *
16
- * @param rootDir Package root directory
17
- * @param baselinePath Explicit baseline file path (optional — uses latest if omitted)
18
- * @param options Compare options (noise threshold, etc.)
19
- */
20
- export function runCompare(rootDir, baselinePath, options) {
21
- const start = Date.now();
22
- const scoreSummaryPath = resolve(rootDir, "results", "latest", "score-summary.json");
23
- if (!existsSync(scoreSummaryPath)) {
24
- return {
25
- durationMs: Date.now() - start,
26
- error: "score-summary.json not found. Run calculate-scores first.",
27
- status: "failed",
28
- };
29
- }
30
- // Load experiment (current run)
31
- const experiment = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
32
- // Resolve baseline
33
- let resolvedBaselinePath;
34
- if (baselinePath) {
35
- resolvedBaselinePath = resolve(baselinePath);
36
- }
37
- else {
38
- const baselinesDir = resolve(rootDir, "results", "baselines");
39
- if (!existsSync(baselinesDir)) {
40
- return {
41
- reason: "No baselines directory found. Run 'pnpm baseline:save' first.",
42
- status: "skipped",
43
- };
44
- }
45
- const files = readdirSync(baselinesDir)
46
- .filter((f) => f.endsWith(".json"))
47
- .sort()
48
- .reverse();
49
- if (files.length === 0) {
50
- return {
51
- reason: "No baseline files found. Run 'pnpm baseline:save' first.",
52
- status: "skipped",
53
- };
54
- }
55
- resolvedBaselinePath = join(baselinesDir, files[0]);
56
- }
57
- if (!existsSync(resolvedBaselinePath)) {
58
- return {
59
- durationMs: Date.now() - start,
60
- error: `Baseline file not found: ${resolvedBaselinePath}`,
61
- status: "failed",
62
- };
63
- }
64
- const baseline = JSON.parse(readFileSync(resolvedBaselinePath, "utf-8"));
65
- // Run comparison
66
- const report = compare(baseline, experiment, options);
67
- // Write report
68
- const reportPath = resolve(rootDir, "results", "latest", "comparison-report.json");
69
- writeFileSync(reportPath, JSON.stringify(report, null, 2));
70
- // Build summary
71
- const improved = report.improved.length;
72
- const regressed = report.regressed.length;
73
- const unchanged = report.unchanged.length;
74
- const overallDelta = report.deltas.overall;
75
- const deltaStr = overallDelta > 0
76
- ? `+${Math.round(overallDelta)}`
77
- : String(Math.round(overallDelta));
78
- const parts = [`Overall: ${deltaStr}`];
79
- if (improved > 0)
80
- parts.push(`${improved} improved`);
81
- if (regressed > 0)
82
- parts.push(`${regressed} regressed`);
83
- if (unchanged > 0)
84
- parts.push(`${unchanged} unchanged`);
85
- return {
86
- durationMs: Date.now() - start,
87
- status: "success",
88
- summary: parts.join(", "),
89
- };
90
- }
@@ -1,53 +0,0 @@
1
- /**
2
- * Pipeline step: Run promptfoo evaluation.
3
- *
4
- * Preconditions: config files and context files exist
5
- * Postconditions: eval-results.json exists and is valid
6
- *
7
- * Cache key: promptfooconfig*.yaml + contexts/*.md + tasks/*.yaml +
8
- * canonical contexts + reference solutions + config/models.yaml
9
- * Cache outputs: results/latest/eval-results*.json
10
- *
11
- * Remote cache: When local cache misses and a Sanity token is available,
12
- * the step queries the Content Lake for a report with a matching eval
13
- * fingerprint. On a hit, the cached score-summary.json is written to disk
14
- * and the eval + calculate-scores steps are skipped entirely.
15
- *
16
- * @see docs/design-docs/content-lake-eval-caching.md
17
- */
18
- import type { ConcreteEvalMode, DebugOptions, FilterOptions, StepResult } from "../types.js";
19
- /** Each mode writes eval results to a different file (set in the config's outputPath) */
20
- export declare const RESULTS_FILES: Record<ConcreteEvalMode, string>;
21
- /** Extended step result that carries cache metadata for downstream steps */
22
- export interface EvalStepResult {
23
- /** The computed eval fingerprint (for publishing in provenance) */
24
- evalFingerprint?: string;
25
- /** Whether this result came from a remote cache hit */
26
- remoteCacheHit?: boolean;
27
- /** The step result */
28
- stepResult: StepResult;
29
- }
30
- /** Options for the remote cache (Content Lake fingerprint lookup) */
31
- export interface RemoteCacheOptions {
32
- /** Whether this is a debug run (debug runs don't use remote cache) */
33
- debug?: boolean;
34
- /** Filter options used for fingerprint computation */
35
- filter?: FilterOptions;
36
- /** Grader model identifier from models.yaml */
37
- graderModel: string;
38
- /** Disable remote cache lookup (--no-remote-cache) */
39
- noRemoteCache?: boolean;
40
- /** Sanity API token for reading cached reports */
41
- sanityToken?: string;
42
- }
43
- export declare function buildFilterFlags(debug?: DebugOptions): string;
44
- /**
45
- * Extract the Promptfoo share URL from the eval results JSON.
46
- *
47
- * Promptfoo writes a `shareableUrl` field into the results file when
48
- * `PROMPTFOO_API_KEY` is set. This replaces the previous approach of
49
- * scraping the URL from a captured log file (which required piping
50
- * through `tee` and broke TTY progress reporting).
51
- */
52
- export declare function extractShareUrl(mode: ConcreteEvalMode): string | undefined;
53
- export declare function runEval(mode: ConcreteEvalMode, debug?: DebugOptions, concurrency?: number, noCache?: boolean, remoteCacheOpts?: RemoteCacheOptions): Promise<EvalStepResult>;
@@ -1,347 +0,0 @@
1
- /**
2
- * Pipeline step: Run promptfoo evaluation.
3
- *
4
- * Preconditions: config files and context files exist
5
- * Postconditions: eval-results.json exists and is valid
6
- *
7
- * Cache key: promptfooconfig*.yaml + contexts/*.md + tasks/*.yaml +
8
- * canonical contexts + reference solutions + config/models.yaml
9
- * Cache outputs: results/latest/eval-results*.json
10
- *
11
- * Remote cache: When local cache misses and a Sanity token is available,
12
- * the step queries the Content Lake for a report with a matching eval
13
- * fingerprint. On a hit, the cached score-summary.json is written to disk
14
- * and the eval + calculate-scores steps are skipped entirely.
15
- *
16
- * @see docs/design-docs/content-lake-eval-caching.md
17
- */
18
- import { execSync } from "child_process";
19
- import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
20
- import { dirname, resolve } from "path";
21
- import { fileURLToPath } from "url";
22
- import { getStepInputPaths, hashFiles, lookupCache, recordCache, } from "../cache.js";
23
- import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../checks.js";
24
- import { computeEvalFingerprint } from "../eval-fingerprint.js";
25
- import { resolveMappings } from "../resolve-mappings.js";
26
- const __dirname = dirname(fileURLToPath(import.meta.url));
27
- const ROOT = resolve(__dirname, "..", "..", "..");
28
- const CONFIG_FILES = {
29
- agentic: "promptfooconfig.agentic.yaml",
30
- baseline: "promptfooconfig.yaml",
31
- observed: "promptfooconfig.observed.yaml",
32
- };
33
- /** Each mode writes eval results to a different file (set in the config's outputPath) */
34
- export const RESULTS_FILES = {
35
- agentic: "results/latest/eval-results-agentic.json",
36
- baseline: "results/latest/eval-results.json",
37
- observed: "results/latest/eval-results-observed.json",
38
- };
39
- export function buildFilterFlags(debug) {
40
- if (!debug?.enabled)
41
- return "";
42
- const flags = [];
43
- if (debug.pattern) {
44
- flags.push(`--filter-pattern '${debug.pattern}'`);
45
- }
46
- if (debug.sample) {
47
- flags.push(`--filter-sample ${debug.sample}`);
48
- }
49
- if (debug.firstN) {
50
- flags.push(`--filter-first-n ${debug.firstN}`);
51
- }
52
- // Default: first 2 tests when no other filters specified
53
- if (flags.length === 0) {
54
- flags.push("--filter-first-n 2");
55
- }
56
- return " " + flags.join(" ");
57
- }
58
- /**
59
- * Extract the Promptfoo share URL from the eval results JSON.
60
- *
61
- * Promptfoo writes a `shareableUrl` field into the results file when
62
- * `PROMPTFOO_API_KEY` is set. This replaces the previous approach of
63
- * scraping the URL from a captured log file (which required piping
64
- * through `tee` and broke TTY progress reporting).
65
- */
66
- export function extractShareUrl(mode) {
67
- const resultsPath = resolve(ROOT, RESULTS_FILES[mode]);
68
- if (!existsSync(resultsPath))
69
- return undefined;
70
- try {
71
- const raw = readFileSync(resultsPath, "utf-8");
72
- const data = JSON.parse(raw);
73
- return data.shareableUrl ?? undefined;
74
- }
75
- catch {
76
- return undefined;
77
- }
78
- }
79
- // ---------------------------------------------------------------------------
80
- // Post-eval error scanning
81
- // ---------------------------------------------------------------------------
82
- export async function runEval(mode, debug, concurrency, noCache = false, remoteCacheOpts) {
83
- const start = Date.now();
84
- // Precondition: config file exists
85
- const configIssues = checkGeneratedConfigsExist(ROOT);
86
- const configErrors = configIssues.filter((i) => i.severity === "error");
87
- if (configErrors.length > 0) {
88
- return {
89
- stepResult: {
90
- durationMs: Date.now() - start,
91
- error: `Config files missing: ${configErrors.map((e) => e.message).join("; ")}`,
92
- status: "failed",
93
- },
94
- };
95
- }
96
- // Precondition: canonical context files exist for all mapped tasks
97
- const mappings = resolveMappings(ROOT);
98
- const taskIds = Object.values(mappings.feature_areas).flatMap((area) => area.tasks.map((t) => t.id));
99
- const contextIssues = checkCanonicalContextsExist(ROOT, taskIds);
100
- const contextErrors = contextIssues.filter((i) => i.severity === "error");
101
- if (contextErrors.length > 0) {
102
- return {
103
- stepResult: {
104
- durationMs: Date.now() - start,
105
- error: `Context files missing. Run 'pnpm fetch-docs' first. ${contextErrors.map((e) => e.message).join("; ")}`,
106
- status: "failed",
107
- },
108
- };
109
- }
110
- // -----------------------------------------------------------------------
111
- // Compute eval fingerprint (used for both remote cache + provenance)
112
- // Only for non-debug runs — debug runs use test subsets.
113
- // -----------------------------------------------------------------------
114
- let evalFingerprint;
115
- if (!debug?.enabled && remoteCacheOpts?.graderModel) {
116
- try {
117
- evalFingerprint = computeEvalFingerprint({
118
- filter: remoteCacheOpts.filter,
119
- graderModel: remoteCacheOpts.graderModel,
120
- mode,
121
- rootDir: ROOT,
122
- });
123
- }
124
- catch (err) {
125
- console.warn(` ⚠️ Could not compute eval fingerprint: ${err instanceof Error ? err.message : String(err)}`);
126
- }
127
- }
128
- // -----------------------------------------------------------------------
129
- // Cache check — local first, then remote
130
- // -----------------------------------------------------------------------
131
- // Local cache check — skip eval if inputs unchanged (biggest cost saver).
132
- // Each mode gets its own cache key so that in `full` mode, a fresh agentic
133
- // cache doesn't force baseline to re-run (or vice versa).
134
- const cacheKey = `eval-${mode}`;
135
- if (!noCache) {
136
- const cacheResult = lookupCache(ROOT, cacheKey);
137
- if (cacheResult.hit) {
138
- return {
139
- evalFingerprint,
140
- stepResult: {
141
- durationMs: Date.now() - start,
142
- status: "success",
143
- summary: `Skipped (cached) — ${cacheResult.entry.summary}`,
144
- },
145
- };
146
- }
147
- }
148
- // Remote cache check — query Content Lake for matching fingerprint
149
- if (evalFingerprint &&
150
- !noCache &&
151
- !remoteCacheOpts?.noRemoteCache &&
152
- remoteCacheOpts?.sanityToken) {
153
- const remoteCacheResult = await checkRemoteCache(evalFingerprint, remoteCacheOpts.sanityToken);
154
- if (remoteCacheResult) {
155
- return {
156
- evalFingerprint,
157
- remoteCacheHit: true,
158
- stepResult: {
159
- durationMs: Date.now() - start,
160
- status: "success",
161
- summary: `Skipped (remote cache hit) — reusing report ${remoteCacheResult.reportId} from ${remoteCacheResult.completedAt}`,
162
- },
163
- };
164
- }
165
- }
166
- // Execute — run promptfoo directly with inherited stdio so the TTY
167
- // progress bar works in interactive terminals and the CI progress
168
- // reporter works in CI environments. Previously this was piped through
169
- // `tee` to capture a log file for share-URL extraction, but `tee`
170
- // destroyed TTY detection, disabling all progress output. The share URL
171
- // is now read from the eval results JSON (`shareableUrl` field) instead.
172
- //
173
- // Sharing is enabled by default (via PROMPTFOO_API_KEY / cloud config).
174
- // We set PROMPTFOO_DISABLE_SHARE_EMAIL_REQUEST=1 to prevent promptfoo's
175
- // interactive email prompt from blocking the terminal in local TTY
176
- // environments. In CI, isCI() already guards against the prompt, but
177
- // the env var provides defense-in-depth for all execution contexts.
178
- const configFile = CONFIG_FILES[mode];
179
- const filterFlags = buildFilterFlags(debug);
180
- const concurrencyFlag = concurrency ? ` --max-concurrency ${concurrency}` : "";
181
- const noCacheFlag = noCache ? " --no-cache" : "";
182
- const evalCmd = `dotenv -e ../../.env -o -- promptfoo eval --config ${configFile}${filterFlags}${concurrencyFlag}${noCacheFlag}`;
183
- let exitCode = 0;
184
- try {
185
- execSync(evalCmd, {
186
- cwd: ROOT,
187
- env: {
188
- ...process.env,
189
- PROMPTFOO_DISABLE_SHARE_EMAIL_REQUEST: "1",
190
- },
191
- stdio: "inherit",
192
- });
193
- }
194
- catch (err) {
195
- // promptfoo exits 100 when assertions fail — that's expected, not an error
196
- exitCode =
197
- err !== null && typeof err === "object" && "status" in err
198
- ? err.status
199
- : 1;
200
- if (exitCode !== 100) {
201
- return {
202
- evalFingerprint,
203
- stepResult: {
204
- durationMs: Date.now() - start,
205
- error: `promptfoo eval failed with exit code ${exitCode}`,
206
- status: "failed",
207
- },
208
- };
209
- }
210
- }
211
- // Postcondition: results file exists and is valid
212
- const resultsIssues = checkResultsExist(ROOT, RESULTS_FILES[mode]);
213
- const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
214
- if (resultsErrors.length > 0) {
215
- return {
216
- evalFingerprint,
217
- stepResult: {
218
- durationMs: Date.now() - start,
219
- error: `Postcondition failed: ${resultsErrors.map((e) => e.message).join("; ")}`,
220
- status: "failed",
221
- },
222
- };
223
- }
224
- // Scan results for errors and surface them clearly
225
- const errorSummary = scanResultsForErrors(resolve(ROOT, RESULTS_FILES[mode]));
226
- if (errorSummary) {
227
- console.log();
228
- console.log(errorSummary);
229
- }
230
- const durationMs = Date.now() - start;
231
- const summary = `Evaluation complete (mode: ${mode}${debug?.enabled ? ", debug" : ""})`;
232
- // Record cache — only for non-debug runs (debug uses a subset of tests).
233
- // Uses per-mode cache key so baseline and agentic are independently cached.
234
- if (!noCache && !debug?.enabled) {
235
- const inputPaths = getStepInputPaths(ROOT, cacheKey);
236
- const inputHash = hashFiles(inputPaths);
237
- recordCache(ROOT, cacheKey, inputHash, summary, durationMs, [
238
- RESULTS_FILES[mode],
239
- ]);
240
- }
241
- return {
242
- evalFingerprint,
243
- stepResult: { durationMs, status: "success", summary },
244
- };
245
- }
246
- // ---------------------------------------------------------------------------
247
- // Remote cache helpers
248
- // ---------------------------------------------------------------------------
249
- /**
250
- * Query the Sanity Content Lake for a report with a matching eval fingerprint.
251
- *
252
- * On a hit, writes the cached score-summary.json to results/latest/ so that
253
- * downstream steps (report, compare, publish) can proceed as if the eval
254
- * had just run.
255
- *
256
- * @returns The matched report metadata on hit, null on miss or error
257
- */
258
- async function checkRemoteCache(fingerprint, sanityToken) {
259
- try {
260
- const { ReportStore } = await import("../../report-store.js");
261
- const store = new ReportStore({
262
- dataset: process.env.AILF_REPORT_DATASET ?? undefined,
263
- projectId: process.env.AILF_REPORT_PROJECT_ID ?? undefined,
264
- token: sanityToken,
265
- });
266
- const startQuery = Date.now();
267
- const cachedReport = await store.findByFingerprint(fingerprint);
268
- const queryMs = Date.now() - startQuery;
269
- if (!cachedReport) {
270
- console.log(` ℹ️ Remote cache miss — no report matches fingerprint (${queryMs}ms)`);
271
- return null;
272
- }
273
- // Write the cached score summary to disk so downstream steps work
274
- const outDir = resolve(ROOT, "results", "latest");
275
- if (!existsSync(outDir)) {
276
- mkdirSync(outDir, { recursive: true });
277
- }
278
- writeFileSync(resolve(outDir, "score-summary.json"), JSON.stringify(cachedReport.summary, null, 2));
279
- console.log(` ✅ Remote cache hit — reusing report ${cachedReport.id} from ${cachedReport.completedAt}`);
280
- console.log(` ℹ️ Fingerprint: ${fingerprint.slice(0, 16)}... (${queryMs}ms)`);
281
- console.log(" ⚠️ Cached scores are statistically equivalent, not identical");
282
- return {
283
- completedAt: cachedReport.completedAt,
284
- reportId: cachedReport.id,
285
- };
286
- }
287
- catch (err) {
288
- console.warn(` ⚠️ Remote cache check failed: ${err instanceof Error ? err.message : String(err)}`);
289
- return null;
290
- }
291
- }
292
- /**
293
- * Read the eval results JSON and produce a human-readable summary of any
294
- * errored or failed tests. This surfaces API errors, timeouts, and other
295
- * issues that would otherwise be buried in the Promptfoo table output.
296
- *
297
- * Returns null if there are no errors/failures worth reporting.
298
- */
299
- function scanResultsForErrors(resultsPath) {
300
- if (!existsSync(resultsPath))
301
- return null;
302
- let file;
303
- try {
304
- const raw = readFileSync(resultsPath, "utf-8");
305
- file = JSON.parse(raw);
306
- }
307
- catch {
308
- return null;
309
- }
310
- const results = file?.results?.results;
311
- if (!Array.isArray(results))
312
- return null;
313
- const errored = [];
314
- for (const r of results) {
315
- if (r.gradingResult !== null)
316
- continue;
317
- const desc = r.testCase?.description ?? r.description ?? "unknown";
318
- const provider = r.provider?.label ?? r.provider?.id ?? "unknown";
319
- // No grading result = the provider errored before producing a response.
320
- // This is the only case we surface — API 500s, timeouts, rate limits.
321
- // Note: r.error may also be set for assertion failures, but those have
322
- // a non-null gradingResult and are normal pass/fail outcomes.
323
- const errorMsg = r.error
324
- ? (typeof r.error === "string" ? r.error : JSON.stringify(r.error)).slice(0, 200)
325
- : "Provider returned no scorable result";
326
- errored.push({ description: desc, error: errorMsg, provider });
327
- }
328
- if (errored.length === 0)
329
- return null;
330
- const total = results.length;
331
- const lines = [];
332
- lines.push(` ┌─────────────────────────────────────────────────────────────`);
333
- lines.push(` │ ⚠️ ${errored.length} of ${total} eval result(s) errored (no gradingResult)`);
334
- lines.push(` │`);
335
- for (const e of errored) {
336
- lines.push(` │ ✗ [${e.provider}] ${e.description}`);
337
- lines.push(` │ → ${e.error}`);
338
- }
339
- const errorRate = Math.round((errored.length / total) * 100);
340
- if (errorRate >= 25) {
341
- lines.push(` │`);
342
- lines.push(` │ 🔥 High error rate (${errorRate}%) — check API keys, rate limits,`);
343
- lines.push(` │ or model availability. Errored results are excluded from scoring.`);
344
- }
345
- lines.push(` └─────────────────────────────────────────────────────────────`);
346
- return lines.join("\n");
347
- }
@@ -1,11 +0,0 @@
1
- /**
2
- * Pipeline step: Fetch documentation from Sanity CMS.
3
- *
4
- * Preconditions: tasks have inline canonical_docs
5
- * Postconditions: canonical context files exist for all mapped tasks
6
- *
7
- * Cache key: tasks/*.yaml + config/sources.yaml + config/models.yaml
8
- * Cache outputs: contexts/canonical/*.md files
9
- */
10
- import type { StepResult } from "../types.js";
11
- export declare function runFetchDocs(source?: string, noCache?: boolean): Promise<StepResult>;
@@ -1,84 +0,0 @@
1
- /**
2
- * Pipeline step: Fetch documentation from Sanity CMS.
3
- *
4
- * Preconditions: tasks have inline canonical_docs
5
- * Postconditions: canonical context files exist for all mapped tasks
6
- *
7
- * Cache key: tasks/*.yaml + config/sources.yaml + config/models.yaml
8
- * Cache outputs: contexts/canonical/*.md files
9
- */
10
- import { execSync } from "child_process";
11
- import { dirname, resolve } from "path";
12
- import { fileURLToPath } from "url";
13
- import { lookupCache, recordCache } from "../cache.js";
14
- import { checkCanonicalContextsExist } from "../checks.js";
15
- import { resolveMappings } from "../resolve-mappings.js";
16
- const __dirname = dirname(fileURLToPath(import.meta.url));
17
- const ROOT = resolve(__dirname, "..", "..", "..");
18
- export async function runFetchDocs(source, noCache = false) {
19
- const start = Date.now();
20
- // Precondition: at least one task has inline canonical mappings
21
- const mappings = resolveMappings(ROOT);
22
- const totalTasks = Object.values(mappings.feature_areas).reduce((sum, area) => sum + area.tasks.length, 0);
23
- if (totalTasks === 0) {
24
- return {
25
- durationMs: Date.now() - start,
26
- error: "No tasks with canonical_docs found in task files. Add canonical_docs to your task definitions.",
27
- status: "failed",
28
- };
29
- }
30
- // Cache check
31
- if (!noCache) {
32
- const cacheResult = lookupCache(ROOT, "fetch-docs");
33
- if (cacheResult.hit) {
34
- return {
35
- durationMs: Date.now() - start,
36
- status: "success",
37
- summary: `Skipped (cached) — ${cacheResult.entry.summary}`,
38
- };
39
- }
40
- }
41
- // Execute
42
- try {
43
- const sourceArg = source ? ` --source ${source}` : "";
44
- execSync(`pnpm fetch-docs${sourceArg}`, {
45
- cwd: ROOT,
46
- env: process.env,
47
- stdio: "inherit",
48
- });
49
- }
50
- catch (err) {
51
- return {
52
- durationMs: Date.now() - start,
53
- error: `fetch-docs failed: ${err instanceof Error ? err.message : String(err)}`,
54
- status: "failed",
55
- };
56
- }
57
- // Postcondition: canonical context files exist for all mapped tasks
58
- // Re-resolve in case fetch-docs modified things (unlikely but safe)
59
- const postMappings = resolveMappings(ROOT);
60
- const taskIds = Object.values(postMappings.feature_areas).flatMap((area) => area.tasks.map((t) => t.id));
61
- const contextIssues = checkCanonicalContextsExist(ROOT, taskIds);
62
- const contextErrors = contextIssues.filter((i) => i.severity === "error");
63
- if (contextErrors.length > 0) {
64
- return {
65
- durationMs: Date.now() - start,
66
- error: `Postcondition failed: ${contextErrors.map((e) => e.message).join("; ")}`,
67
- status: "failed",
68
- };
69
- }
70
- const durationMs = Date.now() - start;
71
- const summary = `Fetched canonical contexts for ${taskIds.length} tasks`;
72
- // Record cache
73
- if (!noCache) {
74
- const { getStepInputPaths, hashFiles } = await import("../cache.js");
75
- const inputPaths = getStepInputPaths(ROOT, "fetch-docs");
76
- const inputHash = hashFiles(inputPaths);
77
- const outputPaths = [
78
- ...taskIds.map((id) => `contexts/canonical/${id}.md`),
79
- "contexts/document-manifest.json",
80
- ];
81
- recordCache(ROOT, "fetch-docs", inputHash, summary, durationMs, outputPaths);
82
- }
83
- return { durationMs, status: "success", summary };
84
- }
@@ -1,11 +0,0 @@
1
- /**
2
- * Pipeline step: Generate promptfoo config files from config/models.yaml.
3
- *
4
- * Preconditions: config/models.yaml is valid
5
- * Postconditions: promptfooconfig*.yaml files exist
6
- *
7
- * Cache key: config/models.yaml + config/sources.yaml + tasks/*.yaml
8
- * Cache outputs: promptfooconfig*.yaml files
9
- */
10
- import type { StepResult } from "../types.js";
11
- export declare function runGenerateConfigs(source?: string, noCache?: boolean): StepResult;