@sanity/ailf 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/cli.js +0 -0
- package/dist/orchestration/steps/run-eval-step.js +1 -1
- package/dist/pipeline/checks.d.ts +8 -3
- package/dist/pipeline/checks.js +23 -3
- package/package.json +25 -25
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
- package/dist/adapters/task-sources/yaml-task-source.js +0 -139
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
- package/dist/commands/update-quality-scores.d.ts +0 -5
- package/dist/commands/update-quality-scores.js +0 -20
- package/dist/lib/agent-behavior-report.d.ts +0 -8
- package/dist/lib/agent-behavior-report.js +0 -185
- package/dist/lib/baseline.d.ts +0 -19
- package/dist/lib/baseline.js +0 -153
- package/dist/lib/calculate-scores.d.ts +0 -23
- package/dist/lib/calculate-scores.js +0 -42
- package/dist/lib/compare.d.ts +0 -18
- package/dist/lib/compare.js +0 -170
- package/dist/lib/coverage-audit.d.ts +0 -4
- package/dist/lib/coverage-audit.js +0 -42
- package/dist/lib/discovery-report.d.ts +0 -13
- package/dist/lib/discovery-report.js +0 -57
- package/dist/lib/fetch-docs.d.ts +0 -30
- package/dist/lib/fetch-docs.js +0 -171
- package/dist/lib/generate-configs.d.ts +0 -25
- package/dist/lib/generate-configs.js +0 -42
- package/dist/lib/grader-api.d.ts +0 -21
- package/dist/lib/grader-api.js +0 -34
- package/dist/lib/grader-compare.d.ts +0 -19
- package/dist/lib/grader-compare.js +0 -91
- package/dist/lib/grader-consistency.d.ts +0 -27
- package/dist/lib/grader-consistency.js +0 -79
- package/dist/lib/grader-sensitivity.d.ts +0 -19
- package/dist/lib/grader-sensitivity.js +0 -75
- package/dist/lib/grader-validate.d.ts +0 -19
- package/dist/lib/grader-validate.js +0 -78
- package/dist/lib/measure-retrieval.d.ts +0 -14
- package/dist/lib/measure-retrieval.js +0 -71
- package/dist/lib/pr-comment.d.ts +0 -16
- package/dist/lib/pr-comment.js +0 -28
- package/dist/lib/readiness-report.d.ts +0 -13
- package/dist/lib/readiness-report.js +0 -108
- package/dist/lib/webhook-server.d.ts +0 -11
- package/dist/lib/webhook-server.js +0 -24
- package/dist/lib/weekly-digest.d.ts +0 -24
- package/dist/lib/weekly-digest.js +0 -148
- package/dist/orchestration/env-bridge.d.ts +0 -21
- package/dist/orchestration/env-bridge.js +0 -66
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
- package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
- package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
- package/dist/pipeline/compiler/task-bridge.js +0 -92
- package/dist/pipeline/expand-tasks.d.ts +0 -232
- package/dist/pipeline/expand-tasks.js +0 -467
- package/dist/pipeline/generate-configs.d.ts +0 -92
- package/dist/pipeline/generate-configs.js +0 -445
- package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/calculate-scores-step.js +0 -89
- package/dist/pipeline/steps/compare-step.d.ts +0 -18
- package/dist/pipeline/steps/compare-step.js +0 -90
- package/dist/pipeline/steps/eval-step.d.ts +0 -53
- package/dist/pipeline/steps/eval-step.js +0 -347
- package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
- package/dist/pipeline/steps/fetch-docs-step.js +0 -84
- package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
- package/dist/pipeline/steps/generate-configs-step.js +0 -98
- package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
- package/dist/pipeline/steps/grader-consistency-step.js +0 -74
- package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
- package/dist/pipeline/steps/publish-report-step.js +0 -243
- package/dist/pipeline/steps/report-step.d.ts +0 -13
- package/dist/pipeline/steps/report-step.js +0 -56
- package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/update-scores-step.js +0 -42
- package/dist/scripts/agent-behavior-report.d.ts +0 -19
- package/dist/scripts/agent-behavior-report.js +0 -315
- package/dist/scripts/baseline.d.ts +0 -43
- package/dist/scripts/baseline.js +0 -267
- package/dist/scripts/calculate-scores.d.ts +0 -166
- package/dist/scripts/calculate-scores.js +0 -1296
- package/dist/scripts/compare.d.ts +0 -22
- package/dist/scripts/compare.js +0 -334
- package/dist/scripts/coverage-audit.d.ts +0 -44
- package/dist/scripts/coverage-audit.js +0 -209
- package/dist/scripts/debug-eval.d.ts +0 -19
- package/dist/scripts/debug-eval.js +0 -73
- package/dist/scripts/discovery-report.d.ts +0 -58
- package/dist/scripts/discovery-report.js +0 -250
- package/dist/scripts/fetch-docs.d.ts +0 -35
- package/dist/scripts/fetch-docs.js +0 -472
- package/dist/scripts/generate-configs.d.ts +0 -66
- package/dist/scripts/generate-configs.js +0 -459
- package/dist/scripts/grader-api.d.ts +0 -27
- package/dist/scripts/grader-api.js +0 -206
- package/dist/scripts/grader-compare.d.ts +0 -22
- package/dist/scripts/grader-compare.js +0 -368
- package/dist/scripts/grader-consistency.d.ts +0 -20
- package/dist/scripts/grader-consistency.js +0 -313
- package/dist/scripts/grader-sensitivity.d.ts +0 -22
- package/dist/scripts/grader-sensitivity.js +0 -354
- package/dist/scripts/grader-validate.d.ts +0 -19
- package/dist/scripts/grader-validate.js +0 -267
- package/dist/scripts/measure-retrieval.d.ts +0 -10
- package/dist/scripts/measure-retrieval.js +0 -145
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
- package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
- package/dist/scripts/pipeline.d.ts +0 -76
- package/dist/scripts/pipeline.js +0 -1031
- package/dist/scripts/pr-comment.d.ts +0 -10
- package/dist/scripts/pr-comment.js +0 -510
- package/dist/scripts/readiness-report.d.ts +0 -88
- package/dist/scripts/readiness-report.js +0 -342
- package/dist/scripts/update-quality-scores.d.ts +0 -15
- package/dist/scripts/update-quality-scores.js +0 -184
- package/dist/scripts/validate-task-sources.d.ts +0 -21
- package/dist/scripts/validate-task-sources.js +0 -210
- package/dist/scripts/validate.d.ts +0 -13
- package/dist/scripts/validate.js +0 -79
- package/dist/scripts/webhook-server.d.ts +0 -26
- package/dist/scripts/webhook-server.js +0 -147
- package/dist/scripts/weekly-digest.d.ts +0 -24
- package/dist/scripts/weekly-digest.js +0 -144
- package/dist/sinks/format-slack.d.ts +0 -64
- package/dist/sinks/format-slack.js +0 -306
- package/dist/sinks/slack-sink.d.ts +0 -27
- package/dist/sinks/slack-sink.js +0 -78
- package/dist/sinks/webhook-sink.d.ts +0 -19
- package/dist/sinks/webhook-sink.js +0 -50
- package/tasks/.expanded.agentic.yaml +0 -280
- package/tasks/.expanded.yaml +0 -565
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Pipeline step: Compare current scores against a baseline.
|
|
3
|
-
*
|
|
4
|
-
* Preconditions: score-summary.json exists
|
|
5
|
-
* Postconditions: comparison-report.json written to results/latest/
|
|
6
|
-
*
|
|
7
|
-
* This step is optional — it only runs when --compare is passed
|
|
8
|
-
* (or a baseline exists and auto-compare is enabled).
|
|
9
|
-
*/
|
|
10
|
-
import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
|
|
11
|
-
import { join, resolve } from "path";
|
|
12
|
-
import { compare } from "../compare.js";
|
|
13
|
-
/**
|
|
14
|
-
* Run comparison against a baseline.
|
|
15
|
-
*
|
|
16
|
-
* @param rootDir Package root directory
|
|
17
|
-
* @param baselinePath Explicit baseline file path (optional — uses latest if omitted)
|
|
18
|
-
* @param options Compare options (noise threshold, etc.)
|
|
19
|
-
*/
|
|
20
|
-
export function runCompare(rootDir, baselinePath, options) {
|
|
21
|
-
const start = Date.now();
|
|
22
|
-
const scoreSummaryPath = resolve(rootDir, "results", "latest", "score-summary.json");
|
|
23
|
-
if (!existsSync(scoreSummaryPath)) {
|
|
24
|
-
return {
|
|
25
|
-
durationMs: Date.now() - start,
|
|
26
|
-
error: "score-summary.json not found. Run calculate-scores first.",
|
|
27
|
-
status: "failed",
|
|
28
|
-
};
|
|
29
|
-
}
|
|
30
|
-
// Load experiment (current run)
|
|
31
|
-
const experiment = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
|
|
32
|
-
// Resolve baseline
|
|
33
|
-
let resolvedBaselinePath;
|
|
34
|
-
if (baselinePath) {
|
|
35
|
-
resolvedBaselinePath = resolve(baselinePath);
|
|
36
|
-
}
|
|
37
|
-
else {
|
|
38
|
-
const baselinesDir = resolve(rootDir, "results", "baselines");
|
|
39
|
-
if (!existsSync(baselinesDir)) {
|
|
40
|
-
return {
|
|
41
|
-
reason: "No baselines directory found. Run 'pnpm baseline:save' first.",
|
|
42
|
-
status: "skipped",
|
|
43
|
-
};
|
|
44
|
-
}
|
|
45
|
-
const files = readdirSync(baselinesDir)
|
|
46
|
-
.filter((f) => f.endsWith(".json"))
|
|
47
|
-
.sort()
|
|
48
|
-
.reverse();
|
|
49
|
-
if (files.length === 0) {
|
|
50
|
-
return {
|
|
51
|
-
reason: "No baseline files found. Run 'pnpm baseline:save' first.",
|
|
52
|
-
status: "skipped",
|
|
53
|
-
};
|
|
54
|
-
}
|
|
55
|
-
resolvedBaselinePath = join(baselinesDir, files[0]);
|
|
56
|
-
}
|
|
57
|
-
if (!existsSync(resolvedBaselinePath)) {
|
|
58
|
-
return {
|
|
59
|
-
durationMs: Date.now() - start,
|
|
60
|
-
error: `Baseline file not found: ${resolvedBaselinePath}`,
|
|
61
|
-
status: "failed",
|
|
62
|
-
};
|
|
63
|
-
}
|
|
64
|
-
const baseline = JSON.parse(readFileSync(resolvedBaselinePath, "utf-8"));
|
|
65
|
-
// Run comparison
|
|
66
|
-
const report = compare(baseline, experiment, options);
|
|
67
|
-
// Write report
|
|
68
|
-
const reportPath = resolve(rootDir, "results", "latest", "comparison-report.json");
|
|
69
|
-
writeFileSync(reportPath, JSON.stringify(report, null, 2));
|
|
70
|
-
// Build summary
|
|
71
|
-
const improved = report.improved.length;
|
|
72
|
-
const regressed = report.regressed.length;
|
|
73
|
-
const unchanged = report.unchanged.length;
|
|
74
|
-
const overallDelta = report.deltas.overall;
|
|
75
|
-
const deltaStr = overallDelta > 0
|
|
76
|
-
? `+${Math.round(overallDelta)}`
|
|
77
|
-
: String(Math.round(overallDelta));
|
|
78
|
-
const parts = [`Overall: ${deltaStr}`];
|
|
79
|
-
if (improved > 0)
|
|
80
|
-
parts.push(`${improved} improved`);
|
|
81
|
-
if (regressed > 0)
|
|
82
|
-
parts.push(`${regressed} regressed`);
|
|
83
|
-
if (unchanged > 0)
|
|
84
|
-
parts.push(`${unchanged} unchanged`);
|
|
85
|
-
return {
|
|
86
|
-
durationMs: Date.now() - start,
|
|
87
|
-
status: "success",
|
|
88
|
-
summary: parts.join(", "),
|
|
89
|
-
};
|
|
90
|
-
}
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Pipeline step: Run promptfoo evaluation.
|
|
3
|
-
*
|
|
4
|
-
* Preconditions: config files and context files exist
|
|
5
|
-
* Postconditions: eval-results.json exists and is valid
|
|
6
|
-
*
|
|
7
|
-
* Cache key: promptfooconfig*.yaml + contexts/*.md + tasks/*.yaml +
|
|
8
|
-
* canonical contexts + reference solutions + config/models.yaml
|
|
9
|
-
* Cache outputs: results/latest/eval-results*.json
|
|
10
|
-
*
|
|
11
|
-
* Remote cache: When local cache misses and a Sanity token is available,
|
|
12
|
-
* the step queries the Content Lake for a report with a matching eval
|
|
13
|
-
* fingerprint. On a hit, the cached score-summary.json is written to disk
|
|
14
|
-
* and the eval + calculate-scores steps are skipped entirely.
|
|
15
|
-
*
|
|
16
|
-
* @see docs/design-docs/content-lake-eval-caching.md
|
|
17
|
-
*/
|
|
18
|
-
import type { ConcreteEvalMode, DebugOptions, FilterOptions, StepResult } from "../types.js";
|
|
19
|
-
/** Each mode writes eval results to a different file (set in the config's outputPath) */
|
|
20
|
-
export declare const RESULTS_FILES: Record<ConcreteEvalMode, string>;
|
|
21
|
-
/** Extended step result that carries cache metadata for downstream steps */
|
|
22
|
-
export interface EvalStepResult {
|
|
23
|
-
/** The computed eval fingerprint (for publishing in provenance) */
|
|
24
|
-
evalFingerprint?: string;
|
|
25
|
-
/** Whether this result came from a remote cache hit */
|
|
26
|
-
remoteCacheHit?: boolean;
|
|
27
|
-
/** The step result */
|
|
28
|
-
stepResult: StepResult;
|
|
29
|
-
}
|
|
30
|
-
/** Options for the remote cache (Content Lake fingerprint lookup) */
|
|
31
|
-
export interface RemoteCacheOptions {
|
|
32
|
-
/** Whether this is a debug run (debug runs don't use remote cache) */
|
|
33
|
-
debug?: boolean;
|
|
34
|
-
/** Filter options used for fingerprint computation */
|
|
35
|
-
filter?: FilterOptions;
|
|
36
|
-
/** Grader model identifier from models.yaml */
|
|
37
|
-
graderModel: string;
|
|
38
|
-
/** Disable remote cache lookup (--no-remote-cache) */
|
|
39
|
-
noRemoteCache?: boolean;
|
|
40
|
-
/** Sanity API token for reading cached reports */
|
|
41
|
-
sanityToken?: string;
|
|
42
|
-
}
|
|
43
|
-
export declare function buildFilterFlags(debug?: DebugOptions): string;
|
|
44
|
-
/**
|
|
45
|
-
* Extract the Promptfoo share URL from the eval results JSON.
|
|
46
|
-
*
|
|
47
|
-
* Promptfoo writes a `shareableUrl` field into the results file when
|
|
48
|
-
* `PROMPTFOO_API_KEY` is set. This replaces the previous approach of
|
|
49
|
-
* scraping the URL from a captured log file (which required piping
|
|
50
|
-
* through `tee` and broke TTY progress reporting).
|
|
51
|
-
*/
|
|
52
|
-
export declare function extractShareUrl(mode: ConcreteEvalMode): string | undefined;
|
|
53
|
-
export declare function runEval(mode: ConcreteEvalMode, debug?: DebugOptions, concurrency?: number, noCache?: boolean, remoteCacheOpts?: RemoteCacheOptions): Promise<EvalStepResult>;
|
|
@@ -1,347 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Pipeline step: Run promptfoo evaluation.
|
|
3
|
-
*
|
|
4
|
-
* Preconditions: config files and context files exist
|
|
5
|
-
* Postconditions: eval-results.json exists and is valid
|
|
6
|
-
*
|
|
7
|
-
* Cache key: promptfooconfig*.yaml + contexts/*.md + tasks/*.yaml +
|
|
8
|
-
* canonical contexts + reference solutions + config/models.yaml
|
|
9
|
-
* Cache outputs: results/latest/eval-results*.json
|
|
10
|
-
*
|
|
11
|
-
* Remote cache: When local cache misses and a Sanity token is available,
|
|
12
|
-
* the step queries the Content Lake for a report with a matching eval
|
|
13
|
-
* fingerprint. On a hit, the cached score-summary.json is written to disk
|
|
14
|
-
* and the eval + calculate-scores steps are skipped entirely.
|
|
15
|
-
*
|
|
16
|
-
* @see docs/design-docs/content-lake-eval-caching.md
|
|
17
|
-
*/
|
|
18
|
-
import { execSync } from "child_process";
|
|
19
|
-
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
20
|
-
import { dirname, resolve } from "path";
|
|
21
|
-
import { fileURLToPath } from "url";
|
|
22
|
-
import { getStepInputPaths, hashFiles, lookupCache, recordCache, } from "../cache.js";
|
|
23
|
-
import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../checks.js";
|
|
24
|
-
import { computeEvalFingerprint } from "../eval-fingerprint.js";
|
|
25
|
-
import { resolveMappings } from "../resolve-mappings.js";
|
|
26
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
27
|
-
const ROOT = resolve(__dirname, "..", "..", "..");
|
|
28
|
-
const CONFIG_FILES = {
|
|
29
|
-
agentic: "promptfooconfig.agentic.yaml",
|
|
30
|
-
baseline: "promptfooconfig.yaml",
|
|
31
|
-
observed: "promptfooconfig.observed.yaml",
|
|
32
|
-
};
|
|
33
|
-
/** Each mode writes eval results to a different file (set in the config's outputPath) */
|
|
34
|
-
export const RESULTS_FILES = {
|
|
35
|
-
agentic: "results/latest/eval-results-agentic.json",
|
|
36
|
-
baseline: "results/latest/eval-results.json",
|
|
37
|
-
observed: "results/latest/eval-results-observed.json",
|
|
38
|
-
};
|
|
39
|
-
export function buildFilterFlags(debug) {
|
|
40
|
-
if (!debug?.enabled)
|
|
41
|
-
return "";
|
|
42
|
-
const flags = [];
|
|
43
|
-
if (debug.pattern) {
|
|
44
|
-
flags.push(`--filter-pattern '${debug.pattern}'`);
|
|
45
|
-
}
|
|
46
|
-
if (debug.sample) {
|
|
47
|
-
flags.push(`--filter-sample ${debug.sample}`);
|
|
48
|
-
}
|
|
49
|
-
if (debug.firstN) {
|
|
50
|
-
flags.push(`--filter-first-n ${debug.firstN}`);
|
|
51
|
-
}
|
|
52
|
-
// Default: first 2 tests when no other filters specified
|
|
53
|
-
if (flags.length === 0) {
|
|
54
|
-
flags.push("--filter-first-n 2");
|
|
55
|
-
}
|
|
56
|
-
return " " + flags.join(" ");
|
|
57
|
-
}
|
|
58
|
-
/**
|
|
59
|
-
* Extract the Promptfoo share URL from the eval results JSON.
|
|
60
|
-
*
|
|
61
|
-
* Promptfoo writes a `shareableUrl` field into the results file when
|
|
62
|
-
* `PROMPTFOO_API_KEY` is set. This replaces the previous approach of
|
|
63
|
-
* scraping the URL from a captured log file (which required piping
|
|
64
|
-
* through `tee` and broke TTY progress reporting).
|
|
65
|
-
*/
|
|
66
|
-
export function extractShareUrl(mode) {
|
|
67
|
-
const resultsPath = resolve(ROOT, RESULTS_FILES[mode]);
|
|
68
|
-
if (!existsSync(resultsPath))
|
|
69
|
-
return undefined;
|
|
70
|
-
try {
|
|
71
|
-
const raw = readFileSync(resultsPath, "utf-8");
|
|
72
|
-
const data = JSON.parse(raw);
|
|
73
|
-
return data.shareableUrl ?? undefined;
|
|
74
|
-
}
|
|
75
|
-
catch {
|
|
76
|
-
return undefined;
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
// ---------------------------------------------------------------------------
|
|
80
|
-
// Post-eval error scanning
|
|
81
|
-
// ---------------------------------------------------------------------------
|
|
82
|
-
export async function runEval(mode, debug, concurrency, noCache = false, remoteCacheOpts) {
|
|
83
|
-
const start = Date.now();
|
|
84
|
-
// Precondition: config file exists
|
|
85
|
-
const configIssues = checkGeneratedConfigsExist(ROOT);
|
|
86
|
-
const configErrors = configIssues.filter((i) => i.severity === "error");
|
|
87
|
-
if (configErrors.length > 0) {
|
|
88
|
-
return {
|
|
89
|
-
stepResult: {
|
|
90
|
-
durationMs: Date.now() - start,
|
|
91
|
-
error: `Config files missing: ${configErrors.map((e) => e.message).join("; ")}`,
|
|
92
|
-
status: "failed",
|
|
93
|
-
},
|
|
94
|
-
};
|
|
95
|
-
}
|
|
96
|
-
// Precondition: canonical context files exist for all mapped tasks
|
|
97
|
-
const mappings = resolveMappings(ROOT);
|
|
98
|
-
const taskIds = Object.values(mappings.feature_areas).flatMap((area) => area.tasks.map((t) => t.id));
|
|
99
|
-
const contextIssues = checkCanonicalContextsExist(ROOT, taskIds);
|
|
100
|
-
const contextErrors = contextIssues.filter((i) => i.severity === "error");
|
|
101
|
-
if (contextErrors.length > 0) {
|
|
102
|
-
return {
|
|
103
|
-
stepResult: {
|
|
104
|
-
durationMs: Date.now() - start,
|
|
105
|
-
error: `Context files missing. Run 'pnpm fetch-docs' first. ${contextErrors.map((e) => e.message).join("; ")}`,
|
|
106
|
-
status: "failed",
|
|
107
|
-
},
|
|
108
|
-
};
|
|
109
|
-
}
|
|
110
|
-
// -----------------------------------------------------------------------
|
|
111
|
-
// Compute eval fingerprint (used for both remote cache + provenance)
|
|
112
|
-
// Only for non-debug runs — debug runs use test subsets.
|
|
113
|
-
// -----------------------------------------------------------------------
|
|
114
|
-
let evalFingerprint;
|
|
115
|
-
if (!debug?.enabled && remoteCacheOpts?.graderModel) {
|
|
116
|
-
try {
|
|
117
|
-
evalFingerprint = computeEvalFingerprint({
|
|
118
|
-
filter: remoteCacheOpts.filter,
|
|
119
|
-
graderModel: remoteCacheOpts.graderModel,
|
|
120
|
-
mode,
|
|
121
|
-
rootDir: ROOT,
|
|
122
|
-
});
|
|
123
|
-
}
|
|
124
|
-
catch (err) {
|
|
125
|
-
console.warn(` ⚠️ Could not compute eval fingerprint: ${err instanceof Error ? err.message : String(err)}`);
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
// -----------------------------------------------------------------------
|
|
129
|
-
// Cache check — local first, then remote
|
|
130
|
-
// -----------------------------------------------------------------------
|
|
131
|
-
// Local cache check — skip eval if inputs unchanged (biggest cost saver).
|
|
132
|
-
// Each mode gets its own cache key so that in `full` mode, a fresh agentic
|
|
133
|
-
// cache doesn't force baseline to re-run (or vice versa).
|
|
134
|
-
const cacheKey = `eval-${mode}`;
|
|
135
|
-
if (!noCache) {
|
|
136
|
-
const cacheResult = lookupCache(ROOT, cacheKey);
|
|
137
|
-
if (cacheResult.hit) {
|
|
138
|
-
return {
|
|
139
|
-
evalFingerprint,
|
|
140
|
-
stepResult: {
|
|
141
|
-
durationMs: Date.now() - start,
|
|
142
|
-
status: "success",
|
|
143
|
-
summary: `Skipped (cached) — ${cacheResult.entry.summary}`,
|
|
144
|
-
},
|
|
145
|
-
};
|
|
146
|
-
}
|
|
147
|
-
}
|
|
148
|
-
// Remote cache check — query Content Lake for matching fingerprint
|
|
149
|
-
if (evalFingerprint &&
|
|
150
|
-
!noCache &&
|
|
151
|
-
!remoteCacheOpts?.noRemoteCache &&
|
|
152
|
-
remoteCacheOpts?.sanityToken) {
|
|
153
|
-
const remoteCacheResult = await checkRemoteCache(evalFingerprint, remoteCacheOpts.sanityToken);
|
|
154
|
-
if (remoteCacheResult) {
|
|
155
|
-
return {
|
|
156
|
-
evalFingerprint,
|
|
157
|
-
remoteCacheHit: true,
|
|
158
|
-
stepResult: {
|
|
159
|
-
durationMs: Date.now() - start,
|
|
160
|
-
status: "success",
|
|
161
|
-
summary: `Skipped (remote cache hit) — reusing report ${remoteCacheResult.reportId} from ${remoteCacheResult.completedAt}`,
|
|
162
|
-
},
|
|
163
|
-
};
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
// Execute — run promptfoo directly with inherited stdio so the TTY
|
|
167
|
-
// progress bar works in interactive terminals and the CI progress
|
|
168
|
-
// reporter works in CI environments. Previously this was piped through
|
|
169
|
-
// `tee` to capture a log file for share-URL extraction, but `tee`
|
|
170
|
-
// destroyed TTY detection, disabling all progress output. The share URL
|
|
171
|
-
// is now read from the eval results JSON (`shareableUrl` field) instead.
|
|
172
|
-
//
|
|
173
|
-
// Sharing is enabled by default (via PROMPTFOO_API_KEY / cloud config).
|
|
174
|
-
// We set PROMPTFOO_DISABLE_SHARE_EMAIL_REQUEST=1 to prevent promptfoo's
|
|
175
|
-
// interactive email prompt from blocking the terminal in local TTY
|
|
176
|
-
// environments. In CI, isCI() already guards against the prompt, but
|
|
177
|
-
// the env var provides defense-in-depth for all execution contexts.
|
|
178
|
-
const configFile = CONFIG_FILES[mode];
|
|
179
|
-
const filterFlags = buildFilterFlags(debug);
|
|
180
|
-
const concurrencyFlag = concurrency ? ` --max-concurrency ${concurrency}` : "";
|
|
181
|
-
const noCacheFlag = noCache ? " --no-cache" : "";
|
|
182
|
-
const evalCmd = `dotenv -e ../../.env -o -- promptfoo eval --config ${configFile}${filterFlags}${concurrencyFlag}${noCacheFlag}`;
|
|
183
|
-
let exitCode = 0;
|
|
184
|
-
try {
|
|
185
|
-
execSync(evalCmd, {
|
|
186
|
-
cwd: ROOT,
|
|
187
|
-
env: {
|
|
188
|
-
...process.env,
|
|
189
|
-
PROMPTFOO_DISABLE_SHARE_EMAIL_REQUEST: "1",
|
|
190
|
-
},
|
|
191
|
-
stdio: "inherit",
|
|
192
|
-
});
|
|
193
|
-
}
|
|
194
|
-
catch (err) {
|
|
195
|
-
// promptfoo exits 100 when assertions fail — that's expected, not an error
|
|
196
|
-
exitCode =
|
|
197
|
-
err !== null && typeof err === "object" && "status" in err
|
|
198
|
-
? err.status
|
|
199
|
-
: 1;
|
|
200
|
-
if (exitCode !== 100) {
|
|
201
|
-
return {
|
|
202
|
-
evalFingerprint,
|
|
203
|
-
stepResult: {
|
|
204
|
-
durationMs: Date.now() - start,
|
|
205
|
-
error: `promptfoo eval failed with exit code ${exitCode}`,
|
|
206
|
-
status: "failed",
|
|
207
|
-
},
|
|
208
|
-
};
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
|
-
// Postcondition: results file exists and is valid
|
|
212
|
-
const resultsIssues = checkResultsExist(ROOT, RESULTS_FILES[mode]);
|
|
213
|
-
const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
|
|
214
|
-
if (resultsErrors.length > 0) {
|
|
215
|
-
return {
|
|
216
|
-
evalFingerprint,
|
|
217
|
-
stepResult: {
|
|
218
|
-
durationMs: Date.now() - start,
|
|
219
|
-
error: `Postcondition failed: ${resultsErrors.map((e) => e.message).join("; ")}`,
|
|
220
|
-
status: "failed",
|
|
221
|
-
},
|
|
222
|
-
};
|
|
223
|
-
}
|
|
224
|
-
// Scan results for errors and surface them clearly
|
|
225
|
-
const errorSummary = scanResultsForErrors(resolve(ROOT, RESULTS_FILES[mode]));
|
|
226
|
-
if (errorSummary) {
|
|
227
|
-
console.log();
|
|
228
|
-
console.log(errorSummary);
|
|
229
|
-
}
|
|
230
|
-
const durationMs = Date.now() - start;
|
|
231
|
-
const summary = `Evaluation complete (mode: ${mode}${debug?.enabled ? ", debug" : ""})`;
|
|
232
|
-
// Record cache — only for non-debug runs (debug uses a subset of tests).
|
|
233
|
-
// Uses per-mode cache key so baseline and agentic are independently cached.
|
|
234
|
-
if (!noCache && !debug?.enabled) {
|
|
235
|
-
const inputPaths = getStepInputPaths(ROOT, cacheKey);
|
|
236
|
-
const inputHash = hashFiles(inputPaths);
|
|
237
|
-
recordCache(ROOT, cacheKey, inputHash, summary, durationMs, [
|
|
238
|
-
RESULTS_FILES[mode],
|
|
239
|
-
]);
|
|
240
|
-
}
|
|
241
|
-
return {
|
|
242
|
-
evalFingerprint,
|
|
243
|
-
stepResult: { durationMs, status: "success", summary },
|
|
244
|
-
};
|
|
245
|
-
}
|
|
246
|
-
// ---------------------------------------------------------------------------
|
|
247
|
-
// Remote cache helpers
|
|
248
|
-
// ---------------------------------------------------------------------------
|
|
249
|
-
/**
|
|
250
|
-
* Query the Sanity Content Lake for a report with a matching eval fingerprint.
|
|
251
|
-
*
|
|
252
|
-
* On a hit, writes the cached score-summary.json to results/latest/ so that
|
|
253
|
-
* downstream steps (report, compare, publish) can proceed as if the eval
|
|
254
|
-
* had just run.
|
|
255
|
-
*
|
|
256
|
-
* @returns The matched report metadata on hit, null on miss or error
|
|
257
|
-
*/
|
|
258
|
-
async function checkRemoteCache(fingerprint, sanityToken) {
|
|
259
|
-
try {
|
|
260
|
-
const { ReportStore } = await import("../../report-store.js");
|
|
261
|
-
const store = new ReportStore({
|
|
262
|
-
dataset: process.env.AILF_REPORT_DATASET ?? undefined,
|
|
263
|
-
projectId: process.env.AILF_REPORT_PROJECT_ID ?? undefined,
|
|
264
|
-
token: sanityToken,
|
|
265
|
-
});
|
|
266
|
-
const startQuery = Date.now();
|
|
267
|
-
const cachedReport = await store.findByFingerprint(fingerprint);
|
|
268
|
-
const queryMs = Date.now() - startQuery;
|
|
269
|
-
if (!cachedReport) {
|
|
270
|
-
console.log(` ℹ️ Remote cache miss — no report matches fingerprint (${queryMs}ms)`);
|
|
271
|
-
return null;
|
|
272
|
-
}
|
|
273
|
-
// Write the cached score summary to disk so downstream steps work
|
|
274
|
-
const outDir = resolve(ROOT, "results", "latest");
|
|
275
|
-
if (!existsSync(outDir)) {
|
|
276
|
-
mkdirSync(outDir, { recursive: true });
|
|
277
|
-
}
|
|
278
|
-
writeFileSync(resolve(outDir, "score-summary.json"), JSON.stringify(cachedReport.summary, null, 2));
|
|
279
|
-
console.log(` ✅ Remote cache hit — reusing report ${cachedReport.id} from ${cachedReport.completedAt}`);
|
|
280
|
-
console.log(` ℹ️ Fingerprint: ${fingerprint.slice(0, 16)}... (${queryMs}ms)`);
|
|
281
|
-
console.log(" ⚠️ Cached scores are statistically equivalent, not identical");
|
|
282
|
-
return {
|
|
283
|
-
completedAt: cachedReport.completedAt,
|
|
284
|
-
reportId: cachedReport.id,
|
|
285
|
-
};
|
|
286
|
-
}
|
|
287
|
-
catch (err) {
|
|
288
|
-
console.warn(` ⚠️ Remote cache check failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
289
|
-
return null;
|
|
290
|
-
}
|
|
291
|
-
}
|
|
292
|
-
/**
|
|
293
|
-
* Read the eval results JSON and produce a human-readable summary of any
|
|
294
|
-
* errored or failed tests. This surfaces API errors, timeouts, and other
|
|
295
|
-
* issues that would otherwise be buried in the Promptfoo table output.
|
|
296
|
-
*
|
|
297
|
-
* Returns null if there are no errors/failures worth reporting.
|
|
298
|
-
*/
|
|
299
|
-
function scanResultsForErrors(resultsPath) {
|
|
300
|
-
if (!existsSync(resultsPath))
|
|
301
|
-
return null;
|
|
302
|
-
let file;
|
|
303
|
-
try {
|
|
304
|
-
const raw = readFileSync(resultsPath, "utf-8");
|
|
305
|
-
file = JSON.parse(raw);
|
|
306
|
-
}
|
|
307
|
-
catch {
|
|
308
|
-
return null;
|
|
309
|
-
}
|
|
310
|
-
const results = file?.results?.results;
|
|
311
|
-
if (!Array.isArray(results))
|
|
312
|
-
return null;
|
|
313
|
-
const errored = [];
|
|
314
|
-
for (const r of results) {
|
|
315
|
-
if (r.gradingResult !== null)
|
|
316
|
-
continue;
|
|
317
|
-
const desc = r.testCase?.description ?? r.description ?? "unknown";
|
|
318
|
-
const provider = r.provider?.label ?? r.provider?.id ?? "unknown";
|
|
319
|
-
// No grading result = the provider errored before producing a response.
|
|
320
|
-
// This is the only case we surface — API 500s, timeouts, rate limits.
|
|
321
|
-
// Note: r.error may also be set for assertion failures, but those have
|
|
322
|
-
// a non-null gradingResult and are normal pass/fail outcomes.
|
|
323
|
-
const errorMsg = r.error
|
|
324
|
-
? (typeof r.error === "string" ? r.error : JSON.stringify(r.error)).slice(0, 200)
|
|
325
|
-
: "Provider returned no scorable result";
|
|
326
|
-
errored.push({ description: desc, error: errorMsg, provider });
|
|
327
|
-
}
|
|
328
|
-
if (errored.length === 0)
|
|
329
|
-
return null;
|
|
330
|
-
const total = results.length;
|
|
331
|
-
const lines = [];
|
|
332
|
-
lines.push(` ┌─────────────────────────────────────────────────────────────`);
|
|
333
|
-
lines.push(` │ ⚠️ ${errored.length} of ${total} eval result(s) errored (no gradingResult)`);
|
|
334
|
-
lines.push(` │`);
|
|
335
|
-
for (const e of errored) {
|
|
336
|
-
lines.push(` │ ✗ [${e.provider}] ${e.description}`);
|
|
337
|
-
lines.push(` │ → ${e.error}`);
|
|
338
|
-
}
|
|
339
|
-
const errorRate = Math.round((errored.length / total) * 100);
|
|
340
|
-
if (errorRate >= 25) {
|
|
341
|
-
lines.push(` │`);
|
|
342
|
-
lines.push(` │ 🔥 High error rate (${errorRate}%) — check API keys, rate limits,`);
|
|
343
|
-
lines.push(` │ or model availability. Errored results are excluded from scoring.`);
|
|
344
|
-
}
|
|
345
|
-
lines.push(` └─────────────────────────────────────────────────────────────`);
|
|
346
|
-
return lines.join("\n");
|
|
347
|
-
}
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Pipeline step: Fetch documentation from Sanity CMS.
|
|
3
|
-
*
|
|
4
|
-
* Preconditions: tasks have inline canonical_docs
|
|
5
|
-
* Postconditions: canonical context files exist for all mapped tasks
|
|
6
|
-
*
|
|
7
|
-
* Cache key: tasks/*.yaml + config/sources.yaml + config/models.yaml
|
|
8
|
-
* Cache outputs: contexts/canonical/*.md files
|
|
9
|
-
*/
|
|
10
|
-
import type { StepResult } from "../types.js";
|
|
11
|
-
export declare function runFetchDocs(source?: string, noCache?: boolean): Promise<StepResult>;
|
|
@@ -1,84 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Pipeline step: Fetch documentation from Sanity CMS.
|
|
3
|
-
*
|
|
4
|
-
* Preconditions: tasks have inline canonical_docs
|
|
5
|
-
* Postconditions: canonical context files exist for all mapped tasks
|
|
6
|
-
*
|
|
7
|
-
* Cache key: tasks/*.yaml + config/sources.yaml + config/models.yaml
|
|
8
|
-
* Cache outputs: contexts/canonical/*.md files
|
|
9
|
-
*/
|
|
10
|
-
import { execSync } from "child_process";
|
|
11
|
-
import { dirname, resolve } from "path";
|
|
12
|
-
import { fileURLToPath } from "url";
|
|
13
|
-
import { lookupCache, recordCache } from "../cache.js";
|
|
14
|
-
import { checkCanonicalContextsExist } from "../checks.js";
|
|
15
|
-
import { resolveMappings } from "../resolve-mappings.js";
|
|
16
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
17
|
-
const ROOT = resolve(__dirname, "..", "..", "..");
|
|
18
|
-
export async function runFetchDocs(source, noCache = false) {
|
|
19
|
-
const start = Date.now();
|
|
20
|
-
// Precondition: at least one task has inline canonical mappings
|
|
21
|
-
const mappings = resolveMappings(ROOT);
|
|
22
|
-
const totalTasks = Object.values(mappings.feature_areas).reduce((sum, area) => sum + area.tasks.length, 0);
|
|
23
|
-
if (totalTasks === 0) {
|
|
24
|
-
return {
|
|
25
|
-
durationMs: Date.now() - start,
|
|
26
|
-
error: "No tasks with canonical_docs found in task files. Add canonical_docs to your task definitions.",
|
|
27
|
-
status: "failed",
|
|
28
|
-
};
|
|
29
|
-
}
|
|
30
|
-
// Cache check
|
|
31
|
-
if (!noCache) {
|
|
32
|
-
const cacheResult = lookupCache(ROOT, "fetch-docs");
|
|
33
|
-
if (cacheResult.hit) {
|
|
34
|
-
return {
|
|
35
|
-
durationMs: Date.now() - start,
|
|
36
|
-
status: "success",
|
|
37
|
-
summary: `Skipped (cached) — ${cacheResult.entry.summary}`,
|
|
38
|
-
};
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
// Execute
|
|
42
|
-
try {
|
|
43
|
-
const sourceArg = source ? ` --source ${source}` : "";
|
|
44
|
-
execSync(`pnpm fetch-docs${sourceArg}`, {
|
|
45
|
-
cwd: ROOT,
|
|
46
|
-
env: process.env,
|
|
47
|
-
stdio: "inherit",
|
|
48
|
-
});
|
|
49
|
-
}
|
|
50
|
-
catch (err) {
|
|
51
|
-
return {
|
|
52
|
-
durationMs: Date.now() - start,
|
|
53
|
-
error: `fetch-docs failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
54
|
-
status: "failed",
|
|
55
|
-
};
|
|
56
|
-
}
|
|
57
|
-
// Postcondition: canonical context files exist for all mapped tasks
|
|
58
|
-
// Re-resolve in case fetch-docs modified things (unlikely but safe)
|
|
59
|
-
const postMappings = resolveMappings(ROOT);
|
|
60
|
-
const taskIds = Object.values(postMappings.feature_areas).flatMap((area) => area.tasks.map((t) => t.id));
|
|
61
|
-
const contextIssues = checkCanonicalContextsExist(ROOT, taskIds);
|
|
62
|
-
const contextErrors = contextIssues.filter((i) => i.severity === "error");
|
|
63
|
-
if (contextErrors.length > 0) {
|
|
64
|
-
return {
|
|
65
|
-
durationMs: Date.now() - start,
|
|
66
|
-
error: `Postcondition failed: ${contextErrors.map((e) => e.message).join("; ")}`,
|
|
67
|
-
status: "failed",
|
|
68
|
-
};
|
|
69
|
-
}
|
|
70
|
-
const durationMs = Date.now() - start;
|
|
71
|
-
const summary = `Fetched canonical contexts for ${taskIds.length} tasks`;
|
|
72
|
-
// Record cache
|
|
73
|
-
if (!noCache) {
|
|
74
|
-
const { getStepInputPaths, hashFiles } = await import("../cache.js");
|
|
75
|
-
const inputPaths = getStepInputPaths(ROOT, "fetch-docs");
|
|
76
|
-
const inputHash = hashFiles(inputPaths);
|
|
77
|
-
const outputPaths = [
|
|
78
|
-
...taskIds.map((id) => `contexts/canonical/${id}.md`),
|
|
79
|
-
"contexts/document-manifest.json",
|
|
80
|
-
];
|
|
81
|
-
recordCache(ROOT, "fetch-docs", inputHash, summary, durationMs, outputPaths);
|
|
82
|
-
}
|
|
83
|
-
return { durationMs, status: "success", summary };
|
|
84
|
-
}
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Pipeline step: Generate promptfoo config files from config/models.yaml.
|
|
3
|
-
*
|
|
4
|
-
* Preconditions: config/models.yaml is valid
|
|
5
|
-
* Postconditions: promptfooconfig*.yaml files exist
|
|
6
|
-
*
|
|
7
|
-
* Cache key: config/models.yaml + config/sources.yaml + tasks/*.yaml
|
|
8
|
-
* Cache outputs: promptfooconfig*.yaml files
|
|
9
|
-
*/
|
|
10
|
-
import type { StepResult } from "../types.js";
|
|
11
|
-
export declare function runGenerateConfigs(source?: string, noCache?: boolean): StepResult;
|