@sanity/ailf 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/examples/index.d.ts +16 -12
- package/dist/_vendor/ailf-core/examples/index.js +19 -12
- package/dist/_vendor/ailf-core/ports/context.d.ts +4 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +12 -2
- package/dist/adapters/task-sources/repo-schemas.js +28 -2
- package/dist/cli.js +0 -0
- package/dist/commands/init.js +17 -5
- package/dist/commands/pipeline-action.js +44 -6
- package/dist/commands/publish.js +2 -1
- package/dist/commands/validate-tasks.js +4 -1
- package/dist/composition-root.js +9 -5
- package/dist/orchestration/build-app-context.js +2 -0
- package/package.json +1 -1
- package/dist/commands/update-quality-scores.d.ts +0 -5
- package/dist/commands/update-quality-scores.js +0 -20
- package/dist/lib/agent-behavior-report.d.ts +0 -8
- package/dist/lib/agent-behavior-report.js +0 -185
- package/dist/lib/baseline.d.ts +0 -19
- package/dist/lib/baseline.js +0 -153
- package/dist/lib/calculate-scores.d.ts +0 -23
- package/dist/lib/calculate-scores.js +0 -42
- package/dist/lib/compare.d.ts +0 -18
- package/dist/lib/compare.js +0 -170
- package/dist/lib/coverage-audit.d.ts +0 -4
- package/dist/lib/coverage-audit.js +0 -42
- package/dist/lib/discovery-report.d.ts +0 -13
- package/dist/lib/discovery-report.js +0 -57
- package/dist/lib/fetch-docs.d.ts +0 -30
- package/dist/lib/fetch-docs.js +0 -171
- package/dist/lib/generate-configs.d.ts +0 -25
- package/dist/lib/generate-configs.js +0 -42
- package/dist/lib/grader-api.d.ts +0 -21
- package/dist/lib/grader-api.js +0 -34
- package/dist/lib/grader-compare.d.ts +0 -19
- package/dist/lib/grader-compare.js +0 -91
- package/dist/lib/grader-consistency.d.ts +0 -27
- package/dist/lib/grader-consistency.js +0 -79
- package/dist/lib/grader-sensitivity.d.ts +0 -19
- package/dist/lib/grader-sensitivity.js +0 -75
- package/dist/lib/grader-validate.d.ts +0 -19
- package/dist/lib/grader-validate.js +0 -78
- package/dist/lib/measure-retrieval.d.ts +0 -14
- package/dist/lib/measure-retrieval.js +0 -71
- package/dist/lib/pr-comment.d.ts +0 -16
- package/dist/lib/pr-comment.js +0 -28
- package/dist/lib/readiness-report.d.ts +0 -13
- package/dist/lib/readiness-report.js +0 -108
- package/dist/lib/webhook-server.d.ts +0 -11
- package/dist/lib/webhook-server.js +0 -24
- package/dist/lib/weekly-digest.d.ts +0 -24
- package/dist/lib/weekly-digest.js +0 -148
- package/dist/orchestration/env-bridge.d.ts +0 -21
- package/dist/orchestration/env-bridge.js +0 -66
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
- package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
- package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/calculate-scores-step.js +0 -89
- package/dist/pipeline/steps/compare-step.d.ts +0 -18
- package/dist/pipeline/steps/compare-step.js +0 -90
- package/dist/pipeline/steps/eval-step.d.ts +0 -53
- package/dist/pipeline/steps/eval-step.js +0 -347
- package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
- package/dist/pipeline/steps/fetch-docs-step.js +0 -84
- package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
- package/dist/pipeline/steps/generate-configs-step.js +0 -98
- package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
- package/dist/pipeline/steps/grader-consistency-step.js +0 -74
- package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
- package/dist/pipeline/steps/publish-report-step.js +0 -243
- package/dist/pipeline/steps/report-step.d.ts +0 -13
- package/dist/pipeline/steps/report-step.js +0 -56
- package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/update-scores-step.js +0 -42
- package/dist/scripts/agent-behavior-report.d.ts +0 -19
- package/dist/scripts/agent-behavior-report.js +0 -315
- package/dist/scripts/baseline.d.ts +0 -43
- package/dist/scripts/baseline.js +0 -267
- package/dist/scripts/calculate-scores.d.ts +0 -166
- package/dist/scripts/calculate-scores.js +0 -1296
- package/dist/scripts/compare.d.ts +0 -22
- package/dist/scripts/compare.js +0 -334
- package/dist/scripts/coverage-audit.d.ts +0 -44
- package/dist/scripts/coverage-audit.js +0 -209
- package/dist/scripts/debug-eval.d.ts +0 -19
- package/dist/scripts/debug-eval.js +0 -73
- package/dist/scripts/discovery-report.d.ts +0 -58
- package/dist/scripts/discovery-report.js +0 -250
- package/dist/scripts/fetch-docs.d.ts +0 -35
- package/dist/scripts/fetch-docs.js +0 -472
- package/dist/scripts/generate-configs.d.ts +0 -66
- package/dist/scripts/generate-configs.js +0 -459
- package/dist/scripts/grader-api.d.ts +0 -27
- package/dist/scripts/grader-api.js +0 -206
- package/dist/scripts/grader-compare.d.ts +0 -22
- package/dist/scripts/grader-compare.js +0 -368
- package/dist/scripts/grader-consistency.d.ts +0 -20
- package/dist/scripts/grader-consistency.js +0 -313
- package/dist/scripts/grader-sensitivity.d.ts +0 -22
- package/dist/scripts/grader-sensitivity.js +0 -354
- package/dist/scripts/grader-validate.d.ts +0 -19
- package/dist/scripts/grader-validate.js +0 -267
- package/dist/scripts/measure-retrieval.d.ts +0 -10
- package/dist/scripts/measure-retrieval.js +0 -145
- package/dist/scripts/pipeline.d.ts +0 -76
- package/dist/scripts/pipeline.js +0 -1031
- package/dist/scripts/pr-comment.d.ts +0 -10
- package/dist/scripts/pr-comment.js +0 -510
- package/dist/scripts/readiness-report.d.ts +0 -88
- package/dist/scripts/readiness-report.js +0 -342
- package/dist/scripts/update-quality-scores.d.ts +0 -15
- package/dist/scripts/update-quality-scores.js +0 -184
- package/dist/scripts/validate.d.ts +0 -13
- package/dist/scripts/validate.js +0 -79
- package/dist/scripts/webhook-server.d.ts +0 -26
- package/dist/scripts/webhook-server.js +0 -147
- package/dist/scripts/weekly-digest.d.ts +0 -24
- package/dist/scripts/weekly-digest.js +0 -144
- package/dist/sinks/format-slack.d.ts +0 -64
- package/dist/sinks/format-slack.js +0 -306
- package/dist/sinks/slack-sink.d.ts +0 -27
- package/dist/sinks/slack-sink.js +0 -78
- package/dist/sinks/webhook-sink.d.ts +0 -19
- package/dist/sinks/webhook-sink.js +0 -50
- package/tasks/.expanded.agentic.yaml +0 -51
- package/tasks/.expanded.yaml +0 -66
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Environment variable bridge — writes ResolvedConfig values to process.env
|
|
3
|
-
* so that lib/*.ts modules (which still read process.env) work correctly.
|
|
4
|
-
*
|
|
5
|
-
* This replaces the former global applyEnvironment() with an explicit
|
|
6
|
-
* per-step bridge. Each orchestration step calls this before invoking
|
|
7
|
-
* its lib/*.ts main() function.
|
|
8
|
-
*
|
|
9
|
-
* Phase 9 will eliminate this file entirely by giving lib/*.ts main()
|
|
10
|
-
* functions typed option parameters.
|
|
11
|
-
*
|
|
12
|
-
* @see docs/exec-plans/active/ports-and-adapters/phase-8-delete-legacy-step-layer.md
|
|
13
|
-
*/
|
|
14
|
-
/**
|
|
15
|
-
* Bridge ResolvedConfig values to process.env.
|
|
16
|
-
*
|
|
17
|
-
* Idempotent — safe to call multiple times. Only sets env vars for
|
|
18
|
-
* config values that are defined (never deletes or resets).
|
|
19
|
-
*/
|
|
20
|
-
export function bridgeConfigToEnv(config) {
|
|
21
|
-
// Mode
|
|
22
|
-
process.env.EVAL_MODE = config.mode;
|
|
23
|
-
// Search mode
|
|
24
|
-
if (config.searchMode !== "open") {
|
|
25
|
-
process.env.EVAL_SEARCH_MODE = config.searchMode;
|
|
26
|
-
}
|
|
27
|
-
// Source
|
|
28
|
-
if (config.source) {
|
|
29
|
-
process.env.DOC_SOURCE = config.source;
|
|
30
|
-
}
|
|
31
|
-
// URL-derived overrides
|
|
32
|
-
if (config.urls?.[0]) {
|
|
33
|
-
process.env.DOC_BASE_URL = config.urls[0];
|
|
34
|
-
}
|
|
35
|
-
// Sanity overrides
|
|
36
|
-
if (config.datasetOverride) {
|
|
37
|
-
process.env.SANITY_DATASET = config.datasetOverride;
|
|
38
|
-
}
|
|
39
|
-
if (config.projectIdOverride) {
|
|
40
|
-
process.env.SANITY_PROJECT_ID = config.projectIdOverride;
|
|
41
|
-
}
|
|
42
|
-
if (config.perspectiveOverride) {
|
|
43
|
-
process.env.SANITY_PERSPECTIVE = config.perspectiveOverride;
|
|
44
|
-
}
|
|
45
|
-
if (config.studioOriginOverride) {
|
|
46
|
-
process.env.SANITY_STUDIO_ORIGIN = config.studioOriginOverride;
|
|
47
|
-
}
|
|
48
|
-
if (config.sanityDocumentArgs?.length) {
|
|
49
|
-
process.env.SANITY_DOCUMENT_IDS = config.sanityDocumentArgs.join(",");
|
|
50
|
-
}
|
|
51
|
-
// Custom headers
|
|
52
|
-
if (config.headers) {
|
|
53
|
-
process.env.DOC_HEADERS = JSON.stringify(config.headers);
|
|
54
|
-
}
|
|
55
|
-
// Allowed origins
|
|
56
|
-
if (config.allowedOrigins?.length) {
|
|
57
|
-
process.env.DOC_ALLOWED_ORIGINS = config.allowedOrigins.join(",");
|
|
58
|
-
}
|
|
59
|
-
// Scoping filters
|
|
60
|
-
if (config.areas) {
|
|
61
|
-
process.env.EVAL_FILTER_AREAS = config.areas.join(",");
|
|
62
|
-
}
|
|
63
|
-
if (config.tasks) {
|
|
64
|
-
process.env.EVAL_FILTER_TASKS = config.tasks.join(",");
|
|
65
|
-
}
|
|
66
|
-
}
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Shell delegation for the fetch-docs step.
|
|
3
|
-
*
|
|
4
|
-
* Isolates the execSync call so it can be replaced when the pipeline
|
|
5
|
-
* fully migrates to the DocFetcher port.
|
|
6
|
-
*/
|
|
7
|
-
export interface ShellResult {
|
|
8
|
-
ok: boolean;
|
|
9
|
-
error?: string;
|
|
10
|
-
}
|
|
11
|
-
/**
|
|
12
|
-
* Run `pnpm fetch-docs` via shell.
|
|
13
|
-
*
|
|
14
|
-
* Returns a result object instead of throwing so the step can
|
|
15
|
-
* handle the failure uniformly.
|
|
16
|
-
*/
|
|
17
|
-
export declare function runFetchDocsShell(rootDir: string, source?: string): ShellResult;
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Shell delegation for the fetch-docs step.
|
|
3
|
-
*
|
|
4
|
-
* Isolates the execSync call so it can be replaced when the pipeline
|
|
5
|
-
* fully migrates to the DocFetcher port.
|
|
6
|
-
*/
|
|
7
|
-
import { execSync } from "child_process";
|
|
8
|
-
/**
|
|
9
|
-
* Run `pnpm fetch-docs` via shell.
|
|
10
|
-
*
|
|
11
|
-
* Returns a result object instead of throwing so the step can
|
|
12
|
-
* handle the failure uniformly.
|
|
13
|
-
*/
|
|
14
|
-
export function runFetchDocsShell(rootDir, source) {
|
|
15
|
-
try {
|
|
16
|
-
const sourceArg = source ? ` --source ${source}` : "";
|
|
17
|
-
execSync(`pnpm fetch-docs${sourceArg}`, {
|
|
18
|
-
cwd: rootDir,
|
|
19
|
-
env: process.env,
|
|
20
|
-
stdio: "inherit",
|
|
21
|
-
});
|
|
22
|
-
return { ok: true };
|
|
23
|
-
}
|
|
24
|
-
catch (err) {
|
|
25
|
-
return {
|
|
26
|
-
ok: false,
|
|
27
|
-
error: err instanceof Error ? err.message : String(err),
|
|
28
|
-
};
|
|
29
|
-
}
|
|
30
|
-
}
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Pipeline step: Calculate AI Literacy Scores from eval results.
|
|
3
|
-
*
|
|
4
|
-
* Preconditions: eval-results.json exists and is valid
|
|
5
|
-
* Postconditions: score-summary.json exists and is valid
|
|
6
|
-
*
|
|
7
|
-
* Cache key: eval results JSON file(s)
|
|
8
|
-
* Cache outputs: results/latest/score-summary.json
|
|
9
|
-
*/
|
|
10
|
-
import type { EvalMode, StepResult } from "../types.js";
|
|
11
|
-
export declare function runCalculateScores(source?: string, mode?: EvalMode, noCache?: boolean): StepResult;
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Pipeline step: Calculate AI Literacy Scores from eval results.
|
|
3
|
-
*
|
|
4
|
-
* Preconditions: eval-results.json exists and is valid
|
|
5
|
-
* Postconditions: score-summary.json exists and is valid
|
|
6
|
-
*
|
|
7
|
-
* Cache key: eval results JSON file(s)
|
|
8
|
-
* Cache outputs: results/latest/score-summary.json
|
|
9
|
-
*/
|
|
10
|
-
import { execSync } from "child_process";
|
|
11
|
-
import { dirname, resolve } from "path";
|
|
12
|
-
import { fileURLToPath } from "url";
|
|
13
|
-
import { getStepInputPaths, hashFiles, lookupCache, recordCache, } from "../cache.js";
|
|
14
|
-
import { checkResultsExist, checkScoreSummaryValid } from "../checks.js";
|
|
15
|
-
import { RESULTS_FILES } from "./eval-step.js";
|
|
16
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
17
|
-
const ROOT = resolve(__dirname, "..", "..", "..");
|
|
18
|
-
export function runCalculateScores(source, mode = "baseline", noCache = false) {
|
|
19
|
-
const start = Date.now();
|
|
20
|
-
// For full mode, use the baseline results file as the primary input
|
|
21
|
-
// (calculate-scores reads all available results files internally)
|
|
22
|
-
const primaryMode = mode === "full" ? "baseline" : mode;
|
|
23
|
-
const resultsFile = RESULTS_FILES[primaryMode];
|
|
24
|
-
const resultsIssues = checkResultsExist(ROOT, resultsFile);
|
|
25
|
-
const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
|
|
26
|
-
if (resultsErrors.length > 0) {
|
|
27
|
-
return {
|
|
28
|
-
durationMs: Date.now() - start,
|
|
29
|
-
error: `Results missing: ${resultsErrors.map((e) => e.message).join("; ")}`,
|
|
30
|
-
status: "failed",
|
|
31
|
-
};
|
|
32
|
-
}
|
|
33
|
-
// Cache check
|
|
34
|
-
if (!noCache) {
|
|
35
|
-
const cacheResult = lookupCache(ROOT, "calculate-scores");
|
|
36
|
-
if (cacheResult.hit) {
|
|
37
|
-
return {
|
|
38
|
-
durationMs: Date.now() - start,
|
|
39
|
-
status: "success",
|
|
40
|
-
summary: `Skipped (cached) — ${cacheResult.entry.summary}`,
|
|
41
|
-
};
|
|
42
|
-
}
|
|
43
|
-
}
|
|
44
|
-
// Execute — note: calculate-scores exits 1 when areas are below critical,
|
|
45
|
-
// which is expected behavior, not an error
|
|
46
|
-
try {
|
|
47
|
-
const sourceArg = source ? ` --source ${source}` : "";
|
|
48
|
-
const resultsArg = primaryMode !== "baseline" ? ` ${resultsFile}` : "";
|
|
49
|
-
execSync(`tsx src/lib/calculate-scores.ts${resultsArg}${sourceArg}`, {
|
|
50
|
-
cwd: ROOT,
|
|
51
|
-
env: process.env,
|
|
52
|
-
stdio: "inherit",
|
|
53
|
-
});
|
|
54
|
-
}
|
|
55
|
-
catch (err) {
|
|
56
|
-
const code = err !== null && typeof err === "object" && "status" in err
|
|
57
|
-
? err.status
|
|
58
|
-
: 1;
|
|
59
|
-
// Exit code 1 means "areas below critical" — that's expected
|
|
60
|
-
if (code !== 1) {
|
|
61
|
-
return {
|
|
62
|
-
durationMs: Date.now() - start,
|
|
63
|
-
error: `calculate-scores failed with exit code ${code}`,
|
|
64
|
-
status: "failed",
|
|
65
|
-
};
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
// Postcondition: score summary exists and is valid
|
|
69
|
-
const summaryIssues = checkScoreSummaryValid(ROOT);
|
|
70
|
-
const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
|
|
71
|
-
if (summaryErrors.length > 0) {
|
|
72
|
-
return {
|
|
73
|
-
durationMs: Date.now() - start,
|
|
74
|
-
error: `Postcondition failed: ${summaryErrors.map((e) => e.message).join("; ")}`,
|
|
75
|
-
status: "failed",
|
|
76
|
-
};
|
|
77
|
-
}
|
|
78
|
-
const durationMs = Date.now() - start;
|
|
79
|
-
const summary = "Scores calculated and summary written";
|
|
80
|
-
// Record cache
|
|
81
|
-
if (!noCache) {
|
|
82
|
-
const inputPaths = getStepInputPaths(ROOT, "calculate-scores");
|
|
83
|
-
const inputHash = hashFiles(inputPaths);
|
|
84
|
-
recordCache(ROOT, "calculate-scores", inputHash, summary, durationMs, [
|
|
85
|
-
"results/latest/score-summary.json",
|
|
86
|
-
]);
|
|
87
|
-
}
|
|
88
|
-
return { durationMs, status: "success", summary };
|
|
89
|
-
}
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Pipeline step: Compare current scores against a baseline.
|
|
3
|
-
*
|
|
4
|
-
* Preconditions: score-summary.json exists
|
|
5
|
-
* Postconditions: comparison-report.json written to results/latest/
|
|
6
|
-
*
|
|
7
|
-
* This step is optional — it only runs when --compare is passed
|
|
8
|
-
* (or a baseline exists and auto-compare is enabled).
|
|
9
|
-
*/
|
|
10
|
-
import type { CompareOptions, StepResult } from "../types.js";
|
|
11
|
-
/**
|
|
12
|
-
* Run comparison against a baseline.
|
|
13
|
-
*
|
|
14
|
-
* @param rootDir Package root directory
|
|
15
|
-
* @param baselinePath Explicit baseline file path (optional — uses latest if omitted)
|
|
16
|
-
* @param options Compare options (noise threshold, etc.)
|
|
17
|
-
*/
|
|
18
|
-
export declare function runCompare(rootDir: string, baselinePath?: string, options?: CompareOptions): StepResult;
|
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Pipeline step: Compare current scores against a baseline.
|
|
3
|
-
*
|
|
4
|
-
* Preconditions: score-summary.json exists
|
|
5
|
-
* Postconditions: comparison-report.json written to results/latest/
|
|
6
|
-
*
|
|
7
|
-
* This step is optional — it only runs when --compare is passed
|
|
8
|
-
* (or a baseline exists and auto-compare is enabled).
|
|
9
|
-
*/
|
|
10
|
-
import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
|
|
11
|
-
import { join, resolve } from "path";
|
|
12
|
-
import { compare } from "../compare.js";
|
|
13
|
-
/**
|
|
14
|
-
* Run comparison against a baseline.
|
|
15
|
-
*
|
|
16
|
-
* @param rootDir Package root directory
|
|
17
|
-
* @param baselinePath Explicit baseline file path (optional — uses latest if omitted)
|
|
18
|
-
* @param options Compare options (noise threshold, etc.)
|
|
19
|
-
*/
|
|
20
|
-
export function runCompare(rootDir, baselinePath, options) {
|
|
21
|
-
const start = Date.now();
|
|
22
|
-
const scoreSummaryPath = resolve(rootDir, "results", "latest", "score-summary.json");
|
|
23
|
-
if (!existsSync(scoreSummaryPath)) {
|
|
24
|
-
return {
|
|
25
|
-
durationMs: Date.now() - start,
|
|
26
|
-
error: "score-summary.json not found. Run calculate-scores first.",
|
|
27
|
-
status: "failed",
|
|
28
|
-
};
|
|
29
|
-
}
|
|
30
|
-
// Load experiment (current run)
|
|
31
|
-
const experiment = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
|
|
32
|
-
// Resolve baseline
|
|
33
|
-
let resolvedBaselinePath;
|
|
34
|
-
if (baselinePath) {
|
|
35
|
-
resolvedBaselinePath = resolve(baselinePath);
|
|
36
|
-
}
|
|
37
|
-
else {
|
|
38
|
-
const baselinesDir = resolve(rootDir, "results", "baselines");
|
|
39
|
-
if (!existsSync(baselinesDir)) {
|
|
40
|
-
return {
|
|
41
|
-
reason: "No baselines directory found. Run 'pnpm baseline:save' first.",
|
|
42
|
-
status: "skipped",
|
|
43
|
-
};
|
|
44
|
-
}
|
|
45
|
-
const files = readdirSync(baselinesDir)
|
|
46
|
-
.filter((f) => f.endsWith(".json"))
|
|
47
|
-
.sort()
|
|
48
|
-
.reverse();
|
|
49
|
-
if (files.length === 0) {
|
|
50
|
-
return {
|
|
51
|
-
reason: "No baseline files found. Run 'pnpm baseline:save' first.",
|
|
52
|
-
status: "skipped",
|
|
53
|
-
};
|
|
54
|
-
}
|
|
55
|
-
resolvedBaselinePath = join(baselinesDir, files[0]);
|
|
56
|
-
}
|
|
57
|
-
if (!existsSync(resolvedBaselinePath)) {
|
|
58
|
-
return {
|
|
59
|
-
durationMs: Date.now() - start,
|
|
60
|
-
error: `Baseline file not found: ${resolvedBaselinePath}`,
|
|
61
|
-
status: "failed",
|
|
62
|
-
};
|
|
63
|
-
}
|
|
64
|
-
const baseline = JSON.parse(readFileSync(resolvedBaselinePath, "utf-8"));
|
|
65
|
-
// Run comparison
|
|
66
|
-
const report = compare(baseline, experiment, options);
|
|
67
|
-
// Write report
|
|
68
|
-
const reportPath = resolve(rootDir, "results", "latest", "comparison-report.json");
|
|
69
|
-
writeFileSync(reportPath, JSON.stringify(report, null, 2));
|
|
70
|
-
// Build summary
|
|
71
|
-
const improved = report.improved.length;
|
|
72
|
-
const regressed = report.regressed.length;
|
|
73
|
-
const unchanged = report.unchanged.length;
|
|
74
|
-
const overallDelta = report.deltas.overall;
|
|
75
|
-
const deltaStr = overallDelta > 0
|
|
76
|
-
? `+${Math.round(overallDelta)}`
|
|
77
|
-
: String(Math.round(overallDelta));
|
|
78
|
-
const parts = [`Overall: ${deltaStr}`];
|
|
79
|
-
if (improved > 0)
|
|
80
|
-
parts.push(`${improved} improved`);
|
|
81
|
-
if (regressed > 0)
|
|
82
|
-
parts.push(`${regressed} regressed`);
|
|
83
|
-
if (unchanged > 0)
|
|
84
|
-
parts.push(`${unchanged} unchanged`);
|
|
85
|
-
return {
|
|
86
|
-
durationMs: Date.now() - start,
|
|
87
|
-
status: "success",
|
|
88
|
-
summary: parts.join(", "),
|
|
89
|
-
};
|
|
90
|
-
}
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Pipeline step: Run promptfoo evaluation.
|
|
3
|
-
*
|
|
4
|
-
* Preconditions: config files and context files exist
|
|
5
|
-
* Postconditions: eval-results.json exists and is valid
|
|
6
|
-
*
|
|
7
|
-
* Cache key: promptfooconfig*.yaml + contexts/*.md + tasks/*.yaml +
|
|
8
|
-
* canonical contexts + reference solutions + config/models.yaml
|
|
9
|
-
* Cache outputs: results/latest/eval-results*.json
|
|
10
|
-
*
|
|
11
|
-
* Remote cache: When local cache misses and a Sanity token is available,
|
|
12
|
-
* the step queries the Content Lake for a report with a matching eval
|
|
13
|
-
* fingerprint. On a hit, the cached score-summary.json is written to disk
|
|
14
|
-
* and the eval + calculate-scores steps are skipped entirely.
|
|
15
|
-
*
|
|
16
|
-
* @see docs/design-docs/content-lake-eval-caching.md
|
|
17
|
-
*/
|
|
18
|
-
import type { ConcreteEvalMode, DebugOptions, FilterOptions, StepResult } from "../types.js";
|
|
19
|
-
/** Each mode writes eval results to a different file (set in the config's outputPath) */
|
|
20
|
-
export declare const RESULTS_FILES: Record<ConcreteEvalMode, string>;
|
|
21
|
-
/** Extended step result that carries cache metadata for downstream steps */
|
|
22
|
-
export interface EvalStepResult {
|
|
23
|
-
/** The computed eval fingerprint (for publishing in provenance) */
|
|
24
|
-
evalFingerprint?: string;
|
|
25
|
-
/** Whether this result came from a remote cache hit */
|
|
26
|
-
remoteCacheHit?: boolean;
|
|
27
|
-
/** The step result */
|
|
28
|
-
stepResult: StepResult;
|
|
29
|
-
}
|
|
30
|
-
/** Options for the remote cache (Content Lake fingerprint lookup) */
|
|
31
|
-
export interface RemoteCacheOptions {
|
|
32
|
-
/** Whether this is a debug run (debug runs don't use remote cache) */
|
|
33
|
-
debug?: boolean;
|
|
34
|
-
/** Filter options used for fingerprint computation */
|
|
35
|
-
filter?: FilterOptions;
|
|
36
|
-
/** Grader model identifier from models.yaml */
|
|
37
|
-
graderModel: string;
|
|
38
|
-
/** Disable remote cache lookup (--no-remote-cache) */
|
|
39
|
-
noRemoteCache?: boolean;
|
|
40
|
-
/** Sanity API token for reading cached reports */
|
|
41
|
-
sanityToken?: string;
|
|
42
|
-
}
|
|
43
|
-
export declare function buildFilterFlags(debug?: DebugOptions): string;
|
|
44
|
-
/**
|
|
45
|
-
* Extract the Promptfoo share URL from the eval results JSON.
|
|
46
|
-
*
|
|
47
|
-
* Promptfoo writes a `shareableUrl` field into the results file when
|
|
48
|
-
* `PROMPTFOO_API_KEY` is set. This replaces the previous approach of
|
|
49
|
-
* scraping the URL from a captured log file (which required piping
|
|
50
|
-
* through `tee` and broke TTY progress reporting).
|
|
51
|
-
*/
|
|
52
|
-
export declare function extractShareUrl(mode: ConcreteEvalMode): string | undefined;
|
|
53
|
-
export declare function runEval(mode: ConcreteEvalMode, debug?: DebugOptions, concurrency?: number, noCache?: boolean, remoteCacheOpts?: RemoteCacheOptions): Promise<EvalStepResult>;
|