@sanity/ailf 2.1.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/rubrics.ts +3 -3
- package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
- package/dist/_vendor/ailf-core/examples/index.js +66 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +25 -0
- package/dist/agent-harness/assertions-runtime.d.ts +49 -0
- package/dist/agent-harness/assertions-runtime.js +138 -0
- package/dist/agent-harness/provider.d.ts +58 -0
- package/dist/agent-harness/provider.js +104 -0
- package/dist/commands/calculate-scores.js +7 -2
- package/dist/commands/capture-list.d.ts +1 -1
- package/dist/commands/capture-list.js +6 -3
- package/dist/commands/compare.js +11 -7
- package/dist/commands/explain-handler.js +22 -24
- package/dist/commands/fetch-docs.js +4 -2
- package/dist/commands/generate-configs.js +6 -2
- package/dist/commands/init.js +3 -0
- package/dist/commands/pipeline-action.js +8 -24
- package/dist/commands/pipeline.js +1 -1
- package/dist/commands/pr-comment.js +6 -2
- package/dist/commands/publish.d.ts +1 -0
- package/dist/commands/publish.js +12 -8
- package/dist/commands/remote-pipeline.js +1 -1
- package/dist/commands/remote-results.d.ts +8 -8
- package/dist/commands/remote-results.js +7 -7
- package/dist/commands/shared/options.d.ts +8 -0
- package/dist/commands/shared/options.js +10 -0
- package/dist/commands/shared/resolve-output-dir.d.ts +27 -0
- package/dist/commands/shared/resolve-output-dir.js +36 -0
- package/dist/composition-root.js +1 -1
- package/dist/config/rubrics.ts +3 -3
- package/dist/orchestration/build-app-context.js +1 -1
- package/dist/orchestration/steps/gap-analysis-step.js +86 -75
- package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
- package/dist/orchestration/steps/generate-configs-step.js +47 -2
- package/dist/pipeline/calculate-scores.js +113 -2
- package/dist/pipeline/compare.js +50 -19
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +103 -25
- package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +15 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +42 -85
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +3 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +1 -27
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +2 -9
- package/dist/pipeline/compiler/rubric-resolution.d.ts +40 -0
- package/dist/pipeline/compiler/rubric-resolution.js +52 -0
- package/dist/pipeline/compiler/scoring-bridge.js +59 -7
- package/dist/pipeline/provenance.js +7 -1
- package/dist/pipeline/validate.d.ts +5 -4
- package/dist/pipeline/validate.js +34 -113
- package/package.json +2 -1
|
@@ -9,18 +9,21 @@ import { fileURLToPath } from "url";
|
|
|
9
9
|
import { Command } from "commander";
|
|
10
10
|
import { createAppContext } from "../composition-root.js";
|
|
11
11
|
import { calculateAndWriteScores } from "../pipeline/calculate-scores.js";
|
|
12
|
+
import { addOutputDirOption } from "./shared/options.js";
|
|
13
|
+
import { resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
12
14
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
13
15
|
const ROOT = resolve(__dirname, "..", "..");
|
|
14
16
|
export function createCalculateScoresCommand() {
|
|
15
|
-
|
|
17
|
+
const cmd = new Command("calculate-scores")
|
|
16
18
|
.description("Calculate AI Literacy Scores from Promptfoo evaluation results")
|
|
17
19
|
.option("--source <name>", "Documentation source name")
|
|
18
20
|
.argument("[results-path]", "Path to eval-results.json")
|
|
19
21
|
.action(async (resultsPath, opts) => {
|
|
20
22
|
try {
|
|
23
|
+
const outputDir = resolveOutputDir(opts.outputDir);
|
|
21
24
|
const ctx = createAppContext({
|
|
22
25
|
rootDir: ROOT,
|
|
23
|
-
outputDir
|
|
26
|
+
outputDir,
|
|
24
27
|
mode: "literacy",
|
|
25
28
|
noAutoScope: false,
|
|
26
29
|
skipFetch: true,
|
|
@@ -53,4 +56,6 @@ export function createCalculateScoresCommand() {
|
|
|
53
56
|
console.error(err.message);
|
|
54
57
|
}
|
|
55
58
|
});
|
|
59
|
+
addOutputDirOption(cmd);
|
|
60
|
+
return cmd;
|
|
56
61
|
}
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* reads each manifest, and prints a summary table sorted by date.
|
|
6
6
|
*
|
|
7
7
|
* Usage:
|
|
8
|
-
* ailf capture list # default: results/captures/
|
|
8
|
+
* ailf capture list # default: .ailf/results/captures/
|
|
9
9
|
* ailf capture list ./my-captures # custom directory
|
|
10
10
|
*/
|
|
11
11
|
import { Command } from "commander";
|
|
@@ -5,22 +5,25 @@
|
|
|
5
5
|
* reads each manifest, and prints a summary table sorted by date.
|
|
6
6
|
*
|
|
7
7
|
* Usage:
|
|
8
|
-
* ailf capture list # default: results/captures/
|
|
8
|
+
* ailf capture list # default: .ailf/results/captures/
|
|
9
9
|
* ailf capture list ./my-captures # custom directory
|
|
10
10
|
*/
|
|
11
11
|
import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
|
|
12
12
|
import { join, resolve } from "node:path";
|
|
13
13
|
import { Command } from "commander";
|
|
14
|
+
import { resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
14
15
|
// ---------------------------------------------------------------------------
|
|
15
16
|
// Command factory
|
|
16
17
|
// ---------------------------------------------------------------------------
|
|
17
18
|
export function createCaptureListCommand() {
|
|
18
19
|
return new Command("list")
|
|
19
20
|
.description("List pipeline captures in a directory")
|
|
20
|
-
.argument("[dir]", "Captures directory (default: results/captures/)")
|
|
21
|
+
.argument("[dir]", "Captures directory (default: .ailf/results/captures/)")
|
|
21
22
|
.option("-f, --format <fmt>", "Output format: table or json", "table")
|
|
22
23
|
.action(async (dir, opts) => {
|
|
23
|
-
const captureDir =
|
|
24
|
+
const captureDir = dir
|
|
25
|
+
? resolve(dir)
|
|
26
|
+
: resolve(resolveOutputDir(), "..", "captures");
|
|
24
27
|
if (!existsSync(captureDir)) {
|
|
25
28
|
console.error(` No captures directory found at ${captureDir}`);
|
|
26
29
|
console.error(" Run 'ailf pipeline --capture' to create captures.");
|
package/dist/commands/compare.js
CHANGED
|
@@ -9,29 +9,31 @@ import { dirname, join, resolve } from "path";
|
|
|
9
9
|
import { fileURLToPath } from "url";
|
|
10
10
|
import { Command } from "commander";
|
|
11
11
|
import { compare } from "../pipeline/compare.js";
|
|
12
|
+
import { addOutputDirOption } from "./shared/options.js";
|
|
13
|
+
import { resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
12
14
|
import { DEFAULT_NOISE_THRESHOLD, } from "../pipeline/types.js";
|
|
13
15
|
import { formatComparisonTable } from "../_vendor/ailf-core/index.js";
|
|
14
16
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
15
17
|
const ROOT = resolve(__dirname, "..", "..");
|
|
16
18
|
const BASELINES_DIR = join(ROOT, "results", "baselines");
|
|
17
|
-
const SCORE_SUMMARY_PATH = join(ROOT, "results", "latest", "score-summary.json");
|
|
18
19
|
// ---------------------------------------------------------------------------
|
|
19
20
|
// Helpers
|
|
20
21
|
// ---------------------------------------------------------------------------
|
|
21
22
|
export function createCompareCommand() {
|
|
22
|
-
|
|
23
|
+
const cmd = new Command("compare")
|
|
23
24
|
.description("Compare two evaluation score summaries")
|
|
24
25
|
.option("-b, --baseline <path>", "Baseline score-summary.json (default: latest baseline)")
|
|
25
|
-
.option("-e, --experiment <path>", "Experiment score-summary.json (default: results/latest/score-summary.json)")
|
|
26
|
+
.option("-e, --experiment <path>", "Experiment score-summary.json (default: .ailf/results/latest/score-summary.json)")
|
|
26
27
|
.option("-t, --threshold <n>", "Noise threshold for unchanged classification", parseFloat)
|
|
27
28
|
.option("-o, --output <path>", "Write JSON report to file")
|
|
28
29
|
.option("-f, --format <fmt>", "Output format: table or json", "table")
|
|
29
30
|
.action(async (opts) => {
|
|
31
|
+
const outputDir = resolveOutputDir(opts.outputDir);
|
|
30
32
|
const threshold = opts.threshold ?? DEFAULT_NOISE_THRESHOLD;
|
|
31
33
|
// Resolve experiment path
|
|
32
34
|
const expPath = opts.experiment
|
|
33
35
|
? resolve(opts.experiment)
|
|
34
|
-
:
|
|
36
|
+
: join(outputDir, "score-summary.json");
|
|
35
37
|
const experiment = loadSummary(expPath);
|
|
36
38
|
// Resolve baseline path
|
|
37
39
|
let basePath;
|
|
@@ -48,7 +50,7 @@ export function createCompareCommand() {
|
|
|
48
50
|
}
|
|
49
51
|
const baseline = loadSummary(basePath);
|
|
50
52
|
// Try to load grader consistency data for empirical thresholds
|
|
51
|
-
const consistencyPath = join(
|
|
53
|
+
const consistencyPath = join(outputDir, "grader-consistency.json");
|
|
52
54
|
let graderConsistency;
|
|
53
55
|
if (existsSync(consistencyPath) && opts.threshold === undefined) {
|
|
54
56
|
try {
|
|
@@ -93,10 +95,12 @@ export function createCompareCommand() {
|
|
|
93
95
|
console.log(` ✅ Comparison report also written to ${opts.output}`);
|
|
94
96
|
}
|
|
95
97
|
}
|
|
96
|
-
// Write comparison report to
|
|
97
|
-
const latestComparisonPath = join(
|
|
98
|
+
// Write comparison report to output dir for other steps to consume
|
|
99
|
+
const latestComparisonPath = join(outputDir, "comparison-report.json");
|
|
98
100
|
writeFileSync(latestComparisonPath, JSON.stringify(report, null, 2));
|
|
99
101
|
});
|
|
102
|
+
addOutputDirOption(cmd);
|
|
103
|
+
return cmd;
|
|
100
104
|
}
|
|
101
105
|
function findLatestBaseline() {
|
|
102
106
|
if (!existsSync(BASELINES_DIR))
|
|
@@ -23,6 +23,7 @@ import { TASK_FILE_NAMES } from "../_vendor/ailf-core/index.js";
|
|
|
23
23
|
import { buildPipelinePlan, buildSimpleCommandPlan, } from "../pipeline/plan.js";
|
|
24
24
|
import { formatPlanConsole, formatPlanJson } from "../pipeline/plan-format.js";
|
|
25
25
|
import { computeResolvedOptions } from "./pipeline-action.js";
|
|
26
|
+
import { getCallerCwd } from "./shared/resolve-output-dir.js";
|
|
26
27
|
import { LiteracyVariant } from "../pipeline/normalize-mode.js";
|
|
27
28
|
// ---------------------------------------------------------------------------
|
|
28
29
|
// Registry
|
|
@@ -43,10 +44,10 @@ const EXPLAIN_REGISTRY = {
|
|
|
43
44
|
"agent-report": {
|
|
44
45
|
description: "Generate an agent behavior observation report from eval results",
|
|
45
46
|
filesCreated: [
|
|
46
|
-
"
|
|
47
|
-
"
|
|
47
|
+
"<outputDir>/agent-report.json",
|
|
48
|
+
"<outputDir>/agent-report.md",
|
|
48
49
|
],
|
|
49
|
-
filesRead: ["
|
|
50
|
+
filesRead: ["<outputDir>/eval-results.json"],
|
|
50
51
|
steps: [
|
|
51
52
|
{
|
|
52
53
|
cacheStatus: "miss",
|
|
@@ -82,9 +83,9 @@ const EXPLAIN_REGISTRY = {
|
|
|
82
83
|
},
|
|
83
84
|
"calculate-scores": {
|
|
84
85
|
description: "Calculate AI Literacy Scores from Promptfoo evaluation results",
|
|
85
|
-
filesCreated: ["
|
|
86
|
+
filesCreated: ["<outputDir>/score-summary.json"],
|
|
86
87
|
filesRead: [
|
|
87
|
-
"
|
|
88
|
+
"<outputDir>/eval-results.json",
|
|
88
89
|
"config/rubrics.ts",
|
|
89
90
|
"config/models.ts",
|
|
90
91
|
],
|
|
@@ -104,23 +105,20 @@ const EXPLAIN_REGISTRY = {
|
|
|
104
105
|
{
|
|
105
106
|
cacheStatus: "miss",
|
|
106
107
|
name: "Write summary",
|
|
107
|
-
reason: "Persist score-summary.json to
|
|
108
|
+
reason: "Persist score-summary.json to output directory",
|
|
108
109
|
willRun: true,
|
|
109
110
|
},
|
|
110
111
|
],
|
|
111
112
|
},
|
|
112
113
|
compare: {
|
|
113
114
|
description: "Compare current evaluation scores against a saved baseline",
|
|
114
|
-
filesCreated: ["
|
|
115
|
-
filesRead: [
|
|
116
|
-
"results/latest/score-summary.json",
|
|
117
|
-
"results/baselines/*.json",
|
|
118
|
-
],
|
|
115
|
+
filesCreated: ["<outputDir>/comparison-report.json"],
|
|
116
|
+
filesRead: ["<outputDir>/score-summary.json", "results/baselines/*.json"],
|
|
119
117
|
steps: [
|
|
120
118
|
{
|
|
121
119
|
cacheStatus: "miss",
|
|
122
120
|
name: "Load current scores",
|
|
123
|
-
reason: "Read
|
|
121
|
+
reason: "Read <outputDir>/score-summary.json",
|
|
124
122
|
willRun: true,
|
|
125
123
|
},
|
|
126
124
|
{
|
|
@@ -181,8 +179,8 @@ const EXPLAIN_REGISTRY = {
|
|
|
181
179
|
},
|
|
182
180
|
"discovery-report": {
|
|
183
181
|
description: "Generate agent discoverability report from agentic retrieval metrics",
|
|
184
|
-
filesCreated: ["
|
|
185
|
-
filesRead: ["
|
|
182
|
+
filesCreated: ["<outputDir>/discovery-report.md"],
|
|
183
|
+
filesRead: ["<outputDir>/score-summary.json"],
|
|
186
184
|
steps: [
|
|
187
185
|
{
|
|
188
186
|
cacheStatus: "miss",
|
|
@@ -206,7 +204,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
206
204
|
},
|
|
207
205
|
eval: {
|
|
208
206
|
description: "Run Promptfoo evaluation directly (passthrough — all flags forwarded to promptfoo)",
|
|
209
|
-
filesCreated: ["
|
|
207
|
+
filesCreated: ["<outputDir>/eval-results.json"],
|
|
210
208
|
filesRead: ["promptfooconfig.yaml"],
|
|
211
209
|
steps: [
|
|
212
210
|
{
|
|
@@ -280,7 +278,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
280
278
|
grader: {
|
|
281
279
|
description: "Grader reliability tools (consistency, compare, sensitivity, validate)",
|
|
282
280
|
filesRead: [
|
|
283
|
-
"
|
|
281
|
+
"<outputDir>/eval-results.json",
|
|
284
282
|
"config/rubrics.ts",
|
|
285
283
|
"canonical/reference-solutions/",
|
|
286
284
|
],
|
|
@@ -312,7 +310,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
312
310
|
},
|
|
313
311
|
"measure-retrieval": {
|
|
314
312
|
description: "Measure Sanity text search retrieval quality against canonical document annotations",
|
|
315
|
-
filesCreated: ["
|
|
313
|
+
filesCreated: ["<outputDir>/retrieval-metrics.json"],
|
|
316
314
|
filesRead: ["tasks/literacy/*.task.ts"],
|
|
317
315
|
steps: [
|
|
318
316
|
{
|
|
@@ -337,7 +335,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
337
335
|
},
|
|
338
336
|
"pr-comment": {
|
|
339
337
|
description: "Generate a markdown PR comment from evaluation scores for CI posting",
|
|
340
|
-
filesRead: ["
|
|
338
|
+
filesRead: ["<outputDir>/score-summary.json"],
|
|
341
339
|
steps: [
|
|
342
340
|
{
|
|
343
341
|
cacheStatus: "miss",
|
|
@@ -355,7 +353,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
355
353
|
},
|
|
356
354
|
publish: {
|
|
357
355
|
description: "Publish a local evaluation report to the Sanity Content Lake (standalone)",
|
|
358
|
-
filesRead: ["
|
|
356
|
+
filesRead: ["<outputDir>/score-summary.json"],
|
|
359
357
|
steps: [
|
|
360
358
|
{
|
|
361
359
|
cacheStatus: "miss",
|
|
@@ -386,12 +384,12 @@ const EXPLAIN_REGISTRY = {
|
|
|
386
384
|
"readiness-report": {
|
|
387
385
|
description: "Generate launch readiness checklist for a feature area with threshold evaluation",
|
|
388
386
|
filesRead: [
|
|
389
|
-
"
|
|
390
|
-
"
|
|
387
|
+
"<outputDir>/score-summary.json",
|
|
388
|
+
"<outputDir>/gap-analysis.json",
|
|
391
389
|
"config/thresholds.ts",
|
|
392
390
|
"results/baselines/",
|
|
393
391
|
],
|
|
394
|
-
filesCreated: ["
|
|
392
|
+
filesCreated: ["<outputDir>/readiness-report.md"],
|
|
395
393
|
steps: [
|
|
396
394
|
{
|
|
397
395
|
cacheStatus: "miss",
|
|
@@ -603,7 +601,7 @@ function buildInitExplainPlan(actionCommand, rootDir) {
|
|
|
603
601
|
const configFile = format === "ts"
|
|
604
602
|
? "ailf.config.ts"
|
|
605
603
|
: `config.${format === "yaml" ? "yaml" : "json"}`;
|
|
606
|
-
const callerCwd =
|
|
604
|
+
const callerCwd = getCallerCwd();
|
|
607
605
|
const targetDir = opts.path ?? ".";
|
|
608
606
|
const ailfDir = `${targetDir}/.ailf`;
|
|
609
607
|
const tasksDir = `${ailfDir}/tasks`;
|
|
@@ -664,7 +662,7 @@ function buildBaselineExplainPlan(actionCommand, rootDir) {
|
|
|
664
662
|
command: `baseline ${subcommand}`,
|
|
665
663
|
description: descriptions[subcommand] ?? `Baseline operation: ${subcommand}`,
|
|
666
664
|
filesCreated: subcommand === "save" ? ["results/baselines/<timestamp>.json"] : [],
|
|
667
|
-
filesRead: ["
|
|
665
|
+
filesRead: ["<outputDir>/score-summary.json", "results/baselines/"],
|
|
668
666
|
rootDir,
|
|
669
667
|
});
|
|
670
668
|
}
|
|
@@ -11,7 +11,8 @@ import { Command } from "commander";
|
|
|
11
11
|
import { createAppContext } from "../composition-root.js";
|
|
12
12
|
import { loadSource } from "../sources.js";
|
|
13
13
|
import { configToSourceOverrides } from "../orchestration/config-to-source-overrides.js";
|
|
14
|
-
import { addSanitySourceOptions } from "./shared/options.js";
|
|
14
|
+
import { addOutputDirOption, addSanitySourceOptions } from "./shared/options.js";
|
|
15
|
+
import { resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
15
16
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
16
17
|
const ROOT = resolve(__dirname, "..", "..");
|
|
17
18
|
export function createFetchDocsCommand() {
|
|
@@ -31,6 +32,7 @@ export function createFetchDocsCommand() {
|
|
|
31
32
|
}
|
|
32
33
|
});
|
|
33
34
|
addSanitySourceOptions(cmd);
|
|
35
|
+
addOutputDirOption(cmd);
|
|
34
36
|
return cmd;
|
|
35
37
|
}
|
|
36
38
|
// ---------------------------------------------------------------------------
|
|
@@ -41,7 +43,7 @@ async function executeFetchDocs(opts) {
|
|
|
41
43
|
// Build a minimal ResolvedConfig for the composition root
|
|
42
44
|
const ctx = createAppContext({
|
|
43
45
|
rootDir: ROOT,
|
|
44
|
-
outputDir:
|
|
46
|
+
outputDir: resolveOutputDir(opts.outputDir),
|
|
45
47
|
mode: "literacy",
|
|
46
48
|
noAutoScope: false,
|
|
47
49
|
skipFetch: false,
|
|
@@ -9,17 +9,19 @@ import { fileURLToPath } from "url";
|
|
|
9
9
|
import { Command } from "commander";
|
|
10
10
|
import { createAppContext } from "../composition-root.js";
|
|
11
11
|
import { GenerateConfigsStep } from "../orchestration/steps/generate-configs-step.js";
|
|
12
|
+
import { addOutputDirOption } from "./shared/options.js";
|
|
13
|
+
import { resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
12
14
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
13
15
|
const ROOT = resolve(__dirname, "..", "..");
|
|
14
16
|
export function createGenerateConfigsCommand() {
|
|
15
|
-
|
|
17
|
+
const cmd = new Command("generate-configs")
|
|
16
18
|
.description("Generate promptfoo config files from config/models.yaml")
|
|
17
19
|
.option("-s, --source <name>", "Documentation source name")
|
|
18
20
|
.action(async (opts) => {
|
|
19
21
|
try {
|
|
20
22
|
const ctx = createAppContext({
|
|
21
23
|
rootDir: ROOT,
|
|
22
|
-
outputDir:
|
|
24
|
+
outputDir: resolveOutputDir(opts.outputDir),
|
|
23
25
|
mode: "literacy",
|
|
24
26
|
noAutoScope: false,
|
|
25
27
|
skipFetch: true,
|
|
@@ -58,4 +60,6 @@ export function createGenerateConfigsCommand() {
|
|
|
58
60
|
console.error(err.message);
|
|
59
61
|
}
|
|
60
62
|
});
|
|
63
|
+
addOutputDirOption(cmd);
|
|
64
|
+
return cmd;
|
|
61
65
|
}
|
package/dist/commands/init.js
CHANGED
|
@@ -138,6 +138,9 @@ async function runInit(opts) {
|
|
|
138
138
|
else if (modeFilter === "knowledge-probe") {
|
|
139
139
|
stemsToWrite = taskStemsForMode("knowledge-probe");
|
|
140
140
|
}
|
|
141
|
+
else if (modeFilter === "agent-harness") {
|
|
142
|
+
stemsToWrite = taskStemsForMode("agent-harness");
|
|
143
|
+
}
|
|
141
144
|
else {
|
|
142
145
|
// Default (no --mode): write all tasks
|
|
143
146
|
stemsToWrite = [...TASK_FILE_NAMES];
|
|
@@ -21,6 +21,7 @@ import { buildStepSequence } from "../orchestration/build-step-sequence.js";
|
|
|
21
21
|
import { orchestratePipeline } from "../orchestration/pipeline-orchestrator.js";
|
|
22
22
|
import { load } from "js-yaml";
|
|
23
23
|
import { parseRepoConfig, } from "../adapters/task-sources/repo-schemas.js";
|
|
24
|
+
import { getCallerCwd, resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
24
25
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
25
26
|
const ROOT = resolve(__dirname, "..", "..");
|
|
26
27
|
// ---------------------------------------------------------------------------
|
|
@@ -35,7 +36,7 @@ const VALID_SEARCH_MODES = ["open", "origin-only", "off"];
|
|
|
35
36
|
*/
|
|
36
37
|
export function computeResolvedOptions(opts) {
|
|
37
38
|
// Resolve paths relative to the caller's cwd, not the eval package root
|
|
38
|
-
const callerCwd =
|
|
39
|
+
const callerCwd = getCallerCwd();
|
|
39
40
|
// Validate + normalize mode via the single boundary function.
|
|
40
41
|
// normalizeMode() maps legacy variant names (baseline, agentic, etc.)
|
|
41
42
|
// to canonical mode "literacy" + variant, and throws on invalid input.
|
|
@@ -209,23 +210,12 @@ export function computeResolvedOptions(opts) {
|
|
|
209
210
|
const remote = opts.remote || process.env.AILF_REMOTE === "1";
|
|
210
211
|
const apiUrl = opts.apiUrl ?? process.env.AILF_API_URL ?? "https://ailf-api.sanity.build";
|
|
211
212
|
const apiKey = process.env.AILF_API_KEY ?? undefined;
|
|
212
|
-
// Output directory: explicit
|
|
213
|
+
// Output directory: explicit --output-dir → $CWD/.ailf/results/latest/
|
|
213
214
|
const resolvedRepoTasksPath = opts.repoTasksPath
|
|
214
215
|
? resolve(callerCwd, opts.repoTasksPath)
|
|
215
216
|
: undefined;
|
|
216
217
|
const resolvedTaskSourceType = resolveTaskSourceType(opts.taskSource);
|
|
217
|
-
|
|
218
|
-
if (opts.outputDir) {
|
|
219
|
-
outputDir = resolve(callerCwd, opts.outputDir);
|
|
220
|
-
}
|
|
221
|
-
else if (resolvedTaskSourceType === "repo" || resolvedRepoTasksPath) {
|
|
222
|
-
outputDir = resolvedRepoTasksPath
|
|
223
|
-
? resolve(resolvedRepoTasksPath, "..", "results", "latest")
|
|
224
|
-
: resolve(callerCwd, ".ailf", "results", "latest");
|
|
225
|
-
}
|
|
226
|
-
else {
|
|
227
|
-
outputDir = resolve(ROOT, "results", "latest");
|
|
228
|
-
}
|
|
218
|
+
const outputDir = resolveOutputDir(opts.outputDir);
|
|
229
219
|
return {
|
|
230
220
|
allowedOriginArgs,
|
|
231
221
|
apiKey,
|
|
@@ -310,7 +300,7 @@ export async function executePipeline(cliOpts) {
|
|
|
310
300
|
}
|
|
311
301
|
const { FileConfigAdapter } = await import("../adapters/config-sources/file-config-adapter.js");
|
|
312
302
|
const { createAppContext } = await import("../composition-root.js");
|
|
313
|
-
const callerCwd =
|
|
303
|
+
const callerCwd = getCallerCwd();
|
|
314
304
|
const adapter = new FileConfigAdapter(cliOpts.config, ROOT);
|
|
315
305
|
const config = await adapter.resolve();
|
|
316
306
|
// Merge CLI-only flags that aren't in the config file.
|
|
@@ -323,13 +313,8 @@ export async function executePipeline(cliOpts) {
|
|
|
323
313
|
if (cliOpts.output) {
|
|
324
314
|
config.outputPath = resolve(callerCwd, cliOpts.output);
|
|
325
315
|
}
|
|
326
|
-
// Output dir: explicit CLI flag →
|
|
327
|
-
|
|
328
|
-
config.outputDir = resolve(callerCwd, cliOpts.outputDir);
|
|
329
|
-
}
|
|
330
|
-
else if (config.repoTasksPath) {
|
|
331
|
-
config.outputDir = resolve(config.repoTasksPath, "..", "results", "latest");
|
|
332
|
-
}
|
|
316
|
+
// Output dir: explicit CLI flag → $CWD/.ailf/results/latest/
|
|
317
|
+
config.outputDir = resolveOutputDir(cliOpts.outputDir);
|
|
333
318
|
// Create AppContext directly from the merged config so adapters
|
|
334
319
|
// (especially taskSource) are wired from the file config's
|
|
335
320
|
// taskSourceType — not from CLI defaults.
|
|
@@ -350,8 +335,7 @@ export async function executePipeline(cliOpts) {
|
|
|
350
335
|
// cache which never contains .ailf/.
|
|
351
336
|
if (o.remote) {
|
|
352
337
|
const { runRemotePipeline } = await import("./remote-pipeline.js");
|
|
353
|
-
|
|
354
|
-
await runRemotePipeline(o, callerCwd);
|
|
338
|
+
await runRemotePipeline(o, getCallerCwd());
|
|
355
339
|
return;
|
|
356
340
|
}
|
|
357
341
|
// Dry-run: validate only, don't execute steps
|
|
@@ -55,7 +55,7 @@ export function createPipelineCommand() {
|
|
|
55
55
|
.option("--remote", "Submit evaluation to the AILF API instead of running locally", false)
|
|
56
56
|
.option("--api-url <url>", "AILF API base URL (default: https://ailf-api.sanity.build)")
|
|
57
57
|
.option("--capture", "Enable artifact capture for this run", false)
|
|
58
|
-
.option("--capture-dir <path>", "Base directory for capture output (default: results/captures/)")
|
|
58
|
+
.option("--capture-dir <path>", "Base directory for capture output (default: .ailf/results/captures/)")
|
|
59
59
|
.option("--no-capture-compress", "Disable tar.gz compression of captures")
|
|
60
60
|
.option("--no-capture-extras", "Exclude mode-specific artifacts from captures")
|
|
61
61
|
.action(async (opts) => {
|
|
@@ -9,10 +9,12 @@ import { fileURLToPath } from "url";
|
|
|
9
9
|
import { Command } from "commander";
|
|
10
10
|
import { createAppContext } from "../composition-root.js";
|
|
11
11
|
import { generatePrComment } from "../pipeline/pr-comment.js";
|
|
12
|
+
import { addOutputDirOption } from "./shared/options.js";
|
|
13
|
+
import { resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
12
14
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
13
15
|
const ROOT = resolve(__dirname, "..", "..");
|
|
14
16
|
export function createPrCommentCommand() {
|
|
15
|
-
|
|
17
|
+
const cmd = new Command("pr-comment")
|
|
16
18
|
.description("Generate a markdown PR comment from evaluation scores")
|
|
17
19
|
.option("--output <path>", "Write comment to file (default: stdout)")
|
|
18
20
|
.option("--promptfoo-url <url>", "Promptfoo share URL to include")
|
|
@@ -20,7 +22,7 @@ export function createPrCommentCommand() {
|
|
|
20
22
|
try {
|
|
21
23
|
const ctx = createAppContext({
|
|
22
24
|
rootDir: ROOT,
|
|
23
|
-
outputDir:
|
|
25
|
+
outputDir: resolveOutputDir(opts.outputDir),
|
|
24
26
|
mode: "literacy",
|
|
25
27
|
noAutoScope: false,
|
|
26
28
|
skipFetch: true,
|
|
@@ -48,4 +50,6 @@ export function createPrCommentCommand() {
|
|
|
48
50
|
console.error(err.message);
|
|
49
51
|
}
|
|
50
52
|
});
|
|
53
|
+
addOutputDirOption(cmd);
|
|
54
|
+
return cmd;
|
|
51
55
|
}
|
package/dist/commands/publish.js
CHANGED
|
@@ -23,22 +23,27 @@ import { dirname, resolve } from "path";
|
|
|
23
23
|
import { fileURLToPath } from "url";
|
|
24
24
|
import { Command } from "commander";
|
|
25
25
|
import { createAppContext } from "../composition-root.js";
|
|
26
|
+
import { addOutputDirOption } from "./shared/options.js";
|
|
27
|
+
import { getCallerCwd, resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
26
28
|
import { buildProvenance, } from "../pipeline/provenance.js";
|
|
27
29
|
import { generateReportTitle } from "../pipeline/report-title.js";
|
|
28
30
|
import { generateReportId, } from "../report-store.js";
|
|
29
31
|
import { withRetry } from "../sinks/retry.js";
|
|
30
32
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
31
33
|
const ROOT = resolve(__dirname, "..", "..");
|
|
32
|
-
const DEFAULT_SUMMARY_PATH = resolve(ROOT, "results", "latest", "score-summary.json");
|
|
33
34
|
export function createPublishCommand() {
|
|
34
|
-
|
|
35
|
+
const cmd = new Command("publish")
|
|
35
36
|
.description("Publish a local evaluation report to the Sanity Content Lake")
|
|
36
|
-
.argument("[summary-path]", "Path to score-summary.json"
|
|
37
|
+
.argument("[summary-path]", "Path to score-summary.json")
|
|
37
38
|
.option("-t, --tag <tag>", "Label for the published report")
|
|
38
39
|
.option("-n, --dry-run", "Preview the report without writing to Sanity or sinks", false)
|
|
39
40
|
.action(async (summaryPath, opts) => {
|
|
40
|
-
|
|
41
|
+
const outputDir = resolveOutputDir(opts.outputDir);
|
|
42
|
+
const effectivePath = summaryPath ?? resolve(outputDir, "score-summary.json");
|
|
43
|
+
await runPublishCommand(effectivePath, outputDir, opts);
|
|
41
44
|
});
|
|
45
|
+
addOutputDirOption(cmd);
|
|
46
|
+
return cmd;
|
|
42
47
|
}
|
|
43
48
|
// ---------------------------------------------------------------------------
|
|
44
49
|
// Provenance builder (from score summary, not full pipeline context)
|
|
@@ -77,7 +82,7 @@ function buildProvenanceFromSummary(summary) {
|
|
|
77
82
|
// ---------------------------------------------------------------------------
|
|
78
83
|
// Command implementation
|
|
79
84
|
// ---------------------------------------------------------------------------
|
|
80
|
-
async function runPublishCommand(summaryPath, opts) {
|
|
85
|
+
async function runPublishCommand(summaryPath, outputDir, opts) {
|
|
81
86
|
// Wire up infrastructure via composition root
|
|
82
87
|
const ctx = createAppContext({
|
|
83
88
|
compareEnabled: false,
|
|
@@ -87,7 +92,7 @@ async function runPublishCommand(summaryPath, opts) {
|
|
|
87
92
|
noAutoScope: false,
|
|
88
93
|
noCache: true,
|
|
89
94
|
noRemoteCache: true,
|
|
90
|
-
outputDir
|
|
95
|
+
outputDir,
|
|
91
96
|
publishEnabled: true,
|
|
92
97
|
publishTag: opts.tag,
|
|
93
98
|
readinessEnabled: false,
|
|
@@ -106,8 +111,7 @@ async function runPublishCommand(summaryPath, opts) {
|
|
|
106
111
|
// -----------------------------------------------------------------------
|
|
107
112
|
// 1. Resolve and read the score summary
|
|
108
113
|
// -----------------------------------------------------------------------
|
|
109
|
-
const
|
|
110
|
-
const resolvedPath = resolve(callerCwd, summaryPath);
|
|
114
|
+
const resolvedPath = resolve(getCallerCwd(), summaryPath);
|
|
111
115
|
if (!existsSync(resolvedPath)) {
|
|
112
116
|
console.error(` ✖ File not found: ${resolvedPath}`);
|
|
113
117
|
console.error();
|
|
@@ -4,9 +4,9 @@
|
|
|
4
4
|
* Produces the same file layout as local mode so downstream tools
|
|
5
5
|
* (workflow PR comments, score comparison, baseline save) work unchanged:
|
|
6
6
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
7
|
+
* <outputDir>/score-summary.json — scores by area + overall
|
|
8
|
+
* <outputDir>/report.md — rendered markdown report
|
|
9
|
+
* <outputDir>/job-metadata.json — job ID, timing, API URL
|
|
10
10
|
*
|
|
11
11
|
* @see packages/eval/src/commands/remote-pipeline.ts — caller
|
|
12
12
|
*/
|
|
@@ -14,8 +14,8 @@ import type { ApiClient } from "../adapters/api-client/api-client.js";
|
|
|
14
14
|
import type { JobResponse } from "../adapters/api-client/types.js";
|
|
15
15
|
/** Options for writing remote results. */
|
|
16
16
|
export interface WriteResultsOptions {
|
|
17
|
-
/**
|
|
18
|
-
|
|
17
|
+
/** Base directory for output artifacts. */
|
|
18
|
+
outputDir: string;
|
|
19
19
|
/** Optional output path override (--output flag). */
|
|
20
20
|
outputPath?: string;
|
|
21
21
|
/** API base URL (for metadata). */
|
|
@@ -25,9 +25,9 @@ export interface WriteResultsOptions {
|
|
|
25
25
|
* Fetch report artifacts from the API and write them to disk.
|
|
26
26
|
*
|
|
27
27
|
* Writes:
|
|
28
|
-
* -
|
|
29
|
-
* -
|
|
30
|
-
* -
|
|
28
|
+
* - `<outputDir>/score-summary.json` — score data from job response
|
|
29
|
+
* - `<outputDir>/report.md` — full markdown report (if reportId present)
|
|
30
|
+
* - `<outputDir>/job-metadata.json` — job tracking info
|
|
31
31
|
* - `--output` path — markdown report (if specified)
|
|
32
32
|
*/
|
|
33
33
|
export declare function writeRemoteResults(client: ApiClient, job: JobResponse, options: WriteResultsOptions): Promise<void>;
|
|
@@ -4,9 +4,9 @@
|
|
|
4
4
|
* Produces the same file layout as local mode so downstream tools
|
|
5
5
|
* (workflow PR comments, score comparison, baseline save) work unchanged:
|
|
6
6
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
7
|
+
* <outputDir>/score-summary.json — scores by area + overall
|
|
8
|
+
* <outputDir>/report.md — rendered markdown report
|
|
9
|
+
* <outputDir>/job-metadata.json — job ID, timing, API URL
|
|
10
10
|
*
|
|
11
11
|
* @see packages/eval/src/commands/remote-pipeline.ts — caller
|
|
12
12
|
*/
|
|
@@ -19,13 +19,13 @@ import { resolve } from "path";
|
|
|
19
19
|
* Fetch report artifacts from the API and write them to disk.
|
|
20
20
|
*
|
|
21
21
|
* Writes:
|
|
22
|
-
* -
|
|
23
|
-
* -
|
|
24
|
-
* -
|
|
22
|
+
* - `<outputDir>/score-summary.json` — score data from job response
|
|
23
|
+
* - `<outputDir>/report.md` — full markdown report (if reportId present)
|
|
24
|
+
* - `<outputDir>/job-metadata.json` — job tracking info
|
|
25
25
|
* - `--output` path — markdown report (if specified)
|
|
26
26
|
*/
|
|
27
27
|
export async function writeRemoteResults(client, job, options) {
|
|
28
|
-
const resultsDir =
|
|
28
|
+
const resultsDir = options.outputDir;
|
|
29
29
|
mkdirSync(resultsDir, { recursive: true });
|
|
30
30
|
// 1. Write score summary
|
|
31
31
|
const scoreSummary = buildScoreSummary(job);
|
|
@@ -18,6 +18,14 @@ export declare function addDebugOptions(cmd: Command): Command;
|
|
|
18
18
|
* Add output options: --output, --format
|
|
19
19
|
*/
|
|
20
20
|
export declare function addOutputOptions(cmd: Command): Command;
|
|
21
|
+
/**
|
|
22
|
+
* Add --output-dir option for commands that write pipeline artifacts.
|
|
23
|
+
*
|
|
24
|
+
* Pair with `resolveOutputDir()` from `./resolve-output-dir.js` to resolve
|
|
25
|
+
* the value. When omitted, `resolveOutputDir()` defaults to
|
|
26
|
+
* `$CWD/.ailf/results/latest/`.
|
|
27
|
+
*/
|
|
28
|
+
export declare function addOutputDirOption(cmd: Command): Command;
|
|
21
29
|
/**
|
|
22
30
|
* Add Sanity source options: --sanity-dataset, --sanity-project, etc.
|
|
23
31
|
*/
|