@sanity/ailf 2.2.0 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/config/rubrics.ts +3 -3
  2. package/dist/_vendor/ailf-core/types/index.d.ts +25 -0
  3. package/dist/adapters/task-sources/content-lake-task-source.js +15 -7
  4. package/dist/commands/calculate-scores.js +7 -2
  5. package/dist/commands/capture-list.d.ts +1 -1
  6. package/dist/commands/capture-list.js +6 -3
  7. package/dist/commands/compare.js +11 -7
  8. package/dist/commands/explain-handler.js +22 -24
  9. package/dist/commands/fetch-docs.js +4 -2
  10. package/dist/commands/generate-configs.js +6 -2
  11. package/dist/commands/pipeline-action.js +8 -24
  12. package/dist/commands/pipeline.js +1 -1
  13. package/dist/commands/pr-comment.js +6 -2
  14. package/dist/commands/publish.d.ts +1 -0
  15. package/dist/commands/publish.js +12 -8
  16. package/dist/commands/remote-pipeline.js +1 -1
  17. package/dist/commands/remote-results.d.ts +8 -8
  18. package/dist/commands/remote-results.js +7 -7
  19. package/dist/commands/shared/options.d.ts +8 -0
  20. package/dist/commands/shared/options.js +10 -0
  21. package/dist/commands/shared/resolve-output-dir.d.ts +27 -0
  22. package/dist/commands/shared/resolve-output-dir.js +36 -0
  23. package/dist/composition-root.js +1 -1
  24. package/dist/config/rubrics.ts +3 -3
  25. package/dist/orchestration/build-app-context.js +1 -1
  26. package/dist/orchestration/steps/fetch-docs-step.js +23 -9
  27. package/dist/orchestration/steps/gap-analysis-step.js +86 -75
  28. package/dist/orchestration/steps/generate-configs-step.d.ts +15 -0
  29. package/dist/orchestration/steps/generate-configs-step.js +56 -0
  30. package/dist/orchestration/steps/run-eval-step.js +14 -0
  31. package/dist/pipeline/calculate-scores.js +113 -2
  32. package/dist/pipeline/compare.js +50 -19
  33. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +64 -0
  34. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +6 -0
  35. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +14 -0
  36. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -0
  37. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +3 -0
  38. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +1 -27
  39. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +2 -9
  40. package/dist/pipeline/compiler/rubric-resolution.d.ts +40 -0
  41. package/dist/pipeline/compiler/rubric-resolution.js +52 -0
  42. package/dist/pipeline/compiler/scoring-bridge.js +59 -7
  43. package/dist/pipeline/provenance.js +7 -1
  44. package/dist/pipeline/validate.d.ts +5 -4
  45. package/dist/pipeline/validate.js +34 -113
  46. package/dist/webhook/eval-request-handler.js +4 -0
  47. package/package.json +1 -1
package/config/rubrics.ts CHANGED
@@ -201,9 +201,9 @@ export default defineRubrics({
201
201
  currency: 0.2,
202
202
  },
203
203
  "agent-harness": {
204
- "agent-output": 0.45,
205
- "tool-usage": 0.4,
206
- "process-quality": 0.15,
204
+ "assertion-pass-rate": 0.35,
205
+ "agent-output": 0.35,
206
+ "tool-usage": 0.3,
207
207
  },
208
208
  },
209
209
 
@@ -126,31 +126,56 @@ export interface FeatureScore {
126
126
  * Only present when agentic evaluation data is available.
127
127
  */
128
128
  actualScore?: number;
129
+ /**
130
+ * Assertion pass rate — fraction of structural assertions that passed (0–100).
131
+ * Only present for modes with javascript assertions (agent-harness, agent-task).
132
+ */
133
+ assertionPassRate?: number;
129
134
  /**
130
135
  * Ceiling score — gold-standard docs injected directly.
131
136
  * This is the theoretical maximum score for this area given the current docs.
137
+ * Set to 0 for modes without with/without-docs variants (agent-harness).
132
138
  */
133
139
  ceilingScore: number;
134
140
  codeCorrectness: number;
141
+ /**
142
+ * Generic dimension scores map — all dimensions by kebab-case name (0–100).
143
+ * Includes the three named fields above plus any mode-specific dimensions
144
+ * (e.g., agent-output, tool-usage, assertion-pass-rate).
145
+ * New consumers should read from this map. The named fields are backward-
146
+ * compatible accessors populated from it.
147
+ */
148
+ dimensions?: Record<string, number>;
135
149
  docCoverage: number;
136
150
  /** Sanity documents used for this feature area's evaluation */
137
151
  documents?: DocumentRef[];
138
152
  /**
139
153
  * Doc Lift — documentation quality contribution (ceiling − floor).
140
154
  * Positive when docs help, negative when docs hurt (interference).
155
+ * Set to 0 for modes without with/without-docs variants (agent-harness).
141
156
  */
142
157
  docLift: number;
143
158
  /**
144
159
  * Doc quality gap — room for documentation improvement (100 − ceiling).
145
160
  * Lower is better.
161
+ * Set to 0 for modes without with/without-docs variants (agent-harness).
146
162
  */
147
163
  docQualityGap: number;
148
164
  feature: string;
149
165
  /**
150
166
  * Floor score — no docs, training data only.
151
167
  * The model's inherent knowledge baseline.
168
+ * Set to 0 for modes without with/without-docs variants (agent-harness).
152
169
  */
153
170
  floorScore: number;
171
+ /**
172
+ * How this score entry was grouped.
173
+ * - "feature": by documentation feature area (literacy mode)
174
+ * - "task": by individual task ID (agent-harness mode)
175
+ * - "aggregate": single aggregate across all tasks
176
+ * Defaults to "feature" when absent (backward compatibility).
177
+ */
178
+ groupType?: "aggregate" | "feature" | "task";
154
179
  /**
155
180
  * Infrastructure efficiency — actual / ceiling (0.0–1.0).
156
181
  * What fraction of documentation quality reaches agents through discovery?
@@ -28,7 +28,13 @@
28
28
  */
29
29
  const TASKS_QUERY = /* groq */ `
30
30
  *[_type == "ailf.task"
31
- && (!defined($areas) || area->areaId.current in $areas)
31
+ && (
32
+ !defined($areas)
33
+ // Current field name
34
+ || area->areaId.current in $areas
35
+ // Legacy field name (pre-schema-rename documents)
36
+ || featureArea->areaId.current in $areas
37
+ )
32
38
  && (!defined($taskIds) || id.current in $taskIds)
33
39
  && (
34
40
  // Status-based filtering (unified — replaces execution.enabled)
@@ -39,13 +45,15 @@ const TASKS_QUERY = /* groq */ `
39
45
  || (defined($taskIds) && status != "archived")
40
46
  )
41
47
  && (!defined($tags) || count((tags)[@ in $tags]) > 0)
42
- ] | order(area->areaId.current asc, id.current asc) {
48
+ ] | order(coalesce(area->areaId.current, featureArea->areaId.current) asc, id.current asc) {
43
49
  "taskId": id.current,
44
- title,
45
- "areaId": area->areaId.current,
46
- promptText,
50
+ // Coalesce current and legacy field names so documents created before
51
+ // the schema rename are still readable.
52
+ "title": coalesce(title, description),
53
+ "areaId": coalesce(area->areaId.current, featureArea->areaId.current),
54
+ "promptText": coalesce(promptText, taskPrompt),
47
55
  docCoverage,
48
- "contextDocs": contextDocs[] {
56
+ "contextDocs": coalesce(contextDocs, canonicalDocs)[] {
49
57
  refType,
50
58
  "slug": doc->slug.current,
51
59
  "docRefId": doc->_id,
@@ -55,7 +63,7 @@ const TASKS_QUERY = /* groq */ `
55
63
  perspective,
56
64
  reason
57
65
  },
58
- assertions,
66
+ "assertions": coalesce(assertions, assert),
59
67
  rawAssert,
60
68
  baseline,
61
69
  tags,
@@ -9,18 +9,21 @@ import { fileURLToPath } from "url";
9
9
  import { Command } from "commander";
10
10
  import { createAppContext } from "../composition-root.js";
11
11
  import { calculateAndWriteScores } from "../pipeline/calculate-scores.js";
12
+ import { addOutputDirOption } from "./shared/options.js";
13
+ import { resolveOutputDir } from "./shared/resolve-output-dir.js";
12
14
  const __dirname = dirname(fileURLToPath(import.meta.url));
13
15
  const ROOT = resolve(__dirname, "..", "..");
14
16
  export function createCalculateScoresCommand() {
15
- return new Command("calculate-scores")
17
+ const cmd = new Command("calculate-scores")
16
18
  .description("Calculate AI Literacy Scores from Promptfoo evaluation results")
17
19
  .option("--source <name>", "Documentation source name")
18
20
  .argument("[results-path]", "Path to eval-results.json")
19
21
  .action(async (resultsPath, opts) => {
20
22
  try {
23
+ const outputDir = resolveOutputDir(opts.outputDir);
21
24
  const ctx = createAppContext({
22
25
  rootDir: ROOT,
23
- outputDir: resolve(ROOT, "results", "latest"),
26
+ outputDir,
24
27
  mode: "literacy",
25
28
  noAutoScope: false,
26
29
  skipFetch: true,
@@ -53,4 +56,6 @@ export function createCalculateScoresCommand() {
53
56
  console.error(err.message);
54
57
  }
55
58
  });
59
+ addOutputDirOption(cmd);
60
+ return cmd;
56
61
  }
@@ -5,7 +5,7 @@
5
5
  * reads each manifest, and prints a summary table sorted by date.
6
6
  *
7
7
  * Usage:
8
- * ailf capture list # default: results/captures/
8
+ * ailf capture list # default: .ailf/results/captures/
9
9
  * ailf capture list ./my-captures # custom directory
10
10
  */
11
11
  import { Command } from "commander";
@@ -5,22 +5,25 @@
5
5
  * reads each manifest, and prints a summary table sorted by date.
6
6
  *
7
7
  * Usage:
8
- * ailf capture list # default: results/captures/
8
+ * ailf capture list # default: .ailf/results/captures/
9
9
  * ailf capture list ./my-captures # custom directory
10
10
  */
11
11
  import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
12
12
  import { join, resolve } from "node:path";
13
13
  import { Command } from "commander";
14
+ import { resolveOutputDir } from "./shared/resolve-output-dir.js";
14
15
  // ---------------------------------------------------------------------------
15
16
  // Command factory
16
17
  // ---------------------------------------------------------------------------
17
18
  export function createCaptureListCommand() {
18
19
  return new Command("list")
19
20
  .description("List pipeline captures in a directory")
20
- .argument("[dir]", "Captures directory (default: results/captures/)")
21
+ .argument("[dir]", "Captures directory (default: .ailf/results/captures/)")
21
22
  .option("-f, --format <fmt>", "Output format: table or json", "table")
22
23
  .action(async (dir, opts) => {
23
- const captureDir = resolve(dir ?? "results/captures");
24
+ const captureDir = dir
25
+ ? resolve(dir)
26
+ : resolve(resolveOutputDir(), "..", "captures");
24
27
  if (!existsSync(captureDir)) {
25
28
  console.error(` No captures directory found at ${captureDir}`);
26
29
  console.error(" Run 'ailf pipeline --capture' to create captures.");
@@ -9,29 +9,31 @@ import { dirname, join, resolve } from "path";
9
9
  import { fileURLToPath } from "url";
10
10
  import { Command } from "commander";
11
11
  import { compare } from "../pipeline/compare.js";
12
+ import { addOutputDirOption } from "./shared/options.js";
13
+ import { resolveOutputDir } from "./shared/resolve-output-dir.js";
12
14
  import { DEFAULT_NOISE_THRESHOLD, } from "../pipeline/types.js";
13
15
  import { formatComparisonTable } from "../_vendor/ailf-core/index.js";
14
16
  const __dirname = dirname(fileURLToPath(import.meta.url));
15
17
  const ROOT = resolve(__dirname, "..", "..");
16
18
  const BASELINES_DIR = join(ROOT, "results", "baselines");
17
- const SCORE_SUMMARY_PATH = join(ROOT, "results", "latest", "score-summary.json");
18
19
  // ---------------------------------------------------------------------------
19
20
  // Helpers
20
21
  // ---------------------------------------------------------------------------
21
22
  export function createCompareCommand() {
22
- return new Command("compare")
23
+ const cmd = new Command("compare")
23
24
  .description("Compare two evaluation score summaries")
24
25
  .option("-b, --baseline <path>", "Baseline score-summary.json (default: latest baseline)")
25
- .option("-e, --experiment <path>", "Experiment score-summary.json (default: results/latest/score-summary.json)")
26
+ .option("-e, --experiment <path>", "Experiment score-summary.json (default: .ailf/results/latest/score-summary.json)")
26
27
  .option("-t, --threshold <n>", "Noise threshold for unchanged classification", parseFloat)
27
28
  .option("-o, --output <path>", "Write JSON report to file")
28
29
  .option("-f, --format <fmt>", "Output format: table or json", "table")
29
30
  .action(async (opts) => {
31
+ const outputDir = resolveOutputDir(opts.outputDir);
30
32
  const threshold = opts.threshold ?? DEFAULT_NOISE_THRESHOLD;
31
33
  // Resolve experiment path
32
34
  const expPath = opts.experiment
33
35
  ? resolve(opts.experiment)
34
- : SCORE_SUMMARY_PATH;
36
+ : join(outputDir, "score-summary.json");
35
37
  const experiment = loadSummary(expPath);
36
38
  // Resolve baseline path
37
39
  let basePath;
@@ -48,7 +50,7 @@ export function createCompareCommand() {
48
50
  }
49
51
  const baseline = loadSummary(basePath);
50
52
  // Try to load grader consistency data for empirical thresholds
51
- const consistencyPath = join(ROOT, "results", "latest", "grader-consistency.json");
53
+ const consistencyPath = join(outputDir, "grader-consistency.json");
52
54
  let graderConsistency;
53
55
  if (existsSync(consistencyPath) && opts.threshold === undefined) {
54
56
  try {
@@ -93,10 +95,12 @@ export function createCompareCommand() {
93
95
  console.log(` ✅ Comparison report also written to ${opts.output}`);
94
96
  }
95
97
  }
96
- // Write comparison report to results/latest for other steps to consume
97
- const latestComparisonPath = join(ROOT, "results", "latest", "comparison-report.json");
98
+ // Write comparison report to output dir for other steps to consume
99
+ const latestComparisonPath = join(outputDir, "comparison-report.json");
98
100
  writeFileSync(latestComparisonPath, JSON.stringify(report, null, 2));
99
101
  });
102
+ addOutputDirOption(cmd);
103
+ return cmd;
100
104
  }
101
105
  function findLatestBaseline() {
102
106
  if (!existsSync(BASELINES_DIR))
@@ -23,6 +23,7 @@ import { TASK_FILE_NAMES } from "../_vendor/ailf-core/index.js";
23
23
  import { buildPipelinePlan, buildSimpleCommandPlan, } from "../pipeline/plan.js";
24
24
  import { formatPlanConsole, formatPlanJson } from "../pipeline/plan-format.js";
25
25
  import { computeResolvedOptions } from "./pipeline-action.js";
26
+ import { getCallerCwd } from "./shared/resolve-output-dir.js";
26
27
  import { LiteracyVariant } from "../pipeline/normalize-mode.js";
27
28
  // ---------------------------------------------------------------------------
28
29
  // Registry
@@ -43,10 +44,10 @@ const EXPLAIN_REGISTRY = {
43
44
  "agent-report": {
44
45
  description: "Generate an agent behavior observation report from eval results",
45
46
  filesCreated: [
46
- "results/latest/agent-report.json",
47
- "results/latest/agent-report.md",
47
+ "<outputDir>/agent-report.json",
48
+ "<outputDir>/agent-report.md",
48
49
  ],
49
- filesRead: ["results/latest/eval-results.json"],
50
+ filesRead: ["<outputDir>/eval-results.json"],
50
51
  steps: [
51
52
  {
52
53
  cacheStatus: "miss",
@@ -82,9 +83,9 @@ const EXPLAIN_REGISTRY = {
82
83
  },
83
84
  "calculate-scores": {
84
85
  description: "Calculate AI Literacy Scores from Promptfoo evaluation results",
85
- filesCreated: ["results/latest/score-summary.json"],
86
+ filesCreated: ["<outputDir>/score-summary.json"],
86
87
  filesRead: [
87
- "results/latest/eval-results.json",
88
+ "<outputDir>/eval-results.json",
88
89
  "config/rubrics.ts",
89
90
  "config/models.ts",
90
91
  ],
@@ -104,23 +105,20 @@ const EXPLAIN_REGISTRY = {
104
105
  {
105
106
  cacheStatus: "miss",
106
107
  name: "Write summary",
107
- reason: "Persist score-summary.json to results/latest/",
108
+ reason: "Persist score-summary.json to output directory",
108
109
  willRun: true,
109
110
  },
110
111
  ],
111
112
  },
112
113
  compare: {
113
114
  description: "Compare current evaluation scores against a saved baseline",
114
- filesCreated: ["results/latest/comparison-report.json"],
115
- filesRead: [
116
- "results/latest/score-summary.json",
117
- "results/baselines/*.json",
118
- ],
115
+ filesCreated: ["<outputDir>/comparison-report.json"],
116
+ filesRead: ["<outputDir>/score-summary.json", "results/baselines/*.json"],
119
117
  steps: [
120
118
  {
121
119
  cacheStatus: "miss",
122
120
  name: "Load current scores",
123
- reason: "Read results/latest/score-summary.json",
121
+ reason: "Read <outputDir>/score-summary.json",
124
122
  willRun: true,
125
123
  },
126
124
  {
@@ -181,8 +179,8 @@ const EXPLAIN_REGISTRY = {
181
179
  },
182
180
  "discovery-report": {
183
181
  description: "Generate agent discoverability report from agentic retrieval metrics",
184
- filesCreated: ["results/latest/discovery-report.md"],
185
- filesRead: ["results/latest/score-summary.json"],
182
+ filesCreated: ["<outputDir>/discovery-report.md"],
183
+ filesRead: ["<outputDir>/score-summary.json"],
186
184
  steps: [
187
185
  {
188
186
  cacheStatus: "miss",
@@ -206,7 +204,7 @@ const EXPLAIN_REGISTRY = {
206
204
  },
207
205
  eval: {
208
206
  description: "Run Promptfoo evaluation directly (passthrough — all flags forwarded to promptfoo)",
209
- filesCreated: ["results/latest/eval-results.json"],
207
+ filesCreated: ["<outputDir>/eval-results.json"],
210
208
  filesRead: ["promptfooconfig.yaml"],
211
209
  steps: [
212
210
  {
@@ -280,7 +278,7 @@ const EXPLAIN_REGISTRY = {
280
278
  grader: {
281
279
  description: "Grader reliability tools (consistency, compare, sensitivity, validate)",
282
280
  filesRead: [
283
- "results/latest/eval-results.json",
281
+ "<outputDir>/eval-results.json",
284
282
  "config/rubrics.ts",
285
283
  "canonical/reference-solutions/",
286
284
  ],
@@ -312,7 +310,7 @@ const EXPLAIN_REGISTRY = {
312
310
  },
313
311
  "measure-retrieval": {
314
312
  description: "Measure Sanity text search retrieval quality against canonical document annotations",
315
- filesCreated: ["results/latest/retrieval-metrics.json"],
313
+ filesCreated: ["<outputDir>/retrieval-metrics.json"],
316
314
  filesRead: ["tasks/literacy/*.task.ts"],
317
315
  steps: [
318
316
  {
@@ -337,7 +335,7 @@ const EXPLAIN_REGISTRY = {
337
335
  },
338
336
  "pr-comment": {
339
337
  description: "Generate a markdown PR comment from evaluation scores for CI posting",
340
- filesRead: ["results/latest/score-summary.json"],
338
+ filesRead: ["<outputDir>/score-summary.json"],
341
339
  steps: [
342
340
  {
343
341
  cacheStatus: "miss",
@@ -355,7 +353,7 @@ const EXPLAIN_REGISTRY = {
355
353
  },
356
354
  publish: {
357
355
  description: "Publish a local evaluation report to the Sanity Content Lake (standalone)",
358
- filesRead: ["results/latest/score-summary.json"],
356
+ filesRead: ["<outputDir>/score-summary.json"],
359
357
  steps: [
360
358
  {
361
359
  cacheStatus: "miss",
@@ -386,12 +384,12 @@ const EXPLAIN_REGISTRY = {
386
384
  "readiness-report": {
387
385
  description: "Generate launch readiness checklist for a feature area with threshold evaluation",
388
386
  filesRead: [
389
- "results/latest/score-summary.json",
390
- "results/latest/gap-analysis.json",
387
+ "<outputDir>/score-summary.json",
388
+ "<outputDir>/gap-analysis.json",
391
389
  "config/thresholds.ts",
392
390
  "results/baselines/",
393
391
  ],
394
- filesCreated: ["results/latest/readiness-report.md"],
392
+ filesCreated: ["<outputDir>/readiness-report.md"],
395
393
  steps: [
396
394
  {
397
395
  cacheStatus: "miss",
@@ -603,7 +601,7 @@ function buildInitExplainPlan(actionCommand, rootDir) {
603
601
  const configFile = format === "ts"
604
602
  ? "ailf.config.ts"
605
603
  : `config.${format === "yaml" ? "yaml" : "json"}`;
606
- const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
604
+ const callerCwd = getCallerCwd();
607
605
  const targetDir = opts.path ?? ".";
608
606
  const ailfDir = `${targetDir}/.ailf`;
609
607
  const tasksDir = `${ailfDir}/tasks`;
@@ -664,7 +662,7 @@ function buildBaselineExplainPlan(actionCommand, rootDir) {
664
662
  command: `baseline ${subcommand}`,
665
663
  description: descriptions[subcommand] ?? `Baseline operation: ${subcommand}`,
666
664
  filesCreated: subcommand === "save" ? ["results/baselines/<timestamp>.json"] : [],
667
- filesRead: ["results/latest/score-summary.json", "results/baselines/"],
665
+ filesRead: ["<outputDir>/score-summary.json", "results/baselines/"],
668
666
  rootDir,
669
667
  });
670
668
  }
@@ -11,7 +11,8 @@ import { Command } from "commander";
11
11
  import { createAppContext } from "../composition-root.js";
12
12
  import { loadSource } from "../sources.js";
13
13
  import { configToSourceOverrides } from "../orchestration/config-to-source-overrides.js";
14
- import { addSanitySourceOptions } from "./shared/options.js";
14
+ import { addOutputDirOption, addSanitySourceOptions } from "./shared/options.js";
15
+ import { resolveOutputDir } from "./shared/resolve-output-dir.js";
15
16
  const __dirname = dirname(fileURLToPath(import.meta.url));
16
17
  const ROOT = resolve(__dirname, "..", "..");
17
18
  export function createFetchDocsCommand() {
@@ -31,6 +32,7 @@ export function createFetchDocsCommand() {
31
32
  }
32
33
  });
33
34
  addSanitySourceOptions(cmd);
35
+ addOutputDirOption(cmd);
34
36
  return cmd;
35
37
  }
36
38
  // ---------------------------------------------------------------------------
@@ -41,7 +43,7 @@ async function executeFetchDocs(opts) {
41
43
  // Build a minimal ResolvedConfig for the composition root
42
44
  const ctx = createAppContext({
43
45
  rootDir: ROOT,
44
- outputDir: resolve(ROOT, "results", "latest"),
46
+ outputDir: resolveOutputDir(opts.outputDir),
45
47
  mode: "literacy",
46
48
  noAutoScope: false,
47
49
  skipFetch: false,
@@ -9,17 +9,19 @@ import { fileURLToPath } from "url";
9
9
  import { Command } from "commander";
10
10
  import { createAppContext } from "../composition-root.js";
11
11
  import { GenerateConfigsStep } from "../orchestration/steps/generate-configs-step.js";
12
+ import { addOutputDirOption } from "./shared/options.js";
13
+ import { resolveOutputDir } from "./shared/resolve-output-dir.js";
12
14
  const __dirname = dirname(fileURLToPath(import.meta.url));
13
15
  const ROOT = resolve(__dirname, "..", "..");
14
16
  export function createGenerateConfigsCommand() {
15
- return new Command("generate-configs")
17
+ const cmd = new Command("generate-configs")
16
18
  .description("Generate promptfoo config files from config/models.yaml")
17
19
  .option("-s, --source <name>", "Documentation source name")
18
20
  .action(async (opts) => {
19
21
  try {
20
22
  const ctx = createAppContext({
21
23
  rootDir: ROOT,
22
- outputDir: resolve(ROOT, "results", "latest"),
24
+ outputDir: resolveOutputDir(opts.outputDir),
23
25
  mode: "literacy",
24
26
  noAutoScope: false,
25
27
  skipFetch: true,
@@ -58,4 +60,6 @@ export function createGenerateConfigsCommand() {
58
60
  console.error(err.message);
59
61
  }
60
62
  });
63
+ addOutputDirOption(cmd);
64
+ return cmd;
61
65
  }
@@ -21,6 +21,7 @@ import { buildStepSequence } from "../orchestration/build-step-sequence.js";
21
21
  import { orchestratePipeline } from "../orchestration/pipeline-orchestrator.js";
22
22
  import { load } from "js-yaml";
23
23
  import { parseRepoConfig, } from "../adapters/task-sources/repo-schemas.js";
24
+ import { getCallerCwd, resolveOutputDir } from "./shared/resolve-output-dir.js";
24
25
  const __dirname = dirname(fileURLToPath(import.meta.url));
25
26
  const ROOT = resolve(__dirname, "..", "..");
26
27
  // ---------------------------------------------------------------------------
@@ -35,7 +36,7 @@ const VALID_SEARCH_MODES = ["open", "origin-only", "off"];
35
36
  */
36
37
  export function computeResolvedOptions(opts) {
37
38
  // Resolve paths relative to the caller's cwd, not the eval package root
38
- const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
39
+ const callerCwd = getCallerCwd();
39
40
  // Validate + normalize mode via the single boundary function.
40
41
  // normalizeMode() maps legacy variant names (baseline, agentic, etc.)
41
42
  // to canonical mode "literacy" + variant, and throws on invalid input.
@@ -209,23 +210,12 @@ export function computeResolvedOptions(opts) {
209
210
  const remote = opts.remote || process.env.AILF_REMOTE === "1";
210
211
  const apiUrl = opts.apiUrl ?? process.env.AILF_API_URL ?? "https://ailf-api.sanity.build";
211
212
  const apiKey = process.env.AILF_API_KEY ?? undefined;
212
- // Output directory: explicit flag → repo-task heuristic default
213
+ // Output directory: explicit --output-dir$CWD/.ailf/results/latest/
213
214
  const resolvedRepoTasksPath = opts.repoTasksPath
214
215
  ? resolve(callerCwd, opts.repoTasksPath)
215
216
  : undefined;
216
217
  const resolvedTaskSourceType = resolveTaskSourceType(opts.taskSource);
217
- let outputDir;
218
- if (opts.outputDir) {
219
- outputDir = resolve(callerCwd, opts.outputDir);
220
- }
221
- else if (resolvedTaskSourceType === "repo" || resolvedRepoTasksPath) {
222
- outputDir = resolvedRepoTasksPath
223
- ? resolve(resolvedRepoTasksPath, "..", "results", "latest")
224
- : resolve(callerCwd, ".ailf", "results", "latest");
225
- }
226
- else {
227
- outputDir = resolve(ROOT, "results", "latest");
228
- }
218
+ const outputDir = resolveOutputDir(opts.outputDir);
229
219
  return {
230
220
  allowedOriginArgs,
231
221
  apiKey,
@@ -310,7 +300,7 @@ export async function executePipeline(cliOpts) {
310
300
  }
311
301
  const { FileConfigAdapter } = await import("../adapters/config-sources/file-config-adapter.js");
312
302
  const { createAppContext } = await import("../composition-root.js");
313
- const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
303
+ const callerCwd = getCallerCwd();
314
304
  const adapter = new FileConfigAdapter(cliOpts.config, ROOT);
315
305
  const config = await adapter.resolve();
316
306
  // Merge CLI-only flags that aren't in the config file.
@@ -323,13 +313,8 @@ export async function executePipeline(cliOpts) {
323
313
  if (cliOpts.output) {
324
314
  config.outputPath = resolve(callerCwd, cliOpts.output);
325
315
  }
326
- // Output dir: explicit CLI flag → repo-task heuristic → file-config default
327
- if (cliOpts.outputDir) {
328
- config.outputDir = resolve(callerCwd, cliOpts.outputDir);
329
- }
330
- else if (config.repoTasksPath) {
331
- config.outputDir = resolve(config.repoTasksPath, "..", "results", "latest");
332
- }
316
+ // Output dir: explicit CLI flag → $CWD/.ailf/results/latest/
317
+ config.outputDir = resolveOutputDir(cliOpts.outputDir);
333
318
  // Create AppContext directly from the merged config so adapters
334
319
  // (especially taskSource) are wired from the file config's
335
320
  // taskSourceType — not from CLI defaults.
@@ -350,8 +335,7 @@ export async function executePipeline(cliOpts) {
350
335
  // cache which never contains .ailf/.
351
336
  if (o.remote) {
352
337
  const { runRemotePipeline } = await import("./remote-pipeline.js");
353
- const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
354
- await runRemotePipeline(o, callerCwd);
338
+ await runRemotePipeline(o, getCallerCwd());
355
339
  return;
356
340
  }
357
341
  // Dry-run: validate only, don't execute steps
@@ -55,7 +55,7 @@ export function createPipelineCommand() {
55
55
  .option("--remote", "Submit evaluation to the AILF API instead of running locally", false)
56
56
  .option("--api-url <url>", "AILF API base URL (default: https://ailf-api.sanity.build)")
57
57
  .option("--capture", "Enable artifact capture for this run", false)
58
- .option("--capture-dir <path>", "Base directory for capture output (default: results/captures/)")
58
+ .option("--capture-dir <path>", "Base directory for capture output (default: .ailf/results/captures/)")
59
59
  .option("--no-capture-compress", "Disable tar.gz compression of captures")
60
60
  .option("--no-capture-extras", "Exclude mode-specific artifacts from captures")
61
61
  .action(async (opts) => {
@@ -9,10 +9,12 @@ import { fileURLToPath } from "url";
9
9
  import { Command } from "commander";
10
10
  import { createAppContext } from "../composition-root.js";
11
11
  import { generatePrComment } from "../pipeline/pr-comment.js";
12
+ import { addOutputDirOption } from "./shared/options.js";
13
+ import { resolveOutputDir } from "./shared/resolve-output-dir.js";
12
14
  const __dirname = dirname(fileURLToPath(import.meta.url));
13
15
  const ROOT = resolve(__dirname, "..", "..");
14
16
  export function createPrCommentCommand() {
15
- return new Command("pr-comment")
17
+ const cmd = new Command("pr-comment")
16
18
  .description("Generate a markdown PR comment from evaluation scores")
17
19
  .option("--output <path>", "Write comment to file (default: stdout)")
18
20
  .option("--promptfoo-url <url>", "Promptfoo share URL to include")
@@ -20,7 +22,7 @@ export function createPrCommentCommand() {
20
22
  try {
21
23
  const ctx = createAppContext({
22
24
  rootDir: ROOT,
23
- outputDir: resolve(ROOT, "results", "latest"),
25
+ outputDir: resolveOutputDir(opts.outputDir),
24
26
  mode: "literacy",
25
27
  noAutoScope: false,
26
28
  skipFetch: true,
@@ -48,4 +50,6 @@ export function createPrCommentCommand() {
48
50
  console.error(err.message);
49
51
  }
50
52
  });
53
+ addOutputDirOption(cmd);
54
+ return cmd;
51
55
  }
@@ -21,6 +21,7 @@
21
21
  import { Command } from "commander";
22
22
  export interface PublishCommandOptions {
23
23
  dryRun: boolean;
24
+ outputDir?: string;
24
25
  tag?: string;
25
26
  }
26
27
  export declare function createPublishCommand(): Command;
@@ -23,22 +23,27 @@ import { dirname, resolve } from "path";
23
23
  import { fileURLToPath } from "url";
24
24
  import { Command } from "commander";
25
25
  import { createAppContext } from "../composition-root.js";
26
+ import { addOutputDirOption } from "./shared/options.js";
27
+ import { getCallerCwd, resolveOutputDir } from "./shared/resolve-output-dir.js";
26
28
  import { buildProvenance, } from "../pipeline/provenance.js";
27
29
  import { generateReportTitle } from "../pipeline/report-title.js";
28
30
  import { generateReportId, } from "../report-store.js";
29
31
  import { withRetry } from "../sinks/retry.js";
30
32
  const __dirname = dirname(fileURLToPath(import.meta.url));
31
33
  const ROOT = resolve(__dirname, "..", "..");
32
- const DEFAULT_SUMMARY_PATH = resolve(ROOT, "results", "latest", "score-summary.json");
33
34
  export function createPublishCommand() {
34
- return new Command("publish")
35
+ const cmd = new Command("publish")
35
36
  .description("Publish a local evaluation report to the Sanity Content Lake")
36
- .argument("[summary-path]", "Path to score-summary.json", DEFAULT_SUMMARY_PATH)
37
+ .argument("[summary-path]", "Path to score-summary.json")
37
38
  .option("-t, --tag <tag>", "Label for the published report")
38
39
  .option("-n, --dry-run", "Preview the report without writing to Sanity or sinks", false)
39
40
  .action(async (summaryPath, opts) => {
40
- await runPublishCommand(summaryPath, opts);
41
+ const outputDir = resolveOutputDir(opts.outputDir);
42
+ const effectivePath = summaryPath ?? resolve(outputDir, "score-summary.json");
43
+ await runPublishCommand(effectivePath, outputDir, opts);
41
44
  });
45
+ addOutputDirOption(cmd);
46
+ return cmd;
42
47
  }
43
48
  // ---------------------------------------------------------------------------
44
49
  // Provenance builder (from score summary, not full pipeline context)
@@ -77,7 +82,7 @@ function buildProvenanceFromSummary(summary) {
77
82
  // ---------------------------------------------------------------------------
78
83
  // Command implementation
79
84
  // ---------------------------------------------------------------------------
80
- async function runPublishCommand(summaryPath, opts) {
85
+ async function runPublishCommand(summaryPath, outputDir, opts) {
81
86
  // Wire up infrastructure via composition root
82
87
  const ctx = createAppContext({
83
88
  compareEnabled: false,
@@ -87,7 +92,7 @@ async function runPublishCommand(summaryPath, opts) {
87
92
  noAutoScope: false,
88
93
  noCache: true,
89
94
  noRemoteCache: true,
90
- outputDir: resolve(ROOT, "results", "latest"),
95
+ outputDir,
91
96
  publishEnabled: true,
92
97
  publishTag: opts.tag,
93
98
  readinessEnabled: false,
@@ -106,8 +111,7 @@ async function runPublishCommand(summaryPath, opts) {
106
111
  // -----------------------------------------------------------------------
107
112
  // 1. Resolve and read the score summary
108
113
  // -----------------------------------------------------------------------
109
- const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
110
- const resolvedPath = resolve(callerCwd, summaryPath);
114
+ const resolvedPath = resolve(getCallerCwd(), summaryPath);
111
115
  if (!existsSync(resolvedPath)) {
112
116
  console.error(` ✖ File not found: ${resolvedPath}`);
113
117
  console.error();
@@ -88,7 +88,7 @@ export async function runRemotePipeline(opts, rootDir) {
88
88
  }
89
89
  // 7. Fetch and write output artifacts
90
90
  await writeRemoteResults(client, job, {
91
- rootDir,
91
+ outputDir: opts.outputDir,
92
92
  outputPath: opts.outputPath,
93
93
  apiUrl: opts.apiUrl,
94
94
  });