@sanity/ailf 2.2.0 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/rubrics.ts +3 -3
- package/dist/_vendor/ailf-core/types/index.d.ts +25 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +15 -7
- package/dist/commands/calculate-scores.js +7 -2
- package/dist/commands/capture-list.d.ts +1 -1
- package/dist/commands/capture-list.js +6 -3
- package/dist/commands/compare.js +11 -7
- package/dist/commands/explain-handler.js +22 -24
- package/dist/commands/fetch-docs.js +4 -2
- package/dist/commands/generate-configs.js +6 -2
- package/dist/commands/pipeline-action.js +8 -24
- package/dist/commands/pipeline.js +1 -1
- package/dist/commands/pr-comment.js +6 -2
- package/dist/commands/publish.d.ts +1 -0
- package/dist/commands/publish.js +12 -8
- package/dist/commands/remote-pipeline.js +1 -1
- package/dist/commands/remote-results.d.ts +8 -8
- package/dist/commands/remote-results.js +7 -7
- package/dist/commands/shared/options.d.ts +8 -0
- package/dist/commands/shared/options.js +10 -0
- package/dist/commands/shared/resolve-output-dir.d.ts +27 -0
- package/dist/commands/shared/resolve-output-dir.js +36 -0
- package/dist/composition-root.js +1 -1
- package/dist/config/rubrics.ts +3 -3
- package/dist/orchestration/build-app-context.js +1 -1
- package/dist/orchestration/steps/fetch-docs-step.js +23 -9
- package/dist/orchestration/steps/gap-analysis-step.js +86 -75
- package/dist/orchestration/steps/generate-configs-step.d.ts +15 -0
- package/dist/orchestration/steps/generate-configs-step.js +56 -0
- package/dist/orchestration/steps/run-eval-step.js +14 -0
- package/dist/pipeline/calculate-scores.js +113 -2
- package/dist/pipeline/compare.js +50 -19
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +64 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +6 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +14 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +3 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +1 -27
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +2 -9
- package/dist/pipeline/compiler/rubric-resolution.d.ts +40 -0
- package/dist/pipeline/compiler/rubric-resolution.js +52 -0
- package/dist/pipeline/compiler/scoring-bridge.js +59 -7
- package/dist/pipeline/provenance.js +7 -1
- package/dist/pipeline/validate.d.ts +5 -4
- package/dist/pipeline/validate.js +34 -113
- package/dist/webhook/eval-request-handler.js +4 -0
- package/package.json +1 -1
package/config/rubrics.ts
CHANGED
|
@@ -201,9 +201,9 @@ export default defineRubrics({
|
|
|
201
201
|
currency: 0.2,
|
|
202
202
|
},
|
|
203
203
|
"agent-harness": {
|
|
204
|
-
"
|
|
205
|
-
"
|
|
206
|
-
"
|
|
204
|
+
"assertion-pass-rate": 0.35,
|
|
205
|
+
"agent-output": 0.35,
|
|
206
|
+
"tool-usage": 0.3,
|
|
207
207
|
},
|
|
208
208
|
},
|
|
209
209
|
|
|
@@ -126,31 +126,56 @@ export interface FeatureScore {
|
|
|
126
126
|
* Only present when agentic evaluation data is available.
|
|
127
127
|
*/
|
|
128
128
|
actualScore?: number;
|
|
129
|
+
/**
|
|
130
|
+
* Assertion pass rate — fraction of structural assertions that passed (0–100).
|
|
131
|
+
* Only present for modes with javascript assertions (agent-harness, agent-task).
|
|
132
|
+
*/
|
|
133
|
+
assertionPassRate?: number;
|
|
129
134
|
/**
|
|
130
135
|
* Ceiling score — gold-standard docs injected directly.
|
|
131
136
|
* This is the theoretical maximum score for this area given the current docs.
|
|
137
|
+
* Set to 0 for modes without with/without-docs variants (agent-harness).
|
|
132
138
|
*/
|
|
133
139
|
ceilingScore: number;
|
|
134
140
|
codeCorrectness: number;
|
|
141
|
+
/**
|
|
142
|
+
* Generic dimension scores map — all dimensions by kebab-case name (0–100).
|
|
143
|
+
* Includes the three named fields above plus any mode-specific dimensions
|
|
144
|
+
* (e.g., agent-output, tool-usage, assertion-pass-rate).
|
|
145
|
+
* New consumers should read from this map. The named fields are backward-
|
|
146
|
+
* compatible accessors populated from it.
|
|
147
|
+
*/
|
|
148
|
+
dimensions?: Record<string, number>;
|
|
135
149
|
docCoverage: number;
|
|
136
150
|
/** Sanity documents used for this feature area's evaluation */
|
|
137
151
|
documents?: DocumentRef[];
|
|
138
152
|
/**
|
|
139
153
|
* Doc Lift — documentation quality contribution (ceiling − floor).
|
|
140
154
|
* Positive when docs help, negative when docs hurt (interference).
|
|
155
|
+
* Set to 0 for modes without with/without-docs variants (agent-harness).
|
|
141
156
|
*/
|
|
142
157
|
docLift: number;
|
|
143
158
|
/**
|
|
144
159
|
* Doc quality gap — room for documentation improvement (100 − ceiling).
|
|
145
160
|
* Lower is better.
|
|
161
|
+
* Set to 0 for modes without with/without-docs variants (agent-harness).
|
|
146
162
|
*/
|
|
147
163
|
docQualityGap: number;
|
|
148
164
|
feature: string;
|
|
149
165
|
/**
|
|
150
166
|
* Floor score — no docs, training data only.
|
|
151
167
|
* The model's inherent knowledge baseline.
|
|
168
|
+
* Set to 0 for modes without with/without-docs variants (agent-harness).
|
|
152
169
|
*/
|
|
153
170
|
floorScore: number;
|
|
171
|
+
/**
|
|
172
|
+
* How this score entry was grouped.
|
|
173
|
+
* - "feature": by documentation feature area (literacy mode)
|
|
174
|
+
* - "task": by individual task ID (agent-harness mode)
|
|
175
|
+
* - "aggregate": single aggregate across all tasks
|
|
176
|
+
* Defaults to "feature" when absent (backward compatibility).
|
|
177
|
+
*/
|
|
178
|
+
groupType?: "aggregate" | "feature" | "task";
|
|
154
179
|
/**
|
|
155
180
|
* Infrastructure efficiency — actual / ceiling (0.0–1.0).
|
|
156
181
|
* What fraction of documentation quality reaches agents through discovery?
|
|
@@ -28,7 +28,13 @@
|
|
|
28
28
|
*/
|
|
29
29
|
const TASKS_QUERY = /* groq */ `
|
|
30
30
|
*[_type == "ailf.task"
|
|
31
|
-
&& (
|
|
31
|
+
&& (
|
|
32
|
+
!defined($areas)
|
|
33
|
+
// Current field name
|
|
34
|
+
|| area->areaId.current in $areas
|
|
35
|
+
// Legacy field name (pre-schema-rename documents)
|
|
36
|
+
|| featureArea->areaId.current in $areas
|
|
37
|
+
)
|
|
32
38
|
&& (!defined($taskIds) || id.current in $taskIds)
|
|
33
39
|
&& (
|
|
34
40
|
// Status-based filtering (unified — replaces execution.enabled)
|
|
@@ -39,13 +45,15 @@ const TASKS_QUERY = /* groq */ `
|
|
|
39
45
|
|| (defined($taskIds) && status != "archived")
|
|
40
46
|
)
|
|
41
47
|
&& (!defined($tags) || count((tags)[@ in $tags]) > 0)
|
|
42
|
-
] | order(area->areaId.current asc, id.current asc) {
|
|
48
|
+
] | order(coalesce(area->areaId.current, featureArea->areaId.current) asc, id.current asc) {
|
|
43
49
|
"taskId": id.current,
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
50
|
+
// Coalesce current and legacy field names so documents created before
|
|
51
|
+
// the schema rename are still readable.
|
|
52
|
+
"title": coalesce(title, description),
|
|
53
|
+
"areaId": coalesce(area->areaId.current, featureArea->areaId.current),
|
|
54
|
+
"promptText": coalesce(promptText, taskPrompt),
|
|
47
55
|
docCoverage,
|
|
48
|
-
"contextDocs": contextDocs[] {
|
|
56
|
+
"contextDocs": coalesce(contextDocs, canonicalDocs)[] {
|
|
49
57
|
refType,
|
|
50
58
|
"slug": doc->slug.current,
|
|
51
59
|
"docRefId": doc->_id,
|
|
@@ -55,7 +63,7 @@ const TASKS_QUERY = /* groq */ `
|
|
|
55
63
|
perspective,
|
|
56
64
|
reason
|
|
57
65
|
},
|
|
58
|
-
assertions,
|
|
66
|
+
"assertions": coalesce(assertions, assert),
|
|
59
67
|
rawAssert,
|
|
60
68
|
baseline,
|
|
61
69
|
tags,
|
|
@@ -9,18 +9,21 @@ import { fileURLToPath } from "url";
|
|
|
9
9
|
import { Command } from "commander";
|
|
10
10
|
import { createAppContext } from "../composition-root.js";
|
|
11
11
|
import { calculateAndWriteScores } from "../pipeline/calculate-scores.js";
|
|
12
|
+
import { addOutputDirOption } from "./shared/options.js";
|
|
13
|
+
import { resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
12
14
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
13
15
|
const ROOT = resolve(__dirname, "..", "..");
|
|
14
16
|
export function createCalculateScoresCommand() {
|
|
15
|
-
|
|
17
|
+
const cmd = new Command("calculate-scores")
|
|
16
18
|
.description("Calculate AI Literacy Scores from Promptfoo evaluation results")
|
|
17
19
|
.option("--source <name>", "Documentation source name")
|
|
18
20
|
.argument("[results-path]", "Path to eval-results.json")
|
|
19
21
|
.action(async (resultsPath, opts) => {
|
|
20
22
|
try {
|
|
23
|
+
const outputDir = resolveOutputDir(opts.outputDir);
|
|
21
24
|
const ctx = createAppContext({
|
|
22
25
|
rootDir: ROOT,
|
|
23
|
-
outputDir
|
|
26
|
+
outputDir,
|
|
24
27
|
mode: "literacy",
|
|
25
28
|
noAutoScope: false,
|
|
26
29
|
skipFetch: true,
|
|
@@ -53,4 +56,6 @@ export function createCalculateScoresCommand() {
|
|
|
53
56
|
console.error(err.message);
|
|
54
57
|
}
|
|
55
58
|
});
|
|
59
|
+
addOutputDirOption(cmd);
|
|
60
|
+
return cmd;
|
|
56
61
|
}
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* reads each manifest, and prints a summary table sorted by date.
|
|
6
6
|
*
|
|
7
7
|
* Usage:
|
|
8
|
-
* ailf capture list # default: results/captures/
|
|
8
|
+
* ailf capture list # default: .ailf/results/captures/
|
|
9
9
|
* ailf capture list ./my-captures # custom directory
|
|
10
10
|
*/
|
|
11
11
|
import { Command } from "commander";
|
|
@@ -5,22 +5,25 @@
|
|
|
5
5
|
* reads each manifest, and prints a summary table sorted by date.
|
|
6
6
|
*
|
|
7
7
|
* Usage:
|
|
8
|
-
* ailf capture list # default: results/captures/
|
|
8
|
+
* ailf capture list # default: .ailf/results/captures/
|
|
9
9
|
* ailf capture list ./my-captures # custom directory
|
|
10
10
|
*/
|
|
11
11
|
import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
|
|
12
12
|
import { join, resolve } from "node:path";
|
|
13
13
|
import { Command } from "commander";
|
|
14
|
+
import { resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
14
15
|
// ---------------------------------------------------------------------------
|
|
15
16
|
// Command factory
|
|
16
17
|
// ---------------------------------------------------------------------------
|
|
17
18
|
export function createCaptureListCommand() {
|
|
18
19
|
return new Command("list")
|
|
19
20
|
.description("List pipeline captures in a directory")
|
|
20
|
-
.argument("[dir]", "Captures directory (default: results/captures/)")
|
|
21
|
+
.argument("[dir]", "Captures directory (default: .ailf/results/captures/)")
|
|
21
22
|
.option("-f, --format <fmt>", "Output format: table or json", "table")
|
|
22
23
|
.action(async (dir, opts) => {
|
|
23
|
-
const captureDir =
|
|
24
|
+
const captureDir = dir
|
|
25
|
+
? resolve(dir)
|
|
26
|
+
: resolve(resolveOutputDir(), "..", "captures");
|
|
24
27
|
if (!existsSync(captureDir)) {
|
|
25
28
|
console.error(` No captures directory found at ${captureDir}`);
|
|
26
29
|
console.error(" Run 'ailf pipeline --capture' to create captures.");
|
package/dist/commands/compare.js
CHANGED
|
@@ -9,29 +9,31 @@ import { dirname, join, resolve } from "path";
|
|
|
9
9
|
import { fileURLToPath } from "url";
|
|
10
10
|
import { Command } from "commander";
|
|
11
11
|
import { compare } from "../pipeline/compare.js";
|
|
12
|
+
import { addOutputDirOption } from "./shared/options.js";
|
|
13
|
+
import { resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
12
14
|
import { DEFAULT_NOISE_THRESHOLD, } from "../pipeline/types.js";
|
|
13
15
|
import { formatComparisonTable } from "../_vendor/ailf-core/index.js";
|
|
14
16
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
15
17
|
const ROOT = resolve(__dirname, "..", "..");
|
|
16
18
|
const BASELINES_DIR = join(ROOT, "results", "baselines");
|
|
17
|
-
const SCORE_SUMMARY_PATH = join(ROOT, "results", "latest", "score-summary.json");
|
|
18
19
|
// ---------------------------------------------------------------------------
|
|
19
20
|
// Helpers
|
|
20
21
|
// ---------------------------------------------------------------------------
|
|
21
22
|
export function createCompareCommand() {
|
|
22
|
-
|
|
23
|
+
const cmd = new Command("compare")
|
|
23
24
|
.description("Compare two evaluation score summaries")
|
|
24
25
|
.option("-b, --baseline <path>", "Baseline score-summary.json (default: latest baseline)")
|
|
25
|
-
.option("-e, --experiment <path>", "Experiment score-summary.json (default: results/latest/score-summary.json)")
|
|
26
|
+
.option("-e, --experiment <path>", "Experiment score-summary.json (default: .ailf/results/latest/score-summary.json)")
|
|
26
27
|
.option("-t, --threshold <n>", "Noise threshold for unchanged classification", parseFloat)
|
|
27
28
|
.option("-o, --output <path>", "Write JSON report to file")
|
|
28
29
|
.option("-f, --format <fmt>", "Output format: table or json", "table")
|
|
29
30
|
.action(async (opts) => {
|
|
31
|
+
const outputDir = resolveOutputDir(opts.outputDir);
|
|
30
32
|
const threshold = opts.threshold ?? DEFAULT_NOISE_THRESHOLD;
|
|
31
33
|
// Resolve experiment path
|
|
32
34
|
const expPath = opts.experiment
|
|
33
35
|
? resolve(opts.experiment)
|
|
34
|
-
:
|
|
36
|
+
: join(outputDir, "score-summary.json");
|
|
35
37
|
const experiment = loadSummary(expPath);
|
|
36
38
|
// Resolve baseline path
|
|
37
39
|
let basePath;
|
|
@@ -48,7 +50,7 @@ export function createCompareCommand() {
|
|
|
48
50
|
}
|
|
49
51
|
const baseline = loadSummary(basePath);
|
|
50
52
|
// Try to load grader consistency data for empirical thresholds
|
|
51
|
-
const consistencyPath = join(
|
|
53
|
+
const consistencyPath = join(outputDir, "grader-consistency.json");
|
|
52
54
|
let graderConsistency;
|
|
53
55
|
if (existsSync(consistencyPath) && opts.threshold === undefined) {
|
|
54
56
|
try {
|
|
@@ -93,10 +95,12 @@ export function createCompareCommand() {
|
|
|
93
95
|
console.log(` ✅ Comparison report also written to ${opts.output}`);
|
|
94
96
|
}
|
|
95
97
|
}
|
|
96
|
-
// Write comparison report to
|
|
97
|
-
const latestComparisonPath = join(
|
|
98
|
+
// Write comparison report to output dir for other steps to consume
|
|
99
|
+
const latestComparisonPath = join(outputDir, "comparison-report.json");
|
|
98
100
|
writeFileSync(latestComparisonPath, JSON.stringify(report, null, 2));
|
|
99
101
|
});
|
|
102
|
+
addOutputDirOption(cmd);
|
|
103
|
+
return cmd;
|
|
100
104
|
}
|
|
101
105
|
function findLatestBaseline() {
|
|
102
106
|
if (!existsSync(BASELINES_DIR))
|
|
@@ -23,6 +23,7 @@ import { TASK_FILE_NAMES } from "../_vendor/ailf-core/index.js";
|
|
|
23
23
|
import { buildPipelinePlan, buildSimpleCommandPlan, } from "../pipeline/plan.js";
|
|
24
24
|
import { formatPlanConsole, formatPlanJson } from "../pipeline/plan-format.js";
|
|
25
25
|
import { computeResolvedOptions } from "./pipeline-action.js";
|
|
26
|
+
import { getCallerCwd } from "./shared/resolve-output-dir.js";
|
|
26
27
|
import { LiteracyVariant } from "../pipeline/normalize-mode.js";
|
|
27
28
|
// ---------------------------------------------------------------------------
|
|
28
29
|
// Registry
|
|
@@ -43,10 +44,10 @@ const EXPLAIN_REGISTRY = {
|
|
|
43
44
|
"agent-report": {
|
|
44
45
|
description: "Generate an agent behavior observation report from eval results",
|
|
45
46
|
filesCreated: [
|
|
46
|
-
"
|
|
47
|
-
"
|
|
47
|
+
"<outputDir>/agent-report.json",
|
|
48
|
+
"<outputDir>/agent-report.md",
|
|
48
49
|
],
|
|
49
|
-
filesRead: ["
|
|
50
|
+
filesRead: ["<outputDir>/eval-results.json"],
|
|
50
51
|
steps: [
|
|
51
52
|
{
|
|
52
53
|
cacheStatus: "miss",
|
|
@@ -82,9 +83,9 @@ const EXPLAIN_REGISTRY = {
|
|
|
82
83
|
},
|
|
83
84
|
"calculate-scores": {
|
|
84
85
|
description: "Calculate AI Literacy Scores from Promptfoo evaluation results",
|
|
85
|
-
filesCreated: ["
|
|
86
|
+
filesCreated: ["<outputDir>/score-summary.json"],
|
|
86
87
|
filesRead: [
|
|
87
|
-
"
|
|
88
|
+
"<outputDir>/eval-results.json",
|
|
88
89
|
"config/rubrics.ts",
|
|
89
90
|
"config/models.ts",
|
|
90
91
|
],
|
|
@@ -104,23 +105,20 @@ const EXPLAIN_REGISTRY = {
|
|
|
104
105
|
{
|
|
105
106
|
cacheStatus: "miss",
|
|
106
107
|
name: "Write summary",
|
|
107
|
-
reason: "Persist score-summary.json to
|
|
108
|
+
reason: "Persist score-summary.json to output directory",
|
|
108
109
|
willRun: true,
|
|
109
110
|
},
|
|
110
111
|
],
|
|
111
112
|
},
|
|
112
113
|
compare: {
|
|
113
114
|
description: "Compare current evaluation scores against a saved baseline",
|
|
114
|
-
filesCreated: ["
|
|
115
|
-
filesRead: [
|
|
116
|
-
"results/latest/score-summary.json",
|
|
117
|
-
"results/baselines/*.json",
|
|
118
|
-
],
|
|
115
|
+
filesCreated: ["<outputDir>/comparison-report.json"],
|
|
116
|
+
filesRead: ["<outputDir>/score-summary.json", "results/baselines/*.json"],
|
|
119
117
|
steps: [
|
|
120
118
|
{
|
|
121
119
|
cacheStatus: "miss",
|
|
122
120
|
name: "Load current scores",
|
|
123
|
-
reason: "Read
|
|
121
|
+
reason: "Read <outputDir>/score-summary.json",
|
|
124
122
|
willRun: true,
|
|
125
123
|
},
|
|
126
124
|
{
|
|
@@ -181,8 +179,8 @@ const EXPLAIN_REGISTRY = {
|
|
|
181
179
|
},
|
|
182
180
|
"discovery-report": {
|
|
183
181
|
description: "Generate agent discoverability report from agentic retrieval metrics",
|
|
184
|
-
filesCreated: ["
|
|
185
|
-
filesRead: ["
|
|
182
|
+
filesCreated: ["<outputDir>/discovery-report.md"],
|
|
183
|
+
filesRead: ["<outputDir>/score-summary.json"],
|
|
186
184
|
steps: [
|
|
187
185
|
{
|
|
188
186
|
cacheStatus: "miss",
|
|
@@ -206,7 +204,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
206
204
|
},
|
|
207
205
|
eval: {
|
|
208
206
|
description: "Run Promptfoo evaluation directly (passthrough — all flags forwarded to promptfoo)",
|
|
209
|
-
filesCreated: ["
|
|
207
|
+
filesCreated: ["<outputDir>/eval-results.json"],
|
|
210
208
|
filesRead: ["promptfooconfig.yaml"],
|
|
211
209
|
steps: [
|
|
212
210
|
{
|
|
@@ -280,7 +278,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
280
278
|
grader: {
|
|
281
279
|
description: "Grader reliability tools (consistency, compare, sensitivity, validate)",
|
|
282
280
|
filesRead: [
|
|
283
|
-
"
|
|
281
|
+
"<outputDir>/eval-results.json",
|
|
284
282
|
"config/rubrics.ts",
|
|
285
283
|
"canonical/reference-solutions/",
|
|
286
284
|
],
|
|
@@ -312,7 +310,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
312
310
|
},
|
|
313
311
|
"measure-retrieval": {
|
|
314
312
|
description: "Measure Sanity text search retrieval quality against canonical document annotations",
|
|
315
|
-
filesCreated: ["
|
|
313
|
+
filesCreated: ["<outputDir>/retrieval-metrics.json"],
|
|
316
314
|
filesRead: ["tasks/literacy/*.task.ts"],
|
|
317
315
|
steps: [
|
|
318
316
|
{
|
|
@@ -337,7 +335,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
337
335
|
},
|
|
338
336
|
"pr-comment": {
|
|
339
337
|
description: "Generate a markdown PR comment from evaluation scores for CI posting",
|
|
340
|
-
filesRead: ["
|
|
338
|
+
filesRead: ["<outputDir>/score-summary.json"],
|
|
341
339
|
steps: [
|
|
342
340
|
{
|
|
343
341
|
cacheStatus: "miss",
|
|
@@ -355,7 +353,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
355
353
|
},
|
|
356
354
|
publish: {
|
|
357
355
|
description: "Publish a local evaluation report to the Sanity Content Lake (standalone)",
|
|
358
|
-
filesRead: ["
|
|
356
|
+
filesRead: ["<outputDir>/score-summary.json"],
|
|
359
357
|
steps: [
|
|
360
358
|
{
|
|
361
359
|
cacheStatus: "miss",
|
|
@@ -386,12 +384,12 @@ const EXPLAIN_REGISTRY = {
|
|
|
386
384
|
"readiness-report": {
|
|
387
385
|
description: "Generate launch readiness checklist for a feature area with threshold evaluation",
|
|
388
386
|
filesRead: [
|
|
389
|
-
"
|
|
390
|
-
"
|
|
387
|
+
"<outputDir>/score-summary.json",
|
|
388
|
+
"<outputDir>/gap-analysis.json",
|
|
391
389
|
"config/thresholds.ts",
|
|
392
390
|
"results/baselines/",
|
|
393
391
|
],
|
|
394
|
-
filesCreated: ["
|
|
392
|
+
filesCreated: ["<outputDir>/readiness-report.md"],
|
|
395
393
|
steps: [
|
|
396
394
|
{
|
|
397
395
|
cacheStatus: "miss",
|
|
@@ -603,7 +601,7 @@ function buildInitExplainPlan(actionCommand, rootDir) {
|
|
|
603
601
|
const configFile = format === "ts"
|
|
604
602
|
? "ailf.config.ts"
|
|
605
603
|
: `config.${format === "yaml" ? "yaml" : "json"}`;
|
|
606
|
-
const callerCwd =
|
|
604
|
+
const callerCwd = getCallerCwd();
|
|
607
605
|
const targetDir = opts.path ?? ".";
|
|
608
606
|
const ailfDir = `${targetDir}/.ailf`;
|
|
609
607
|
const tasksDir = `${ailfDir}/tasks`;
|
|
@@ -664,7 +662,7 @@ function buildBaselineExplainPlan(actionCommand, rootDir) {
|
|
|
664
662
|
command: `baseline ${subcommand}`,
|
|
665
663
|
description: descriptions[subcommand] ?? `Baseline operation: ${subcommand}`,
|
|
666
664
|
filesCreated: subcommand === "save" ? ["results/baselines/<timestamp>.json"] : [],
|
|
667
|
-
filesRead: ["
|
|
665
|
+
filesRead: ["<outputDir>/score-summary.json", "results/baselines/"],
|
|
668
666
|
rootDir,
|
|
669
667
|
});
|
|
670
668
|
}
|
|
@@ -11,7 +11,8 @@ import { Command } from "commander";
|
|
|
11
11
|
import { createAppContext } from "../composition-root.js";
|
|
12
12
|
import { loadSource } from "../sources.js";
|
|
13
13
|
import { configToSourceOverrides } from "../orchestration/config-to-source-overrides.js";
|
|
14
|
-
import { addSanitySourceOptions } from "./shared/options.js";
|
|
14
|
+
import { addOutputDirOption, addSanitySourceOptions } from "./shared/options.js";
|
|
15
|
+
import { resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
15
16
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
16
17
|
const ROOT = resolve(__dirname, "..", "..");
|
|
17
18
|
export function createFetchDocsCommand() {
|
|
@@ -31,6 +32,7 @@ export function createFetchDocsCommand() {
|
|
|
31
32
|
}
|
|
32
33
|
});
|
|
33
34
|
addSanitySourceOptions(cmd);
|
|
35
|
+
addOutputDirOption(cmd);
|
|
34
36
|
return cmd;
|
|
35
37
|
}
|
|
36
38
|
// ---------------------------------------------------------------------------
|
|
@@ -41,7 +43,7 @@ async function executeFetchDocs(opts) {
|
|
|
41
43
|
// Build a minimal ResolvedConfig for the composition root
|
|
42
44
|
const ctx = createAppContext({
|
|
43
45
|
rootDir: ROOT,
|
|
44
|
-
outputDir:
|
|
46
|
+
outputDir: resolveOutputDir(opts.outputDir),
|
|
45
47
|
mode: "literacy",
|
|
46
48
|
noAutoScope: false,
|
|
47
49
|
skipFetch: false,
|
|
@@ -9,17 +9,19 @@ import { fileURLToPath } from "url";
|
|
|
9
9
|
import { Command } from "commander";
|
|
10
10
|
import { createAppContext } from "../composition-root.js";
|
|
11
11
|
import { GenerateConfigsStep } from "../orchestration/steps/generate-configs-step.js";
|
|
12
|
+
import { addOutputDirOption } from "./shared/options.js";
|
|
13
|
+
import { resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
12
14
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
13
15
|
const ROOT = resolve(__dirname, "..", "..");
|
|
14
16
|
export function createGenerateConfigsCommand() {
|
|
15
|
-
|
|
17
|
+
const cmd = new Command("generate-configs")
|
|
16
18
|
.description("Generate promptfoo config files from config/models.yaml")
|
|
17
19
|
.option("-s, --source <name>", "Documentation source name")
|
|
18
20
|
.action(async (opts) => {
|
|
19
21
|
try {
|
|
20
22
|
const ctx = createAppContext({
|
|
21
23
|
rootDir: ROOT,
|
|
22
|
-
outputDir:
|
|
24
|
+
outputDir: resolveOutputDir(opts.outputDir),
|
|
23
25
|
mode: "literacy",
|
|
24
26
|
noAutoScope: false,
|
|
25
27
|
skipFetch: true,
|
|
@@ -58,4 +60,6 @@ export function createGenerateConfigsCommand() {
|
|
|
58
60
|
console.error(err.message);
|
|
59
61
|
}
|
|
60
62
|
});
|
|
63
|
+
addOutputDirOption(cmd);
|
|
64
|
+
return cmd;
|
|
61
65
|
}
|
|
@@ -21,6 +21,7 @@ import { buildStepSequence } from "../orchestration/build-step-sequence.js";
|
|
|
21
21
|
import { orchestratePipeline } from "../orchestration/pipeline-orchestrator.js";
|
|
22
22
|
import { load } from "js-yaml";
|
|
23
23
|
import { parseRepoConfig, } from "../adapters/task-sources/repo-schemas.js";
|
|
24
|
+
import { getCallerCwd, resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
24
25
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
25
26
|
const ROOT = resolve(__dirname, "..", "..");
|
|
26
27
|
// ---------------------------------------------------------------------------
|
|
@@ -35,7 +36,7 @@ const VALID_SEARCH_MODES = ["open", "origin-only", "off"];
|
|
|
35
36
|
*/
|
|
36
37
|
export function computeResolvedOptions(opts) {
|
|
37
38
|
// Resolve paths relative to the caller's cwd, not the eval package root
|
|
38
|
-
const callerCwd =
|
|
39
|
+
const callerCwd = getCallerCwd();
|
|
39
40
|
// Validate + normalize mode via the single boundary function.
|
|
40
41
|
// normalizeMode() maps legacy variant names (baseline, agentic, etc.)
|
|
41
42
|
// to canonical mode "literacy" + variant, and throws on invalid input.
|
|
@@ -209,23 +210,12 @@ export function computeResolvedOptions(opts) {
|
|
|
209
210
|
const remote = opts.remote || process.env.AILF_REMOTE === "1";
|
|
210
211
|
const apiUrl = opts.apiUrl ?? process.env.AILF_API_URL ?? "https://ailf-api.sanity.build";
|
|
211
212
|
const apiKey = process.env.AILF_API_KEY ?? undefined;
|
|
212
|
-
// Output directory: explicit
|
|
213
|
+
// Output directory: explicit --output-dir → $CWD/.ailf/results/latest/
|
|
213
214
|
const resolvedRepoTasksPath = opts.repoTasksPath
|
|
214
215
|
? resolve(callerCwd, opts.repoTasksPath)
|
|
215
216
|
: undefined;
|
|
216
217
|
const resolvedTaskSourceType = resolveTaskSourceType(opts.taskSource);
|
|
217
|
-
|
|
218
|
-
if (opts.outputDir) {
|
|
219
|
-
outputDir = resolve(callerCwd, opts.outputDir);
|
|
220
|
-
}
|
|
221
|
-
else if (resolvedTaskSourceType === "repo" || resolvedRepoTasksPath) {
|
|
222
|
-
outputDir = resolvedRepoTasksPath
|
|
223
|
-
? resolve(resolvedRepoTasksPath, "..", "results", "latest")
|
|
224
|
-
: resolve(callerCwd, ".ailf", "results", "latest");
|
|
225
|
-
}
|
|
226
|
-
else {
|
|
227
|
-
outputDir = resolve(ROOT, "results", "latest");
|
|
228
|
-
}
|
|
218
|
+
const outputDir = resolveOutputDir(opts.outputDir);
|
|
229
219
|
return {
|
|
230
220
|
allowedOriginArgs,
|
|
231
221
|
apiKey,
|
|
@@ -310,7 +300,7 @@ export async function executePipeline(cliOpts) {
|
|
|
310
300
|
}
|
|
311
301
|
const { FileConfigAdapter } = await import("../adapters/config-sources/file-config-adapter.js");
|
|
312
302
|
const { createAppContext } = await import("../composition-root.js");
|
|
313
|
-
const callerCwd =
|
|
303
|
+
const callerCwd = getCallerCwd();
|
|
314
304
|
const adapter = new FileConfigAdapter(cliOpts.config, ROOT);
|
|
315
305
|
const config = await adapter.resolve();
|
|
316
306
|
// Merge CLI-only flags that aren't in the config file.
|
|
@@ -323,13 +313,8 @@ export async function executePipeline(cliOpts) {
|
|
|
323
313
|
if (cliOpts.output) {
|
|
324
314
|
config.outputPath = resolve(callerCwd, cliOpts.output);
|
|
325
315
|
}
|
|
326
|
-
// Output dir: explicit CLI flag →
|
|
327
|
-
|
|
328
|
-
config.outputDir = resolve(callerCwd, cliOpts.outputDir);
|
|
329
|
-
}
|
|
330
|
-
else if (config.repoTasksPath) {
|
|
331
|
-
config.outputDir = resolve(config.repoTasksPath, "..", "results", "latest");
|
|
332
|
-
}
|
|
316
|
+
// Output dir: explicit CLI flag → $CWD/.ailf/results/latest/
|
|
317
|
+
config.outputDir = resolveOutputDir(cliOpts.outputDir);
|
|
333
318
|
// Create AppContext directly from the merged config so adapters
|
|
334
319
|
// (especially taskSource) are wired from the file config's
|
|
335
320
|
// taskSourceType — not from CLI defaults.
|
|
@@ -350,8 +335,7 @@ export async function executePipeline(cliOpts) {
|
|
|
350
335
|
// cache which never contains .ailf/.
|
|
351
336
|
if (o.remote) {
|
|
352
337
|
const { runRemotePipeline } = await import("./remote-pipeline.js");
|
|
353
|
-
|
|
354
|
-
await runRemotePipeline(o, callerCwd);
|
|
338
|
+
await runRemotePipeline(o, getCallerCwd());
|
|
355
339
|
return;
|
|
356
340
|
}
|
|
357
341
|
// Dry-run: validate only, don't execute steps
|
|
@@ -55,7 +55,7 @@ export function createPipelineCommand() {
|
|
|
55
55
|
.option("--remote", "Submit evaluation to the AILF API instead of running locally", false)
|
|
56
56
|
.option("--api-url <url>", "AILF API base URL (default: https://ailf-api.sanity.build)")
|
|
57
57
|
.option("--capture", "Enable artifact capture for this run", false)
|
|
58
|
-
.option("--capture-dir <path>", "Base directory for capture output (default: results/captures/)")
|
|
58
|
+
.option("--capture-dir <path>", "Base directory for capture output (default: .ailf/results/captures/)")
|
|
59
59
|
.option("--no-capture-compress", "Disable tar.gz compression of captures")
|
|
60
60
|
.option("--no-capture-extras", "Exclude mode-specific artifacts from captures")
|
|
61
61
|
.action(async (opts) => {
|
|
@@ -9,10 +9,12 @@ import { fileURLToPath } from "url";
|
|
|
9
9
|
import { Command } from "commander";
|
|
10
10
|
import { createAppContext } from "../composition-root.js";
|
|
11
11
|
import { generatePrComment } from "../pipeline/pr-comment.js";
|
|
12
|
+
import { addOutputDirOption } from "./shared/options.js";
|
|
13
|
+
import { resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
12
14
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
13
15
|
const ROOT = resolve(__dirname, "..", "..");
|
|
14
16
|
export function createPrCommentCommand() {
|
|
15
|
-
|
|
17
|
+
const cmd = new Command("pr-comment")
|
|
16
18
|
.description("Generate a markdown PR comment from evaluation scores")
|
|
17
19
|
.option("--output <path>", "Write comment to file (default: stdout)")
|
|
18
20
|
.option("--promptfoo-url <url>", "Promptfoo share URL to include")
|
|
@@ -20,7 +22,7 @@ export function createPrCommentCommand() {
|
|
|
20
22
|
try {
|
|
21
23
|
const ctx = createAppContext({
|
|
22
24
|
rootDir: ROOT,
|
|
23
|
-
outputDir:
|
|
25
|
+
outputDir: resolveOutputDir(opts.outputDir),
|
|
24
26
|
mode: "literacy",
|
|
25
27
|
noAutoScope: false,
|
|
26
28
|
skipFetch: true,
|
|
@@ -48,4 +50,6 @@ export function createPrCommentCommand() {
|
|
|
48
50
|
console.error(err.message);
|
|
49
51
|
}
|
|
50
52
|
});
|
|
53
|
+
addOutputDirOption(cmd);
|
|
54
|
+
return cmd;
|
|
51
55
|
}
|
package/dist/commands/publish.js
CHANGED
|
@@ -23,22 +23,27 @@ import { dirname, resolve } from "path";
|
|
|
23
23
|
import { fileURLToPath } from "url";
|
|
24
24
|
import { Command } from "commander";
|
|
25
25
|
import { createAppContext } from "../composition-root.js";
|
|
26
|
+
import { addOutputDirOption } from "./shared/options.js";
|
|
27
|
+
import { getCallerCwd, resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
26
28
|
import { buildProvenance, } from "../pipeline/provenance.js";
|
|
27
29
|
import { generateReportTitle } from "../pipeline/report-title.js";
|
|
28
30
|
import { generateReportId, } from "../report-store.js";
|
|
29
31
|
import { withRetry } from "../sinks/retry.js";
|
|
30
32
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
31
33
|
const ROOT = resolve(__dirname, "..", "..");
|
|
32
|
-
const DEFAULT_SUMMARY_PATH = resolve(ROOT, "results", "latest", "score-summary.json");
|
|
33
34
|
export function createPublishCommand() {
|
|
34
|
-
|
|
35
|
+
const cmd = new Command("publish")
|
|
35
36
|
.description("Publish a local evaluation report to the Sanity Content Lake")
|
|
36
|
-
.argument("[summary-path]", "Path to score-summary.json"
|
|
37
|
+
.argument("[summary-path]", "Path to score-summary.json")
|
|
37
38
|
.option("-t, --tag <tag>", "Label for the published report")
|
|
38
39
|
.option("-n, --dry-run", "Preview the report without writing to Sanity or sinks", false)
|
|
39
40
|
.action(async (summaryPath, opts) => {
|
|
40
|
-
|
|
41
|
+
const outputDir = resolveOutputDir(opts.outputDir);
|
|
42
|
+
const effectivePath = summaryPath ?? resolve(outputDir, "score-summary.json");
|
|
43
|
+
await runPublishCommand(effectivePath, outputDir, opts);
|
|
41
44
|
});
|
|
45
|
+
addOutputDirOption(cmd);
|
|
46
|
+
return cmd;
|
|
42
47
|
}
|
|
43
48
|
// ---------------------------------------------------------------------------
|
|
44
49
|
// Provenance builder (from score summary, not full pipeline context)
|
|
@@ -77,7 +82,7 @@ function buildProvenanceFromSummary(summary) {
|
|
|
77
82
|
// ---------------------------------------------------------------------------
|
|
78
83
|
// Command implementation
|
|
79
84
|
// ---------------------------------------------------------------------------
|
|
80
|
-
async function runPublishCommand(summaryPath, opts) {
|
|
85
|
+
async function runPublishCommand(summaryPath, outputDir, opts) {
|
|
81
86
|
// Wire up infrastructure via composition root
|
|
82
87
|
const ctx = createAppContext({
|
|
83
88
|
compareEnabled: false,
|
|
@@ -87,7 +92,7 @@ async function runPublishCommand(summaryPath, opts) {
|
|
|
87
92
|
noAutoScope: false,
|
|
88
93
|
noCache: true,
|
|
89
94
|
noRemoteCache: true,
|
|
90
|
-
outputDir
|
|
95
|
+
outputDir,
|
|
91
96
|
publishEnabled: true,
|
|
92
97
|
publishTag: opts.tag,
|
|
93
98
|
readinessEnabled: false,
|
|
@@ -106,8 +111,7 @@ async function runPublishCommand(summaryPath, opts) {
|
|
|
106
111
|
// -----------------------------------------------------------------------
|
|
107
112
|
// 1. Resolve and read the score summary
|
|
108
113
|
// -----------------------------------------------------------------------
|
|
109
|
-
const
|
|
110
|
-
const resolvedPath = resolve(callerCwd, summaryPath);
|
|
114
|
+
const resolvedPath = resolve(getCallerCwd(), summaryPath);
|
|
111
115
|
if (!existsSync(resolvedPath)) {
|
|
112
116
|
console.error(` ✖ File not found: ${resolvedPath}`);
|
|
113
117
|
console.error();
|