@sanity/ailf 0.1.34 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +6 -0
- package/config/bigquery/views/reports.sql +1 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +10 -20
- package/dist/_vendor/ailf-core/examples/index.js +10 -20
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +2 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +65 -0
- package/dist/_vendor/ailf-tasks/schemas.d.ts +12 -0
- package/dist/_vendor/ailf-tasks/schemas.js +4 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +9 -1
- package/dist/adapters/task-sources/repo-task-source.js +19 -4
- package/dist/commands/calculate-scores.js +5 -1
- package/dist/commands/publish.js +3 -0
- package/dist/composition-root.js +7 -2
- package/dist/orchestration/pipeline-orchestrator.js +27 -2
- package/dist/orchestration/step-runner.js +8 -0
- package/dist/orchestration/steps/calculate-scores-step.js +22 -19
- package/dist/orchestration/steps/generate-configs-step.js +1 -0
- package/dist/orchestration/steps/grader-consistency-step.js +1 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +2 -1
- package/dist/orchestration/steps/publish-report-step.js +3 -0
- package/dist/pipeline/calculate-scores.d.ts +11 -1
- package/dist/pipeline/calculate-scores.js +222 -157
- package/dist/pipeline/coverage-audit.d.ts +2 -1
- package/dist/pipeline/coverage-audit.js +5 -3
- package/dist/pipeline/expand-tasks.d.ts +2 -1
- package/dist/pipeline/expand-tasks.js +33 -2
- package/dist/pipeline/generate-configs.d.ts +3 -1
- package/dist/pipeline/generate-configs.js +51 -37
- package/dist/pipeline/grader-api.d.ts +2 -1
- package/dist/pipeline/grader-api.js +11 -9
- package/dist/pipeline/grader-compare-runner.d.ts +3 -0
- package/dist/pipeline/grader-compare-runner.js +21 -19
- package/dist/pipeline/grader-consistency-runner.d.ts +3 -0
- package/dist/pipeline/grader-consistency-runner.js +16 -14
- package/dist/pipeline/grader-sensitivity-runner.d.ts +3 -0
- package/dist/pipeline/grader-sensitivity-runner.js +18 -16
- package/dist/pipeline/grader-validate-runner.d.ts +3 -0
- package/dist/pipeline/grader-validate-runner.js +16 -14
- package/dist/pipeline/mirror-repo-tasks.d.ts +80 -1
- package/dist/pipeline/mirror-repo-tasks.js +148 -32
- package/dist/pipeline/provenance.d.ts +3 -0
- package/dist/pipeline/provenance.js +25 -3
- package/dist/pipeline/report-title.d.ts +66 -0
- package/dist/pipeline/report-title.js +118 -0
- package/dist/report-store.js +2 -0
- package/dist/sinks/bigquery/index.d.ts +1 -0
- package/dist/sinks/bigquery/index.js +1 -0
- package/dist/sources.d.ts +2 -1
- package/dist/sources.js +28 -1
- package/package.json +23 -23
|
@@ -36,11 +36,15 @@ export function createCalculateScoresCommand() {
|
|
|
36
36
|
remote: false,
|
|
37
37
|
apiUrl: "https://ailf-api.sanity.build",
|
|
38
38
|
});
|
|
39
|
-
calculateAndWriteScores({
|
|
39
|
+
const result = calculateAndWriteScores({
|
|
40
40
|
resultsPath,
|
|
41
41
|
rootDir: ctx.config.rootDir,
|
|
42
42
|
source: opts.source,
|
|
43
43
|
});
|
|
44
|
+
// At the CLI boundary, exit non-zero if areas are below threshold
|
|
45
|
+
if (result.belowCritical.length > 0) {
|
|
46
|
+
process.exitCode = 1;
|
|
47
|
+
}
|
|
44
48
|
}
|
|
45
49
|
catch (err) {
|
|
46
50
|
process.exitCode = 1;
|
package/dist/commands/publish.js
CHANGED
|
@@ -24,6 +24,7 @@ import { fileURLToPath } from "url";
|
|
|
24
24
|
import { Command } from "commander";
|
|
25
25
|
import { createAppContext } from "../composition-root.js";
|
|
26
26
|
import { buildProvenance, } from "../pipeline/provenance.js";
|
|
27
|
+
import { generateReportTitle } from "../pipeline/report-title.js";
|
|
27
28
|
import { generateReportId, } from "../report-store.js";
|
|
28
29
|
import { withRetry } from "../sinks/retry.js";
|
|
29
30
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
@@ -166,6 +167,7 @@ async function runPublishCommand(summaryPath, opts) {
|
|
|
166
167
|
};
|
|
167
168
|
}
|
|
168
169
|
const reportId = generateReportId();
|
|
170
|
+
const title = generateReportTitle({ provenance });
|
|
169
171
|
const report = {
|
|
170
172
|
comparison: comparison ?? undefined,
|
|
171
173
|
completedAt: now,
|
|
@@ -174,6 +176,7 @@ async function runPublishCommand(summaryPath, opts) {
|
|
|
174
176
|
provenance,
|
|
175
177
|
summary,
|
|
176
178
|
tag: opts.tag,
|
|
179
|
+
title,
|
|
177
180
|
};
|
|
178
181
|
// -----------------------------------------------------------------------
|
|
179
182
|
// 4. Dry run — print preview and exit
|
package/dist/composition-root.js
CHANGED
|
@@ -63,9 +63,14 @@ export function createAppContext(config) {
|
|
|
63
63
|
function createLogger() {
|
|
64
64
|
if (process.env.AILF_LOG_FORMAT === "json")
|
|
65
65
|
return new JsonLogger();
|
|
66
|
-
if (process.env.
|
|
66
|
+
if (process.env.AILF_LOG_LEVEL === "quiet" ||
|
|
67
|
+
process.env.AILF_QUIET === "1") {
|
|
67
68
|
return new QuietLogger();
|
|
68
|
-
|
|
69
|
+
}
|
|
70
|
+
return new ConsoleLogger({
|
|
71
|
+
verbose: process.env.AILF_LOG_LEVEL === "verbose" ||
|
|
72
|
+
process.env.AILF_VERBOSE === "1",
|
|
73
|
+
});
|
|
69
74
|
}
|
|
70
75
|
function createCache(config) {
|
|
71
76
|
const local = new FilesystemCache(config.rootDir);
|
|
@@ -77,12 +77,20 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
77
77
|
const pipelineStart = Date.now();
|
|
78
78
|
const hasJob = !!ctx.config.jobId;
|
|
79
79
|
ctx.logger.section("ai-literacy-framework — Evaluation Pipeline");
|
|
80
|
+
ctx.logger.debug(`Pipeline starting with ${steps.length} steps`, {
|
|
81
|
+
steps: steps.map((s) => s.name),
|
|
82
|
+
mode: ctx.config.mode,
|
|
83
|
+
source: ctx.config.source,
|
|
84
|
+
noCache: ctx.config.noCache,
|
|
85
|
+
jobId: ctx.config.jobId,
|
|
86
|
+
});
|
|
80
87
|
// Report initial running status
|
|
81
88
|
if (hasJob) {
|
|
82
89
|
await reportJobProgress(ctx, steps[0]?.name ?? "init", 0, steps.length, "running");
|
|
83
90
|
}
|
|
84
91
|
for (let i = 0; i < steps.length; i++) {
|
|
85
92
|
const step = steps[i];
|
|
93
|
+
ctx.logger.debug(`Starting step ${i + 1}/${steps.length}: ${step.name}`);
|
|
86
94
|
ctx.logger.section(step.name);
|
|
87
95
|
// Report current step progress
|
|
88
96
|
if (hasJob) {
|
|
@@ -97,16 +105,23 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
97
105
|
// Fail fast on required step failure
|
|
98
106
|
if (result.status === "failed" && !step.optional) {
|
|
99
107
|
ctx.logger.error(`Pipeline aborted: ${step.name} failed`);
|
|
108
|
+
const failedError = result.status === "failed" ? result.error : `${step.name} failed`;
|
|
100
109
|
// Report failure to job store
|
|
101
110
|
if (hasJob) {
|
|
102
|
-
const errorMsg = result.status === "failed" ? result.error : `${step.name} failed`;
|
|
103
111
|
await reportJobProgress(ctx, step.name, i + 1, steps.length, "failed", {
|
|
104
|
-
message:
|
|
112
|
+
message: failedError,
|
|
105
113
|
step: step.name,
|
|
106
114
|
});
|
|
107
115
|
}
|
|
108
116
|
return {
|
|
117
|
+
belowCritical: state.belowCritical,
|
|
109
118
|
durationMs: Date.now() - pipelineStart,
|
|
119
|
+
failureReason: {
|
|
120
|
+
type: "step-failed",
|
|
121
|
+
step: step.name,
|
|
122
|
+
message: failedError,
|
|
123
|
+
},
|
|
124
|
+
promptfooUrls: state.promptfooUrls,
|
|
110
125
|
steps: results,
|
|
111
126
|
success: false,
|
|
112
127
|
validation,
|
|
@@ -120,6 +135,13 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
120
135
|
const durationMs = Date.now() - pipelineStart;
|
|
121
136
|
ctx.logger.section("Pipeline Complete");
|
|
122
137
|
ctx.logger.info(`All steps completed in ${durationMs}ms`);
|
|
138
|
+
ctx.logger.debug("Pipeline state at completion", {
|
|
139
|
+
belowCritical: state.belowCritical,
|
|
140
|
+
reportId: state.reportId,
|
|
141
|
+
remoteCacheHits: state.remoteCacheHits
|
|
142
|
+
? [...state.remoteCacheHits]
|
|
143
|
+
: undefined,
|
|
144
|
+
});
|
|
123
145
|
// Report completion to job store (with reportId from state if available)
|
|
124
146
|
if (hasJob) {
|
|
125
147
|
try {
|
|
@@ -145,9 +167,12 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
145
167
|
}
|
|
146
168
|
}
|
|
147
169
|
return {
|
|
170
|
+
belowCritical: state.belowCritical,
|
|
148
171
|
durationMs,
|
|
172
|
+
promptfooUrls: state.promptfooUrls,
|
|
149
173
|
steps: results,
|
|
150
174
|
success: true,
|
|
175
|
+
testSummary: state.testSummary,
|
|
151
176
|
validation,
|
|
152
177
|
};
|
|
153
178
|
}
|
|
@@ -36,10 +36,13 @@ export async function runStep(step, ctx, state = {}) {
|
|
|
36
36
|
if (canCache) {
|
|
37
37
|
try {
|
|
38
38
|
const inputs = step.cacheInputs(ctx);
|
|
39
|
+
ctx.logger.debug(`[${step.name}] Cache inputs: ${inputs.length} files`);
|
|
39
40
|
const key = await ctx.cache.computeKey(inputs);
|
|
40
41
|
cacheKey = key;
|
|
42
|
+
ctx.logger.debug(`[${step.name}] Cache key: ${key}`);
|
|
41
43
|
const cached = await ctx.cache.lookup(step.name, key);
|
|
42
44
|
if (cached.hit) {
|
|
45
|
+
ctx.logger.debug(`[${step.name}] Cache HIT — skipping execution`);
|
|
43
46
|
const result = {
|
|
44
47
|
durationMs: Date.now() - start,
|
|
45
48
|
status: "success",
|
|
@@ -48,11 +51,16 @@ export async function runStep(step, ctx, state = {}) {
|
|
|
48
51
|
ctx.logger.step(step.name, result);
|
|
49
52
|
return result;
|
|
50
53
|
}
|
|
54
|
+
ctx.logger.debug(`[${step.name}] Cache MISS — executing`);
|
|
51
55
|
}
|
|
52
56
|
catch {
|
|
53
57
|
// Cache lookup failure is non-fatal — proceed to execute
|
|
58
|
+
ctx.logger.debug(`[${step.name}] Cache lookup failed — proceeding`);
|
|
54
59
|
}
|
|
55
60
|
}
|
|
61
|
+
else {
|
|
62
|
+
ctx.logger.debug(`[${step.name}] Cache skipped (${!ctx.cache ? "no cache adapter" : ctx.config.noCache ? "--no-cache" : "no cacheInputs"})`);
|
|
63
|
+
}
|
|
56
64
|
// 3. Execute
|
|
57
65
|
try {
|
|
58
66
|
const result = await step.execute(ctx, state);
|
|
@@ -63,9 +63,11 @@ export class CalculateScoresStep {
|
|
|
63
63
|
catch {
|
|
64
64
|
// Non-fatal — proceed without source metadata
|
|
65
65
|
}
|
|
66
|
+
let belowCritical = [];
|
|
66
67
|
try {
|
|
67
|
-
calculateAndWriteScores({
|
|
68
|
+
const result = calculateAndWriteScores({
|
|
68
69
|
allowedOrigins: ctx.config.allowedOrigins,
|
|
70
|
+
logger: ctx.logger,
|
|
69
71
|
mode: ctx.config.mode,
|
|
70
72
|
resolvedSource,
|
|
71
73
|
resultsPath: primaryMode !== "baseline"
|
|
@@ -75,25 +77,17 @@ export class CalculateScoresStep {
|
|
|
75
77
|
searchMode: ctx.config.searchMode,
|
|
76
78
|
source: ctx.config.source,
|
|
77
79
|
});
|
|
80
|
+
belowCritical = result.belowCritical;
|
|
81
|
+
if (result.testSummary) {
|
|
82
|
+
state.testSummary = result.testSummary;
|
|
83
|
+
}
|
|
78
84
|
}
|
|
79
85
|
catch (err) {
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
:
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
durationMs: Date.now() - start,
|
|
86
|
-
error: `calculate-scores failed with exit code ${code}`,
|
|
87
|
-
status: "failed",
|
|
88
|
-
};
|
|
89
|
-
}
|
|
90
|
-
if (code === undefined) {
|
|
91
|
-
return {
|
|
92
|
-
durationMs: Date.now() - start,
|
|
93
|
-
error: `calculate-scores failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
94
|
-
status: "failed",
|
|
95
|
-
};
|
|
96
|
-
}
|
|
86
|
+
return {
|
|
87
|
+
durationMs: Date.now() - start,
|
|
88
|
+
error: `calculate-scores failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
89
|
+
status: "failed",
|
|
90
|
+
};
|
|
97
91
|
}
|
|
98
92
|
// Postcondition: score summary exists and is valid
|
|
99
93
|
const summaryIssues = checkScoreSummaryValid(ctx.config.rootDir);
|
|
@@ -105,10 +99,19 @@ export class CalculateScoresStep {
|
|
|
105
99
|
status: "failed",
|
|
106
100
|
};
|
|
107
101
|
}
|
|
102
|
+
// Propagate belowCritical into pipeline state for downstream consumers
|
|
103
|
+
// (e.g., orchestrator reporting, publish step metadata).
|
|
104
|
+
// This is informational — the pipeline continues to run subsequent steps.
|
|
105
|
+
if (belowCritical.length > 0) {
|
|
106
|
+
state.belowCritical = belowCritical;
|
|
107
|
+
}
|
|
108
|
+
const criticalSuffix = belowCritical.length > 0
|
|
109
|
+
? ` (${belowCritical.length} area(s) below critical threshold: ${belowCritical.join(", ")})`
|
|
110
|
+
: "";
|
|
108
111
|
return {
|
|
109
112
|
durationMs: Date.now() - start,
|
|
110
113
|
status: "success",
|
|
111
|
-
summary:
|
|
114
|
+
summary: `Scores calculated and summary written${criticalSuffix}`,
|
|
112
115
|
};
|
|
113
116
|
}
|
|
114
117
|
cacheInputs(ctx) {
|
|
@@ -14,6 +14,7 @@ import { readFileSync } from "fs";
|
|
|
14
14
|
import { resolve } from "path";
|
|
15
15
|
import { checkScoreSummaryValid } from "../../pipeline/checks.js";
|
|
16
16
|
import { buildProvenance, } from "../../pipeline/provenance.js";
|
|
17
|
+
import { generateReportTitle } from "../../pipeline/report-title.js";
|
|
17
18
|
import { generateReportId } from "../../report-store.js";
|
|
18
19
|
import { withRetry } from "../../sinks/retry.js";
|
|
19
20
|
export class PublishReportStep {
|
|
@@ -101,6 +102,7 @@ export class PublishReportStep {
|
|
|
101
102
|
comparedAgainst: autoCompareResult.baselineReportId,
|
|
102
103
|
};
|
|
103
104
|
}
|
|
105
|
+
const title = generateReportTitle({ provenance });
|
|
104
106
|
const report = {
|
|
105
107
|
comparison: comparison ?? undefined,
|
|
106
108
|
completedAt: now,
|
|
@@ -109,6 +111,7 @@ export class PublishReportStep {
|
|
|
109
111
|
provenance,
|
|
110
112
|
summary,
|
|
111
113
|
tag: this.options.publishTag ?? ctx.config.publishTag,
|
|
114
|
+
title,
|
|
112
115
|
};
|
|
113
116
|
// Share reportId with downstream steps (CallbackStep + orchestrator job update)
|
|
114
117
|
state.reportId = reportId;
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import type { Logger, TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
1
2
|
import { type ResolvedSourceConfig } from "../sources.js";
|
|
2
3
|
import { type ActualScoreEntry, type ComponentResult } from "../_vendor/ailf-core/index.d.ts";
|
|
3
4
|
import type { GraderJudgment, PerModelEntry } from "./types.js";
|
|
@@ -86,6 +87,8 @@ export declare function scoreAgenticResults(resultsPath: string, weights: Record
|
|
|
86
87
|
export interface CalculateScoresOptions {
|
|
87
88
|
/** Allowed origins for source isolation reporting */
|
|
88
89
|
allowedOrigins?: string[];
|
|
90
|
+
/** Logger instance (defaults to ConsoleLogger if not provided) */
|
|
91
|
+
logger?: Logger;
|
|
89
92
|
/** Evaluation mode (controls which result files are read) */
|
|
90
93
|
mode?: string;
|
|
91
94
|
/** Pre-resolved source config (skips loadSource() call) */
|
|
@@ -99,4 +102,11 @@ export interface CalculateScoresOptions {
|
|
|
99
102
|
/** Documentation source name */
|
|
100
103
|
source?: string;
|
|
101
104
|
}
|
|
102
|
-
|
|
105
|
+
/** Result from calculateAndWriteScores — replaces process.exit() calls. */
|
|
106
|
+
export interface CalculateScoresResult {
|
|
107
|
+
/** Feature areas that scored below the critical threshold (40). */
|
|
108
|
+
belowCritical: string[];
|
|
109
|
+
/** Summary of test execution outcomes (total, passed, failed, errored). */
|
|
110
|
+
testSummary?: TestSummary;
|
|
111
|
+
}
|
|
112
|
+
export declare function calculateAndWriteScores(options: CalculateScoresOptions): CalculateScoresResult;
|