@sanity/ailf 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/models.yaml +3 -2
- package/dist/_vendor/ailf-core/types/index.d.ts +53 -0
- package/dist/composition-root.js +7 -2
- package/dist/orchestration/pipeline-orchestrator.js +27 -2
- package/dist/orchestration/step-runner.js +8 -0
- package/dist/orchestration/steps/calculate-scores-step.js +4 -0
- package/dist/orchestration/steps/generate-configs-step.js +1 -0
- package/dist/orchestration/steps/grader-consistency-step.js +1 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +2 -1
- package/dist/pipeline/calculate-scores.d.ts +5 -0
- package/dist/pipeline/calculate-scores.js +219 -146
- package/dist/pipeline/coverage-audit.d.ts +2 -1
- package/dist/pipeline/coverage-audit.js +5 -3
- package/dist/pipeline/expand-tasks.d.ts +2 -1
- package/dist/pipeline/expand-tasks.js +33 -2
- package/dist/pipeline/generate-configs.d.ts +3 -1
- package/dist/pipeline/generate-configs.js +47 -28
- package/dist/pipeline/grader-api.d.ts +2 -1
- package/dist/pipeline/grader-api.js +11 -9
- package/dist/pipeline/grader-compare-runner.d.ts +3 -0
- package/dist/pipeline/grader-compare-runner.js +21 -19
- package/dist/pipeline/grader-consistency-runner.d.ts +3 -0
- package/dist/pipeline/grader-consistency-runner.js +16 -14
- package/dist/pipeline/grader-sensitivity-runner.d.ts +3 -0
- package/dist/pipeline/grader-sensitivity-runner.js +18 -16
- package/dist/pipeline/grader-validate-runner.d.ts +3 -0
- package/dist/pipeline/grader-validate-runner.js +16 -14
- package/dist/pipeline/mirror-repo-tasks.d.ts +3 -1
- package/dist/pipeline/mirror-repo-tasks.js +8 -6
- package/dist/pipeline/provenance.d.ts +3 -0
- package/dist/pipeline/provenance.js +25 -3
- package/dist/sources.d.ts +2 -1
- package/dist/sources.js +28 -1
- package/package.json +3 -3
package/config/models.yaml
CHANGED
|
@@ -46,10 +46,11 @@ models:
|
|
|
46
46
|
max_tokens: 4096
|
|
47
47
|
modes: [baseline, observed, agentic-naive, agentic-optimized]
|
|
48
48
|
- id: openai:chat:gpt-5.4
|
|
49
|
-
label: GPT 5.4
|
|
49
|
+
label: GPT 5.4
|
|
50
50
|
config:
|
|
51
|
-
reasoning_effort: "
|
|
51
|
+
reasoning_effort: "medium"
|
|
52
52
|
max_output_tokens: 4096
|
|
53
|
+
maxRetries: 1
|
|
53
54
|
modes: [baseline, observed, agentic-naive, agentic-optimized]
|
|
54
55
|
|
|
55
56
|
# ── Anthropic ───────────────────────────────────────────────
|
|
@@ -462,6 +462,12 @@ export interface PipelineState {
|
|
|
462
462
|
* below threshold — this is informational, not a hard failure.
|
|
463
463
|
*/
|
|
464
464
|
belowCritical?: string[];
|
|
465
|
+
/**
|
|
466
|
+
* Test execution summary. Set by CalculateScoresStep which reads
|
|
467
|
+
* the eval results and counts passed/failed/errored tests.
|
|
468
|
+
* Consumed by the orchestrator for the enriched PipelineResult.
|
|
469
|
+
*/
|
|
470
|
+
testSummary?: TestSummary;
|
|
465
471
|
}
|
|
466
472
|
/**
|
|
467
473
|
* Release auto-scope metadata — which tasks are affected by a content
|
|
@@ -484,7 +490,48 @@ export interface ReleaseAutoScope {
|
|
|
484
490
|
};
|
|
485
491
|
}
|
|
486
492
|
/** Result of a full pipeline run */
|
|
493
|
+
/** Classified failure reason for a pipeline run. */
|
|
494
|
+
export type PipelineFailureReason = {
|
|
495
|
+
type: "validation";
|
|
496
|
+
message: string;
|
|
497
|
+
} | {
|
|
498
|
+
type: "missing-results";
|
|
499
|
+
message: string;
|
|
500
|
+
} | {
|
|
501
|
+
type: "step-failed";
|
|
502
|
+
step: string;
|
|
503
|
+
message: string;
|
|
504
|
+
} | {
|
|
505
|
+
type: "all-tests-errored";
|
|
506
|
+
message: string;
|
|
507
|
+
};
|
|
508
|
+
/** Summary of test execution outcomes. */
|
|
509
|
+
export interface TestSummary {
|
|
510
|
+
/** Total test cases executed */
|
|
511
|
+
total: number;
|
|
512
|
+
/** Tests that passed grading */
|
|
513
|
+
passed: number;
|
|
514
|
+
/** Tests that failed grading */
|
|
515
|
+
failed: number;
|
|
516
|
+
/** Tests that errored (API timeout, malformed response, etc.) */
|
|
517
|
+
errored: number;
|
|
518
|
+
/** Details of errored tests (model, task, error message) */
|
|
519
|
+
errors?: {
|
|
520
|
+
model: string;
|
|
521
|
+
task: string;
|
|
522
|
+
error: string;
|
|
523
|
+
}[];
|
|
524
|
+
}
|
|
525
|
+
/** Token usage and estimated cost for a pipeline run. */
|
|
526
|
+
export interface PipelineUsage {
|
|
527
|
+
totalTokens: number;
|
|
528
|
+
evalTokens: number;
|
|
529
|
+
graderTokens: number;
|
|
530
|
+
estimatedCostUsd?: number;
|
|
531
|
+
}
|
|
487
532
|
export interface PipelineResult {
|
|
533
|
+
/** Feature areas that scored below the critical threshold. Informational — not a failure. */
|
|
534
|
+
belowCritical?: string[];
|
|
488
535
|
/** Cache hit/miss statistics for this run */
|
|
489
536
|
cache?: {
|
|
490
537
|
hits: number;
|
|
@@ -494,6 +541,8 @@ export interface PipelineResult {
|
|
|
494
541
|
};
|
|
495
542
|
/** Total duration in milliseconds */
|
|
496
543
|
durationMs: number;
|
|
544
|
+
/** Classified failure reason (when success is false). */
|
|
545
|
+
failureReason?: PipelineFailureReason;
|
|
497
546
|
/** @deprecated Use `promptfooUrls` — kept for backward compatibility */
|
|
498
547
|
promptfooUrl?: string;
|
|
499
548
|
/** Per-mode Promptfoo share URLs (one per sub-eval that produced a shareable link) */
|
|
@@ -502,6 +551,10 @@ export interface PipelineResult {
|
|
|
502
551
|
steps: Record<string, StepResult>;
|
|
503
552
|
/** Overall success (all non-skipped steps succeeded) */
|
|
504
553
|
success: boolean;
|
|
554
|
+
/** Summary of test execution outcomes. */
|
|
555
|
+
testSummary?: TestSummary;
|
|
556
|
+
/** Token usage and estimated cost. */
|
|
557
|
+
usage?: PipelineUsage;
|
|
505
558
|
/** Validation issues found (if any) */
|
|
506
559
|
validation: ValidationResult;
|
|
507
560
|
}
|
package/dist/composition-root.js
CHANGED
|
@@ -63,9 +63,14 @@ export function createAppContext(config) {
|
|
|
63
63
|
function createLogger() {
|
|
64
64
|
if (process.env.AILF_LOG_FORMAT === "json")
|
|
65
65
|
return new JsonLogger();
|
|
66
|
-
if (process.env.
|
|
66
|
+
if (process.env.AILF_LOG_LEVEL === "quiet" ||
|
|
67
|
+
process.env.AILF_QUIET === "1") {
|
|
67
68
|
return new QuietLogger();
|
|
68
|
-
|
|
69
|
+
}
|
|
70
|
+
return new ConsoleLogger({
|
|
71
|
+
verbose: process.env.AILF_LOG_LEVEL === "verbose" ||
|
|
72
|
+
process.env.AILF_VERBOSE === "1",
|
|
73
|
+
});
|
|
69
74
|
}
|
|
70
75
|
function createCache(config) {
|
|
71
76
|
const local = new FilesystemCache(config.rootDir);
|
|
@@ -77,12 +77,20 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
77
77
|
const pipelineStart = Date.now();
|
|
78
78
|
const hasJob = !!ctx.config.jobId;
|
|
79
79
|
ctx.logger.section("ai-literacy-framework — Evaluation Pipeline");
|
|
80
|
+
ctx.logger.debug(`Pipeline starting with ${steps.length} steps`, {
|
|
81
|
+
steps: steps.map((s) => s.name),
|
|
82
|
+
mode: ctx.config.mode,
|
|
83
|
+
source: ctx.config.source,
|
|
84
|
+
noCache: ctx.config.noCache,
|
|
85
|
+
jobId: ctx.config.jobId,
|
|
86
|
+
});
|
|
80
87
|
// Report initial running status
|
|
81
88
|
if (hasJob) {
|
|
82
89
|
await reportJobProgress(ctx, steps[0]?.name ?? "init", 0, steps.length, "running");
|
|
83
90
|
}
|
|
84
91
|
for (let i = 0; i < steps.length; i++) {
|
|
85
92
|
const step = steps[i];
|
|
93
|
+
ctx.logger.debug(`Starting step ${i + 1}/${steps.length}: ${step.name}`);
|
|
86
94
|
ctx.logger.section(step.name);
|
|
87
95
|
// Report current step progress
|
|
88
96
|
if (hasJob) {
|
|
@@ -97,16 +105,23 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
97
105
|
// Fail fast on required step failure
|
|
98
106
|
if (result.status === "failed" && !step.optional) {
|
|
99
107
|
ctx.logger.error(`Pipeline aborted: ${step.name} failed`);
|
|
108
|
+
const failedError = result.status === "failed" ? result.error : `${step.name} failed`;
|
|
100
109
|
// Report failure to job store
|
|
101
110
|
if (hasJob) {
|
|
102
|
-
const errorMsg = result.status === "failed" ? result.error : `${step.name} failed`;
|
|
103
111
|
await reportJobProgress(ctx, step.name, i + 1, steps.length, "failed", {
|
|
104
|
-
message:
|
|
112
|
+
message: failedError,
|
|
105
113
|
step: step.name,
|
|
106
114
|
});
|
|
107
115
|
}
|
|
108
116
|
return {
|
|
117
|
+
belowCritical: state.belowCritical,
|
|
109
118
|
durationMs: Date.now() - pipelineStart,
|
|
119
|
+
failureReason: {
|
|
120
|
+
type: "step-failed",
|
|
121
|
+
step: step.name,
|
|
122
|
+
message: failedError,
|
|
123
|
+
},
|
|
124
|
+
promptfooUrls: state.promptfooUrls,
|
|
110
125
|
steps: results,
|
|
111
126
|
success: false,
|
|
112
127
|
validation,
|
|
@@ -120,6 +135,13 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
120
135
|
const durationMs = Date.now() - pipelineStart;
|
|
121
136
|
ctx.logger.section("Pipeline Complete");
|
|
122
137
|
ctx.logger.info(`All steps completed in ${durationMs}ms`);
|
|
138
|
+
ctx.logger.debug("Pipeline state at completion", {
|
|
139
|
+
belowCritical: state.belowCritical,
|
|
140
|
+
reportId: state.reportId,
|
|
141
|
+
remoteCacheHits: state.remoteCacheHits
|
|
142
|
+
? [...state.remoteCacheHits]
|
|
143
|
+
: undefined,
|
|
144
|
+
});
|
|
123
145
|
// Report completion to job store (with reportId from state if available)
|
|
124
146
|
if (hasJob) {
|
|
125
147
|
try {
|
|
@@ -145,9 +167,12 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
145
167
|
}
|
|
146
168
|
}
|
|
147
169
|
return {
|
|
170
|
+
belowCritical: state.belowCritical,
|
|
148
171
|
durationMs,
|
|
172
|
+
promptfooUrls: state.promptfooUrls,
|
|
149
173
|
steps: results,
|
|
150
174
|
success: true,
|
|
175
|
+
testSummary: state.testSummary,
|
|
151
176
|
validation,
|
|
152
177
|
};
|
|
153
178
|
}
|
|
@@ -36,10 +36,13 @@ export async function runStep(step, ctx, state = {}) {
|
|
|
36
36
|
if (canCache) {
|
|
37
37
|
try {
|
|
38
38
|
const inputs = step.cacheInputs(ctx);
|
|
39
|
+
ctx.logger.debug(`[${step.name}] Cache inputs: ${inputs.length} files`);
|
|
39
40
|
const key = await ctx.cache.computeKey(inputs);
|
|
40
41
|
cacheKey = key;
|
|
42
|
+
ctx.logger.debug(`[${step.name}] Cache key: ${key}`);
|
|
41
43
|
const cached = await ctx.cache.lookup(step.name, key);
|
|
42
44
|
if (cached.hit) {
|
|
45
|
+
ctx.logger.debug(`[${step.name}] Cache HIT — skipping execution`);
|
|
43
46
|
const result = {
|
|
44
47
|
durationMs: Date.now() - start,
|
|
45
48
|
status: "success",
|
|
@@ -48,11 +51,16 @@ export async function runStep(step, ctx, state = {}) {
|
|
|
48
51
|
ctx.logger.step(step.name, result);
|
|
49
52
|
return result;
|
|
50
53
|
}
|
|
54
|
+
ctx.logger.debug(`[${step.name}] Cache MISS — executing`);
|
|
51
55
|
}
|
|
52
56
|
catch {
|
|
53
57
|
// Cache lookup failure is non-fatal — proceed to execute
|
|
58
|
+
ctx.logger.debug(`[${step.name}] Cache lookup failed — proceeding`);
|
|
54
59
|
}
|
|
55
60
|
}
|
|
61
|
+
else {
|
|
62
|
+
ctx.logger.debug(`[${step.name}] Cache skipped (${!ctx.cache ? "no cache adapter" : ctx.config.noCache ? "--no-cache" : "no cacheInputs"})`);
|
|
63
|
+
}
|
|
56
64
|
// 3. Execute
|
|
57
65
|
try {
|
|
58
66
|
const result = await step.execute(ctx, state);
|
|
@@ -67,6 +67,7 @@ export class CalculateScoresStep {
|
|
|
67
67
|
try {
|
|
68
68
|
const result = calculateAndWriteScores({
|
|
69
69
|
allowedOrigins: ctx.config.allowedOrigins,
|
|
70
|
+
logger: ctx.logger,
|
|
70
71
|
mode: ctx.config.mode,
|
|
71
72
|
resolvedSource,
|
|
72
73
|
resultsPath: primaryMode !== "baseline"
|
|
@@ -77,6 +78,9 @@ export class CalculateScoresStep {
|
|
|
77
78
|
source: ctx.config.source,
|
|
78
79
|
});
|
|
79
80
|
belowCritical = result.belowCritical;
|
|
81
|
+
if (result.testSummary) {
|
|
82
|
+
state.testSummary = result.testSummary;
|
|
83
|
+
}
|
|
80
84
|
}
|
|
81
85
|
catch (err) {
|
|
82
86
|
return {
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import type { Logger, TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
1
2
|
import { type ResolvedSourceConfig } from "../sources.js";
|
|
2
3
|
import { type ActualScoreEntry, type ComponentResult } from "../_vendor/ailf-core/index.d.ts";
|
|
3
4
|
import type { GraderJudgment, PerModelEntry } from "./types.js";
|
|
@@ -86,6 +87,8 @@ export declare function scoreAgenticResults(resultsPath: string, weights: Record
|
|
|
86
87
|
export interface CalculateScoresOptions {
|
|
87
88
|
/** Allowed origins for source isolation reporting */
|
|
88
89
|
allowedOrigins?: string[];
|
|
90
|
+
/** Logger instance (defaults to ConsoleLogger if not provided) */
|
|
91
|
+
logger?: Logger;
|
|
89
92
|
/** Evaluation mode (controls which result files are read) */
|
|
90
93
|
mode?: string;
|
|
91
94
|
/** Pre-resolved source config (skips loadSource() call) */
|
|
@@ -103,5 +106,7 @@ export interface CalculateScoresOptions {
|
|
|
103
106
|
export interface CalculateScoresResult {
|
|
104
107
|
/** Feature areas that scored below the critical threshold (40). */
|
|
105
108
|
belowCritical: string[];
|
|
109
|
+
/** Summary of test execution outcomes (total, passed, failed, errored). */
|
|
110
|
+
testSummary?: TestSummary;
|
|
106
111
|
}
|
|
107
112
|
export declare function calculateAndWriteScores(options: CalculateScoresOptions): CalculateScoresResult;
|