@sanity/ailf 2.1.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/rubrics.ts +3 -3
- package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
- package/dist/_vendor/ailf-core/examples/index.js +66 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +25 -0
- package/dist/agent-harness/assertions-runtime.d.ts +49 -0
- package/dist/agent-harness/assertions-runtime.js +138 -0
- package/dist/agent-harness/provider.d.ts +58 -0
- package/dist/agent-harness/provider.js +104 -0
- package/dist/commands/calculate-scores.js +7 -2
- package/dist/commands/capture-list.d.ts +1 -1
- package/dist/commands/capture-list.js +6 -3
- package/dist/commands/compare.js +11 -7
- package/dist/commands/explain-handler.js +22 -24
- package/dist/commands/fetch-docs.js +4 -2
- package/dist/commands/generate-configs.js +6 -2
- package/dist/commands/init.js +3 -0
- package/dist/commands/pipeline-action.js +8 -24
- package/dist/commands/pipeline.js +1 -1
- package/dist/commands/pr-comment.js +6 -2
- package/dist/commands/publish.d.ts +1 -0
- package/dist/commands/publish.js +12 -8
- package/dist/commands/remote-pipeline.js +1 -1
- package/dist/commands/remote-results.d.ts +8 -8
- package/dist/commands/remote-results.js +7 -7
- package/dist/commands/shared/options.d.ts +8 -0
- package/dist/commands/shared/options.js +10 -0
- package/dist/commands/shared/resolve-output-dir.d.ts +27 -0
- package/dist/commands/shared/resolve-output-dir.js +36 -0
- package/dist/composition-root.js +1 -1
- package/dist/config/rubrics.ts +3 -3
- package/dist/orchestration/build-app-context.js +1 -1
- package/dist/orchestration/steps/gap-analysis-step.js +86 -75
- package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
- package/dist/orchestration/steps/generate-configs-step.js +47 -2
- package/dist/pipeline/calculate-scores.js +113 -2
- package/dist/pipeline/compare.js +50 -19
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +103 -25
- package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +15 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +42 -85
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +3 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +1 -27
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +2 -9
- package/dist/pipeline/compiler/rubric-resolution.d.ts +40 -0
- package/dist/pipeline/compiler/rubric-resolution.js +52 -0
- package/dist/pipeline/compiler/scoring-bridge.js +59 -7
- package/dist/pipeline/provenance.js +7 -1
- package/dist/pipeline/validate.d.ts +5 -4
- package/dist/pipeline/validate.js +34 -113
- package/package.json +2 -1
|
@@ -4,14 +4,26 @@
|
|
|
4
4
|
* Builds Promptfoo beforeEach/afterEach hooks for provisioning and
|
|
5
5
|
* tearing down sandbox working directories.
|
|
6
6
|
*/
|
|
7
|
+
import { resolve } from "path";
|
|
7
8
|
// ---------------------------------------------------------------------------
|
|
8
9
|
// Sandbox configuration
|
|
9
10
|
// ---------------------------------------------------------------------------
|
|
10
|
-
|
|
11
|
+
/**
|
|
12
|
+
* Build sandbox configuration from a task definition.
|
|
13
|
+
*
|
|
14
|
+
* Fixture paths are resolved to absolute at compile time using callerCwd
|
|
15
|
+
* (the directory the pipeline was invoked from), because promptfoo runs
|
|
16
|
+
* with cwd set to packages/eval/ — not the monorepo root where apps/ lives.
|
|
17
|
+
*/
|
|
18
|
+
export function buildSandboxConfig(task, callerCwd) {
|
|
19
|
+
const cwd = callerCwd ?? process.cwd();
|
|
11
20
|
return {
|
|
12
21
|
type: task.sandbox?.type ?? "tempdir",
|
|
13
22
|
image: task.sandbox?.image,
|
|
14
|
-
fixtures: task.fixtures ?? []
|
|
23
|
+
fixtures: (task.fixtures ?? []).map((f) => {
|
|
24
|
+
const stripped = f.startsWith("file://") ? f.slice(7) : f;
|
|
25
|
+
return resolve(cwd, stripped);
|
|
26
|
+
}),
|
|
15
27
|
limits: task.sandbox?.limits
|
|
16
28
|
? {
|
|
17
29
|
cpus: task.sandbox.limits.cpus,
|
|
@@ -39,23 +51,41 @@ export function buildLifecycleExtensions(task, sandboxConfig) {
|
|
|
39
51
|
return extensions;
|
|
40
52
|
}
|
|
41
53
|
export function buildBeforeEachHook(taskId, config) {
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
54
|
+
// Promptfoo extension hooks receive (hookName, context).
|
|
55
|
+
// beforeEach context is { test } — vars live at context.test.vars.
|
|
56
|
+
// Must return context for mutations to persist.
|
|
57
|
+
//
|
|
58
|
+
// The sandbox directory is created by the YAML writer at config-gen time
|
|
59
|
+
// (deterministic path in results/latest/sandbox-{taskId}/) so it exists
|
|
60
|
+
// before the provider is initialized. This hook copies fixtures into it.
|
|
61
|
+
//
|
|
62
|
+
// @see https://www.promptfoo.dev/docs/configuration/reference/ — extensions
|
|
63
|
+
return (`// beforeEach: copy fixtures into sandbox for ${taskId}\n` +
|
|
64
|
+
`async function(context) {\n` +
|
|
65
|
+
` const { cpSync, existsSync, mkdirSync } = require('fs');\n` +
|
|
46
66
|
` const { resolve } = require('path');\n` +
|
|
47
|
-
` const
|
|
48
|
-
`
|
|
67
|
+
` const workDir = context.test.vars?.__workingDir;\n` +
|
|
68
|
+
` if (!workDir) return context;\n` +
|
|
49
69
|
` mkdirSync(workDir, { recursive: true });\n` +
|
|
50
|
-
`
|
|
51
|
-
`
|
|
52
|
-
`
|
|
70
|
+
` // Copy fixtures into sandbox\n` +
|
|
71
|
+
` const fixtures = ${JSON.stringify(config.fixtures)};\n` +
|
|
72
|
+
` for (const fixture of fixtures) {\n` +
|
|
73
|
+
` const src = resolve(process.cwd(), fixture);\n` +
|
|
74
|
+
` if (existsSync(src)) {\n` +
|
|
75
|
+
` cpSync(src, workDir, { recursive: true });\n` +
|
|
76
|
+
` }\n` +
|
|
77
|
+
` }\n` +
|
|
78
|
+
` return context;\n` +
|
|
53
79
|
`}`);
|
|
54
80
|
}
|
|
55
81
|
export function buildAfterEachHook(taskId) {
|
|
82
|
+
// Promptfoo extension hooks receive (hookName, context).
|
|
83
|
+
// afterEach context is { test, result } — vars live at context.test.vars.
|
|
84
|
+
// @see https://www.promptfoo.dev/docs/configuration/reference/ — extensions
|
|
56
85
|
return (`// afterEach: collect artifacts + teardown for ${taskId}\n` +
|
|
57
|
-
`async function(
|
|
86
|
+
`async function(context) {\n` +
|
|
58
87
|
` const { rmSync, readdirSync, existsSync } = require('fs');\n` +
|
|
88
|
+
` const vars = context.test.vars || {};\n` +
|
|
59
89
|
` const workDir = vars.__workingDir;\n` +
|
|
60
90
|
` if (workDir && existsSync(workDir)) {\n` +
|
|
61
91
|
` try {\n` +
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
* Shared types for the agent harness mode handler.
|
|
3
3
|
*/
|
|
4
4
|
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
|
|
5
|
+
import type { RubricConfig } from "../../rubric-resolution.js";
|
|
5
6
|
import type { SandboxType } from "../../sandbox/sandbox-strategy.js";
|
|
6
7
|
/** Options for compiling an agent harness task */
|
|
7
8
|
export interface AgentHarnessCompileOptions {
|
|
@@ -9,6 +10,8 @@ export interface AgentHarnessCompileOptions {
|
|
|
9
10
|
graderProvider?: string;
|
|
10
11
|
/** Root directory for fixture resolution */
|
|
11
12
|
rootDir?: string;
|
|
13
|
+
/** Rubric config (templates, weights, profiles) — loaded from rubrics config */
|
|
14
|
+
rubricConfig?: RubricConfig;
|
|
12
15
|
}
|
|
13
16
|
/** Result of compiling a single agent harness task */
|
|
14
17
|
export interface AgentHarnessCompileResult {
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
* Handles rubric template resolution, doc-coverage auto-generation,
|
|
5
5
|
* and baseline assertion filtering.
|
|
6
6
|
*/
|
|
7
|
+
import { resolveTemplatedAssertion } from "../../rubric-resolution.js";
|
|
7
8
|
// ---------------------------------------------------------------------------
|
|
8
9
|
// Assertion resolution
|
|
9
10
|
// ---------------------------------------------------------------------------
|
|
@@ -37,33 +38,6 @@ export function resolveAssertions(task, options, warnings) {
|
|
|
37
38
|
return assertions;
|
|
38
39
|
}
|
|
39
40
|
// ---------------------------------------------------------------------------
|
|
40
|
-
// Rubric template resolution
|
|
41
|
-
// ---------------------------------------------------------------------------
|
|
42
|
-
function resolveTemplatedAssertion(a, rubricConfig, graderProvider, warnings) {
|
|
43
|
-
if (!rubricConfig) {
|
|
44
|
-
warnings.push(`No rubric config — template "${a.template}" cannot be resolved`);
|
|
45
|
-
return null;
|
|
46
|
-
}
|
|
47
|
-
const template = rubricConfig.templates[a.template];
|
|
48
|
-
if (!template) {
|
|
49
|
-
warnings.push(`Unknown rubric template: "${a.template}"`);
|
|
50
|
-
return null;
|
|
51
|
-
}
|
|
52
|
-
const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
|
|
53
|
-
const criteriaText = a.criteria.map((c) => `- ${c}`).join("\n");
|
|
54
|
-
const rubricValue = `${template.header}\n${scaleText}\n\n` +
|
|
55
|
-
`${template.criteria_label ?? "Check for:"}\n${criteriaText}\n\n` +
|
|
56
|
-
`Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
|
|
57
|
-
return {
|
|
58
|
-
type: "llm-rubric",
|
|
59
|
-
value: rubricValue,
|
|
60
|
-
...(graderProvider ? { provider: graderProvider } : {}),
|
|
61
|
-
...(template.dimension
|
|
62
|
-
? { metadata: { dimension: template.dimension, maxScore: 100 } }
|
|
63
|
-
: {}),
|
|
64
|
-
};
|
|
65
|
-
}
|
|
66
|
-
// ---------------------------------------------------------------------------
|
|
67
41
|
// Doc-coverage assertion
|
|
68
42
|
// ---------------------------------------------------------------------------
|
|
69
43
|
function buildDocCoverageAssertion(rubricConfig, graderProvider) {
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
* Shared types for the literacy mode handler.
|
|
3
3
|
*/
|
|
4
4
|
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
|
|
5
|
+
export type { RubricConfig } from "../../rubric-resolution.js";
|
|
6
|
+
import type { RubricConfig } from "../../rubric-resolution.js";
|
|
5
7
|
/** Options for compiling a literacy task */
|
|
6
8
|
export interface LiteracyCompileOptions {
|
|
7
9
|
/** Grader provider for LLM-graded assertions */
|
|
@@ -19,15 +21,6 @@ export interface LiteracyCompileOptions {
|
|
|
19
21
|
/** Rubric config (templates, weights, profiles) — loaded from rubrics config */
|
|
20
22
|
rubricConfig?: RubricConfig;
|
|
21
23
|
}
|
|
22
|
-
/** Minimal rubric config needed by the handler */
|
|
23
|
-
export interface RubricConfig {
|
|
24
|
-
templates: Record<string, {
|
|
25
|
-
dimension?: string;
|
|
26
|
-
header: string;
|
|
27
|
-
scale: string[];
|
|
28
|
-
criteria_label?: string;
|
|
29
|
-
}>;
|
|
30
|
-
}
|
|
31
24
|
/** Result of compiling a single literacy task */
|
|
32
25
|
export interface LiteracyCompileResult {
|
|
33
26
|
/** Promptfoo provider configs */
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared rubric template resolution for all evaluation modes.
|
|
3
|
+
*
|
|
4
|
+
* Resolves templated LLM-rubric assertions (those with `template` + `criteria`
|
|
5
|
+
* fields) into fully assembled Promptfoo assertions with rubric text and
|
|
6
|
+
* dimension metadata.
|
|
7
|
+
*
|
|
8
|
+
* Used by both literacy and agent-harness compilers. Extracted from
|
|
9
|
+
* literacy/assertions.ts to fix the compilation bug where agent-harness
|
|
10
|
+
* tasks with templated rubrics produced empty rubric text (DOC-2029).
|
|
11
|
+
*
|
|
12
|
+
* @see docs/design-docs/mode-agnostic-scoring.md
|
|
13
|
+
* @see config/rubrics.ts — template definitions
|
|
14
|
+
*/
|
|
15
|
+
import type { PromptfooAssertion } from "./assertion-mapper.js";
|
|
16
|
+
/** Minimal rubric config needed for template resolution */
|
|
17
|
+
export interface RubricConfig {
|
|
18
|
+
templates: Record<string, {
|
|
19
|
+
criteria_label?: string;
|
|
20
|
+
dimension?: string;
|
|
21
|
+
header: string;
|
|
22
|
+
scale: string[];
|
|
23
|
+
}>;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Resolve a templated LLM-rubric assertion into a fully assembled
|
|
27
|
+
* Promptfoo assertion with rubric text and dimension metadata.
|
|
28
|
+
*
|
|
29
|
+
* A "templated" assertion has `template` (referencing a key in rubrics.ts)
|
|
30
|
+
* and `criteria` (task-specific bullet points). The template provides the
|
|
31
|
+
* scoring header, scale, and dimension metadata. The criteria are appended
|
|
32
|
+
* to create the final rubric prompt.
|
|
33
|
+
*
|
|
34
|
+
* Returns null (with a warning) if the template can't be resolved.
|
|
35
|
+
*/
|
|
36
|
+
export declare function resolveTemplatedAssertion(assertion: {
|
|
37
|
+
criteria: string[];
|
|
38
|
+
template: string;
|
|
39
|
+
type: string;
|
|
40
|
+
}, rubricConfig: RubricConfig | undefined, graderProvider: string | undefined, warnings: string[]): PromptfooAssertion | null;
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared rubric template resolution for all evaluation modes.
|
|
3
|
+
*
|
|
4
|
+
* Resolves templated LLM-rubric assertions (those with `template` + `criteria`
|
|
5
|
+
* fields) into fully assembled Promptfoo assertions with rubric text and
|
|
6
|
+
* dimension metadata.
|
|
7
|
+
*
|
|
8
|
+
* Used by both literacy and agent-harness compilers. Extracted from
|
|
9
|
+
* literacy/assertions.ts to fix the compilation bug where agent-harness
|
|
10
|
+
* tasks with templated rubrics produced empty rubric text (DOC-2029).
|
|
11
|
+
*
|
|
12
|
+
* @see docs/design-docs/mode-agnostic-scoring.md
|
|
13
|
+
* @see config/rubrics.ts — template definitions
|
|
14
|
+
*/
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Template resolution
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
/**
|
|
19
|
+
* Resolve a templated LLM-rubric assertion into a fully assembled
|
|
20
|
+
* Promptfoo assertion with rubric text and dimension metadata.
|
|
21
|
+
*
|
|
22
|
+
* A "templated" assertion has `template` (referencing a key in rubrics.ts)
|
|
23
|
+
* and `criteria` (task-specific bullet points). The template provides the
|
|
24
|
+
* scoring header, scale, and dimension metadata. The criteria are appended
|
|
25
|
+
* to create the final rubric prompt.
|
|
26
|
+
*
|
|
27
|
+
* Returns null (with a warning) if the template can't be resolved.
|
|
28
|
+
*/
|
|
29
|
+
export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvider, warnings) {
|
|
30
|
+
if (!rubricConfig) {
|
|
31
|
+
warnings.push(`No rubric config — template "${assertion.template}" cannot be resolved`);
|
|
32
|
+
return null;
|
|
33
|
+
}
|
|
34
|
+
const template = rubricConfig.templates[assertion.template];
|
|
35
|
+
if (!template) {
|
|
36
|
+
warnings.push(`Unknown rubric template: "${assertion.template}"`);
|
|
37
|
+
return null;
|
|
38
|
+
}
|
|
39
|
+
const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
|
|
40
|
+
const criteriaText = assertion.criteria.map((c) => `- ${c}`).join("\n");
|
|
41
|
+
const rubricValue = `${template.header}\n${scaleText}\n\n` +
|
|
42
|
+
`${template.criteria_label ?? "Check for:"}\n${criteriaText}\n\n` +
|
|
43
|
+
`Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
|
|
44
|
+
return {
|
|
45
|
+
type: "llm-rubric",
|
|
46
|
+
value: rubricValue,
|
|
47
|
+
...(graderProvider ? { provider: graderProvider } : {}),
|
|
48
|
+
...(template.dimension
|
|
49
|
+
? { metadata: { dimension: template.dimension, maxScore: 100 } }
|
|
50
|
+
: {}),
|
|
51
|
+
};
|
|
52
|
+
}
|
|
@@ -41,19 +41,25 @@ import { classifyRubric, parseRubricScore } from "../../_vendor/ailf-core/index.
|
|
|
41
41
|
export function scoreTestGroup(tests, profile, taskId) {
|
|
42
42
|
let totalCost = 0;
|
|
43
43
|
// Step 1: Convert all ComponentResults into AssertionScore[] (0–1 scale)
|
|
44
|
+
//
|
|
45
|
+
// Two assertion types contribute to scoring:
|
|
46
|
+
// - llm-rubric: dimension from metadata, score from grader (0–100 → [0,1])
|
|
47
|
+
// - javascript: mapped to "assertion-pass-rate" dimension (pass=1, fail=0)
|
|
48
|
+
//
|
|
49
|
+
// Other types (cost, trajectory, contains, etc.) are metadata or guards —
|
|
50
|
+
// they don't produce dimension scores.
|
|
44
51
|
const assertionScores = [];
|
|
45
52
|
for (const test of tests) {
|
|
46
53
|
totalCost += test.cost;
|
|
47
54
|
for (const comp of test.gradingResult.componentResults) {
|
|
48
|
-
|
|
49
|
-
continue;
|
|
50
|
-
const converted = componentToAssertionScore(comp);
|
|
55
|
+
const converted = componentToScore(comp);
|
|
51
56
|
if (converted)
|
|
52
57
|
assertionScores.push(converted);
|
|
53
58
|
}
|
|
54
59
|
}
|
|
55
60
|
// Step 2: Aggregate into DimensionScores (0–1 scale)
|
|
56
61
|
const dimensionLabels = {
|
|
62
|
+
"assertion-pass-rate": "Assertion Pass Rate",
|
|
57
63
|
"code-correctness": "Code Correctness",
|
|
58
64
|
"doc-coverage": "Doc Coverage",
|
|
59
65
|
"task-completion": "Task Completion",
|
|
@@ -86,12 +92,34 @@ export function scoreTestGroup(tests, profile, taskId) {
|
|
|
86
92
|
// Conversion helpers
|
|
87
93
|
// ---------------------------------------------------------------------------
|
|
88
94
|
/**
|
|
89
|
-
*
|
|
90
|
-
* AssertionScore format.
|
|
95
|
+
* Route a ComponentResult to the appropriate scoring conversion.
|
|
91
96
|
*
|
|
92
|
-
*
|
|
97
|
+
* Dispatches by assertion type:
|
|
98
|
+
* - llm-rubric → dimension from metadata, grader score (0–100 → [0,1])
|
|
99
|
+
* - javascript → "assertion-pass-rate" dimension, binary (pass=1, fail=0)
|
|
100
|
+
* - everything else → null (not a scoring-relevant assertion)
|
|
101
|
+
*
|
|
102
|
+
* This replaces the previous llm-rubric-only filter that caused agent-harness
|
|
103
|
+
* javascript assertions to be invisible to the scoring engine (DOC-2029).
|
|
104
|
+
*/
|
|
105
|
+
function componentToScore(comp) {
|
|
106
|
+
const type = comp.assertion?.type;
|
|
107
|
+
if (type === "llm-rubric") {
|
|
108
|
+
return llmRubricToScore(comp);
|
|
109
|
+
}
|
|
110
|
+
if (type === "javascript") {
|
|
111
|
+
return javascriptAssertionToScore(comp);
|
|
112
|
+
}
|
|
113
|
+
// Other types (cost, trajectory, contains, etc.) don't produce scores
|
|
114
|
+
return null;
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Convert an LLM-rubric ComponentResult into an AssertionScore.
|
|
118
|
+
*
|
|
119
|
+
* The dimension comes from metadata (set during rubric template resolution).
|
|
120
|
+
* Returns null if the component doesn't map to any dimension.
|
|
93
121
|
*/
|
|
94
|
-
function
|
|
122
|
+
function llmRubricToScore(comp) {
|
|
95
123
|
const dim = classifyRubric(comp);
|
|
96
124
|
if (!dim)
|
|
97
125
|
return null;
|
|
@@ -108,6 +136,30 @@ function componentToAssertionScore(comp) {
|
|
|
108
136
|
weight: 1.0,
|
|
109
137
|
};
|
|
110
138
|
}
|
|
139
|
+
/**
|
|
140
|
+
* Convert a javascript assertion ComponentResult into an AssertionScore.
|
|
141
|
+
*
|
|
142
|
+
* Javascript assertions (fileExists, fileContains, commandSucceeds, etc.)
|
|
143
|
+
* produce binary pass/fail results. They map to the "assertion-pass-rate"
|
|
144
|
+
* dimension — the fraction of structural assertions that passed.
|
|
145
|
+
*
|
|
146
|
+
* Zero-weight assertions (like URL extraction) are excluded from scoring.
|
|
147
|
+
*/
|
|
148
|
+
function javascriptAssertionToScore(comp) {
|
|
149
|
+
// Skip zero-weight assertions (diagnostic-only, e.g., URL extraction)
|
|
150
|
+
const weight = comp.assertion?.weight;
|
|
151
|
+
if (weight === 0)
|
|
152
|
+
return null;
|
|
153
|
+
return {
|
|
154
|
+
assertionType: "javascript",
|
|
155
|
+
dimension: "assertion-pass-rate",
|
|
156
|
+
latencyMs: 0,
|
|
157
|
+
pass: comp.pass,
|
|
158
|
+
reason: comp.reason ?? "",
|
|
159
|
+
score: comp.pass ? 1.0 : 0.0,
|
|
160
|
+
weight: 1.0,
|
|
161
|
+
};
|
|
162
|
+
}
|
|
111
163
|
/** Convert kebab-case dimension key to camelCase (e.g., "task-completion" → "taskCompletion") */
|
|
112
164
|
function kebabToCamel(kebab) {
|
|
113
165
|
return kebab.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
|
|
@@ -59,6 +59,12 @@ export function buildProvenance(input) {
|
|
|
59
59
|
evalFingerprint: input.evalFingerprint,
|
|
60
60
|
hasLineage: Boolean(lineage),
|
|
61
61
|
});
|
|
62
|
+
// Non-literacy modes (agent-harness, mcp-server, etc.) don't use the
|
|
63
|
+
// config/models.ts model matrix — listing those models would be misleading.
|
|
64
|
+
// Only include them for literacy mode where they're the actual eval targets.
|
|
65
|
+
const evaluatedModels = input.mode === "literacy"
|
|
66
|
+
? models.models.map((m) => ({ id: m.id, label: m.label }))
|
|
67
|
+
: [];
|
|
62
68
|
return {
|
|
63
69
|
areas: input.areas,
|
|
64
70
|
autoScope: input.autoScope,
|
|
@@ -68,7 +74,7 @@ export function buildProvenance(input) {
|
|
|
68
74
|
graderModel: models.grader.id,
|
|
69
75
|
lineage,
|
|
70
76
|
mode: input.mode,
|
|
71
|
-
models:
|
|
77
|
+
models: evaluatedModels,
|
|
72
78
|
promptfooUrl: input.promptfooUrl,
|
|
73
79
|
promptfooUrls: input.promptfooUrls,
|
|
74
80
|
source: {
|
|
@@ -51,10 +51,11 @@ export declare function validateReferenceSolutions(rootDir: string): ValidationI
|
|
|
51
51
|
*/
|
|
52
52
|
export declare function validateRubricsYaml(rootDir: string): ValidationIssue[];
|
|
53
53
|
/**
|
|
54
|
-
* Check that
|
|
55
|
-
*
|
|
56
|
-
*
|
|
57
|
-
*
|
|
54
|
+
* Check that task definition files exist.
|
|
55
|
+
*
|
|
56
|
+
* Tasks live as `*.task.ts` files in mode subdirectories (e.g.
|
|
57
|
+
* `tasks/literacy/groq.task.ts`). Legacy YAML task files are no longer
|
|
58
|
+
* used. Warns only if no task files are found at all.
|
|
58
59
|
*/
|
|
59
60
|
export declare function validateTaskFiles(rootDir: string): ValidationIssue[];
|
|
60
61
|
/**
|
|
@@ -9,10 +9,9 @@
|
|
|
9
9
|
*/
|
|
10
10
|
import fs from "fs";
|
|
11
11
|
import path from "path";
|
|
12
|
-
import { load } from "js-yaml";
|
|
13
12
|
import { tryLoadConfigFile } from "./compiler/config-loader.js";
|
|
14
13
|
import { resolveMappings } from "./resolve-mappings.js";
|
|
15
|
-
import { FeatureRegistrySchema, formatZodErrors, RubricConfigSchema,
|
|
14
|
+
import { FeatureRegistrySchema, formatZodErrors, RubricConfigSchema, ThresholdConfigSchema, } from "./schemas.js";
|
|
16
15
|
// ---------------------------------------------------------------------------
|
|
17
16
|
// Helpers
|
|
18
17
|
// ---------------------------------------------------------------------------
|
|
@@ -248,10 +247,11 @@ export function validateRubricsYaml(rootDir) {
|
|
|
248
247
|
return issues;
|
|
249
248
|
}
|
|
250
249
|
/**
|
|
251
|
-
* Check that
|
|
252
|
-
*
|
|
253
|
-
*
|
|
254
|
-
*
|
|
250
|
+
* Check that task definition files exist.
|
|
251
|
+
*
|
|
252
|
+
* Tasks live as `*.task.ts` files in mode subdirectories (e.g.
|
|
253
|
+
* `tasks/literacy/groq.task.ts`). Legacy YAML task files are no longer
|
|
254
|
+
* used. Warns only if no task files are found at all.
|
|
255
255
|
*/
|
|
256
256
|
export function validateTaskFiles(rootDir) {
|
|
257
257
|
const source = "validateTaskFiles";
|
|
@@ -261,70 +261,9 @@ export function validateTaskFiles(rootDir) {
|
|
|
261
261
|
issues.push(warning(source, "tasks/ directory not found (using Content Lake tasks?)", tasksDir));
|
|
262
262
|
return issues;
|
|
263
263
|
}
|
|
264
|
-
const
|
|
265
|
-
|
|
266
|
-
.
|
|
267
|
-
if (yamlFiles.length === 0) {
|
|
268
|
-
issues.push(warning(source, "No task YAML files found in tasks/ (using Content Lake tasks?)", tasksDir));
|
|
269
|
-
return issues;
|
|
270
|
-
}
|
|
271
|
-
const allIds = new Map(); // id → source file
|
|
272
|
-
const templateKeys = loadTemplateKeys(rootDir);
|
|
273
|
-
for (const file of yamlFiles) {
|
|
274
|
-
const filePath = path.join(tasksDir, file);
|
|
275
|
-
// Step 1: Parse YAML
|
|
276
|
-
const result = parseYamlFile(filePath, source);
|
|
277
|
-
if (!result.ok) {
|
|
278
|
-
issues.push(result.issue);
|
|
279
|
-
continue;
|
|
280
|
-
}
|
|
281
|
-
const { data } = result;
|
|
282
|
-
if (!Array.isArray(data)) {
|
|
283
|
-
issues.push(error(source, `${file} did not parse to an array of tasks`, filePath));
|
|
284
|
-
continue;
|
|
285
|
-
}
|
|
286
|
-
// Step 2: Validate each entry with Zod schema
|
|
287
|
-
const zodResult = TaskFileSchema.safeParse(data);
|
|
288
|
-
if (!zodResult.success) {
|
|
289
|
-
const lines = formatZodErrors(zodResult.error);
|
|
290
|
-
for (const line of lines) {
|
|
291
|
-
issues.push(error(source, `${file}: ${line.trim()}`, filePath));
|
|
292
|
-
}
|
|
293
|
-
continue;
|
|
294
|
-
}
|
|
295
|
-
// Step 3: Cross-entry validation (duplicate IDs, docs path consistency)
|
|
296
|
-
for (const entry of zodResult.data) {
|
|
297
|
-
if ("id" in entry && typeof entry.id === "string") {
|
|
298
|
-
// Check for duplicate IDs across all files
|
|
299
|
-
if (allIds.has(entry.id)) {
|
|
300
|
-
issues.push(error(source, `${file}: duplicate id '${entry.id}' (also in ${allIds.get(entry.id)})`, filePath));
|
|
301
|
-
}
|
|
302
|
-
else {
|
|
303
|
-
allIds.set(entry.id, file);
|
|
304
|
-
}
|
|
305
|
-
// Check docs path matches task id
|
|
306
|
-
const vars = entry.vars;
|
|
307
|
-
if (vars.docs && typeof vars.docs === "string") {
|
|
308
|
-
const expectedPath = `file://contexts/canonical/${entry.id}.md`;
|
|
309
|
-
if (vars.docs !== expectedPath) {
|
|
310
|
-
issues.push(warning(source, `${file}: id is '${entry.id}' but docs path is '${vars.docs}' (expected '${expectedPath}')`, filePath));
|
|
311
|
-
}
|
|
312
|
-
}
|
|
313
|
-
// Check that llm-rubric template references exist in config/rubrics
|
|
314
|
-
const asserts = entry.assert;
|
|
315
|
-
if (Array.isArray(asserts) && templateKeys.size > 0) {
|
|
316
|
-
for (const a of asserts) {
|
|
317
|
-
const assertion = a;
|
|
318
|
-
if (assertion.type === "llm-rubric" &&
|
|
319
|
-
typeof assertion.template === "string") {
|
|
320
|
-
if (!templateKeys.has(assertion.template)) {
|
|
321
|
-
issues.push(error(source, `${file}: task '${entry.id}' references unknown rubric template '${assertion.template}' (available: ${[...templateKeys].join(", ")})`, filePath));
|
|
322
|
-
}
|
|
323
|
-
}
|
|
324
|
-
}
|
|
325
|
-
}
|
|
326
|
-
}
|
|
327
|
-
}
|
|
264
|
+
const taskAreas = collectTaskAreas(tasksDir);
|
|
265
|
+
if (taskAreas.size === 0) {
|
|
266
|
+
issues.push(warning(source, "No task files found in tasks/ (using Content Lake tasks?)", tasksDir));
|
|
328
267
|
}
|
|
329
268
|
return issues;
|
|
330
269
|
}
|
|
@@ -355,15 +294,10 @@ export function validateThresholdsYaml(rootDir) {
|
|
|
355
294
|
// Cross-reference: warn if an area override references an area with no task file
|
|
356
295
|
if (zodResult.data.areas) {
|
|
357
296
|
const tasksDir = path.join(rootDir, "tasks");
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
.
|
|
362
|
-
.map((f) => f.replace(/\.(yaml|yml|task\.ts|task\.js)$/, "")));
|
|
363
|
-
for (const areaName of Object.keys(zodResult.data.areas)) {
|
|
364
|
-
if (!taskFiles.has(areaName)) {
|
|
365
|
-
issues.push(warning(source, `config/thresholds: area override '${areaName}' has no matching tasks/${areaName}`, loaded.filePath));
|
|
366
|
-
}
|
|
297
|
+
const taskAreas = collectTaskAreas(tasksDir);
|
|
298
|
+
for (const areaName of Object.keys(zodResult.data.areas)) {
|
|
299
|
+
if (!taskAreas.has(areaName)) {
|
|
300
|
+
issues.push(warning(source, `config/thresholds: area override '${areaName}' has no matching task file`, loaded.filePath));
|
|
367
301
|
}
|
|
368
302
|
}
|
|
369
303
|
}
|
|
@@ -378,44 +312,31 @@ function error(source, message, filePath) {
|
|
|
378
312
|
};
|
|
379
313
|
}
|
|
380
314
|
/**
|
|
381
|
-
*
|
|
382
|
-
*
|
|
315
|
+
* Collect task area names from all subdirectories of `tasksDir`.
|
|
316
|
+
*
|
|
317
|
+
* Task files live in mode subdirectories (e.g. `tasks/literacy/groq.task.ts`).
|
|
318
|
+
* Returns a set of basenames without the `.task.ts`/`.task.js` extension.
|
|
383
319
|
*/
|
|
384
|
-
function
|
|
385
|
-
|
|
386
|
-
if (!loaded)
|
|
320
|
+
function collectTaskAreas(tasksDir) {
|
|
321
|
+
if (!fs.existsSync(tasksDir))
|
|
387
322
|
return new Set();
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
323
|
+
const areas = new Set();
|
|
324
|
+
const taskFilePattern = /\.task\.(ts|js)$/;
|
|
325
|
+
for (const entry of fs.readdirSync(tasksDir, { withFileTypes: true })) {
|
|
326
|
+
if (entry.isDirectory()) {
|
|
327
|
+
const subdir = path.join(tasksDir, entry.name);
|
|
328
|
+
for (const file of fs.readdirSync(subdir)) {
|
|
329
|
+
if (taskFilePattern.test(file)) {
|
|
330
|
+
areas.add(file.replace(taskFilePattern, ""));
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
// Also check top-level task files for backwards compatibility
|
|
335
|
+
if (entry.isFile() && taskFilePattern.test(entry.name)) {
|
|
336
|
+
areas.add(entry.name.replace(taskFilePattern, ""));
|
|
392
337
|
}
|
|
393
338
|
}
|
|
394
|
-
|
|
395
|
-
// Ignore — structural errors are caught by validateRubricsYaml
|
|
396
|
-
}
|
|
397
|
-
return new Set();
|
|
398
|
-
}
|
|
399
|
-
/** Safely parse a YAML file, returning the parsed value or a validation issue. */
|
|
400
|
-
function parseYamlFile(filePath, source) {
|
|
401
|
-
if (!fs.existsSync(filePath)) {
|
|
402
|
-
return {
|
|
403
|
-
issue: error(source, `File not found: ${filePath}`, filePath),
|
|
404
|
-
ok: false,
|
|
405
|
-
};
|
|
406
|
-
}
|
|
407
|
-
try {
|
|
408
|
-
const raw = fs.readFileSync(filePath, "utf-8");
|
|
409
|
-
const data = load(raw);
|
|
410
|
-
return { data, ok: true };
|
|
411
|
-
}
|
|
412
|
-
catch (err) {
|
|
413
|
-
const message = err instanceof Error ? err.message : "Unknown YAML parse error";
|
|
414
|
-
return {
|
|
415
|
-
issue: error(source, `Failed to parse YAML: ${message}`, filePath),
|
|
416
|
-
ok: false,
|
|
417
|
-
};
|
|
418
|
-
}
|
|
339
|
+
return areas;
|
|
419
340
|
}
|
|
420
341
|
// ---------------------------------------------------------------------------
|
|
421
342
|
// Main entry point
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sanity/ailf",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.3.0",
|
|
4
4
|
"private": false,
|
|
5
5
|
"publishConfig": {
|
|
6
6
|
"access": "public"
|
|
@@ -46,6 +46,7 @@
|
|
|
46
46
|
"zod": "^4.3.6"
|
|
47
47
|
},
|
|
48
48
|
"devDependencies": {
|
|
49
|
+
"@anthropic-ai/claude-agent-sdk": "^0.2.105",
|
|
49
50
|
"@types/js-yaml": "^4.0.9",
|
|
50
51
|
"@types/node": "^22.13.1",
|
|
51
52
|
"tsx": "^4.19.2",
|