@sanity/ailf 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/config/models.ts +15 -3
- package/dist/_vendor/ailf-core/config-helpers.d.ts +14 -17
- package/dist/_vendor/ailf-core/config-helpers.js +22 -2
- package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/examples/index.js +25 -0
- package/dist/_vendor/ailf-core/index.d.ts +2 -2
- package/dist/_vendor/ailf-core/index.js +1 -1
- package/dist/_vendor/ailf-core/ports/context.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +2 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1 -3
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +78 -23
- package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.js +21 -13
- package/dist/adapters/task-sources/content-lake-task-source.js +17 -20
- package/dist/adapters/task-sources/index.d.ts +2 -2
- package/dist/adapters/task-sources/index.js +2 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
- package/dist/adapters/task-sources/repo-schemas.js +227 -19
- package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
- package/dist/adapters/task-sources/repo-task-source.js +81 -122
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +2 -2
- package/dist/adapters/task-sources/task-file-loader.js +2 -2
- package/dist/commands/coverage-audit.js +3 -1
- package/dist/commands/init.d.ts +6 -4
- package/dist/commands/init.js +302 -23
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +13 -1
- package/dist/composition-root.js +73 -41
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/orchestration/build-step-sequence.js +4 -2
- package/dist/orchestration/steps/fetch-docs-step.js +2 -3
- package/dist/orchestration/steps/generate-configs-step.js +28 -12
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +105 -68
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/{agent-harness-handler.d.ts → agent-harness/types.d.ts} +3 -24
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +4 -5
- package/dist/pipeline/compiler/mode-handlers/index.js +4 -6
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/{mcp-assertions.d.ts → mcp-server/assertions.d.ts} +2 -10
- package/dist/pipeline/compiler/mode-handlers/{mcp-assertions.js → mcp-server/assertions.js} +63 -6
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +6 -9
- package/dist/pipeline/compiler/presets/sanity-literacy.js +10 -156
- package/dist/pipeline/expand-tasks.d.ts +2 -2
- package/dist/pipeline/expand-tasks.js +2 -2
- package/dist/pipeline/generate-configs.js +1 -1
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +7 -7
- package/dist/pipeline/mirror-repo-tasks.js +9 -9
- package/dist/pipeline/plan.js +1 -1
- package/package.json +11 -3
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -67
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -309
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Assertion resolution for literacy tasks.
|
|
3
|
+
*
|
|
4
|
+
* Handles rubric template resolution, doc-coverage auto-generation,
|
|
5
|
+
* and baseline assertion filtering.
|
|
6
|
+
*/
|
|
7
|
+
// ---------------------------------------------------------------------------
|
|
8
|
+
// Assertion resolution
|
|
9
|
+
// ---------------------------------------------------------------------------
|
|
10
|
+
export function resolveAssertions(task, options, warnings) {
|
|
11
|
+
const assertions = [];
|
|
12
|
+
for (const a of task.assertions ?? []) {
|
|
13
|
+
if (a.type === "llm-rubric" && "template" in a) {
|
|
14
|
+
const resolved = resolveTemplatedAssertion(a, options?.rubricConfig, options?.graderProvider, warnings);
|
|
15
|
+
if (resolved)
|
|
16
|
+
assertions.push(resolved);
|
|
17
|
+
}
|
|
18
|
+
else {
|
|
19
|
+
assertions.push({
|
|
20
|
+
type: a.type,
|
|
21
|
+
...("value" in a ? { value: a.value } : {}),
|
|
22
|
+
...(typeof a.weight === "number"
|
|
23
|
+
? { weight: a.weight }
|
|
24
|
+
: {}),
|
|
25
|
+
...(a.type === "llm-rubric" && options?.graderProvider
|
|
26
|
+
? { provider: options.graderProvider }
|
|
27
|
+
: {}),
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
// Doc-coverage auto-generation
|
|
32
|
+
if (task.docCoverage) {
|
|
33
|
+
const docCoverageAssertion = buildDocCoverageAssertion(options?.rubricConfig, options?.graderProvider);
|
|
34
|
+
if (docCoverageAssertion)
|
|
35
|
+
assertions.push(docCoverageAssertion);
|
|
36
|
+
}
|
|
37
|
+
return assertions;
|
|
38
|
+
}
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
40
|
+
// Rubric template resolution
|
|
41
|
+
// ---------------------------------------------------------------------------
|
|
42
|
+
function resolveTemplatedAssertion(a, rubricConfig, graderProvider, warnings) {
|
|
43
|
+
if (!rubricConfig) {
|
|
44
|
+
warnings.push(`No rubric config — template "${a.template}" cannot be resolved`);
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
const template = rubricConfig.templates[a.template];
|
|
48
|
+
if (!template) {
|
|
49
|
+
warnings.push(`Unknown rubric template: "${a.template}"`);
|
|
50
|
+
return null;
|
|
51
|
+
}
|
|
52
|
+
const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
|
|
53
|
+
const criteriaText = a.criteria.map((c) => `- ${c}`).join("\n");
|
|
54
|
+
const rubricValue = `${template.header}\n${scaleText}\n\n` +
|
|
55
|
+
`${template.criteria_label ?? "Check for:"}\n${criteriaText}\n\n` +
|
|
56
|
+
`Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
|
|
57
|
+
return {
|
|
58
|
+
type: "llm-rubric",
|
|
59
|
+
value: rubricValue,
|
|
60
|
+
...(graderProvider ? { provider: graderProvider } : {}),
|
|
61
|
+
...(template.dimension
|
|
62
|
+
? { metadata: { dimension: template.dimension, maxScore: 100 } }
|
|
63
|
+
: {}),
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
// ---------------------------------------------------------------------------
|
|
67
|
+
// Doc-coverage assertion
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
function buildDocCoverageAssertion(rubricConfig, graderProvider) {
|
|
70
|
+
if (!rubricConfig?.templates["doc-coverage"])
|
|
71
|
+
return null;
|
|
72
|
+
const template = rubricConfig.templates["doc-coverage"];
|
|
73
|
+
const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
|
|
74
|
+
const rubricValue = `${template.header}\n${scaleText}\n\n` +
|
|
75
|
+
`Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
|
|
76
|
+
return {
|
|
77
|
+
type: "llm-rubric",
|
|
78
|
+
value: rubricValue,
|
|
79
|
+
...(graderProvider ? { provider: graderProvider } : {}),
|
|
80
|
+
...(template.dimension
|
|
81
|
+
? { metadata: { dimension: template.dimension, maxScore: 100 } }
|
|
82
|
+
: {}),
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
// Baseline assertion filtering
|
|
87
|
+
// ---------------------------------------------------------------------------
|
|
88
|
+
/**
|
|
89
|
+
* Build baseline assertions matching the legacy expand-tasks behavior.
|
|
90
|
+
*
|
|
91
|
+
* - "full": all assertions carried over
|
|
92
|
+
* - "abbreviated": only first llm-rubric with shortened prompt
|
|
93
|
+
* - "none": no assertions
|
|
94
|
+
*/
|
|
95
|
+
export function buildBaselineAssertions(goldAssertions, rubricMode) {
|
|
96
|
+
const mode = rubricMode ?? "full";
|
|
97
|
+
if (mode === "none")
|
|
98
|
+
return [];
|
|
99
|
+
if (mode === "full")
|
|
100
|
+
return [...goldAssertions];
|
|
101
|
+
// Abbreviated: keep first llm-rubric as summary, skip rest
|
|
102
|
+
const abbreviated = [];
|
|
103
|
+
let foundFirst = false;
|
|
104
|
+
for (const a of goldAssertions) {
|
|
105
|
+
if (a.type === "llm-rubric") {
|
|
106
|
+
if (!foundFirst) {
|
|
107
|
+
foundFirst = true;
|
|
108
|
+
abbreviated.push({
|
|
109
|
+
type: "llm-rubric",
|
|
110
|
+
value: "Score task completion from 0 to 100 (same criteria as above).\n" +
|
|
111
|
+
'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
|
|
112
|
+
...(a.provider ? { provider: a.provider } : {}),
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
return abbreviated;
|
|
118
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Literacy task compilation — core compilation logic.
|
|
3
|
+
*
|
|
4
|
+
* Produces the same structure as the legacy expand-tasks.ts path:
|
|
5
|
+
* - Gold entry with with-docs prompt and canonical doc context
|
|
6
|
+
* - Baseline entry with without-docs prompt and empty docs
|
|
7
|
+
* - Rubric assertions with structured dimension metadata
|
|
8
|
+
*/
|
|
9
|
+
import type { LiteracyTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
10
|
+
import type { LiteracyCompileOptions, LiteracyCompileResult } from "./types.js";
|
|
11
|
+
/**
|
|
12
|
+
* Compile a literacy task into Promptfoo configuration.
|
|
13
|
+
*/
|
|
14
|
+
export declare function compileLiteracyTask(task: LiteracyTaskDefinition, options?: LiteracyCompileOptions): LiteracyCompileResult;
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Literacy task compilation — core compilation logic.
|
|
3
|
+
*
|
|
4
|
+
* Produces the same structure as the legacy expand-tasks.ts path:
|
|
5
|
+
* - Gold entry with with-docs prompt and canonical doc context
|
|
6
|
+
* - Baseline entry with without-docs prompt and empty docs
|
|
7
|
+
* - Rubric assertions with structured dimension metadata
|
|
8
|
+
*/
|
|
9
|
+
import { LiteracyVariant, } from "../../../normalize-mode.js";
|
|
10
|
+
import { buildBaselineAssertions, resolveAssertions } from "./assertions.js";
|
|
11
|
+
import { LITERACY_PROMPT_TEMPLATES } from "./prompts.js";
|
|
12
|
+
import { validateLiteracyTask } from "./validation.js";
|
|
13
|
+
/**
|
|
14
|
+
* Compile a literacy task into Promptfoo configuration.
|
|
15
|
+
*/
|
|
16
|
+
export function compileLiteracyTask(task, options) {
|
|
17
|
+
const warnings = [];
|
|
18
|
+
const evalMode = options?.evalMode ?? LiteracyVariant.STANDARD;
|
|
19
|
+
// Validation
|
|
20
|
+
for (const err of validateLiteracyTask(task)) {
|
|
21
|
+
warnings.push(`Literacy task "${task.id}": ${err.field} — ${err.message}`);
|
|
22
|
+
}
|
|
23
|
+
const providers = buildProviders(options);
|
|
24
|
+
const prompts = buildPrompts(evalMode);
|
|
25
|
+
const tests = buildTestCases(task, evalMode, options, warnings);
|
|
26
|
+
return { providers, tests, prompts, warnings };
|
|
27
|
+
}
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
// Provider assembly
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
function buildProviders(options) {
|
|
32
|
+
if (options?.models && options.models.length > 0) {
|
|
33
|
+
return options.models.map((m) => ({
|
|
34
|
+
id: m.id,
|
|
35
|
+
label: m.label,
|
|
36
|
+
config: m.config,
|
|
37
|
+
}));
|
|
38
|
+
}
|
|
39
|
+
return [];
|
|
40
|
+
}
|
|
41
|
+
// ---------------------------------------------------------------------------
|
|
42
|
+
// Prompt assembly
|
|
43
|
+
// ---------------------------------------------------------------------------
|
|
44
|
+
function templateToPromptfoo(pt) {
|
|
45
|
+
return { id: pt.id, label: pt.label, raw: pt.template };
|
|
46
|
+
}
|
|
47
|
+
function buildPrompts(evalMode) {
|
|
48
|
+
if (evalMode === "agentic") {
|
|
49
|
+
return [templateToPromptfoo(LITERACY_PROMPT_TEMPLATES["agentic"])];
|
|
50
|
+
}
|
|
51
|
+
return [
|
|
52
|
+
templateToPromptfoo(LITERACY_PROMPT_TEMPLATES["with-docs"]),
|
|
53
|
+
templateToPromptfoo(LITERACY_PROMPT_TEMPLATES["without-docs"]),
|
|
54
|
+
];
|
|
55
|
+
}
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
// Test case assembly
|
|
58
|
+
// ---------------------------------------------------------------------------
|
|
59
|
+
function buildTestCases(task, evalMode, options, warnings) {
|
|
60
|
+
const tests = [];
|
|
61
|
+
const promptText = task.prompt?.text ?? task.prompt?.template ?? "";
|
|
62
|
+
const contextDocs = task.context?.docs ?? [];
|
|
63
|
+
const taskArea = task.area ?? "";
|
|
64
|
+
const taskTitle = task.title;
|
|
65
|
+
const promptVars = task.prompt?.vars ?? {};
|
|
66
|
+
const hasDocs = contextDocs.length > 0;
|
|
67
|
+
const docsVar = hasDocs ? `file://contexts/canonical/${task.id}.md` : "";
|
|
68
|
+
const assertions = resolveAssertions(task, options, warnings);
|
|
69
|
+
// Gold entry — canonical docs injected
|
|
70
|
+
const goldVars = {
|
|
71
|
+
task: promptText,
|
|
72
|
+
docs: docsVar,
|
|
73
|
+
__featureArea: taskArea,
|
|
74
|
+
...promptVars,
|
|
75
|
+
};
|
|
76
|
+
tests.push({
|
|
77
|
+
description: `${taskTitle} (gold)`,
|
|
78
|
+
vars: goldVars,
|
|
79
|
+
...(evalMode === LiteracyVariant.STANDARD
|
|
80
|
+
? { prompts: ["with-docs"] }
|
|
81
|
+
: {}),
|
|
82
|
+
...(assertions.length > 0 ? { assert: assertions } : {}),
|
|
83
|
+
});
|
|
84
|
+
// Baseline entry — no docs (floor measurement)
|
|
85
|
+
if (evalMode !== "agentic") {
|
|
86
|
+
const baselineEnabled = task.baseline?.enabled !== false;
|
|
87
|
+
if (baselineEnabled) {
|
|
88
|
+
const baselineAssertions = buildBaselineAssertions(assertions, task.baseline?.rubric);
|
|
89
|
+
tests.push({
|
|
90
|
+
description: `${taskTitle} (baseline)`,
|
|
91
|
+
vars: {
|
|
92
|
+
task: promptText,
|
|
93
|
+
docs: "",
|
|
94
|
+
__featureArea: taskArea,
|
|
95
|
+
...promptVars,
|
|
96
|
+
},
|
|
97
|
+
prompts: ["without-docs"],
|
|
98
|
+
...(baselineAssertions.length > 0
|
|
99
|
+
? { assert: baselineAssertions }
|
|
100
|
+
: {}),
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
return tests;
|
|
105
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Literacy mode handler — compiles LiteracyTaskDefinition into Promptfoo config.
|
|
3
|
+
*
|
|
4
|
+
* @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
5
|
+
*/
|
|
6
|
+
import type { ModeHandler } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
7
|
+
export { LITERACY_PROMPT_TEMPLATES } from "./prompts.js";
|
|
8
|
+
export { validateLiteracyTask, type LiteracyValidationError, } from "./validation.js";
|
|
9
|
+
export { compileLiteracyTask } from "./compiler.js";
|
|
10
|
+
export type { LiteracyCompileOptions, LiteracyCompileResult, RubricConfig, } from "./types.js";
|
|
11
|
+
export declare const handler: ModeHandler;
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Literacy mode handler — compiles LiteracyTaskDefinition into Promptfoo config.
|
|
3
|
+
*
|
|
4
|
+
* @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
5
|
+
*/
|
|
6
|
+
import { compileLiteracyTask } from "./compiler.js";
|
|
7
|
+
import { LITERACY_PROMPT_TEMPLATES } from "./prompts.js";
|
|
8
|
+
// Re-export public API
|
|
9
|
+
export { LITERACY_PROMPT_TEMPLATES } from "./prompts.js";
|
|
10
|
+
export { validateLiteracyTask, } from "./validation.js";
|
|
11
|
+
export { compileLiteracyTask } from "./compiler.js";
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// ModeHandler adapter — wraps compileLiteracyTask for registry dispatch
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
export const handler = {
|
|
16
|
+
getPrompts() {
|
|
17
|
+
return LITERACY_PROMPT_TEMPLATES;
|
|
18
|
+
},
|
|
19
|
+
compileTask(task, ctx) {
|
|
20
|
+
if (task.mode !== "literacy") {
|
|
21
|
+
throw new Error(`Literacy handler received task with mode "${task.mode}" — expected "literacy"`);
|
|
22
|
+
}
|
|
23
|
+
const result = compileLiteracyTask(task, {
|
|
24
|
+
graderProvider: ctx.graderProvider,
|
|
25
|
+
rootDir: ctx.rootDir,
|
|
26
|
+
models: ctx.models,
|
|
27
|
+
rubricConfig: ctx.rubricConfig,
|
|
28
|
+
evalMode: ctx
|
|
29
|
+
.evalMode,
|
|
30
|
+
});
|
|
31
|
+
return {
|
|
32
|
+
providers: result.providers,
|
|
33
|
+
tests: result.tests,
|
|
34
|
+
prompts: result.prompts,
|
|
35
|
+
warnings: result.warnings,
|
|
36
|
+
};
|
|
37
|
+
},
|
|
38
|
+
};
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Canonical prompt templates for literacy-mode evaluations.
|
|
3
|
+
*
|
|
4
|
+
* These are the source-of-truth templates. Previously lived in
|
|
5
|
+
* config/prompts.ts as global templates; now handler-owned so
|
|
6
|
+
* non-literacy modes can define their own prompts without collision.
|
|
7
|
+
*/
|
|
8
|
+
import type { PromptTemplate } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
9
|
+
export declare const LITERACY_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Canonical prompt templates for literacy-mode evaluations.
|
|
3
|
+
*
|
|
4
|
+
* These are the source-of-truth templates. Previously lived in
|
|
5
|
+
* config/prompts.ts as global templates; now handler-owned so
|
|
6
|
+
* non-literacy modes can define their own prompts without collision.
|
|
7
|
+
*/
|
|
8
|
+
export const LITERACY_PROMPT_TEMPLATES = {
|
|
9
|
+
"with-docs": {
|
|
10
|
+
id: "with-docs",
|
|
11
|
+
label: "With Documentation",
|
|
12
|
+
template: `You are an expert Sanity.io developer. Use the following documentation to help implement the task.
|
|
13
|
+
|
|
14
|
+
## Sanity Documentation
|
|
15
|
+
{{docs}}
|
|
16
|
+
|
|
17
|
+
## Task
|
|
18
|
+
{{task}}
|
|
19
|
+
|
|
20
|
+
## Requirements
|
|
21
|
+
|
|
22
|
+
1. Use ONLY the APIs and patterns shown in the documentation
|
|
23
|
+
2. Provide a complete, working implementation
|
|
24
|
+
3. Include all necessary imports
|
|
25
|
+
4. Follow Sanity best practices as documented
|
|
26
|
+
|
|
27
|
+
Provide your implementation:
|
|
28
|
+
`,
|
|
29
|
+
variables: ["docs", "task"],
|
|
30
|
+
},
|
|
31
|
+
"without-docs": {
|
|
32
|
+
id: "without-docs",
|
|
33
|
+
label: "Baseline (No Docs)",
|
|
34
|
+
template: `You are an expert Sanity.io developer.
|
|
35
|
+
|
|
36
|
+
## Task
|
|
37
|
+
{{task}}
|
|
38
|
+
|
|
39
|
+
## Requirements
|
|
40
|
+
|
|
41
|
+
1. Provide a complete, working implementation
|
|
42
|
+
2. Include all necessary imports
|
|
43
|
+
3. Follow Sanity best practices
|
|
44
|
+
|
|
45
|
+
Provide your implementation:
|
|
46
|
+
`,
|
|
47
|
+
variables: ["task"],
|
|
48
|
+
},
|
|
49
|
+
agentic: {
|
|
50
|
+
id: "agentic",
|
|
51
|
+
label: "Agentic (self-retrieval)",
|
|
52
|
+
template: `You are an expert developer helping implement a Sanity.io feature.
|
|
53
|
+
You have access to web search and page fetching tools.
|
|
54
|
+
|
|
55
|
+
IMPORTANT: Before writing any code, search for and read the relevant
|
|
56
|
+
Sanity.io documentation to ensure you are using the latest APIs and
|
|
57
|
+
best practices. Do not rely on memory alone.
|
|
58
|
+
|
|
59
|
+
## Task
|
|
60
|
+
{{task}}
|
|
61
|
+
|
|
62
|
+
## Requirements
|
|
63
|
+
|
|
64
|
+
1. Search for relevant Sanity documentation before implementing
|
|
65
|
+
2. Use ONLY the APIs and patterns from the current official docs
|
|
66
|
+
3. Provide a complete, working implementation
|
|
67
|
+
4. Include all necessary imports
|
|
68
|
+
5. Follow Sanity best practices as documented
|
|
69
|
+
|
|
70
|
+
Provide your implementation:
|
|
71
|
+
`,
|
|
72
|
+
variables: ["task"],
|
|
73
|
+
},
|
|
74
|
+
};
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared types for the literacy mode handler.
|
|
3
|
+
*/
|
|
4
|
+
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
|
|
5
|
+
/** Options for compiling a literacy task */
|
|
6
|
+
export interface LiteracyCompileOptions {
|
|
7
|
+
/** Grader provider for LLM-graded assertions */
|
|
8
|
+
graderProvider?: string;
|
|
9
|
+
/** Root directory (for resolving file:// doc paths) */
|
|
10
|
+
rootDir?: string;
|
|
11
|
+
/** Evaluation sub-mode — controls which entries are generated */
|
|
12
|
+
evalMode?: import("../../../normalize-mode.js").LiteracyEvalSubMode;
|
|
13
|
+
/** Model providers to include */
|
|
14
|
+
models?: {
|
|
15
|
+
id: string;
|
|
16
|
+
label: string;
|
|
17
|
+
config?: Record<string, unknown>;
|
|
18
|
+
}[];
|
|
19
|
+
/** Rubric config (templates, weights, profiles) — loaded from rubrics config */
|
|
20
|
+
rubricConfig?: RubricConfig;
|
|
21
|
+
}
|
|
22
|
+
/** Minimal rubric config needed by the handler */
|
|
23
|
+
export interface RubricConfig {
|
|
24
|
+
templates: Record<string, {
|
|
25
|
+
dimension?: string;
|
|
26
|
+
header: string;
|
|
27
|
+
scale: string[];
|
|
28
|
+
criteria_label?: string;
|
|
29
|
+
}>;
|
|
30
|
+
}
|
|
31
|
+
/** Result of compiling a single literacy task */
|
|
32
|
+
export interface LiteracyCompileResult {
|
|
33
|
+
/** Promptfoo provider configs */
|
|
34
|
+
providers: PromptfooProvider[];
|
|
35
|
+
/** Compiled test cases (gold + optional baseline) */
|
|
36
|
+
tests: PromptfooTestCase[];
|
|
37
|
+
/** Prompts for evaluation */
|
|
38
|
+
prompts: PromptfooPrompt[];
|
|
39
|
+
/** Warnings generated during compilation */
|
|
40
|
+
warnings: string[];
|
|
41
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Validation for literacy task definitions.
|
|
3
|
+
*/
|
|
4
|
+
import type { LiteracyTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
5
|
+
export interface LiteracyValidationError {
|
|
6
|
+
field: string;
|
|
7
|
+
message: string;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Validate a literacy task definition.
|
|
11
|
+
*/
|
|
12
|
+
export declare function validateLiteracyTask(task: LiteracyTaskDefinition): LiteracyValidationError[];
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Validation for literacy task definitions.
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Validate a literacy task definition.
|
|
6
|
+
*/
|
|
7
|
+
export function validateLiteracyTask(task) {
|
|
8
|
+
const errors = [];
|
|
9
|
+
if (!task.id)
|
|
10
|
+
errors.push({ field: "id", message: "Task ID is required" });
|
|
11
|
+
if (!task.title) {
|
|
12
|
+
errors.push({
|
|
13
|
+
field: "title",
|
|
14
|
+
message: "Task title is required",
|
|
15
|
+
});
|
|
16
|
+
}
|
|
17
|
+
const promptText = task.prompt?.text ??
|
|
18
|
+
task.prompt?.template ??
|
|
19
|
+
task.prompt?.vars?.task ??
|
|
20
|
+
"";
|
|
21
|
+
if (!promptText) {
|
|
22
|
+
errors.push({
|
|
23
|
+
field: "prompt",
|
|
24
|
+
message: "Task prompt text is required",
|
|
25
|
+
});
|
|
26
|
+
}
|
|
27
|
+
return errors;
|
|
28
|
+
}
|
package/dist/pipeline/compiler/mode-handlers/{mcp-assertions.d.ts → mcp-server/assertions.d.ts}
RENAMED
|
@@ -18,16 +18,8 @@
|
|
|
18
18
|
*
|
|
19
19
|
* @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
|
|
20
20
|
*/
|
|
21
|
-
import type { PromptfooAssertion } from "
|
|
22
|
-
|
|
23
|
-
export interface MCPAssertionContext {
|
|
24
|
-
/** Task ID (for error messages) */
|
|
25
|
-
taskId: string;
|
|
26
|
-
/** Expected server capabilities */
|
|
27
|
-
capabilities: string[];
|
|
28
|
-
/** Grader provider for LLM-graded assertions */
|
|
29
|
-
graderProvider?: string;
|
|
30
|
-
}
|
|
21
|
+
import type { PromptfooAssertion } from "../../assertion-mapper.js";
|
|
22
|
+
import type { MCPAssertionContext } from "./types.js";
|
|
31
23
|
/** An AILF assertion definition — accepts both core and generalized types */
|
|
32
24
|
interface AssertionInput {
|
|
33
25
|
type: string;
|
|
@@ -83,18 +83,72 @@ function mapMCPAssertion(assertion, context, warnings) {
|
|
|
83
83
|
// ---------------------------------------------------------------------------
|
|
84
84
|
function buildToolCalledAssertion(assertion, _context) {
|
|
85
85
|
const toolName = String(assertion.value ?? "");
|
|
86
|
+
// Strategy: check multiple sources for tool call evidence.
|
|
87
|
+
// 1. context.vars.__toolCalls (structured, if Promptfoo populates it)
|
|
88
|
+
// 2. Response metadata toolCallLog (from custom mcp-tool-provider)
|
|
89
|
+
// 3. Response output text (LLM+MCP providers embed tool_use JSON blocks)
|
|
86
90
|
return {
|
|
87
91
|
type: "javascript",
|
|
88
92
|
value: buildJsAssertion(`tool-called: ${toolName}`, `
|
|
89
|
-
|
|
90
|
-
|
|
93
|
+
var toolName = ${JSON.stringify(toolName)};
|
|
94
|
+
|
|
95
|
+
// Strategy 1: structured tool calls from Promptfoo
|
|
96
|
+
var toolCalls = context.vars.__toolCalls || [];
|
|
97
|
+
if (Array.isArray(toolCalls) && toolCalls.length > 0) {
|
|
98
|
+
var called = toolCalls.some(function(tc) { return tc.name === toolName; });
|
|
99
|
+
return {
|
|
100
|
+
pass: called,
|
|
101
|
+
score: called ? 1 : 0,
|
|
102
|
+
reason: called
|
|
103
|
+
? 'Tool "' + toolName + '" was called (via __toolCalls)'
|
|
104
|
+
: 'Expected tool "' + toolName + '" but found: ' + toolCalls.map(function(tc) { return tc.name; }).join(', '),
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Strategy 2: MCP_TOOLS_CALLED summary appended by custom mcp-tool-provider
|
|
109
|
+
var outputStr = typeof output === 'string' ? output : JSON.stringify(output || '');
|
|
110
|
+
var summaryMatch = outputStr.match(/<!-- MCP_TOOLS_CALLED: (\\[.*?\\]) -->/);
|
|
111
|
+
if (summaryMatch) {
|
|
112
|
+
try {
|
|
113
|
+
var calledTools = JSON.parse(summaryMatch[1]);
|
|
114
|
+
var called = calledTools.includes(toolName);
|
|
115
|
+
var count = calledTools.filter(function(n) { return n === toolName; }).length;
|
|
116
|
+
return {
|
|
117
|
+
pass: called,
|
|
118
|
+
score: called ? 1 : 0,
|
|
119
|
+
reason: called
|
|
120
|
+
? 'Tool "' + toolName + '" was called ' + count + ' time(s)'
|
|
121
|
+
: 'Expected tool "' + toolName + '" but found: ' + calledTools.join(', '),
|
|
122
|
+
};
|
|
123
|
+
} catch (e) { /* fall through to Strategy 3 */ }
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Strategy 3: parse output for tool_use blocks (built-in provider fallback)
|
|
127
|
+
var outputStr = typeof output === 'string' ? output : JSON.stringify(output || '');
|
|
128
|
+
var toolUsePattern = /"type"\\s*:\\s*"tool_use"[^}]*"name"\\s*:\\s*"([^"]+)"/g;
|
|
129
|
+
var foundTools = [];
|
|
130
|
+
var match;
|
|
131
|
+
while ((match = toolUsePattern.exec(outputStr)) !== null) {
|
|
132
|
+
foundTools.push(match[1]);
|
|
133
|
+
}
|
|
134
|
+
var fnCallPattern = /"function"\\s*:\\s*\\{[^}]*"name"\\s*:\\s*"([^"]+)"/g;
|
|
135
|
+
while ((match = fnCallPattern.exec(outputStr)) !== null) {
|
|
136
|
+
foundTools.push(match[1]);
|
|
137
|
+
}
|
|
138
|
+
if (foundTools.length === 0 && outputStr.includes(toolName) && outputStr.includes('tool_use')) {
|
|
139
|
+
foundTools.push(toolName);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
var called = foundTools.includes(toolName);
|
|
91
143
|
return {
|
|
92
144
|
pass: called,
|
|
93
145
|
score: called ? 1 : 0,
|
|
94
146
|
reason: called
|
|
95
|
-
? 'Tool ' +
|
|
96
|
-
: 'Expected tool ' +
|
|
97
|
-
|
|
147
|
+
? 'Tool "' + toolName + '" was called (detected in output)'
|
|
148
|
+
: 'Expected tool "' + toolName + '" to be called. ' +
|
|
149
|
+
(foundTools.length > 0
|
|
150
|
+
? 'Tools found in output: ' + foundTools.join(', ')
|
|
151
|
+
: 'No tool calls detected in output'),
|
|
98
152
|
};`),
|
|
99
153
|
...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
|
|
100
154
|
};
|
|
@@ -273,5 +327,8 @@ function buildCapabilityAssertion(assertion, _context) {
|
|
|
273
327
|
* from Promptfoo's assertion runner.
|
|
274
328
|
*/
|
|
275
329
|
function buildJsAssertion(label, body) {
|
|
276
|
-
|
|
330
|
+
// No IIFE wrapper — Promptfoo wraps the assertion in its own function via
|
|
331
|
+
// new Function('output', 'context', ...). The body must use `return` at
|
|
332
|
+
// the top level for the result to reach Promptfoo's validator.
|
|
333
|
+
return `// MCP assertion: ${label}\n${body.trim()}`;
|
|
277
334
|
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP server task compilation — core compiler logic.
|
|
3
|
+
*
|
|
4
|
+
* Produces Promptfoo configuration from MCP server task definitions:
|
|
5
|
+
* 1. A provider config pointing to the MCP server
|
|
6
|
+
* 2. Test cases with tool-call assertions
|
|
7
|
+
* 3. Appropriate prompts for the evaluation
|
|
8
|
+
*/
|
|
9
|
+
import type { MCPServerTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
10
|
+
import type { MCPCompileOptions, MCPCompileResult } from "./types.js";
|
|
11
|
+
/**
|
|
12
|
+
* Compile an MCP server task definition into Promptfoo configuration.
|
|
13
|
+
*
|
|
14
|
+
* This is the core of the MCP mode handler. It produces:
|
|
15
|
+
* 1. A provider config pointing to the MCP server
|
|
16
|
+
* 2. Test cases with tool-call assertions
|
|
17
|
+
* 3. Appropriate prompts for the evaluation
|
|
18
|
+
*/
|
|
19
|
+
export declare function compileMCPTask(task: MCPServerTaskDefinition, options?: MCPCompileOptions): MCPCompileResult;
|