@sanity/ailf 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/cli.js +0 -0
- package/package.json +24 -24
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
- package/dist/adapters/task-sources/yaml-task-source.js +0 -139
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
- package/dist/commands/update-quality-scores.d.ts +0 -5
- package/dist/commands/update-quality-scores.js +0 -20
- package/dist/lib/agent-behavior-report.d.ts +0 -8
- package/dist/lib/agent-behavior-report.js +0 -185
- package/dist/lib/baseline.d.ts +0 -19
- package/dist/lib/baseline.js +0 -153
- package/dist/lib/calculate-scores.d.ts +0 -23
- package/dist/lib/calculate-scores.js +0 -42
- package/dist/lib/compare.d.ts +0 -18
- package/dist/lib/compare.js +0 -170
- package/dist/lib/coverage-audit.d.ts +0 -4
- package/dist/lib/coverage-audit.js +0 -42
- package/dist/lib/discovery-report.d.ts +0 -13
- package/dist/lib/discovery-report.js +0 -57
- package/dist/lib/fetch-docs.d.ts +0 -30
- package/dist/lib/fetch-docs.js +0 -171
- package/dist/lib/generate-configs.d.ts +0 -25
- package/dist/lib/generate-configs.js +0 -42
- package/dist/lib/grader-api.d.ts +0 -21
- package/dist/lib/grader-api.js +0 -34
- package/dist/lib/grader-compare.d.ts +0 -19
- package/dist/lib/grader-compare.js +0 -91
- package/dist/lib/grader-consistency.d.ts +0 -27
- package/dist/lib/grader-consistency.js +0 -79
- package/dist/lib/grader-sensitivity.d.ts +0 -19
- package/dist/lib/grader-sensitivity.js +0 -75
- package/dist/lib/grader-validate.d.ts +0 -19
- package/dist/lib/grader-validate.js +0 -78
- package/dist/lib/measure-retrieval.d.ts +0 -14
- package/dist/lib/measure-retrieval.js +0 -71
- package/dist/lib/pr-comment.d.ts +0 -16
- package/dist/lib/pr-comment.js +0 -28
- package/dist/lib/readiness-report.d.ts +0 -13
- package/dist/lib/readiness-report.js +0 -108
- package/dist/lib/webhook-server.d.ts +0 -11
- package/dist/lib/webhook-server.js +0 -24
- package/dist/lib/weekly-digest.d.ts +0 -24
- package/dist/lib/weekly-digest.js +0 -148
- package/dist/orchestration/env-bridge.d.ts +0 -21
- package/dist/orchestration/env-bridge.js +0 -66
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
- package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
- package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
- package/dist/pipeline/compiler/task-bridge.js +0 -92
- package/dist/pipeline/expand-tasks.d.ts +0 -232
- package/dist/pipeline/expand-tasks.js +0 -467
- package/dist/pipeline/generate-configs.d.ts +0 -92
- package/dist/pipeline/generate-configs.js +0 -445
- package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/calculate-scores-step.js +0 -89
- package/dist/pipeline/steps/compare-step.d.ts +0 -18
- package/dist/pipeline/steps/compare-step.js +0 -90
- package/dist/pipeline/steps/eval-step.d.ts +0 -53
- package/dist/pipeline/steps/eval-step.js +0 -347
- package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
- package/dist/pipeline/steps/fetch-docs-step.js +0 -84
- package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
- package/dist/pipeline/steps/generate-configs-step.js +0 -98
- package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
- package/dist/pipeline/steps/grader-consistency-step.js +0 -74
- package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
- package/dist/pipeline/steps/publish-report-step.js +0 -243
- package/dist/pipeline/steps/report-step.d.ts +0 -13
- package/dist/pipeline/steps/report-step.js +0 -56
- package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/update-scores-step.js +0 -42
- package/dist/scripts/agent-behavior-report.d.ts +0 -19
- package/dist/scripts/agent-behavior-report.js +0 -315
- package/dist/scripts/baseline.d.ts +0 -43
- package/dist/scripts/baseline.js +0 -267
- package/dist/scripts/calculate-scores.d.ts +0 -166
- package/dist/scripts/calculate-scores.js +0 -1296
- package/dist/scripts/compare.d.ts +0 -22
- package/dist/scripts/compare.js +0 -334
- package/dist/scripts/coverage-audit.d.ts +0 -44
- package/dist/scripts/coverage-audit.js +0 -209
- package/dist/scripts/debug-eval.d.ts +0 -19
- package/dist/scripts/debug-eval.js +0 -73
- package/dist/scripts/discovery-report.d.ts +0 -58
- package/dist/scripts/discovery-report.js +0 -250
- package/dist/scripts/fetch-docs.d.ts +0 -35
- package/dist/scripts/fetch-docs.js +0 -472
- package/dist/scripts/generate-configs.d.ts +0 -66
- package/dist/scripts/generate-configs.js +0 -459
- package/dist/scripts/grader-api.d.ts +0 -27
- package/dist/scripts/grader-api.js +0 -206
- package/dist/scripts/grader-compare.d.ts +0 -22
- package/dist/scripts/grader-compare.js +0 -368
- package/dist/scripts/grader-consistency.d.ts +0 -20
- package/dist/scripts/grader-consistency.js +0 -313
- package/dist/scripts/grader-sensitivity.d.ts +0 -22
- package/dist/scripts/grader-sensitivity.js +0 -354
- package/dist/scripts/grader-validate.d.ts +0 -19
- package/dist/scripts/grader-validate.js +0 -267
- package/dist/scripts/measure-retrieval.d.ts +0 -10
- package/dist/scripts/measure-retrieval.js +0 -145
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
- package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
- package/dist/scripts/pipeline.d.ts +0 -76
- package/dist/scripts/pipeline.js +0 -1031
- package/dist/scripts/pr-comment.d.ts +0 -10
- package/dist/scripts/pr-comment.js +0 -510
- package/dist/scripts/readiness-report.d.ts +0 -88
- package/dist/scripts/readiness-report.js +0 -342
- package/dist/scripts/update-quality-scores.d.ts +0 -15
- package/dist/scripts/update-quality-scores.js +0 -184
- package/dist/scripts/validate-task-sources.d.ts +0 -21
- package/dist/scripts/validate-task-sources.js +0 -210
- package/dist/scripts/validate.d.ts +0 -13
- package/dist/scripts/validate.js +0 -79
- package/dist/scripts/webhook-server.d.ts +0 -26
- package/dist/scripts/webhook-server.js +0 -147
- package/dist/scripts/weekly-digest.d.ts +0 -24
- package/dist/scripts/weekly-digest.js +0 -144
- package/dist/sinks/format-slack.d.ts +0 -64
- package/dist/sinks/format-slack.js +0 -306
- package/dist/sinks/slack-sink.d.ts +0 -27
- package/dist/sinks/slack-sink.js +0 -78
- package/dist/sinks/webhook-sink.d.ts +0 -19
- package/dist/sinks/webhook-sink.js +0 -50
- package/tasks/.expanded.agentic.yaml +0 -280
- package/tasks/.expanded.yaml +0 -565
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Shell delegation for the fetch-docs step.
|
|
3
|
-
*
|
|
4
|
-
* Isolates the execSync call so it can be replaced when the pipeline
|
|
5
|
-
* fully migrates to the DocFetcher port.
|
|
6
|
-
*/
|
|
7
|
-
import { execSync } from "child_process";
|
|
8
|
-
/**
|
|
9
|
-
* Run `pnpm fetch-docs` via shell.
|
|
10
|
-
*
|
|
11
|
-
* Returns a result object instead of throwing so the step can
|
|
12
|
-
* handle the failure uniformly.
|
|
13
|
-
*/
|
|
14
|
-
export function runFetchDocsShell(rootDir, source) {
|
|
15
|
-
try {
|
|
16
|
-
const sourceArg = source ? ` --source ${source}` : "";
|
|
17
|
-
execSync(`pnpm fetch-docs${sourceArg}`, {
|
|
18
|
-
cwd: rootDir,
|
|
19
|
-
env: process.env,
|
|
20
|
-
stdio: "inherit",
|
|
21
|
-
});
|
|
22
|
-
return { ok: true };
|
|
23
|
-
}
|
|
24
|
-
catch (err) {
|
|
25
|
-
return {
|
|
26
|
-
ok: false,
|
|
27
|
-
error: err instanceof Error ? err.message : String(err),
|
|
28
|
-
};
|
|
29
|
-
}
|
|
30
|
-
}
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* task-bridge.test.ts — Unit tests for the TaskDefinition ↔ LiteracyTaskDefinition bridge.
|
|
3
|
-
*
|
|
4
|
-
* Covers round-trip fidelity, edge cases (missing optionals, all optionals),
|
|
5
|
-
* assertion type mapping, and all four CanonicalDocRef / GeneralizedDocRef variants.
|
|
6
|
-
*
|
|
7
|
-
* Run: npx tsx --test src/pipeline/compiler/__tests__/task-bridge.test.ts
|
|
8
|
-
*/
|
|
9
|
-
export {};
|
|
@@ -1,339 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* task-bridge.test.ts — Unit tests for the TaskDefinition ↔ LiteracyTaskDefinition bridge.
|
|
3
|
-
*
|
|
4
|
-
* Covers round-trip fidelity, edge cases (missing optionals, all optionals),
|
|
5
|
-
* assertion type mapping, and all four CanonicalDocRef / GeneralizedDocRef variants.
|
|
6
|
-
*
|
|
7
|
-
* Run: npx tsx --test src/pipeline/compiler/__tests__/task-bridge.test.ts
|
|
8
|
-
*/
|
|
9
|
-
import assert from "node:assert/strict";
|
|
10
|
-
import { describe, it } from "node:test";
|
|
11
|
-
import { toGeneralized, toLiteracyTask } from "../task-bridge.js";
|
|
12
|
-
// ---------------------------------------------------------------------------
|
|
13
|
-
// Fixtures
|
|
14
|
-
// ---------------------------------------------------------------------------
|
|
15
|
-
/** Minimal old-style task — only required fields, no optionals */
|
|
16
|
-
const minimalOldTask = {
|
|
17
|
-
id: "groq-filter-basic",
|
|
18
|
-
description: "Filter documents with GROQ",
|
|
19
|
-
featureArea: "groq",
|
|
20
|
-
taskPrompt: "Write a GROQ query that filters by _type",
|
|
21
|
-
canonicalDocs: [],
|
|
22
|
-
referenceSolution: "solutions/groq-filter-basic.ts",
|
|
23
|
-
docCoverage: false,
|
|
24
|
-
assertions: [],
|
|
25
|
-
};
|
|
26
|
-
/** Fully-populated old-style task — every optional filled */
|
|
27
|
-
const fullOldTask = {
|
|
28
|
-
id: "mutations-create-advanced",
|
|
29
|
-
description: "Create documents with mutations API",
|
|
30
|
-
featureArea: "mutations",
|
|
31
|
-
taskPrompt: "Use the mutations API to create a document with references",
|
|
32
|
-
canonicalDocs: [
|
|
33
|
-
{ slug: "mutations-overview", reason: "Primary mutations guide" },
|
|
34
|
-
{ path: "/docs/mutations/create", reason: "Create-specific docs" },
|
|
35
|
-
{
|
|
36
|
-
id: "doc-123",
|
|
37
|
-
reason: "Imported draft",
|
|
38
|
-
slug: "draft-slug",
|
|
39
|
-
path: "/docs/draft",
|
|
40
|
-
},
|
|
41
|
-
{ perspective: "release-v3", reason: "V3 release content" },
|
|
42
|
-
],
|
|
43
|
-
referenceSolution: "solutions/mutations-create-advanced.ts",
|
|
44
|
-
docCoverage: true,
|
|
45
|
-
assertions: [
|
|
46
|
-
{
|
|
47
|
-
type: "llm-rubric",
|
|
48
|
-
template: "code-quality",
|
|
49
|
-
criteria: ["correct", "idiomatic"],
|
|
50
|
-
weight: 2,
|
|
51
|
-
},
|
|
52
|
-
{ type: "contains", value: "createIfNotExists" },
|
|
53
|
-
{ type: "javascript", value: "output.includes('mutation')", weight: 1 },
|
|
54
|
-
],
|
|
55
|
-
baseline: { enabled: true, rubric: "full" },
|
|
56
|
-
tags: ["mutations", "advanced", "references"],
|
|
57
|
-
status: "active",
|
|
58
|
-
extraVars: { customHint: "Use createIfNotExists", maxRetries: 3 },
|
|
59
|
-
};
|
|
60
|
-
/** Minimal new-style literacy task — only required/mode fields */
|
|
61
|
-
const minimalNewTask = {
|
|
62
|
-
mode: "literacy",
|
|
63
|
-
id: "studio-config-basic",
|
|
64
|
-
title: "Configure a Sanity Studio",
|
|
65
|
-
};
|
|
66
|
-
/** Fully-populated new-style literacy task (only fields that round-trip through old type) */
|
|
67
|
-
const fullNewTask = {
|
|
68
|
-
mode: "literacy",
|
|
69
|
-
id: "groq-projection-advanced",
|
|
70
|
-
title: "Advanced GROQ projections",
|
|
71
|
-
area: "groq",
|
|
72
|
-
tags: ["groq", "projections"],
|
|
73
|
-
status: "active",
|
|
74
|
-
assertions: [
|
|
75
|
-
{
|
|
76
|
-
type: "llm-rubric",
|
|
77
|
-
template: "completeness",
|
|
78
|
-
criteria: ["covers edge cases"],
|
|
79
|
-
weight: 3,
|
|
80
|
-
},
|
|
81
|
-
{ type: "contains", value: "coalesce" },
|
|
82
|
-
],
|
|
83
|
-
prompt: {
|
|
84
|
-
text: "Write a GROQ query using projections with coalesce",
|
|
85
|
-
vars: { difficulty: "advanced", topic: "projections" },
|
|
86
|
-
},
|
|
87
|
-
context: {
|
|
88
|
-
docs: [
|
|
89
|
-
{ slug: "groq-projections", reason: "Projection docs" },
|
|
90
|
-
{ path: "/docs/groq/projections", reason: "Path-based ref" },
|
|
91
|
-
{ id: "groq-doc-456", reason: "By ID" },
|
|
92
|
-
{ perspective: "release-groq-v2", reason: "GROQ v2 release" },
|
|
93
|
-
],
|
|
94
|
-
},
|
|
95
|
-
referenceSolution: "solutions/groq-projection-advanced.ts",
|
|
96
|
-
docCoverage: true,
|
|
97
|
-
baseline: { enabled: false, rubric: "abbreviated" },
|
|
98
|
-
};
|
|
99
|
-
// ---------------------------------------------------------------------------
|
|
100
|
-
// toGeneralized — old TaskDefinition → LiteracyTaskDefinition
|
|
101
|
-
// ---------------------------------------------------------------------------
|
|
102
|
-
describe("toGeneralized", () => {
|
|
103
|
-
it("converts a minimal old task to a LiteracyTaskDefinition", () => {
|
|
104
|
-
const result = toGeneralized(minimalOldTask);
|
|
105
|
-
assert.equal(result.mode, "literacy");
|
|
106
|
-
assert.equal(result.id, "groq-filter-basic");
|
|
107
|
-
assert.equal(result.title, "Filter documents with GROQ");
|
|
108
|
-
assert.equal(result.area, "groq");
|
|
109
|
-
assert.equal(result.referenceSolution, "solutions/groq-filter-basic.ts");
|
|
110
|
-
assert.equal(result.docCoverage, false);
|
|
111
|
-
assert.deepEqual(result.assertions, []);
|
|
112
|
-
assert.deepEqual(result.context?.docs, []);
|
|
113
|
-
assert.equal(result.prompt?.text, "Write a GROQ query that filters by _type");
|
|
114
|
-
});
|
|
115
|
-
it("converts a fully-populated old task preserving all fields", () => {
|
|
116
|
-
const result = toGeneralized(fullOldTask);
|
|
117
|
-
assert.equal(result.mode, "literacy");
|
|
118
|
-
assert.equal(result.id, "mutations-create-advanced");
|
|
119
|
-
assert.equal(result.title, "Create documents with mutations API");
|
|
120
|
-
assert.equal(result.area, "mutations");
|
|
121
|
-
assert.equal(result.referenceSolution, "solutions/mutations-create-advanced.ts");
|
|
122
|
-
assert.equal(result.docCoverage, true);
|
|
123
|
-
assert.deepEqual(result.baseline, { enabled: true, rubric: "full" });
|
|
124
|
-
assert.deepEqual(result.tags, ["mutations", "advanced", "references"]);
|
|
125
|
-
assert.equal(result.status, "active");
|
|
126
|
-
assert.deepEqual(result.prompt?.vars, {
|
|
127
|
-
customHint: "Use createIfNotExists",
|
|
128
|
-
maxRetries: 3,
|
|
129
|
-
});
|
|
130
|
-
assert.equal(result.prompt?.text, "Use the mutations API to create a document with references");
|
|
131
|
-
});
|
|
132
|
-
it("does not set optional fields when absent in old task", () => {
|
|
133
|
-
const result = toGeneralized(minimalOldTask);
|
|
134
|
-
assert.equal(result.baseline, undefined);
|
|
135
|
-
assert.equal(result.tags, undefined);
|
|
136
|
-
assert.equal(result.status, undefined);
|
|
137
|
-
assert.equal(result.prompt?.vars, undefined);
|
|
138
|
-
});
|
|
139
|
-
});
|
|
140
|
-
// ---------------------------------------------------------------------------
|
|
141
|
-
// toLiteracyTask — LiteracyTaskDefinition → old TaskDefinition
|
|
142
|
-
// ---------------------------------------------------------------------------
|
|
143
|
-
describe("toLiteracyTask", () => {
|
|
144
|
-
it("converts a minimal new task to a TaskDefinition", () => {
|
|
145
|
-
const result = toLiteracyTask(minimalNewTask);
|
|
146
|
-
assert.equal(result.id, "studio-config-basic");
|
|
147
|
-
assert.equal(result.description, "Configure a Sanity Studio");
|
|
148
|
-
assert.equal(result.featureArea, "");
|
|
149
|
-
assert.equal(result.taskPrompt, "");
|
|
150
|
-
assert.deepEqual(result.canonicalDocs, []);
|
|
151
|
-
assert.equal(result.referenceSolution, "");
|
|
152
|
-
assert.equal(result.docCoverage, false);
|
|
153
|
-
assert.deepEqual(result.assertions, []);
|
|
154
|
-
});
|
|
155
|
-
it("converts a fully-populated new task preserving all mappable fields", () => {
|
|
156
|
-
const result = toLiteracyTask(fullNewTask);
|
|
157
|
-
assert.equal(result.id, "groq-projection-advanced");
|
|
158
|
-
assert.equal(result.description, "Advanced GROQ projections");
|
|
159
|
-
assert.equal(result.featureArea, "groq");
|
|
160
|
-
assert.equal(result.taskPrompt, "Write a GROQ query using projections with coalesce");
|
|
161
|
-
assert.equal(result.referenceSolution, "solutions/groq-projection-advanced.ts");
|
|
162
|
-
assert.equal(result.docCoverage, true);
|
|
163
|
-
assert.deepEqual(result.baseline, { enabled: false, rubric: "abbreviated" });
|
|
164
|
-
assert.deepEqual(result.tags, ["groq", "projections"]);
|
|
165
|
-
assert.equal(result.status, "active");
|
|
166
|
-
assert.deepEqual(result.extraVars, {
|
|
167
|
-
difficulty: "advanced",
|
|
168
|
-
topic: "projections",
|
|
169
|
-
});
|
|
170
|
-
});
|
|
171
|
-
it("uses prompt.template as fallback when prompt.text is absent", () => {
|
|
172
|
-
const task = {
|
|
173
|
-
mode: "literacy",
|
|
174
|
-
id: "template-task",
|
|
175
|
-
title: "Template-based task",
|
|
176
|
-
prompt: { template: "my-named-template" },
|
|
177
|
-
};
|
|
178
|
-
const result = toLiteracyTask(task);
|
|
179
|
-
assert.equal(result.taskPrompt, "my-named-template");
|
|
180
|
-
});
|
|
181
|
-
it("does not set optional fields when absent in new task", () => {
|
|
182
|
-
const result = toLiteracyTask(minimalNewTask);
|
|
183
|
-
assert.equal(result.baseline, undefined);
|
|
184
|
-
assert.equal(result.tags, undefined);
|
|
185
|
-
assert.equal(result.status, undefined);
|
|
186
|
-
assert.equal(result.extraVars, undefined);
|
|
187
|
-
});
|
|
188
|
-
});
|
|
189
|
-
// ---------------------------------------------------------------------------
|
|
190
|
-
// Round-trip: old → new → old (must be lossless)
|
|
191
|
-
// ---------------------------------------------------------------------------
|
|
192
|
-
describe("round-trip: toLiteracyTask(toGeneralized(oldTask))", () => {
|
|
193
|
-
it("preserves all fields of a minimal old task", () => {
|
|
194
|
-
const roundTripped = toLiteracyTask(toGeneralized(minimalOldTask));
|
|
195
|
-
assert.deepEqual(roundTripped, minimalOldTask);
|
|
196
|
-
});
|
|
197
|
-
it("preserves all fields of a fully-populated old task", () => {
|
|
198
|
-
const roundTripped = toLiteracyTask(toGeneralized(fullOldTask));
|
|
199
|
-
assert.deepEqual(roundTripped, fullOldTask);
|
|
200
|
-
});
|
|
201
|
-
});
|
|
202
|
-
// ---------------------------------------------------------------------------
|
|
203
|
-
// Round-trip: new → old → new (lossless for mappable fields)
|
|
204
|
-
// ---------------------------------------------------------------------------
|
|
205
|
-
describe("round-trip: toGeneralized(toLiteracyTask(newTask))", () => {
|
|
206
|
-
it("preserves all fields of a minimal new task", () => {
|
|
207
|
-
const roundTripped = toGeneralized(toLiteracyTask(minimalNewTask));
|
|
208
|
-
assert.equal(roundTripped.mode, "literacy");
|
|
209
|
-
assert.equal(roundTripped.id, minimalNewTask.id);
|
|
210
|
-
assert.equal(roundTripped.title, minimalNewTask.title);
|
|
211
|
-
});
|
|
212
|
-
it("preserves all mappable fields of a fully-populated new task", () => {
|
|
213
|
-
const roundTripped = toGeneralized(toLiteracyTask(fullNewTask));
|
|
214
|
-
assert.equal(roundTripped.mode, "literacy");
|
|
215
|
-
assert.equal(roundTripped.id, fullNewTask.id);
|
|
216
|
-
assert.equal(roundTripped.title, fullNewTask.title);
|
|
217
|
-
assert.equal(roundTripped.area, fullNewTask.area);
|
|
218
|
-
assert.deepEqual(roundTripped.tags, fullNewTask.tags);
|
|
219
|
-
assert.equal(roundTripped.status, fullNewTask.status);
|
|
220
|
-
assert.deepEqual(roundTripped.assertions, fullNewTask.assertions);
|
|
221
|
-
assert.equal(roundTripped.prompt?.text, fullNewTask.prompt?.text);
|
|
222
|
-
assert.deepEqual(roundTripped.prompt?.vars, fullNewTask.prompt?.vars);
|
|
223
|
-
assert.deepEqual(roundTripped.context?.docs, fullNewTask.context?.docs);
|
|
224
|
-
assert.equal(roundTripped.referenceSolution, fullNewTask.referenceSolution);
|
|
225
|
-
assert.equal(roundTripped.docCoverage, fullNewTask.docCoverage);
|
|
226
|
-
assert.deepEqual(roundTripped.baseline, fullNewTask.baseline);
|
|
227
|
-
});
|
|
228
|
-
});
|
|
229
|
-
// ---------------------------------------------------------------------------
|
|
230
|
-
// CanonicalDocRef ↔ GeneralizedDocRef mapping (all 4 variants)
|
|
231
|
-
// ---------------------------------------------------------------------------
|
|
232
|
-
describe("doc ref mapping", () => {
|
|
233
|
-
const slugRef = {
|
|
234
|
-
slug: "my-article",
|
|
235
|
-
reason: "testing slug",
|
|
236
|
-
};
|
|
237
|
-
const pathRef = {
|
|
238
|
-
path: "/docs/my-article",
|
|
239
|
-
reason: "testing path",
|
|
240
|
-
};
|
|
241
|
-
const idRef = {
|
|
242
|
-
id: "abc-123",
|
|
243
|
-
reason: "testing id",
|
|
244
|
-
slug: "annotated-slug",
|
|
245
|
-
path: "/docs/annotated",
|
|
246
|
-
};
|
|
247
|
-
const perspectiveRef = {
|
|
248
|
-
perspective: "release-v4",
|
|
249
|
-
reason: "testing perspective",
|
|
250
|
-
};
|
|
251
|
-
it("preserves slug ref through round-trip", () => {
|
|
252
|
-
const task = { ...minimalOldTask, canonicalDocs: [slugRef] };
|
|
253
|
-
const roundTripped = toLiteracyTask(toGeneralized(task));
|
|
254
|
-
assert.deepEqual(roundTripped.canonicalDocs, [slugRef]);
|
|
255
|
-
});
|
|
256
|
-
it("preserves path ref through round-trip", () => {
|
|
257
|
-
const task = { ...minimalOldTask, canonicalDocs: [pathRef] };
|
|
258
|
-
const roundTripped = toLiteracyTask(toGeneralized(task));
|
|
259
|
-
assert.deepEqual(roundTripped.canonicalDocs, [pathRef]);
|
|
260
|
-
});
|
|
261
|
-
it("preserves id ref (with optional slug/path annotations) through round-trip", () => {
|
|
262
|
-
const task = { ...minimalOldTask, canonicalDocs: [idRef] };
|
|
263
|
-
const roundTripped = toLiteracyTask(toGeneralized(task));
|
|
264
|
-
assert.deepEqual(roundTripped.canonicalDocs, [idRef]);
|
|
265
|
-
});
|
|
266
|
-
it("preserves perspective ref through round-trip", () => {
|
|
267
|
-
const task = {
|
|
268
|
-
...minimalOldTask,
|
|
269
|
-
canonicalDocs: [perspectiveRef],
|
|
270
|
-
};
|
|
271
|
-
const roundTripped = toLiteracyTask(toGeneralized(task));
|
|
272
|
-
assert.deepEqual(roundTripped.canonicalDocs, [perspectiveRef]);
|
|
273
|
-
});
|
|
274
|
-
it("preserves all 4 ref variants together through round-trip", () => {
|
|
275
|
-
const allRefs = [slugRef, pathRef, idRef, perspectiveRef];
|
|
276
|
-
const task = { ...minimalOldTask, canonicalDocs: allRefs };
|
|
277
|
-
const roundTripped = toLiteracyTask(toGeneralized(task));
|
|
278
|
-
assert.deepEqual(roundTripped.canonicalDocs, allRefs);
|
|
279
|
-
});
|
|
280
|
-
});
|
|
281
|
-
// ---------------------------------------------------------------------------
|
|
282
|
-
// Assertion type mapping
|
|
283
|
-
// ---------------------------------------------------------------------------
|
|
284
|
-
describe("assertion type mapping", () => {
|
|
285
|
-
const templatedAssertion = {
|
|
286
|
-
type: "llm-rubric",
|
|
287
|
-
template: "code-quality",
|
|
288
|
-
criteria: ["correct", "idiomatic", "secure"],
|
|
289
|
-
weight: 2,
|
|
290
|
-
};
|
|
291
|
-
const valueAssertion = {
|
|
292
|
-
type: "contains",
|
|
293
|
-
value: "createDocument",
|
|
294
|
-
};
|
|
295
|
-
const jsAssertion = {
|
|
296
|
-
type: "javascript",
|
|
297
|
-
value: "output.includes('done')",
|
|
298
|
-
weight: 1,
|
|
299
|
-
};
|
|
300
|
-
it("preserves templated assertions through old→new→old round-trip", () => {
|
|
301
|
-
const task = {
|
|
302
|
-
...minimalOldTask,
|
|
303
|
-
assertions: [templatedAssertion],
|
|
304
|
-
};
|
|
305
|
-
const roundTripped = toLiteracyTask(toGeneralized(task));
|
|
306
|
-
assert.deepEqual(roundTripped.assertions, [templatedAssertion]);
|
|
307
|
-
});
|
|
308
|
-
it("preserves value assertions through old→new→old round-trip", () => {
|
|
309
|
-
const task = {
|
|
310
|
-
...minimalOldTask,
|
|
311
|
-
assertions: [valueAssertion],
|
|
312
|
-
};
|
|
313
|
-
const roundTripped = toLiteracyTask(toGeneralized(task));
|
|
314
|
-
assert.deepEqual(roundTripped.assertions, [valueAssertion]);
|
|
315
|
-
});
|
|
316
|
-
it("preserves mixed assertion types through old→new→old round-trip", () => {
|
|
317
|
-
const mixed = [templatedAssertion, valueAssertion, jsAssertion];
|
|
318
|
-
const task = { ...minimalOldTask, assertions: mixed };
|
|
319
|
-
const roundTripped = toLiteracyTask(toGeneralized(task));
|
|
320
|
-
assert.deepEqual(roundTripped.assertions, mixed);
|
|
321
|
-
});
|
|
322
|
-
it("preserves assertions through new→old→new round-trip", () => {
|
|
323
|
-
const genAssertions = [
|
|
324
|
-
{
|
|
325
|
-
type: "llm-rubric",
|
|
326
|
-
template: "completeness",
|
|
327
|
-
criteria: ["thorough"],
|
|
328
|
-
weight: 1,
|
|
329
|
-
},
|
|
330
|
-
{ type: "regex", value: "^import.*sanity" },
|
|
331
|
-
];
|
|
332
|
-
const task = {
|
|
333
|
-
...minimalNewTask,
|
|
334
|
-
assertions: genAssertions,
|
|
335
|
-
};
|
|
336
|
-
const roundTripped = toGeneralized(toLiteracyTask(task));
|
|
337
|
-
assert.deepEqual(roundTripped.assertions, genAssertions);
|
|
338
|
-
});
|
|
339
|
-
});
|
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* AgentHarnessModeHandler — compilation rules for `agent-harness` mode.
|
|
3
|
-
*
|
|
4
|
-
* Maps agent harness task definitions to Promptfoo configuration with:
|
|
5
|
-
* - Claude Agent SDK / OpenAI Codex SDK providers
|
|
6
|
-
* - Tool permission configuration (preset/allowed/disallowed)
|
|
7
|
-
* - Sandbox setup/teardown via Promptfoo extensions
|
|
8
|
-
* - Fixture provisioning into sandbox working directory
|
|
9
|
-
*
|
|
10
|
-
* @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
|
|
11
|
-
* @see packages/core/src/types/generalized-task.ts — AgentHarnessTaskDefinition
|
|
12
|
-
*/
|
|
13
|
-
import type { AgentHarnessTaskDefinition, ModeHandler, PromptTemplate } from "../../../_vendor/ailf-core/index.d.ts";
|
|
14
|
-
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../promptfoo-compiler.js";
|
|
15
|
-
import type { SandboxType } from "../sandbox/sandbox-strategy.js";
|
|
16
|
-
export declare const AGENT_HARNESS_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
|
|
17
|
-
/** Options for compiling an agent harness task */
|
|
18
|
-
export interface AgentHarnessCompileOptions {
|
|
19
|
-
/** Grader provider for LLM-graded assertions */
|
|
20
|
-
graderProvider?: string;
|
|
21
|
-
/** Root directory for fixture resolution */
|
|
22
|
-
rootDir?: string;
|
|
23
|
-
}
|
|
24
|
-
/** Result of compiling a single agent harness task */
|
|
25
|
-
export interface AgentHarnessCompileResult {
|
|
26
|
-
/** Promptfoo provider config */
|
|
27
|
-
providers: PromptfooProvider[];
|
|
28
|
-
/** Compiled test cases */
|
|
29
|
-
tests: PromptfooTestCase[];
|
|
30
|
-
/** Prompts for evaluation */
|
|
31
|
-
prompts: PromptfooPrompt[];
|
|
32
|
-
/** Promptfoo extensions for sandbox lifecycle */
|
|
33
|
-
extensions: PromptfooExtension[];
|
|
34
|
-
/** Sandbox configuration metadata */
|
|
35
|
-
sandboxConfig: SandboxConfigMeta;
|
|
36
|
-
/** Warnings generated during compilation */
|
|
37
|
-
warnings: string[];
|
|
38
|
-
}
|
|
39
|
-
/** Promptfoo extension hook */
|
|
40
|
-
export interface PromptfooExtension {
|
|
41
|
-
type: "afterEach" | "beforeEach";
|
|
42
|
-
/** JavaScript code or module path for the hook */
|
|
43
|
-
code: string;
|
|
44
|
-
}
|
|
45
|
-
/** Metadata about sandbox configuration for this task */
|
|
46
|
-
export interface SandboxConfigMeta {
|
|
47
|
-
type: SandboxType;
|
|
48
|
-
image?: string;
|
|
49
|
-
fixtures: string[];
|
|
50
|
-
limits?: {
|
|
51
|
-
cpus?: number;
|
|
52
|
-
memoryBytes?: number;
|
|
53
|
-
networkAccess?: boolean;
|
|
54
|
-
};
|
|
55
|
-
}
|
|
56
|
-
/** Validation errors for agent harness task definitions */
|
|
57
|
-
export interface AgentHarnessValidationError {
|
|
58
|
-
field: string;
|
|
59
|
-
message: string;
|
|
60
|
-
}
|
|
61
|
-
/**
|
|
62
|
-
* Validate that an agent harness task definition has all required fields.
|
|
63
|
-
*/
|
|
64
|
-
export declare function validateAgentHarnessTask(task: AgentHarnessTaskDefinition): AgentHarnessValidationError[];
|
|
65
|
-
/**
|
|
66
|
-
* Compile an agent harness task definition into Promptfoo configuration.
|
|
67
|
-
*/
|
|
68
|
-
export declare function compileAgentHarnessTask(task: AgentHarnessTaskDefinition, options?: AgentHarnessCompileOptions): AgentHarnessCompileResult;
|
|
69
|
-
/** ModeHandler-conformant export for the agent-harness evaluation mode. */
|
|
70
|
-
export declare const handler: ModeHandler;
|