@sanity/ailf 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/cli.js +0 -0
- package/package.json +24 -24
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
- package/dist/adapters/task-sources/yaml-task-source.js +0 -139
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
- package/dist/commands/update-quality-scores.d.ts +0 -5
- package/dist/commands/update-quality-scores.js +0 -20
- package/dist/lib/agent-behavior-report.d.ts +0 -8
- package/dist/lib/agent-behavior-report.js +0 -185
- package/dist/lib/baseline.d.ts +0 -19
- package/dist/lib/baseline.js +0 -153
- package/dist/lib/calculate-scores.d.ts +0 -23
- package/dist/lib/calculate-scores.js +0 -42
- package/dist/lib/compare.d.ts +0 -18
- package/dist/lib/compare.js +0 -170
- package/dist/lib/coverage-audit.d.ts +0 -4
- package/dist/lib/coverage-audit.js +0 -42
- package/dist/lib/discovery-report.d.ts +0 -13
- package/dist/lib/discovery-report.js +0 -57
- package/dist/lib/fetch-docs.d.ts +0 -30
- package/dist/lib/fetch-docs.js +0 -171
- package/dist/lib/generate-configs.d.ts +0 -25
- package/dist/lib/generate-configs.js +0 -42
- package/dist/lib/grader-api.d.ts +0 -21
- package/dist/lib/grader-api.js +0 -34
- package/dist/lib/grader-compare.d.ts +0 -19
- package/dist/lib/grader-compare.js +0 -91
- package/dist/lib/grader-consistency.d.ts +0 -27
- package/dist/lib/grader-consistency.js +0 -79
- package/dist/lib/grader-sensitivity.d.ts +0 -19
- package/dist/lib/grader-sensitivity.js +0 -75
- package/dist/lib/grader-validate.d.ts +0 -19
- package/dist/lib/grader-validate.js +0 -78
- package/dist/lib/measure-retrieval.d.ts +0 -14
- package/dist/lib/measure-retrieval.js +0 -71
- package/dist/lib/pr-comment.d.ts +0 -16
- package/dist/lib/pr-comment.js +0 -28
- package/dist/lib/readiness-report.d.ts +0 -13
- package/dist/lib/readiness-report.js +0 -108
- package/dist/lib/webhook-server.d.ts +0 -11
- package/dist/lib/webhook-server.js +0 -24
- package/dist/lib/weekly-digest.d.ts +0 -24
- package/dist/lib/weekly-digest.js +0 -148
- package/dist/orchestration/env-bridge.d.ts +0 -21
- package/dist/orchestration/env-bridge.js +0 -66
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
- package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
- package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
- package/dist/pipeline/compiler/task-bridge.js +0 -92
- package/dist/pipeline/expand-tasks.d.ts +0 -232
- package/dist/pipeline/expand-tasks.js +0 -467
- package/dist/pipeline/generate-configs.d.ts +0 -92
- package/dist/pipeline/generate-configs.js +0 -445
- package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/calculate-scores-step.js +0 -89
- package/dist/pipeline/steps/compare-step.d.ts +0 -18
- package/dist/pipeline/steps/compare-step.js +0 -90
- package/dist/pipeline/steps/eval-step.d.ts +0 -53
- package/dist/pipeline/steps/eval-step.js +0 -347
- package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
- package/dist/pipeline/steps/fetch-docs-step.js +0 -84
- package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
- package/dist/pipeline/steps/generate-configs-step.js +0 -98
- package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
- package/dist/pipeline/steps/grader-consistency-step.js +0 -74
- package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
- package/dist/pipeline/steps/publish-report-step.js +0 -243
- package/dist/pipeline/steps/report-step.d.ts +0 -13
- package/dist/pipeline/steps/report-step.js +0 -56
- package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/update-scores-step.js +0 -42
- package/dist/scripts/agent-behavior-report.d.ts +0 -19
- package/dist/scripts/agent-behavior-report.js +0 -315
- package/dist/scripts/baseline.d.ts +0 -43
- package/dist/scripts/baseline.js +0 -267
- package/dist/scripts/calculate-scores.d.ts +0 -166
- package/dist/scripts/calculate-scores.js +0 -1296
- package/dist/scripts/compare.d.ts +0 -22
- package/dist/scripts/compare.js +0 -334
- package/dist/scripts/coverage-audit.d.ts +0 -44
- package/dist/scripts/coverage-audit.js +0 -209
- package/dist/scripts/debug-eval.d.ts +0 -19
- package/dist/scripts/debug-eval.js +0 -73
- package/dist/scripts/discovery-report.d.ts +0 -58
- package/dist/scripts/discovery-report.js +0 -250
- package/dist/scripts/fetch-docs.d.ts +0 -35
- package/dist/scripts/fetch-docs.js +0 -472
- package/dist/scripts/generate-configs.d.ts +0 -66
- package/dist/scripts/generate-configs.js +0 -459
- package/dist/scripts/grader-api.d.ts +0 -27
- package/dist/scripts/grader-api.js +0 -206
- package/dist/scripts/grader-compare.d.ts +0 -22
- package/dist/scripts/grader-compare.js +0 -368
- package/dist/scripts/grader-consistency.d.ts +0 -20
- package/dist/scripts/grader-consistency.js +0 -313
- package/dist/scripts/grader-sensitivity.d.ts +0 -22
- package/dist/scripts/grader-sensitivity.js +0 -354
- package/dist/scripts/grader-validate.d.ts +0 -19
- package/dist/scripts/grader-validate.js +0 -267
- package/dist/scripts/measure-retrieval.d.ts +0 -10
- package/dist/scripts/measure-retrieval.js +0 -145
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
- package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
- package/dist/scripts/pipeline.d.ts +0 -76
- package/dist/scripts/pipeline.js +0 -1031
- package/dist/scripts/pr-comment.d.ts +0 -10
- package/dist/scripts/pr-comment.js +0 -510
- package/dist/scripts/readiness-report.d.ts +0 -88
- package/dist/scripts/readiness-report.js +0 -342
- package/dist/scripts/update-quality-scores.d.ts +0 -15
- package/dist/scripts/update-quality-scores.js +0 -184
- package/dist/scripts/validate-task-sources.d.ts +0 -21
- package/dist/scripts/validate-task-sources.js +0 -210
- package/dist/scripts/validate.d.ts +0 -13
- package/dist/scripts/validate.js +0 -79
- package/dist/scripts/webhook-server.d.ts +0 -26
- package/dist/scripts/webhook-server.js +0 -147
- package/dist/scripts/weekly-digest.d.ts +0 -24
- package/dist/scripts/weekly-digest.js +0 -144
- package/dist/sinks/format-slack.d.ts +0 -64
- package/dist/sinks/format-slack.js +0 -306
- package/dist/sinks/slack-sink.d.ts +0 -27
- package/dist/sinks/slack-sink.js +0 -78
- package/dist/sinks/webhook-sink.d.ts +0 -19
- package/dist/sinks/webhook-sink.js +0 -50
- package/tasks/.expanded.agentic.yaml +0 -280
- package/tasks/.expanded.yaml +0 -565
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Sanity.io
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/dist/cli.js
CHANGED
|
File without changes
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sanity/ailf",
|
|
3
|
-
"version": "2.0
|
|
3
|
+
"version": "2.1.0",
|
|
4
4
|
"private": false,
|
|
5
5
|
"publishConfig": {
|
|
6
6
|
"access": "public"
|
|
@@ -31,6 +31,28 @@
|
|
|
31
31
|
"canonical",
|
|
32
32
|
"tasks"
|
|
33
33
|
],
|
|
34
|
+
"dependencies": {
|
|
35
|
+
"@google-cloud/bigquery": "^8.1.1",
|
|
36
|
+
"@inquirer/prompts": "^8.3.0",
|
|
37
|
+
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
38
|
+
"@portabletext/markdown": "^1.0.0",
|
|
39
|
+
"@sanity/client": "^7.3.0",
|
|
40
|
+
"commander": "^14.0.3",
|
|
41
|
+
"dotenv": "^16.4.7",
|
|
42
|
+
"dotenv-cli": "^11.0.0",
|
|
43
|
+
"jiti": "^2.6.1",
|
|
44
|
+
"js-yaml": "^4.1.0",
|
|
45
|
+
"promptfoo": "^0.120.24",
|
|
46
|
+
"zod": "^4.3.6"
|
|
47
|
+
},
|
|
48
|
+
"devDependencies": {
|
|
49
|
+
"@types/js-yaml": "^4.0.9",
|
|
50
|
+
"@types/node": "^22.13.1",
|
|
51
|
+
"tsx": "^4.19.2",
|
|
52
|
+
"typescript": "^5.7.3",
|
|
53
|
+
"@sanity/ailf-core": "0.1.0",
|
|
54
|
+
"@sanity/ailf-shared": "0.1.0"
|
|
55
|
+
},
|
|
34
56
|
"scripts": {
|
|
35
57
|
"build": "tsc && tsx scripts/bundle-workspace-deps.ts",
|
|
36
58
|
"generate-configs": "tsx src/cli.ts generate-configs",
|
|
@@ -58,27 +80,5 @@
|
|
|
58
80
|
"discovery-report": "tsx src/cli.ts discovery-report",
|
|
59
81
|
"webhook-server": "tsx src/cli.ts webhook-server",
|
|
60
82
|
"weekly-digest": "tsx src/cli.ts weekly-digest"
|
|
61
|
-
},
|
|
62
|
-
"dependencies": {
|
|
63
|
-
"@google-cloud/bigquery": "^8.1.1",
|
|
64
|
-
"@inquirer/prompts": "^8.3.0",
|
|
65
|
-
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
66
|
-
"@portabletext/markdown": "^1.0.0",
|
|
67
|
-
"@sanity/client": "^7.3.0",
|
|
68
|
-
"commander": "^14.0.3",
|
|
69
|
-
"dotenv": "^16.4.7",
|
|
70
|
-
"dotenv-cli": "^11.0.0",
|
|
71
|
-
"jiti": "^2.6.1",
|
|
72
|
-
"js-yaml": "^4.1.0",
|
|
73
|
-
"promptfoo": "^0.120.24",
|
|
74
|
-
"zod": "^4.3.6"
|
|
75
|
-
},
|
|
76
|
-
"devDependencies": {
|
|
77
|
-
"@sanity/ailf-core": "workspace:*",
|
|
78
|
-
"@sanity/ailf-shared": "workspace:*",
|
|
79
|
-
"@types/js-yaml": "^4.0.9",
|
|
80
|
-
"@types/node": "^22.13.1",
|
|
81
|
-
"tsx": "^4.19.2",
|
|
82
|
-
"typescript": "^5.7.3"
|
|
83
83
|
}
|
|
84
|
-
}
|
|
84
|
+
}
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* comparison-formatters.test.ts
|
|
3
|
-
*
|
|
4
|
-
* Verifies that formatComparisonMarkdown() and formatComparisonTable()
|
|
5
|
-
* dynamically derive column headers from the dimension keys present
|
|
6
|
-
* in the report data, rather than hardcoding literacy-specific names.
|
|
7
|
-
*
|
|
8
|
-
* Run: npx tsx --test src/__tests__/comparison-formatters.test.ts
|
|
9
|
-
*/
|
|
10
|
-
export {};
|
|
@@ -1,185 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* comparison-formatters.test.ts
|
|
3
|
-
*
|
|
4
|
-
* Verifies that formatComparisonMarkdown() and formatComparisonTable()
|
|
5
|
-
* dynamically derive column headers from the dimension keys present
|
|
6
|
-
* in the report data, rather than hardcoding literacy-specific names.
|
|
7
|
-
*
|
|
8
|
-
* Run: npx tsx --test src/__tests__/comparison-formatters.test.ts
|
|
9
|
-
*/
|
|
10
|
-
import assert from "node:assert/strict";
|
|
11
|
-
import { describe, it } from "node:test";
|
|
12
|
-
import { formatComparisonMarkdown, formatComparisonTable, } from "../services/comparison-formatters.js";
|
|
13
|
-
// ---------------------------------------------------------------------------
|
|
14
|
-
// Helpers
|
|
15
|
-
// ---------------------------------------------------------------------------
|
|
16
|
-
/** Minimal ScoreSummary stub — only fields the formatters actually read */
|
|
17
|
-
function stubSummary(avgScore) {
|
|
18
|
-
return {
|
|
19
|
-
belowCritical: [],
|
|
20
|
-
lowestArea: "area-a",
|
|
21
|
-
lowestScore: 40,
|
|
22
|
-
overall: {
|
|
23
|
-
avgCeilingScore: 80,
|
|
24
|
-
avgScore,
|
|
25
|
-
avgDocLift: 10,
|
|
26
|
-
avgDocQualityGap: 20,
|
|
27
|
-
avgFloorScore: 30,
|
|
28
|
-
negativeDocLiftCount: 0,
|
|
29
|
-
},
|
|
30
|
-
scores: [],
|
|
31
|
-
timestamp: "2026-04-05T00:00:00.000Z",
|
|
32
|
-
};
|
|
33
|
-
}
|
|
34
|
-
function makeReport(overrides) {
|
|
35
|
-
return {
|
|
36
|
-
areas: [
|
|
37
|
-
{
|
|
38
|
-
area: "area-a",
|
|
39
|
-
baseline: 60,
|
|
40
|
-
experiment: 65,
|
|
41
|
-
delta: 5,
|
|
42
|
-
change: "improved",
|
|
43
|
-
dimensions: overrides.areaDimensions,
|
|
44
|
-
ceilingDelta: 0,
|
|
45
|
-
docLiftDelta: 2,
|
|
46
|
-
floorDelta: 0,
|
|
47
|
-
},
|
|
48
|
-
],
|
|
49
|
-
baseline: stubSummary(60),
|
|
50
|
-
experiment: stubSummary(65),
|
|
51
|
-
deltas: {
|
|
52
|
-
overall: 5,
|
|
53
|
-
perArea: { "area-a": 5 },
|
|
54
|
-
perDimension: overrides.perDimension,
|
|
55
|
-
docLift: 2,
|
|
56
|
-
},
|
|
57
|
-
generatedAt: "2026-04-05T00:00:00.000Z",
|
|
58
|
-
improved: ["area-a"],
|
|
59
|
-
regressed: [],
|
|
60
|
-
unchanged: [],
|
|
61
|
-
notEvaluated: [],
|
|
62
|
-
mismatched: { onlyInBaseline: [], onlyInExperiment: [] },
|
|
63
|
-
noiseThreshold: 2,
|
|
64
|
-
noiseThresholdEmpirical: false,
|
|
65
|
-
};
|
|
66
|
-
}
|
|
67
|
-
// ---------------------------------------------------------------------------
|
|
68
|
-
// Tests — literacy dimensions (backward compatibility)
|
|
69
|
-
// ---------------------------------------------------------------------------
|
|
70
|
-
describe("formatComparisonMarkdown", () => {
|
|
71
|
-
it("renders literacy dimension columns dynamically", () => {
|
|
72
|
-
const report = makeReport({
|
|
73
|
-
areaDimensions: {
|
|
74
|
-
"task-completion": { baseline: 60, experiment: 65, delta: 5 },
|
|
75
|
-
"code-correctness": { baseline: 50, experiment: 55, delta: 5 },
|
|
76
|
-
"doc-coverage": { baseline: 40, experiment: 42, delta: 2 },
|
|
77
|
-
},
|
|
78
|
-
perDimension: {
|
|
79
|
-
"task-completion": 5,
|
|
80
|
-
"code-correctness": 5,
|
|
81
|
-
"doc-coverage": 2,
|
|
82
|
-
},
|
|
83
|
-
});
|
|
84
|
-
const md = formatComparisonMarkdown(report);
|
|
85
|
-
// Column headers should be title-cased from kebab-case
|
|
86
|
-
assert.ok(md.includes("Task Completion"), "should have Task Completion column header");
|
|
87
|
-
assert.ok(md.includes("Code Correctness"), "should have Code Correctness column header");
|
|
88
|
-
assert.ok(md.includes("Doc Coverage"), "should have Doc Coverage column header");
|
|
89
|
-
// Per-dimension averages section should also show dynamic labels
|
|
90
|
-
assert.ok(md.includes("| Task Completion |"), "dimension averages should include Task Completion");
|
|
91
|
-
assert.ok(md.includes("| Code Correctness |"), "dimension averages should include Code Correctness");
|
|
92
|
-
assert.ok(md.includes("| Doc Coverage |"), "dimension averages should include Doc Coverage");
|
|
93
|
-
});
|
|
94
|
-
it("renders MCP dimension columns dynamically", () => {
|
|
95
|
-
const report = makeReport({
|
|
96
|
-
areaDimensions: {
|
|
97
|
-
"input-validation": { baseline: 50, experiment: 60, delta: 10 },
|
|
98
|
-
"output-correctness": { baseline: 70, experiment: 75, delta: 5 },
|
|
99
|
-
"error-handling": { baseline: 40, experiment: 45, delta: 5 },
|
|
100
|
-
security: { baseline: 80, experiment: 82, delta: 2 },
|
|
101
|
-
},
|
|
102
|
-
perDimension: {
|
|
103
|
-
"input-validation": 10,
|
|
104
|
-
"output-correctness": 5,
|
|
105
|
-
"error-handling": 5,
|
|
106
|
-
security: 2,
|
|
107
|
-
},
|
|
108
|
-
});
|
|
109
|
-
const md = formatComparisonMarkdown(report);
|
|
110
|
-
// 4 MCP columns instead of 3 literacy columns
|
|
111
|
-
assert.ok(md.includes("Input Validation"), "should have Input Validation column");
|
|
112
|
-
assert.ok(md.includes("Output Correctness"), "should have Output Correctness column");
|
|
113
|
-
assert.ok(md.includes("Error Handling"), "should have Error Handling column");
|
|
114
|
-
assert.ok(md.includes("Security"), "should have Security column");
|
|
115
|
-
// Per-dimension averages
|
|
116
|
-
assert.ok(md.includes("| Input Validation |"), "dimension averages should include Input Validation");
|
|
117
|
-
assert.ok(md.includes("| Security |"), "dimension averages should include Security");
|
|
118
|
-
});
|
|
119
|
-
});
|
|
120
|
-
describe("formatComparisonTable", () => {
|
|
121
|
-
it("renders literacy dimension columns dynamically", () => {
|
|
122
|
-
const report = makeReport({
|
|
123
|
-
areaDimensions: {
|
|
124
|
-
"task-completion": { baseline: 60, experiment: 65, delta: 5 },
|
|
125
|
-
"code-correctness": { baseline: 50, experiment: 55, delta: 5 },
|
|
126
|
-
"doc-coverage": { baseline: 40, experiment: 42, delta: 2 },
|
|
127
|
-
},
|
|
128
|
-
perDimension: {
|
|
129
|
-
"task-completion": 5,
|
|
130
|
-
"code-correctness": 5,
|
|
131
|
-
"doc-coverage": 2,
|
|
132
|
-
},
|
|
133
|
-
});
|
|
134
|
-
const table = formatComparisonTable(report);
|
|
135
|
-
// Dimension averages section
|
|
136
|
-
assert.ok(table.includes("Task Completion:"), "should show Task Completion in dimension averages");
|
|
137
|
-
assert.ok(table.includes("Code Correctness:"), "should show Code Correctness in dimension averages");
|
|
138
|
-
assert.ok(table.includes("Doc Coverage:"), "should show Doc Coverage in dimension averages");
|
|
139
|
-
// Per-area table header
|
|
140
|
-
assert.ok(table.includes("Task Completion"), "per-area table should have Task Completion header");
|
|
141
|
-
assert.ok(table.includes("Code Correctness"), "per-area table should have Code Correctness header");
|
|
142
|
-
assert.ok(table.includes("Doc Coverage"), "per-area table should have Doc Coverage header");
|
|
143
|
-
});
|
|
144
|
-
it("renders MCP dimension columns dynamically", () => {
|
|
145
|
-
const report = makeReport({
|
|
146
|
-
areaDimensions: {
|
|
147
|
-
"input-validation": { baseline: 50, experiment: 60, delta: 10 },
|
|
148
|
-
"output-correctness": { baseline: 70, experiment: 75, delta: 5 },
|
|
149
|
-
"error-handling": { baseline: 40, experiment: 45, delta: 5 },
|
|
150
|
-
security: { baseline: 80, experiment: 82, delta: 2 },
|
|
151
|
-
},
|
|
152
|
-
perDimension: {
|
|
153
|
-
"input-validation": 10,
|
|
154
|
-
"output-correctness": 5,
|
|
155
|
-
"error-handling": 5,
|
|
156
|
-
security: 2,
|
|
157
|
-
},
|
|
158
|
-
});
|
|
159
|
-
const table = formatComparisonTable(report);
|
|
160
|
-
// 4 MCP columns in the per-area table
|
|
161
|
-
assert.ok(table.includes("Input Validation"), "should have Input Validation");
|
|
162
|
-
assert.ok(table.includes("Output Correctness"), "should have Output Correctness");
|
|
163
|
-
assert.ok(table.includes("Error Handling"), "should have Error Handling");
|
|
164
|
-
assert.ok(table.includes("Security"), "should have Security");
|
|
165
|
-
// Should NOT have literacy dimension headers
|
|
166
|
-
assert.ok(!table.includes("Task Completion"), "should not contain Task Completion");
|
|
167
|
-
assert.ok(!table.includes("Doc Coverage"), "should not contain Doc Coverage");
|
|
168
|
-
});
|
|
169
|
-
it("includes delta values for each dimension in the per-area rows", () => {
|
|
170
|
-
const report = makeReport({
|
|
171
|
-
areaDimensions: {
|
|
172
|
-
"input-validation": { baseline: 50, experiment: 60, delta: 10 },
|
|
173
|
-
"output-correctness": { baseline: 70, experiment: 75, delta: 5 },
|
|
174
|
-
},
|
|
175
|
-
perDimension: {
|
|
176
|
-
"input-validation": 10,
|
|
177
|
-
"output-correctness": 5,
|
|
178
|
-
},
|
|
179
|
-
});
|
|
180
|
-
const table = formatComparisonTable(report);
|
|
181
|
-
// The per-area row should include the delta values (+10 and +5)
|
|
182
|
-
assert.ok(table.includes("+10"), "should show +10 delta for area-a");
|
|
183
|
-
assert.ok(table.includes("+5"), "should show +5 delta for area-a");
|
|
184
|
-
});
|
|
185
|
-
});
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* noop-collector.test.ts — verifies the NoOpArtifactCollector is truly zero-cost.
|
|
3
|
-
*
|
|
4
|
-
* Run: npx tsx --test src/artifact-capture/__tests__/noop-collector.test.ts
|
|
5
|
-
*/
|
|
6
|
-
import assert from "node:assert/strict";
|
|
7
|
-
import { describe, it } from "node:test";
|
|
8
|
-
import { NoOpArtifactCollector } from "../noop-collector.js";
|
|
9
|
-
describe("NoOpArtifactCollector", () => {
|
|
10
|
-
it("enabled returns false", () => {
|
|
11
|
-
const collector = new NoOpArtifactCollector();
|
|
12
|
-
assert.equal(collector.enabled, false);
|
|
13
|
-
});
|
|
14
|
-
it("extrasEnabled returns false", () => {
|
|
15
|
-
const collector = new NoOpArtifactCollector();
|
|
16
|
-
assert.equal(collector.extrasEnabled, false);
|
|
17
|
-
});
|
|
18
|
-
it("capture() is callable and returns void", () => {
|
|
19
|
-
const collector = new NoOpArtifactCollector();
|
|
20
|
-
const result = collector.capture("step", "type", { data: true });
|
|
21
|
-
assert.equal(result, undefined);
|
|
22
|
-
});
|
|
23
|
-
it("captureFile() is callable and returns void", () => {
|
|
24
|
-
const collector = new NoOpArtifactCollector();
|
|
25
|
-
const result = collector.captureFile("step", "type", "/some/path");
|
|
26
|
-
assert.equal(result, undefined);
|
|
27
|
-
});
|
|
28
|
-
it("flush() returns zero-count result", async () => {
|
|
29
|
-
const collector = new NoOpArtifactCollector();
|
|
30
|
-
const result = await collector.flush();
|
|
31
|
-
assert.equal(result.artifactCount, 0);
|
|
32
|
-
assert.equal(result.destination, "");
|
|
33
|
-
assert.equal(result.totalBytes, 0);
|
|
34
|
-
assert.equal(result.compressed, false);
|
|
35
|
-
});
|
|
36
|
-
it("flush() returns the same frozen object every time", async () => {
|
|
37
|
-
const collector = new NoOpArtifactCollector();
|
|
38
|
-
const a = await collector.flush();
|
|
39
|
-
const b = await collector.flush();
|
|
40
|
-
assert.equal(a, b);
|
|
41
|
-
});
|
|
42
|
-
});
|
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* cli.ts — Minimal CLI for standalone task validation.
|
|
3
|
-
*
|
|
4
|
-
* Usage:
|
|
5
|
-
* npx @sanity/ailf-tasks validate .ailf/tasks/
|
|
6
|
-
* npx @sanity/ailf-tasks validate # defaults to .ailf/tasks/
|
|
7
|
-
*/
|
|
8
|
-
import { loadTaskDir } from "./parser.js";
|
|
9
|
-
import { formatValidationResult, validateRepoTasks } from "./validation.js";
|
|
10
|
-
export function run() {
|
|
11
|
-
const args = process.argv.slice(2);
|
|
12
|
-
const command = args[0];
|
|
13
|
-
if (command === "validate") {
|
|
14
|
-
const dir = args[1] ?? ".ailf/tasks";
|
|
15
|
-
validateCommand(dir);
|
|
16
|
-
}
|
|
17
|
-
else if (command === "--help" ||
|
|
18
|
-
command === "-h" ||
|
|
19
|
-
command === undefined) {
|
|
20
|
-
printUsage();
|
|
21
|
-
}
|
|
22
|
-
else {
|
|
23
|
-
console.error(`Unknown command: ${command}`);
|
|
24
|
-
printUsage();
|
|
25
|
-
process.exit(1);
|
|
26
|
-
}
|
|
27
|
-
}
|
|
28
|
-
function validateCommand(dir) {
|
|
29
|
-
try {
|
|
30
|
-
const tasks = loadTaskDir(dir);
|
|
31
|
-
// Run semantic validation
|
|
32
|
-
const result = validateRepoTasks(tasks);
|
|
33
|
-
const formatted = formatValidationResult(result);
|
|
34
|
-
console.log(`✅ ${tasks.length} task(s) validated from ${dir}`);
|
|
35
|
-
for (const task of tasks) {
|
|
36
|
-
console.log(` ${task.id} — ${task.description}`);
|
|
37
|
-
}
|
|
38
|
-
if (result.warnings.length > 0 || result.errors.length > 0) {
|
|
39
|
-
console.log("");
|
|
40
|
-
console.log(formatted);
|
|
41
|
-
}
|
|
42
|
-
if (!result.valid) {
|
|
43
|
-
process.exit(1);
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
catch (err) {
|
|
47
|
-
console.error(`❌ ${err instanceof Error ? err.message : String(err)}`);
|
|
48
|
-
process.exit(1);
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
function printUsage() {
|
|
52
|
-
console.log("Usage: ailf-tasks <command> [options]");
|
|
53
|
-
console.log("");
|
|
54
|
-
console.log("Commands:");
|
|
55
|
-
console.log(" validate [dir] Validate task YAML files (default: .ailf/tasks/)");
|
|
56
|
-
console.log("");
|
|
57
|
-
console.log("Examples:");
|
|
58
|
-
console.log(" ailf-tasks validate");
|
|
59
|
-
console.log(" ailf-tasks validate .ailf/tasks/");
|
|
60
|
-
console.log(" ailf-tasks validate /path/to/tasks/");
|
|
61
|
-
}
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* @sanity/ailf-tasks — Task definition schemas and YAML parser.
|
|
3
|
-
*
|
|
4
|
-
* Lightweight package for parsing and validating .ailf/tasks/*.yaml files
|
|
5
|
-
* without depending on the full AILF CLI or its heavyweight dependencies
|
|
6
|
-
* (Promptfoo, LLM SDKs, Sanity client).
|
|
7
|
-
*
|
|
8
|
-
* Usage:
|
|
9
|
-
* import { parseTaskFile, loadTaskDir, RepoTaskSchema } from '@sanity/ailf-tasks'
|
|
10
|
-
*/
|
|
11
|
-
export { CURATED_ASSERTION_TYPES, RepoTaskFileSchema, RepoTaskSchema, RUBRIC_TEMPLATE_NAMES, type CuratedAssertionType, type RepoTask, type RubricTemplateName, } from "./schemas.js";
|
|
12
|
-
export { loadTaskDir, parseTaskFile } from "./parser.js";
|
|
13
|
-
export { detectSnakeCaseFields, formatValidationResult, validateRepoTasks, type ValidationMessage, type ValidationResult, } from "./validation.js";
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* @sanity/ailf-tasks — Task definition schemas and YAML parser.
|
|
3
|
-
*
|
|
4
|
-
* Lightweight package for parsing and validating .ailf/tasks/*.yaml files
|
|
5
|
-
* without depending on the full AILF CLI or its heavyweight dependencies
|
|
6
|
-
* (Promptfoo, LLM SDKs, Sanity client).
|
|
7
|
-
*
|
|
8
|
-
* Usage:
|
|
9
|
-
* import { parseTaskFile, loadTaskDir, RepoTaskSchema } from '@sanity/ailf-tasks'
|
|
10
|
-
*/
|
|
11
|
-
// Schemas and types
|
|
12
|
-
export { CURATED_ASSERTION_TYPES, RepoTaskFileSchema, RepoTaskSchema, RUBRIC_TEMPLATE_NAMES, } from "./schemas.js";
|
|
13
|
-
// Parsing
|
|
14
|
-
export { loadTaskDir, parseTaskFile } from "./parser.js";
|
|
15
|
-
// Validation
|
|
16
|
-
export { detectSnakeCaseFields, formatValidationResult, validateRepoTasks, } from "./validation.js";
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* parser.ts — Standalone task file and directory parsing.
|
|
3
|
-
*
|
|
4
|
-
* High-level functions for loading and validating .ailf/tasks/ YAML
|
|
5
|
-
* files without any dependency on the eval pipeline.
|
|
6
|
-
*
|
|
7
|
-
* Usage:
|
|
8
|
-
* import { parseTaskFile, loadTaskDir } from '@sanity/ailf-tasks'
|
|
9
|
-
*/
|
|
10
|
-
import { type RepoTask } from "./schemas.js";
|
|
11
|
-
/**
|
|
12
|
-
* Parse a single task YAML string and return validated tasks.
|
|
13
|
-
*
|
|
14
|
-
* @param content - Raw YAML string content
|
|
15
|
-
* @param filename - Source filename (for error messages)
|
|
16
|
-
* @returns Validated array of RepoTask objects
|
|
17
|
-
* @throws Error if YAML parsing or Zod validation fails
|
|
18
|
-
*/
|
|
19
|
-
export declare function parseTaskFile(content: string, filename?: string): RepoTask[];
|
|
20
|
-
/**
|
|
21
|
-
* Load and parse all task YAML files from a directory.
|
|
22
|
-
*
|
|
23
|
-
* @param dirPath - Path to directory containing .yaml/.yml files
|
|
24
|
-
* @returns All validated tasks, sorted by filename
|
|
25
|
-
* @throws Error if directory not found, no YAML files, or validation fails
|
|
26
|
-
*/
|
|
27
|
-
export declare function loadTaskDir(dirPath: string): RepoTask[];
|
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* parser.ts — Standalone task file and directory parsing.
|
|
3
|
-
*
|
|
4
|
-
* High-level functions for loading and validating .ailf/tasks/ YAML
|
|
5
|
-
* files without any dependency on the eval pipeline.
|
|
6
|
-
*
|
|
7
|
-
* Usage:
|
|
8
|
-
* import { parseTaskFile, loadTaskDir } from '@sanity/ailf-tasks'
|
|
9
|
-
*/
|
|
10
|
-
import { existsSync, readdirSync, readFileSync } from "fs";
|
|
11
|
-
import { resolve } from "path";
|
|
12
|
-
import { load } from "js-yaml";
|
|
13
|
-
import { RepoTaskFileSchema } from "./schemas.js";
|
|
14
|
-
// ---------------------------------------------------------------------------
|
|
15
|
-
// Public API
|
|
16
|
-
// ---------------------------------------------------------------------------
|
|
17
|
-
/**
|
|
18
|
-
* Parse a single task YAML string and return validated tasks.
|
|
19
|
-
*
|
|
20
|
-
* @param content - Raw YAML string content
|
|
21
|
-
* @param filename - Source filename (for error messages)
|
|
22
|
-
* @returns Validated array of RepoTask objects
|
|
23
|
-
* @throws Error if YAML parsing or Zod validation fails
|
|
24
|
-
*/
|
|
25
|
-
export function parseTaskFile(content, filename = "<string>") {
|
|
26
|
-
const parsed = load(content);
|
|
27
|
-
if (!Array.isArray(parsed)) {
|
|
28
|
-
throw new Error(`${filename} did not parse to an array of tasks. ` +
|
|
29
|
-
"Task files must contain a YAML array of task definitions.");
|
|
30
|
-
}
|
|
31
|
-
const result = RepoTaskFileSchema.safeParse(parsed);
|
|
32
|
-
if (!result.success) {
|
|
33
|
-
const messages = result.error.issues
|
|
34
|
-
.map((i) => ` [${i.path.join(".")}]: ${i.message}`)
|
|
35
|
-
.join("\n");
|
|
36
|
-
throw new Error(`Invalid task file "${filename}":\n${messages}`);
|
|
37
|
-
}
|
|
38
|
-
return result.data;
|
|
39
|
-
}
|
|
40
|
-
/**
|
|
41
|
-
* Load and parse all task YAML files from a directory.
|
|
42
|
-
*
|
|
43
|
-
* @param dirPath - Path to directory containing .yaml/.yml files
|
|
44
|
-
* @returns All validated tasks, sorted by filename
|
|
45
|
-
* @throws Error if directory not found, no YAML files, or validation fails
|
|
46
|
-
*/
|
|
47
|
-
export function loadTaskDir(dirPath) {
|
|
48
|
-
if (!existsSync(dirPath)) {
|
|
49
|
-
throw new Error(`Tasks directory not found: ${dirPath}\n` +
|
|
50
|
-
" Expected a directory containing .ailf/tasks/*.yaml files.");
|
|
51
|
-
}
|
|
52
|
-
const yamlFiles = readdirSync(dirPath)
|
|
53
|
-
.filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
|
|
54
|
-
.sort();
|
|
55
|
-
if (yamlFiles.length === 0) {
|
|
56
|
-
throw new Error(`No YAML files found in ${dirPath}\n` +
|
|
57
|
-
" Expected .ailf/tasks/*.yaml files with task definitions.");
|
|
58
|
-
}
|
|
59
|
-
const allTasks = [];
|
|
60
|
-
for (const file of yamlFiles) {
|
|
61
|
-
const filePath = resolve(dirPath, file);
|
|
62
|
-
const content = readFileSync(filePath, "utf-8");
|
|
63
|
-
try {
|
|
64
|
-
const tasks = parseTaskFile(content, file);
|
|
65
|
-
allTasks.push(...tasks);
|
|
66
|
-
}
|
|
67
|
-
catch (err) {
|
|
68
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
69
|
-
throw new Error(`Failed to load ${file}:\n${msg}`, { cause: err });
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
return allTasks;
|
|
73
|
-
}
|