@sanity/ailf 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. package/LICENSE +21 -0
  2. package/dist/cli.js +0 -0
  3. package/dist/orchestration/steps/run-eval-step.js +1 -1
  4. package/dist/pipeline/checks.d.ts +8 -3
  5. package/dist/pipeline/checks.js +23 -3
  6. package/package.json +25 -25
  7. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  8. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  9. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  10. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  11. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  12. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  13. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  14. package/dist/_vendor/ailf-tasks/index.js +0 -16
  15. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  16. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  17. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  18. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  19. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  20. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  21. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  22. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  23. package/dist/agent-observer/test-imports.d.ts +0 -7
  24. package/dist/agent-observer/test-imports.js +0 -185
  25. package/dist/commands/update-quality-scores.d.ts +0 -5
  26. package/dist/commands/update-quality-scores.js +0 -20
  27. package/dist/lib/agent-behavior-report.d.ts +0 -8
  28. package/dist/lib/agent-behavior-report.js +0 -185
  29. package/dist/lib/baseline.d.ts +0 -19
  30. package/dist/lib/baseline.js +0 -153
  31. package/dist/lib/calculate-scores.d.ts +0 -23
  32. package/dist/lib/calculate-scores.js +0 -42
  33. package/dist/lib/compare.d.ts +0 -18
  34. package/dist/lib/compare.js +0 -170
  35. package/dist/lib/coverage-audit.d.ts +0 -4
  36. package/dist/lib/coverage-audit.js +0 -42
  37. package/dist/lib/discovery-report.d.ts +0 -13
  38. package/dist/lib/discovery-report.js +0 -57
  39. package/dist/lib/fetch-docs.d.ts +0 -30
  40. package/dist/lib/fetch-docs.js +0 -171
  41. package/dist/lib/generate-configs.d.ts +0 -25
  42. package/dist/lib/generate-configs.js +0 -42
  43. package/dist/lib/grader-api.d.ts +0 -21
  44. package/dist/lib/grader-api.js +0 -34
  45. package/dist/lib/grader-compare.d.ts +0 -19
  46. package/dist/lib/grader-compare.js +0 -91
  47. package/dist/lib/grader-consistency.d.ts +0 -27
  48. package/dist/lib/grader-consistency.js +0 -79
  49. package/dist/lib/grader-sensitivity.d.ts +0 -19
  50. package/dist/lib/grader-sensitivity.js +0 -75
  51. package/dist/lib/grader-validate.d.ts +0 -19
  52. package/dist/lib/grader-validate.js +0 -78
  53. package/dist/lib/measure-retrieval.d.ts +0 -14
  54. package/dist/lib/measure-retrieval.js +0 -71
  55. package/dist/lib/pr-comment.d.ts +0 -16
  56. package/dist/lib/pr-comment.js +0 -28
  57. package/dist/lib/readiness-report.d.ts +0 -13
  58. package/dist/lib/readiness-report.js +0 -108
  59. package/dist/lib/webhook-server.d.ts +0 -11
  60. package/dist/lib/webhook-server.js +0 -24
  61. package/dist/lib/weekly-digest.d.ts +0 -24
  62. package/dist/lib/weekly-digest.js +0 -148
  63. package/dist/orchestration/env-bridge.d.ts +0 -21
  64. package/dist/orchestration/env-bridge.js +0 -66
  65. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  66. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  67. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  68. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  69. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  70. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  71. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  72. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  73. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  74. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  75. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  76. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  77. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  78. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  79. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  80. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  81. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  82. package/dist/pipeline/compiler/task-bridge.js +0 -92
  83. package/dist/pipeline/expand-tasks.d.ts +0 -232
  84. package/dist/pipeline/expand-tasks.js +0 -467
  85. package/dist/pipeline/generate-configs.d.ts +0 -92
  86. package/dist/pipeline/generate-configs.js +0 -445
  87. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  88. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  89. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  90. package/dist/pipeline/steps/compare-step.js +0 -90
  91. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  92. package/dist/pipeline/steps/eval-step.js +0 -347
  93. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  94. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  95. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  96. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  97. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  98. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  99. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  100. package/dist/pipeline/steps/publish-report-step.js +0 -243
  101. package/dist/pipeline/steps/report-step.d.ts +0 -13
  102. package/dist/pipeline/steps/report-step.js +0 -56
  103. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  104. package/dist/pipeline/steps/update-scores-step.js +0 -42
  105. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  106. package/dist/scripts/agent-behavior-report.js +0 -315
  107. package/dist/scripts/baseline.d.ts +0 -43
  108. package/dist/scripts/baseline.js +0 -267
  109. package/dist/scripts/calculate-scores.d.ts +0 -166
  110. package/dist/scripts/calculate-scores.js +0 -1296
  111. package/dist/scripts/compare.d.ts +0 -22
  112. package/dist/scripts/compare.js +0 -334
  113. package/dist/scripts/coverage-audit.d.ts +0 -44
  114. package/dist/scripts/coverage-audit.js +0 -209
  115. package/dist/scripts/debug-eval.d.ts +0 -19
  116. package/dist/scripts/debug-eval.js +0 -73
  117. package/dist/scripts/discovery-report.d.ts +0 -58
  118. package/dist/scripts/discovery-report.js +0 -250
  119. package/dist/scripts/fetch-docs.d.ts +0 -35
  120. package/dist/scripts/fetch-docs.js +0 -472
  121. package/dist/scripts/generate-configs.d.ts +0 -66
  122. package/dist/scripts/generate-configs.js +0 -459
  123. package/dist/scripts/grader-api.d.ts +0 -27
  124. package/dist/scripts/grader-api.js +0 -206
  125. package/dist/scripts/grader-compare.d.ts +0 -22
  126. package/dist/scripts/grader-compare.js +0 -368
  127. package/dist/scripts/grader-consistency.d.ts +0 -20
  128. package/dist/scripts/grader-consistency.js +0 -313
  129. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  130. package/dist/scripts/grader-sensitivity.js +0 -354
  131. package/dist/scripts/grader-validate.d.ts +0 -19
  132. package/dist/scripts/grader-validate.js +0 -267
  133. package/dist/scripts/measure-retrieval.d.ts +0 -10
  134. package/dist/scripts/measure-retrieval.js +0 -145
  135. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  136. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  137. package/dist/scripts/pipeline.d.ts +0 -76
  138. package/dist/scripts/pipeline.js +0 -1031
  139. package/dist/scripts/pr-comment.d.ts +0 -10
  140. package/dist/scripts/pr-comment.js +0 -510
  141. package/dist/scripts/readiness-report.d.ts +0 -88
  142. package/dist/scripts/readiness-report.js +0 -342
  143. package/dist/scripts/update-quality-scores.d.ts +0 -15
  144. package/dist/scripts/update-quality-scores.js +0 -184
  145. package/dist/scripts/validate-task-sources.d.ts +0 -21
  146. package/dist/scripts/validate-task-sources.js +0 -210
  147. package/dist/scripts/validate.d.ts +0 -13
  148. package/dist/scripts/validate.js +0 -79
  149. package/dist/scripts/webhook-server.d.ts +0 -26
  150. package/dist/scripts/webhook-server.js +0 -147
  151. package/dist/scripts/weekly-digest.d.ts +0 -24
  152. package/dist/scripts/weekly-digest.js +0 -144
  153. package/dist/sinks/format-slack.d.ts +0 -64
  154. package/dist/sinks/format-slack.js +0 -306
  155. package/dist/sinks/slack-sink.d.ts +0 -27
  156. package/dist/sinks/slack-sink.js +0 -78
  157. package/dist/sinks/webhook-sink.d.ts +0 -19
  158. package/dist/sinks/webhook-sink.js +0 -50
  159. package/tasks/.expanded.agentic.yaml +0 -280
  160. package/tasks/.expanded.yaml +0 -565
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Sanity.io
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/dist/cli.js CHANGED
File without changes
@@ -29,7 +29,7 @@ export class RunEvalStep {
29
29
  const start = Date.now();
30
30
  const { rootDir, debug, concurrency, noCache } = ctx.config;
31
31
  // Precondition: config file exists
32
- const configIssues = checkGeneratedConfigsExist(rootDir);
32
+ const configIssues = checkGeneratedConfigsExist(rootDir, this.mode);
33
33
  const configErrors = configIssues.filter((i) => i.severity === "error");
34
34
  if (configErrors.length > 0) {
35
35
  return {
@@ -23,10 +23,15 @@ export declare function checkContextsExist(rootDir: string, areas: string[]): Va
23
23
  */
24
24
  export declare function checkEnvironment(rootDir: string): ValidationIssue[];
25
25
  /**
26
- * Check that the baseline `promptfooconfig.yaml` exists. Optionally check
27
- * for `promptfooconfig.observed.yaml` and `promptfooconfig.agentic.yaml`.
26
+ * Check that the generated promptfoo config for a given mode exists.
27
+ *
28
+ * When `mode` is provided, checks only for that mode's config file
29
+ * (e.g. `promptfooconfig.agent-harness.yaml` for mode `"agent-harness"`).
30
+ *
31
+ * When `mode` is omitted, falls back to the legacy literacy check:
32
+ * baseline `promptfooconfig.yaml` (required) plus optional observed/agentic.
28
33
  */
29
- export declare function checkGeneratedConfigsExist(rootDir: string): ValidationIssue[];
34
+ export declare function checkGeneratedConfigsExist(rootDir: string, mode?: string): ValidationIssue[];
30
35
  /**
31
36
  * Check that the eval results JSON file exists, is valid JSON, and contains
32
37
  * a `results` array.
@@ -8,6 +8,7 @@
8
8
  import { config as loadEnv } from "dotenv";
9
9
  import { existsSync, readFileSync, statSync } from "fs";
10
10
  import { join, resolve } from "path";
11
+ import { configFileForMode } from "./eval-constants.js";
11
12
  // ---------------------------------------------------------------------------
12
13
  // Precondition: contexts exist for each feature area
13
14
  // ---------------------------------------------------------------------------
@@ -109,11 +110,30 @@ export function checkEnvironment(rootDir) {
109
110
  // Postcondition: score summary is valid
110
111
  // ---------------------------------------------------------------------------
111
112
  /**
112
- * Check that the baseline `promptfooconfig.yaml` exists. Optionally check
113
- * for `promptfooconfig.observed.yaml` and `promptfooconfig.agentic.yaml`.
113
+ * Check that the generated promptfoo config for a given mode exists.
114
+ *
115
+ * When `mode` is provided, checks only for that mode's config file
116
+ * (e.g. `promptfooconfig.agent-harness.yaml` for mode `"agent-harness"`).
117
+ *
118
+ * When `mode` is omitted, falls back to the legacy literacy check:
119
+ * baseline `promptfooconfig.yaml` (required) plus optional observed/agentic.
114
120
  */
115
- export function checkGeneratedConfigsExist(rootDir) {
121
+ export function checkGeneratedConfigsExist(rootDir, mode) {
116
122
  const issues = [];
123
+ if (mode) {
124
+ const configName = configFileForMode(mode);
125
+ const configPath = resolve(rootDir, configName);
126
+ if (!existsSync(configPath)) {
127
+ issues.push({
128
+ message: `Config '${configName}' not found for mode '${mode}'. Run the pipeline to generate it.`,
129
+ path: configPath,
130
+ severity: "error",
131
+ source: "checkGeneratedConfigsExist",
132
+ });
133
+ }
134
+ return issues;
135
+ }
136
+ // Legacy literacy check: baseline required, observed/agentic optional
117
137
  const baselinePath = resolve(rootDir, "promptfooconfig.yaml");
118
138
  if (!existsSync(baselinePath)) {
119
139
  issues.push({
package/package.json CHANGED
@@ -1,9 +1,9 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "2.0.1",
3
+ "version": "2.1.0",
4
4
  "private": false,
5
5
  "publishConfig": {
6
- "access": "restricted"
6
+ "access": "public"
7
7
  },
8
8
  "license": "MIT",
9
9
  "repository": {
@@ -31,6 +31,28 @@
31
31
  "canonical",
32
32
  "tasks"
33
33
  ],
34
+ "dependencies": {
35
+ "@google-cloud/bigquery": "^8.1.1",
36
+ "@inquirer/prompts": "^8.3.0",
37
+ "@modelcontextprotocol/sdk": "^1.29.0",
38
+ "@portabletext/markdown": "^1.0.0",
39
+ "@sanity/client": "^7.3.0",
40
+ "commander": "^14.0.3",
41
+ "dotenv": "^16.4.7",
42
+ "dotenv-cli": "^11.0.0",
43
+ "jiti": "^2.6.1",
44
+ "js-yaml": "^4.1.0",
45
+ "promptfoo": "^0.120.24",
46
+ "zod": "^4.3.6"
47
+ },
48
+ "devDependencies": {
49
+ "@types/js-yaml": "^4.0.9",
50
+ "@types/node": "^22.13.1",
51
+ "tsx": "^4.19.2",
52
+ "typescript": "^5.7.3",
53
+ "@sanity/ailf-core": "0.1.0",
54
+ "@sanity/ailf-shared": "0.1.0"
55
+ },
34
56
  "scripts": {
35
57
  "build": "tsc && tsx scripts/bundle-workspace-deps.ts",
36
58
  "generate-configs": "tsx src/cli.ts generate-configs",
@@ -58,27 +80,5 @@
58
80
  "discovery-report": "tsx src/cli.ts discovery-report",
59
81
  "webhook-server": "tsx src/cli.ts webhook-server",
60
82
  "weekly-digest": "tsx src/cli.ts weekly-digest"
61
- },
62
- "dependencies": {
63
- "@google-cloud/bigquery": "^8.1.1",
64
- "@inquirer/prompts": "^8.3.0",
65
- "@modelcontextprotocol/sdk": "^1.29.0",
66
- "@portabletext/markdown": "^1.0.0",
67
- "@sanity/client": "^7.3.0",
68
- "commander": "^14.0.3",
69
- "dotenv": "^16.4.7",
70
- "dotenv-cli": "^11.0.0",
71
- "jiti": "^2.6.1",
72
- "js-yaml": "^4.1.0",
73
- "promptfoo": "^0.120.24",
74
- "zod": "^4.3.6"
75
- },
76
- "devDependencies": {
77
- "@sanity/ailf-core": "workspace:*",
78
- "@sanity/ailf-shared": "workspace:*",
79
- "@types/js-yaml": "^4.0.9",
80
- "@types/node": "^22.13.1",
81
- "tsx": "^4.19.2",
82
- "typescript": "^5.7.3"
83
83
  }
84
- }
84
+ }
@@ -1,10 +0,0 @@
1
- /**
2
- * comparison-formatters.test.ts
3
- *
4
- * Verifies that formatComparisonMarkdown() and formatComparisonTable()
5
- * dynamically derive column headers from the dimension keys present
6
- * in the report data, rather than hardcoding literacy-specific names.
7
- *
8
- * Run: npx tsx --test src/__tests__/comparison-formatters.test.ts
9
- */
10
- export {};
@@ -1,185 +0,0 @@
1
- /**
2
- * comparison-formatters.test.ts
3
- *
4
- * Verifies that formatComparisonMarkdown() and formatComparisonTable()
5
- * dynamically derive column headers from the dimension keys present
6
- * in the report data, rather than hardcoding literacy-specific names.
7
- *
8
- * Run: npx tsx --test src/__tests__/comparison-formatters.test.ts
9
- */
10
- import assert from "node:assert/strict";
11
- import { describe, it } from "node:test";
12
- import { formatComparisonMarkdown, formatComparisonTable, } from "../services/comparison-formatters.js";
13
- // ---------------------------------------------------------------------------
14
- // Helpers
15
- // ---------------------------------------------------------------------------
16
- /** Minimal ScoreSummary stub — only fields the formatters actually read */
17
- function stubSummary(avgScore) {
18
- return {
19
- belowCritical: [],
20
- lowestArea: "area-a",
21
- lowestScore: 40,
22
- overall: {
23
- avgCeilingScore: 80,
24
- avgScore,
25
- avgDocLift: 10,
26
- avgDocQualityGap: 20,
27
- avgFloorScore: 30,
28
- negativeDocLiftCount: 0,
29
- },
30
- scores: [],
31
- timestamp: "2026-04-05T00:00:00.000Z",
32
- };
33
- }
34
- function makeReport(overrides) {
35
- return {
36
- areas: [
37
- {
38
- area: "area-a",
39
- baseline: 60,
40
- experiment: 65,
41
- delta: 5,
42
- change: "improved",
43
- dimensions: overrides.areaDimensions,
44
- ceilingDelta: 0,
45
- docLiftDelta: 2,
46
- floorDelta: 0,
47
- },
48
- ],
49
- baseline: stubSummary(60),
50
- experiment: stubSummary(65),
51
- deltas: {
52
- overall: 5,
53
- perArea: { "area-a": 5 },
54
- perDimension: overrides.perDimension,
55
- docLift: 2,
56
- },
57
- generatedAt: "2026-04-05T00:00:00.000Z",
58
- improved: ["area-a"],
59
- regressed: [],
60
- unchanged: [],
61
- notEvaluated: [],
62
- mismatched: { onlyInBaseline: [], onlyInExperiment: [] },
63
- noiseThreshold: 2,
64
- noiseThresholdEmpirical: false,
65
- };
66
- }
67
- // ---------------------------------------------------------------------------
68
- // Tests — literacy dimensions (backward compatibility)
69
- // ---------------------------------------------------------------------------
70
- describe("formatComparisonMarkdown", () => {
71
- it("renders literacy dimension columns dynamically", () => {
72
- const report = makeReport({
73
- areaDimensions: {
74
- "task-completion": { baseline: 60, experiment: 65, delta: 5 },
75
- "code-correctness": { baseline: 50, experiment: 55, delta: 5 },
76
- "doc-coverage": { baseline: 40, experiment: 42, delta: 2 },
77
- },
78
- perDimension: {
79
- "task-completion": 5,
80
- "code-correctness": 5,
81
- "doc-coverage": 2,
82
- },
83
- });
84
- const md = formatComparisonMarkdown(report);
85
- // Column headers should be title-cased from kebab-case
86
- assert.ok(md.includes("Task Completion"), "should have Task Completion column header");
87
- assert.ok(md.includes("Code Correctness"), "should have Code Correctness column header");
88
- assert.ok(md.includes("Doc Coverage"), "should have Doc Coverage column header");
89
- // Per-dimension averages section should also show dynamic labels
90
- assert.ok(md.includes("| Task Completion |"), "dimension averages should include Task Completion");
91
- assert.ok(md.includes("| Code Correctness |"), "dimension averages should include Code Correctness");
92
- assert.ok(md.includes("| Doc Coverage |"), "dimension averages should include Doc Coverage");
93
- });
94
- it("renders MCP dimension columns dynamically", () => {
95
- const report = makeReport({
96
- areaDimensions: {
97
- "input-validation": { baseline: 50, experiment: 60, delta: 10 },
98
- "output-correctness": { baseline: 70, experiment: 75, delta: 5 },
99
- "error-handling": { baseline: 40, experiment: 45, delta: 5 },
100
- security: { baseline: 80, experiment: 82, delta: 2 },
101
- },
102
- perDimension: {
103
- "input-validation": 10,
104
- "output-correctness": 5,
105
- "error-handling": 5,
106
- security: 2,
107
- },
108
- });
109
- const md = formatComparisonMarkdown(report);
110
- // 4 MCP columns instead of 3 literacy columns
111
- assert.ok(md.includes("Input Validation"), "should have Input Validation column");
112
- assert.ok(md.includes("Output Correctness"), "should have Output Correctness column");
113
- assert.ok(md.includes("Error Handling"), "should have Error Handling column");
114
- assert.ok(md.includes("Security"), "should have Security column");
115
- // Per-dimension averages
116
- assert.ok(md.includes("| Input Validation |"), "dimension averages should include Input Validation");
117
- assert.ok(md.includes("| Security |"), "dimension averages should include Security");
118
- });
119
- });
120
- describe("formatComparisonTable", () => {
121
- it("renders literacy dimension columns dynamically", () => {
122
- const report = makeReport({
123
- areaDimensions: {
124
- "task-completion": { baseline: 60, experiment: 65, delta: 5 },
125
- "code-correctness": { baseline: 50, experiment: 55, delta: 5 },
126
- "doc-coverage": { baseline: 40, experiment: 42, delta: 2 },
127
- },
128
- perDimension: {
129
- "task-completion": 5,
130
- "code-correctness": 5,
131
- "doc-coverage": 2,
132
- },
133
- });
134
- const table = formatComparisonTable(report);
135
- // Dimension averages section
136
- assert.ok(table.includes("Task Completion:"), "should show Task Completion in dimension averages");
137
- assert.ok(table.includes("Code Correctness:"), "should show Code Correctness in dimension averages");
138
- assert.ok(table.includes("Doc Coverage:"), "should show Doc Coverage in dimension averages");
139
- // Per-area table header
140
- assert.ok(table.includes("Task Completion"), "per-area table should have Task Completion header");
141
- assert.ok(table.includes("Code Correctness"), "per-area table should have Code Correctness header");
142
- assert.ok(table.includes("Doc Coverage"), "per-area table should have Doc Coverage header");
143
- });
144
- it("renders MCP dimension columns dynamically", () => {
145
- const report = makeReport({
146
- areaDimensions: {
147
- "input-validation": { baseline: 50, experiment: 60, delta: 10 },
148
- "output-correctness": { baseline: 70, experiment: 75, delta: 5 },
149
- "error-handling": { baseline: 40, experiment: 45, delta: 5 },
150
- security: { baseline: 80, experiment: 82, delta: 2 },
151
- },
152
- perDimension: {
153
- "input-validation": 10,
154
- "output-correctness": 5,
155
- "error-handling": 5,
156
- security: 2,
157
- },
158
- });
159
- const table = formatComparisonTable(report);
160
- // 4 MCP columns in the per-area table
161
- assert.ok(table.includes("Input Validation"), "should have Input Validation");
162
- assert.ok(table.includes("Output Correctness"), "should have Output Correctness");
163
- assert.ok(table.includes("Error Handling"), "should have Error Handling");
164
- assert.ok(table.includes("Security"), "should have Security");
165
- // Should NOT have literacy dimension headers
166
- assert.ok(!table.includes("Task Completion"), "should not contain Task Completion");
167
- assert.ok(!table.includes("Doc Coverage"), "should not contain Doc Coverage");
168
- });
169
- it("includes delta values for each dimension in the per-area rows", () => {
170
- const report = makeReport({
171
- areaDimensions: {
172
- "input-validation": { baseline: 50, experiment: 60, delta: 10 },
173
- "output-correctness": { baseline: 70, experiment: 75, delta: 5 },
174
- },
175
- perDimension: {
176
- "input-validation": 10,
177
- "output-correctness": 5,
178
- },
179
- });
180
- const table = formatComparisonTable(report);
181
- // The per-area row should include the delta values (+10 and +5)
182
- assert.ok(table.includes("+10"), "should show +10 delta for area-a");
183
- assert.ok(table.includes("+5"), "should show +5 delta for area-a");
184
- });
185
- });
@@ -1,6 +0,0 @@
1
- /**
2
- * noop-collector.test.ts — verifies the NoOpArtifactCollector is truly zero-cost.
3
- *
4
- * Run: npx tsx --test src/artifact-capture/__tests__/noop-collector.test.ts
5
- */
6
- export {};
@@ -1,42 +0,0 @@
1
- /**
2
- * noop-collector.test.ts — verifies the NoOpArtifactCollector is truly zero-cost.
3
- *
4
- * Run: npx tsx --test src/artifact-capture/__tests__/noop-collector.test.ts
5
- */
6
- import assert from "node:assert/strict";
7
- import { describe, it } from "node:test";
8
- import { NoOpArtifactCollector } from "../noop-collector.js";
9
- describe("NoOpArtifactCollector", () => {
10
- it("enabled returns false", () => {
11
- const collector = new NoOpArtifactCollector();
12
- assert.equal(collector.enabled, false);
13
- });
14
- it("extrasEnabled returns false", () => {
15
- const collector = new NoOpArtifactCollector();
16
- assert.equal(collector.extrasEnabled, false);
17
- });
18
- it("capture() is callable and returns void", () => {
19
- const collector = new NoOpArtifactCollector();
20
- const result = collector.capture("step", "type", { data: true });
21
- assert.equal(result, undefined);
22
- });
23
- it("captureFile() is callable and returns void", () => {
24
- const collector = new NoOpArtifactCollector();
25
- const result = collector.captureFile("step", "type", "/some/path");
26
- assert.equal(result, undefined);
27
- });
28
- it("flush() returns zero-count result", async () => {
29
- const collector = new NoOpArtifactCollector();
30
- const result = await collector.flush();
31
- assert.equal(result.artifactCount, 0);
32
- assert.equal(result.destination, "");
33
- assert.equal(result.totalBytes, 0);
34
- assert.equal(result.compressed, false);
35
- });
36
- it("flush() returns the same frozen object every time", async () => {
37
- const collector = new NoOpArtifactCollector();
38
- const a = await collector.flush();
39
- const b = await collector.flush();
40
- assert.equal(a, b);
41
- });
42
- });
@@ -1,8 +0,0 @@
1
- /**
2
- * cli.ts — Minimal CLI for standalone task validation.
3
- *
4
- * Usage:
5
- * npx @sanity/ailf-tasks validate .ailf/tasks/
6
- * npx @sanity/ailf-tasks validate # defaults to .ailf/tasks/
7
- */
8
- export declare function run(): void;
@@ -1,61 +0,0 @@
1
- /**
2
- * cli.ts — Minimal CLI for standalone task validation.
3
- *
4
- * Usage:
5
- * npx @sanity/ailf-tasks validate .ailf/tasks/
6
- * npx @sanity/ailf-tasks validate # defaults to .ailf/tasks/
7
- */
8
- import { loadTaskDir } from "./parser.js";
9
- import { formatValidationResult, validateRepoTasks } from "./validation.js";
10
- export function run() {
11
- const args = process.argv.slice(2);
12
- const command = args[0];
13
- if (command === "validate") {
14
- const dir = args[1] ?? ".ailf/tasks";
15
- validateCommand(dir);
16
- }
17
- else if (command === "--help" ||
18
- command === "-h" ||
19
- command === undefined) {
20
- printUsage();
21
- }
22
- else {
23
- console.error(`Unknown command: ${command}`);
24
- printUsage();
25
- process.exit(1);
26
- }
27
- }
28
- function validateCommand(dir) {
29
- try {
30
- const tasks = loadTaskDir(dir);
31
- // Run semantic validation
32
- const result = validateRepoTasks(tasks);
33
- const formatted = formatValidationResult(result);
34
- console.log(`✅ ${tasks.length} task(s) validated from ${dir}`);
35
- for (const task of tasks) {
36
- console.log(` ${task.id} — ${task.description}`);
37
- }
38
- if (result.warnings.length > 0 || result.errors.length > 0) {
39
- console.log("");
40
- console.log(formatted);
41
- }
42
- if (!result.valid) {
43
- process.exit(1);
44
- }
45
- }
46
- catch (err) {
47
- console.error(`❌ ${err instanceof Error ? err.message : String(err)}`);
48
- process.exit(1);
49
- }
50
- }
51
- function printUsage() {
52
- console.log("Usage: ailf-tasks <command> [options]");
53
- console.log("");
54
- console.log("Commands:");
55
- console.log(" validate [dir] Validate task YAML files (default: .ailf/tasks/)");
56
- console.log("");
57
- console.log("Examples:");
58
- console.log(" ailf-tasks validate");
59
- console.log(" ailf-tasks validate .ailf/tasks/");
60
- console.log(" ailf-tasks validate /path/to/tasks/");
61
- }
@@ -1,13 +0,0 @@
1
- /**
2
- * @sanity/ailf-tasks — Task definition schemas and YAML parser.
3
- *
4
- * Lightweight package for parsing and validating .ailf/tasks/*.yaml files
5
- * without depending on the full AILF CLI or its heavyweight dependencies
6
- * (Promptfoo, LLM SDKs, Sanity client).
7
- *
8
- * Usage:
9
- * import { parseTaskFile, loadTaskDir, RepoTaskSchema } from '@sanity/ailf-tasks'
10
- */
11
- export { CURATED_ASSERTION_TYPES, RepoTaskFileSchema, RepoTaskSchema, RUBRIC_TEMPLATE_NAMES, type CuratedAssertionType, type RepoTask, type RubricTemplateName, } from "./schemas.js";
12
- export { loadTaskDir, parseTaskFile } from "./parser.js";
13
- export { detectSnakeCaseFields, formatValidationResult, validateRepoTasks, type ValidationMessage, type ValidationResult, } from "./validation.js";
@@ -1,16 +0,0 @@
1
- /**
2
- * @sanity/ailf-tasks — Task definition schemas and YAML parser.
3
- *
4
- * Lightweight package for parsing and validating .ailf/tasks/*.yaml files
5
- * without depending on the full AILF CLI or its heavyweight dependencies
6
- * (Promptfoo, LLM SDKs, Sanity client).
7
- *
8
- * Usage:
9
- * import { parseTaskFile, loadTaskDir, RepoTaskSchema } from '@sanity/ailf-tasks'
10
- */
11
- // Schemas and types
12
- export { CURATED_ASSERTION_TYPES, RepoTaskFileSchema, RepoTaskSchema, RUBRIC_TEMPLATE_NAMES, } from "./schemas.js";
13
- // Parsing
14
- export { loadTaskDir, parseTaskFile } from "./parser.js";
15
- // Validation
16
- export { detectSnakeCaseFields, formatValidationResult, validateRepoTasks, } from "./validation.js";
@@ -1,27 +0,0 @@
1
- /**
2
- * parser.ts — Standalone task file and directory parsing.
3
- *
4
- * High-level functions for loading and validating .ailf/tasks/ YAML
5
- * files without any dependency on the eval pipeline.
6
- *
7
- * Usage:
8
- * import { parseTaskFile, loadTaskDir } from '@sanity/ailf-tasks'
9
- */
10
- import { type RepoTask } from "./schemas.js";
11
- /**
12
- * Parse a single task YAML string and return validated tasks.
13
- *
14
- * @param content - Raw YAML string content
15
- * @param filename - Source filename (for error messages)
16
- * @returns Validated array of RepoTask objects
17
- * @throws Error if YAML parsing or Zod validation fails
18
- */
19
- export declare function parseTaskFile(content: string, filename?: string): RepoTask[];
20
- /**
21
- * Load and parse all task YAML files from a directory.
22
- *
23
- * @param dirPath - Path to directory containing .yaml/.yml files
24
- * @returns All validated tasks, sorted by filename
25
- * @throws Error if directory not found, no YAML files, or validation fails
26
- */
27
- export declare function loadTaskDir(dirPath: string): RepoTask[];
@@ -1,73 +0,0 @@
1
- /**
2
- * parser.ts — Standalone task file and directory parsing.
3
- *
4
- * High-level functions for loading and validating .ailf/tasks/ YAML
5
- * files without any dependency on the eval pipeline.
6
- *
7
- * Usage:
8
- * import { parseTaskFile, loadTaskDir } from '@sanity/ailf-tasks'
9
- */
10
- import { existsSync, readdirSync, readFileSync } from "fs";
11
- import { resolve } from "path";
12
- import { load } from "js-yaml";
13
- import { RepoTaskFileSchema } from "./schemas.js";
14
- // ---------------------------------------------------------------------------
15
- // Public API
16
- // ---------------------------------------------------------------------------
17
- /**
18
- * Parse a single task YAML string and return validated tasks.
19
- *
20
- * @param content - Raw YAML string content
21
- * @param filename - Source filename (for error messages)
22
- * @returns Validated array of RepoTask objects
23
- * @throws Error if YAML parsing or Zod validation fails
24
- */
25
- export function parseTaskFile(content, filename = "<string>") {
26
- const parsed = load(content);
27
- if (!Array.isArray(parsed)) {
28
- throw new Error(`${filename} did not parse to an array of tasks. ` +
29
- "Task files must contain a YAML array of task definitions.");
30
- }
31
- const result = RepoTaskFileSchema.safeParse(parsed);
32
- if (!result.success) {
33
- const messages = result.error.issues
34
- .map((i) => ` [${i.path.join(".")}]: ${i.message}`)
35
- .join("\n");
36
- throw new Error(`Invalid task file "${filename}":\n${messages}`);
37
- }
38
- return result.data;
39
- }
40
- /**
41
- * Load and parse all task YAML files from a directory.
42
- *
43
- * @param dirPath - Path to directory containing .yaml/.yml files
44
- * @returns All validated tasks, sorted by filename
45
- * @throws Error if directory not found, no YAML files, or validation fails
46
- */
47
- export function loadTaskDir(dirPath) {
48
- if (!existsSync(dirPath)) {
49
- throw new Error(`Tasks directory not found: ${dirPath}\n` +
50
- " Expected a directory containing .ailf/tasks/*.yaml files.");
51
- }
52
- const yamlFiles = readdirSync(dirPath)
53
- .filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
54
- .sort();
55
- if (yamlFiles.length === 0) {
56
- throw new Error(`No YAML files found in ${dirPath}\n` +
57
- " Expected .ailf/tasks/*.yaml files with task definitions.");
58
- }
59
- const allTasks = [];
60
- for (const file of yamlFiles) {
61
- const filePath = resolve(dirPath, file);
62
- const content = readFileSync(filePath, "utf-8");
63
- try {
64
- const tasks = parseTaskFile(content, file);
65
- allTasks.push(...tasks);
66
- }
67
- catch (err) {
68
- const msg = err instanceof Error ? err.message : String(err);
69
- throw new Error(`Failed to load ${file}:\n${msg}`, { cause: err });
70
- }
71
- }
72
- return allTasks;
73
- }