@sanity/ailf 2.0.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/LICENSE +21 -0
  2. package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
  3. package/dist/_vendor/ailf-core/examples/index.js +66 -1
  4. package/dist/agent-harness/assertions-runtime.d.ts +49 -0
  5. package/dist/agent-harness/assertions-runtime.js +138 -0
  6. package/dist/agent-harness/provider.d.ts +58 -0
  7. package/dist/agent-harness/provider.js +104 -0
  8. package/dist/cli.js +0 -0
  9. package/dist/commands/init.js +3 -0
  10. package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
  11. package/dist/orchestration/steps/generate-configs-step.js +35 -2
  12. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +39 -25
  13. package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
  14. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +9 -0
  15. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +28 -85
  16. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
  17. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
  18. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
  19. package/package.json +25 -24
  20. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  21. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  22. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  23. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  24. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  25. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  26. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  27. package/dist/_vendor/ailf-tasks/index.js +0 -16
  28. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  29. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  30. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  31. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  32. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  33. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  34. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  35. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  36. package/dist/agent-observer/test-imports.d.ts +0 -7
  37. package/dist/agent-observer/test-imports.js +0 -185
  38. package/dist/commands/update-quality-scores.d.ts +0 -5
  39. package/dist/commands/update-quality-scores.js +0 -20
  40. package/dist/lib/agent-behavior-report.d.ts +0 -8
  41. package/dist/lib/agent-behavior-report.js +0 -185
  42. package/dist/lib/baseline.d.ts +0 -19
  43. package/dist/lib/baseline.js +0 -153
  44. package/dist/lib/calculate-scores.d.ts +0 -23
  45. package/dist/lib/calculate-scores.js +0 -42
  46. package/dist/lib/compare.d.ts +0 -18
  47. package/dist/lib/compare.js +0 -170
  48. package/dist/lib/coverage-audit.d.ts +0 -4
  49. package/dist/lib/coverage-audit.js +0 -42
  50. package/dist/lib/discovery-report.d.ts +0 -13
  51. package/dist/lib/discovery-report.js +0 -57
  52. package/dist/lib/fetch-docs.d.ts +0 -30
  53. package/dist/lib/fetch-docs.js +0 -171
  54. package/dist/lib/generate-configs.d.ts +0 -25
  55. package/dist/lib/generate-configs.js +0 -42
  56. package/dist/lib/grader-api.d.ts +0 -21
  57. package/dist/lib/grader-api.js +0 -34
  58. package/dist/lib/grader-compare.d.ts +0 -19
  59. package/dist/lib/grader-compare.js +0 -91
  60. package/dist/lib/grader-consistency.d.ts +0 -27
  61. package/dist/lib/grader-consistency.js +0 -79
  62. package/dist/lib/grader-sensitivity.d.ts +0 -19
  63. package/dist/lib/grader-sensitivity.js +0 -75
  64. package/dist/lib/grader-validate.d.ts +0 -19
  65. package/dist/lib/grader-validate.js +0 -78
  66. package/dist/lib/measure-retrieval.d.ts +0 -14
  67. package/dist/lib/measure-retrieval.js +0 -71
  68. package/dist/lib/pr-comment.d.ts +0 -16
  69. package/dist/lib/pr-comment.js +0 -28
  70. package/dist/lib/readiness-report.d.ts +0 -13
  71. package/dist/lib/readiness-report.js +0 -108
  72. package/dist/lib/webhook-server.d.ts +0 -11
  73. package/dist/lib/webhook-server.js +0 -24
  74. package/dist/lib/weekly-digest.d.ts +0 -24
  75. package/dist/lib/weekly-digest.js +0 -148
  76. package/dist/orchestration/env-bridge.d.ts +0 -21
  77. package/dist/orchestration/env-bridge.js +0 -66
  78. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  79. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  80. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  81. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  82. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  83. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  86. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  87. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  88. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  89. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  90. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  91. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  92. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  93. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  94. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  95. package/dist/pipeline/compiler/task-bridge.js +0 -92
  96. package/dist/pipeline/expand-tasks.d.ts +0 -232
  97. package/dist/pipeline/expand-tasks.js +0 -467
  98. package/dist/pipeline/generate-configs.d.ts +0 -92
  99. package/dist/pipeline/generate-configs.js +0 -445
  100. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  102. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  103. package/dist/pipeline/steps/compare-step.js +0 -90
  104. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  105. package/dist/pipeline/steps/eval-step.js +0 -347
  106. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  107. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  108. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  109. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  110. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  111. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  112. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  113. package/dist/pipeline/steps/publish-report-step.js +0 -243
  114. package/dist/pipeline/steps/report-step.d.ts +0 -13
  115. package/dist/pipeline/steps/report-step.js +0 -56
  116. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  117. package/dist/pipeline/steps/update-scores-step.js +0 -42
  118. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  119. package/dist/scripts/agent-behavior-report.js +0 -315
  120. package/dist/scripts/baseline.d.ts +0 -43
  121. package/dist/scripts/baseline.js +0 -267
  122. package/dist/scripts/calculate-scores.d.ts +0 -166
  123. package/dist/scripts/calculate-scores.js +0 -1296
  124. package/dist/scripts/compare.d.ts +0 -22
  125. package/dist/scripts/compare.js +0 -334
  126. package/dist/scripts/coverage-audit.d.ts +0 -44
  127. package/dist/scripts/coverage-audit.js +0 -209
  128. package/dist/scripts/debug-eval.d.ts +0 -19
  129. package/dist/scripts/debug-eval.js +0 -73
  130. package/dist/scripts/discovery-report.d.ts +0 -58
  131. package/dist/scripts/discovery-report.js +0 -250
  132. package/dist/scripts/fetch-docs.d.ts +0 -35
  133. package/dist/scripts/fetch-docs.js +0 -472
  134. package/dist/scripts/generate-configs.d.ts +0 -66
  135. package/dist/scripts/generate-configs.js +0 -459
  136. package/dist/scripts/grader-api.d.ts +0 -27
  137. package/dist/scripts/grader-api.js +0 -206
  138. package/dist/scripts/grader-compare.d.ts +0 -22
  139. package/dist/scripts/grader-compare.js +0 -368
  140. package/dist/scripts/grader-consistency.d.ts +0 -20
  141. package/dist/scripts/grader-consistency.js +0 -313
  142. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  143. package/dist/scripts/grader-sensitivity.js +0 -354
  144. package/dist/scripts/grader-validate.d.ts +0 -19
  145. package/dist/scripts/grader-validate.js +0 -267
  146. package/dist/scripts/measure-retrieval.d.ts +0 -10
  147. package/dist/scripts/measure-retrieval.js +0 -145
  148. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  149. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  150. package/dist/scripts/pipeline.d.ts +0 -76
  151. package/dist/scripts/pipeline.js +0 -1031
  152. package/dist/scripts/pr-comment.d.ts +0 -10
  153. package/dist/scripts/pr-comment.js +0 -510
  154. package/dist/scripts/readiness-report.d.ts +0 -88
  155. package/dist/scripts/readiness-report.js +0 -342
  156. package/dist/scripts/update-quality-scores.d.ts +0 -15
  157. package/dist/scripts/update-quality-scores.js +0 -184
  158. package/dist/scripts/validate-task-sources.d.ts +0 -21
  159. package/dist/scripts/validate-task-sources.js +0 -210
  160. package/dist/scripts/validate.d.ts +0 -13
  161. package/dist/scripts/validate.js +0 -79
  162. package/dist/scripts/webhook-server.d.ts +0 -26
  163. package/dist/scripts/webhook-server.js +0 -147
  164. package/dist/scripts/weekly-digest.d.ts +0 -24
  165. package/dist/scripts/weekly-digest.js +0 -144
  166. package/dist/sinks/format-slack.d.ts +0 -64
  167. package/dist/sinks/format-slack.js +0 -306
  168. package/dist/sinks/slack-sink.d.ts +0 -27
  169. package/dist/sinks/slack-sink.js +0 -78
  170. package/dist/sinks/webhook-sink.d.ts +0 -19
  171. package/dist/sinks/webhook-sink.js +0 -50
  172. package/tasks/.expanded.agentic.yaml +0 -280
  173. package/tasks/.expanded.yaml +0 -565
@@ -1,166 +0,0 @@
1
- /**
2
- * Calculate-scores.ts
3
- *
4
- * Reads Promptfoo evaluation output and computes the AI Literacy Score
5
- * for each feature area. Each dimension is scored on a uniform 0–100 scale:
6
- *
7
- * Task Completion (0–100) — Can the LLM implement the feature?
8
- * Code Correctness (0–100) — Is the code idiomatic and correct?
9
- * Doc Coverage (0–100) — Did docs provide the needed info?
10
- *
11
- * Dimensions are combined into a weighted composite (0–100) using weights
12
- * from config/rubrics.yaml (default: Task×0.50 + Code×0.25 + Docs×0.25).
13
- *
14
- * Additionally compares with-docs vs without-docs scores to calculate
15
- * the "Doc Lift" — how much documentation helps vs parametric knowledge.
16
- *
17
- * When tests are run with the InstrumentedProvider (agent-observer),
18
- * this script also aggregates and reports agent behavior data: which
19
- * documentation pages were visited, what searches were performed, and
20
- * overall network activity patterns.
21
- */
22
- import "dotenv/config";
23
- import type { FeatureScore, GraderJudgment, PerModelEntry } from "../pipeline/types.js";
24
- export interface ComponentResult {
25
- assertion?: {
26
- type: string;
27
- value?: string;
28
- /** Structured metadata propagated from rubric templates (Approach 5). */
29
- metadata?: Record<string, unknown>;
30
- };
31
- pass: boolean;
32
- reason?: string;
33
- score?: number;
34
- }
35
- export interface PromptfooResultsWrapper {
36
- results: RawTestResult[];
37
- stats: {
38
- successes: number;
39
- failures: number;
40
- tokenUsage?: {
41
- assertions?: {
42
- completion: number;
43
- prompt: number;
44
- total: number;
45
- };
46
- completion: number;
47
- prompt: number;
48
- total: number;
49
- };
50
- };
51
- }
52
- export interface RawPromptfooFile {
53
- [key: string]: unknown;
54
- config?: {
55
- defaultTest?: {
56
- options?: {
57
- provider?: string;
58
- rubricProvider?: string;
59
- };
60
- };
61
- };
62
- results: PromptfooResultsWrapper;
63
- }
64
- export interface RawTestResult {
65
- cost?: number;
66
- error?: string;
67
- gradingResult: null | {
68
- componentResults: ComponentResult[];
69
- pass: boolean;
70
- };
71
- metadata?: Record<string, unknown>;
72
- provider?: {
73
- id?: string;
74
- label?: string;
75
- };
76
- response: {
77
- output: string;
78
- };
79
- testCase?: {
80
- description?: string;
81
- vars?: Record<string, string>;
82
- };
83
- vars: Record<string, string>;
84
- }
85
- export interface TestResult {
86
- cost: number;
87
- description: string;
88
- gradingResult: {
89
- componentResults: ComponentResult[];
90
- pass: boolean;
91
- };
92
- metadata?: Record<string, unknown>;
93
- /** Provider identifier (e.g., "openai:gpt-4o") */
94
- providerId?: string;
95
- /** Provider label (e.g., "GPT-4o") */
96
- providerLabel?: string;
97
- response: {
98
- output: string;
99
- };
100
- vars: Record<string, string>;
101
- }
102
- export interface UrlMetadata {
103
- otherUrls: string[];
104
- sanityUrlCount: number;
105
- sanityUrls: string[];
106
- totalUrlCount: number;
107
- }
108
- /**
109
- * Calculate scores grouped by model. Each model gets its own FeatureScore[]
110
- * and model-level aggregates.
111
- *
112
- * Uses the provider.id from Promptfoo results to identify models.
113
- * Falls back to provider.label, then "unknown" if neither is available.
114
- *
115
- * @returns Record keyed by model ID, or null if only one model was used
116
- * (per-model breakdown is redundant when there's only one model).
117
- */
118
- export declare function calculateScoresPerModel(resultsPath: string, weights: Record<string, number>): null | PerModelEntry[];
119
- export declare function classifyRubric(component: ComponentResult): "codeCorrectness" | "docCoverage" | "taskCompletion" | null;
120
- export declare function detectFeatureArea(description: string): string;
121
- /**
122
- * Extract grader judgments (reason text + scores) from evaluation results.
123
- *
124
- * This preserves the grader's natural language reasoning for downstream
125
- * analysis (failure mode classification, gap analysis). Each llm-rubric
126
- * assertion produces one GraderJudgment entry.
127
- *
128
- * Phase 3a prerequisite: structured judgment data for failure mode extraction.
129
- */
130
- export declare function extractGraderJudgments(resultsPath: string): GraderJudgment[];
131
- /**
132
- * Finds the URL-extraction assertion result in a test's componentResults
133
- * and parses the structured JSON from its `reason` field.
134
- */
135
- export declare function extractUrlMetadata(test: TestResult): null | UrlMetadata;
136
- export declare function parseRubricScore(component: ComponentResult): number;
137
- /**
138
- * Score agentic evaluation results. In agentic mode, all test entries are
139
- * gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).
140
- * The model retrieves docs via tools, so all results map to "actual" scores.
141
- *
142
- * Returns a record keyed by feature area with the composite actual score.
143
- */
144
- interface ActualScoreEntry {
145
- actualScore: number;
146
- codeCorrectness: number;
147
- docCoverage: number;
148
- taskCompletion: number;
149
- testCount: number;
150
- totalCost: number;
151
- }
152
- export declare function scoreAgenticResults(resultsPath: string, weights: Record<string, number>): Record<string, ActualScoreEntry>;
153
- /**
154
- * Merge baseline FeatureScore[] with agentic actual scores to produce
155
- * the full three-layer decomposition.
156
- *
157
- * The merge is per feature area. For each area:
158
- * - If baseline data exists: floor, ceiling, docLift, docQualityGap are populated
159
- * - If agentic data exists: actualScore is populated
160
- * - If both exist: retrievalGap and infrastructureEfficiency are computed
161
- *
162
- * @param baselineScores Floor/ceiling scores from baseline evaluation (may be empty)
163
- * @param agenticScores Actual scores from agentic evaluation (may be empty)
164
- */
165
- export declare function mergeScores(baselineScores: FeatureScore[], agenticScores: Record<string, ActualScoreEntry>): FeatureScore[];
166
- export {};