@sanity/ailf 4.6.0 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +50 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +35 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +136 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +153 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +6 -0
- package/dist/_vendor/ailf-core/services/index.js +18 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
- package/dist/_vendor/ailf-core/types/confidence.js +7 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +271 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +19 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
- package/dist/_vendor/ailf-core/types/index.js +15 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/openai-llm-client.js +52 -3
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/cli-program.js +3 -0
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/interpret.d.ts +50 -0
- package/dist/commands/interpret.js +212 -0
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +57 -23
- package/dist/composition-root.js +155 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +9 -0
- package/dist/grader/agent-harness.js +9 -0
- package/dist/grader/common.d.ts +9 -0
- package/dist/grader/common.js +9 -0
- package/dist/grader/index.d.ts +24 -0
- package/dist/grader/index.js +24 -0
- package/dist/grader/knowledge-probe.d.ts +9 -0
- package/dist/grader/knowledge-probe.js +9 -0
- package/dist/grader/literacy.d.ts +9 -0
- package/dist/grader/literacy.js +9 -0
- package/dist/grader/mcp.d.ts +9 -0
- package/dist/grader/mcp.js +9 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +7 -5
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-dimension failure-mode taxonomy barrel — relocated to @sanity/ailf-core.
|
|
3
|
+
*
|
|
4
|
+
* Previously lived in packages/eval/src/grader/index.ts. Moved here (D-05)
|
|
5
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
6
|
+
* `failureModesForDimension` without violating the core→eval import rule.
|
|
7
|
+
*
|
|
8
|
+
* Named re-exports only (W0124 — never `export *`).
|
|
9
|
+
*
|
|
10
|
+
* The eval-side packages/eval/src/grader/index.ts now re-exports from here,
|
|
11
|
+
* keeping all existing eval-side callers working without source changes.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
14
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
15
|
+
* @see docs/decisions/D0005-grader-model-separation.md — single grader model;
|
|
16
|
+
* taxonomies travel with the rubric prompt for reproducibility.
|
|
17
|
+
*/
|
|
18
|
+
export { COMMON_FAILURE_MODES } from "./common.js";
|
|
19
|
+
export { LITERACY_FAILURE_MODES } from "./literacy.js";
|
|
20
|
+
export { MCP_FAILURE_MODES } from "./mcp.js";
|
|
21
|
+
export { KP_FAILURE_MODES } from "./knowledge-probe.js";
|
|
22
|
+
export { AGENT_FAILURE_MODES } from "./agent-harness.js";
|
|
23
|
+
import { COMMON_FAILURE_MODES } from "./common.js";
|
|
24
|
+
import { LITERACY_FAILURE_MODES } from "./literacy.js";
|
|
25
|
+
import { MCP_FAILURE_MODES } from "./mcp.js";
|
|
26
|
+
import { KP_FAILURE_MODES } from "./knowledge-probe.js";
|
|
27
|
+
import { AGENT_FAILURE_MODES } from "./agent-harness.js";
|
|
28
|
+
/**
|
|
29
|
+
* Return the legal failure-mode list for a given rubric dimension.
|
|
30
|
+
*
|
|
31
|
+
* Accepts both family-level keys (`mcp-behavior`, `knowledge-probe`,
|
|
32
|
+
* `agent-harness`) and the per-template `dimension` strings used in
|
|
33
|
+
* `config/rubrics.ts` (`task-completion`, `input-validation`,
|
|
34
|
+
* `factual-correctness`, `process-quality`, …). The cross-cutting
|
|
35
|
+
* `COMMON_FAILURE_MODES` is always included.
|
|
36
|
+
*
|
|
37
|
+
* Unknown dimensions fall through to `COMMON_FAILURE_MODES` only — safe
|
|
38
|
+
* default, the grader can still pick `unclassified`.
|
|
39
|
+
*/
|
|
40
|
+
export function failureModesForDimension(dimension) {
|
|
41
|
+
switch (dimension) {
|
|
42
|
+
// ── Literacy family ──────────────────────────────────────
|
|
43
|
+
case "task-completion":
|
|
44
|
+
case "code-correctness":
|
|
45
|
+
case "doc-coverage":
|
|
46
|
+
return [...COMMON_FAILURE_MODES, ...LITERACY_FAILURE_MODES];
|
|
47
|
+
// ── MCP family ───────────────────────────────────────────
|
|
48
|
+
case "mcp-behavior":
|
|
49
|
+
case "input-validation":
|
|
50
|
+
case "output-correctness":
|
|
51
|
+
case "error-handling":
|
|
52
|
+
case "security":
|
|
53
|
+
return [...COMMON_FAILURE_MODES, ...MCP_FAILURE_MODES];
|
|
54
|
+
// ── Knowledge-probe family ───────────────────────────────
|
|
55
|
+
case "knowledge-probe":
|
|
56
|
+
case "factual-correctness":
|
|
57
|
+
case "completeness":
|
|
58
|
+
case "currency":
|
|
59
|
+
return [...COMMON_FAILURE_MODES, ...KP_FAILURE_MODES];
|
|
60
|
+
// ── Agent-harness family ─────────────────────────────────
|
|
61
|
+
case "agent-harness":
|
|
62
|
+
case "process-quality":
|
|
63
|
+
case "agent-output":
|
|
64
|
+
case "tool-usage":
|
|
65
|
+
return [...COMMON_FAILURE_MODES, ...AGENT_FAILURE_MODES];
|
|
66
|
+
default:
|
|
67
|
+
return COMMON_FAILURE_MODES;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Flat list of all dimension names recognized by `failureModesForDimension`.
|
|
72
|
+
* Used by card-validators and test calibration.
|
|
73
|
+
*/
|
|
74
|
+
export const CANONICAL_DIMENSIONS = [
|
|
75
|
+
// Literacy family
|
|
76
|
+
"task-completion",
|
|
77
|
+
"code-correctness",
|
|
78
|
+
"doc-coverage",
|
|
79
|
+
// MCP family
|
|
80
|
+
"mcp-behavior",
|
|
81
|
+
"input-validation",
|
|
82
|
+
"output-correctness",
|
|
83
|
+
"error-handling",
|
|
84
|
+
"security",
|
|
85
|
+
// Knowledge-probe family
|
|
86
|
+
"knowledge-probe",
|
|
87
|
+
"factual-correctness",
|
|
88
|
+
"completeness",
|
|
89
|
+
"currency",
|
|
90
|
+
// Agent-harness family
|
|
91
|
+
"agent-harness",
|
|
92
|
+
"process-quality",
|
|
93
|
+
"agent-output",
|
|
94
|
+
"tool-usage",
|
|
95
|
+
];
|
|
96
|
+
/**
|
|
97
|
+
* Returns `true` when `mode` appears in the failure-mode list of any
|
|
98
|
+
* canonical dimension family.
|
|
99
|
+
*/
|
|
100
|
+
export function isCanonicalFailureMode(mode) {
|
|
101
|
+
const allModes = new Set([
|
|
102
|
+
...COMMON_FAILURE_MODES,
|
|
103
|
+
...LITERACY_FAILURE_MODES,
|
|
104
|
+
...MCP_FAILURE_MODES,
|
|
105
|
+
...KP_FAILURE_MODES,
|
|
106
|
+
...AGENT_FAILURE_MODES,
|
|
107
|
+
]);
|
|
108
|
+
return allModes.has(mode);
|
|
109
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Knowledge-probe failure modes — valid for the `knowledge-probe` dimension
|
|
3
|
+
* family (factual-correctness, completeness, currency).
|
|
4
|
+
*
|
|
5
|
+
* Relocated from packages/eval/src/grader/knowledge-probe.ts to @sanity/ailf-core
|
|
6
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
7
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
10
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
11
|
+
*/
|
|
12
|
+
export declare const KP_FAILURE_MODES: readonly ["factual-error", "incompleteness", "currency-violation", "hallucination"];
|
|
13
|
+
export type KPFailureMode = (typeof KP_FAILURE_MODES)[number];
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Knowledge-probe failure modes — valid for the `knowledge-probe` dimension
|
|
3
|
+
* family (factual-correctness, completeness, currency).
|
|
4
|
+
*
|
|
5
|
+
* Relocated from packages/eval/src/grader/knowledge-probe.ts to @sanity/ailf-core
|
|
6
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
7
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
10
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
11
|
+
*/
|
|
12
|
+
export const KP_FAILURE_MODES = [
|
|
13
|
+
"factual-error", // assistant asserts something demonstrably false
|
|
14
|
+
"incompleteness", // assistant covers part of the answer; misses key piece
|
|
15
|
+
"currency-violation", // assistant cites stale facts beyond doc currency horizon
|
|
16
|
+
"hallucination", // assistant invents details not present in any doc
|
|
17
|
+
];
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Literacy failure modes — valid for `task-completion`, `code-correctness`,
|
|
3
|
+
* `doc-coverage` (the literacy dimension family).
|
|
4
|
+
*
|
|
5
|
+
* Relocated from packages/eval/src/grader/literacy.ts to @sanity/ailf-core
|
|
6
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
7
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
10
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
11
|
+
*/
|
|
12
|
+
export declare const LITERACY_FAILURE_MODES: readonly ["missing-docs", "outdated-docs", "incorrect-docs", "poor-structure"];
|
|
13
|
+
export type LiteracyFailureMode = (typeof LITERACY_FAILURE_MODES)[number];
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Literacy failure modes — valid for `task-completion`, `code-correctness`,
|
|
3
|
+
* `doc-coverage` (the literacy dimension family).
|
|
4
|
+
*
|
|
5
|
+
* Relocated from packages/eval/src/grader/literacy.ts to @sanity/ailf-core
|
|
6
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
7
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
10
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
11
|
+
*/
|
|
12
|
+
export const LITERACY_FAILURE_MODES = [
|
|
13
|
+
"missing-docs", // relevant doc didn't exist
|
|
14
|
+
"outdated-docs", // doc reflects an older API/version
|
|
15
|
+
"incorrect-docs", // doc states something factually wrong
|
|
16
|
+
"poor-structure", // doc exists but is hard to find or follow
|
|
17
|
+
];
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP failure modes — valid for the `mcp-behavior` dimension family
|
|
3
|
+
* (input-validation, output-correctness, error-handling, security).
|
|
4
|
+
*
|
|
5
|
+
* Relocated from packages/eval/src/grader/mcp.ts to @sanity/ailf-core
|
|
6
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
7
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
10
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
11
|
+
*/
|
|
12
|
+
export declare const MCP_FAILURE_MODES: readonly ["spec-mismatch", "missing-error-handling", "over-privileged", "missing-docs"];
|
|
13
|
+
export type MCPFailureMode = (typeof MCP_FAILURE_MODES)[number];
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP failure modes — valid for the `mcp-behavior` dimension family
|
|
3
|
+
* (input-validation, output-correctness, error-handling, security).
|
|
4
|
+
*
|
|
5
|
+
* Relocated from packages/eval/src/grader/mcp.ts to @sanity/ailf-core
|
|
6
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
7
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
10
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
11
|
+
*/
|
|
12
|
+
export const MCP_FAILURE_MODES = [
|
|
13
|
+
"spec-mismatch", // tool/server output doesn't match published MCP spec
|
|
14
|
+
"missing-error-handling", // tool failure path under-documented or absent
|
|
15
|
+
"over-privileged", // tool exposes operations the doc didn't sanction
|
|
16
|
+
"missing-docs", // re-export from literacy (cross-cutting)
|
|
17
|
+
];
|
|
@@ -23,3 +23,4 @@ export type { PricingEntry, PromptEntry, SourceEntry, } from "./config-helpers.j
|
|
|
23
23
|
export { env } from "./env-helper.js";
|
|
24
24
|
export { NoOpArtifactWriter, NotImplementedError, } from "./ports/artifact-writer.js";
|
|
25
25
|
export { assoc, resolveVariantMode, splitTaskVariant, type AssocContext, type TaskVariantSplit, } from "./artifact-capture/association.js";
|
|
26
|
+
export { AGENT_FAILURE_MODES, CANONICAL_DIMENSIONS, COMMON_FAILURE_MODES, KP_FAILURE_MODES, LITERACY_FAILURE_MODES, MCP_FAILURE_MODES, failureModesForDimension, isCanonicalFailureMode, type AgentFailureMode, type CommonFailureMode, type KPFailureMode, type LiteracyFailureMode, type MCPFailureMode, } from "./grader/failure-modes/index.js";
|
|
@@ -25,3 +25,7 @@ export { defineCanaryTasks, defineConfig, defineFeatures, defineModeBase, define
|
|
|
25
25
|
export { env } from "./env-helper.js";
|
|
26
26
|
export { NoOpArtifactWriter, NotImplementedError, } from "./ports/artifact-writer.js";
|
|
27
27
|
export { assoc, resolveVariantMode, splitTaskVariant, } from "./artifact-capture/association.js";
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
// Phase 5 — failure-mode taxonomy (D-05 hoist)
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
export { AGENT_FAILURE_MODES, CANONICAL_DIMENSIONS, COMMON_FAILURE_MODES, KP_FAILURE_MODES, LITERACY_FAILURE_MODES, MCP_FAILURE_MODES, failureModesForDimension, isCanonicalFailureMode, } from "./grader/failure-modes/index.js";
|
|
@@ -80,6 +80,14 @@ export interface ResolvedConfig {
|
|
|
80
80
|
noRemoteCache: boolean;
|
|
81
81
|
/** Grader replications for consistency measurement */
|
|
82
82
|
graderReplications?: number;
|
|
83
|
+
/**
|
|
84
|
+
* Replications per borderline judgment for the GRAD-04 intra-grader
|
|
85
|
+
* consensus pass (re-grade scores within ±5 of the severity boundaries
|
|
86
|
+
* 30/50/60). Sourced from `RepoConfig.execution.borderlineReplications`
|
|
87
|
+
* or the equivalent CLI/env surface. When unset, the runner factory
|
|
88
|
+
* falls back to its built-in default (3 replications).
|
|
89
|
+
*/
|
|
90
|
+
borderlineReplications?: number;
|
|
83
91
|
/**
|
|
84
92
|
* Grader context policy. Controls whether canonical reference docs are
|
|
85
93
|
* injected into the grader's assertion `rubricPrompt`. `"rubric-only"`
|
|
@@ -81,7 +81,22 @@ export interface ModeRubricConfig {
|
|
|
81
81
|
header: string;
|
|
82
82
|
scale: string[];
|
|
83
83
|
criteria_label?: string;
|
|
84
|
+
/**
|
|
85
|
+
* Plan 03-02 — per-dimension legal failure-mode list. The runtime
|
|
86
|
+
* rubric assembler announces these modes to the grader before the
|
|
87
|
+
* structured-shape footer when the list is non-empty. Stamped at
|
|
88
|
+
* config-load time by `failureModesForDimension(dimension)` in
|
|
89
|
+
* `packages/eval/src/grader/index.ts`.
|
|
90
|
+
*/
|
|
91
|
+
failureModes?: readonly string[];
|
|
84
92
|
}>;
|
|
93
|
+
/**
|
|
94
|
+
* Plan 03-01 — formerly hard-coded literal in
|
|
95
|
+
* `pipeline/compiler/rubric-resolution.ts`; now sourced from
|
|
96
|
+
* RubricConfig.footer and threaded through to the runtime prompt
|
|
97
|
+
* assembler so the grader is told exactly what wire format to emit.
|
|
98
|
+
*/
|
|
99
|
+
footer: string;
|
|
85
100
|
}
|
|
86
101
|
/** A provider entry in the compile result */
|
|
87
102
|
export interface CompileResultProvider {
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* branded-string.ts — single, audited cast site for Zod schemas that
|
|
3
|
+
* parse a non-empty string into a {@link Brand}-tagged domain type.
|
|
4
|
+
*
|
|
5
|
+
* Project rule (`.claude/rules/typescript.md`): "Parse, don't validate
|
|
6
|
+
* — use schema libs at boundaries; no `as` on `unknown`." The
|
|
7
|
+
* `Brand<string, "Tag">` utility in `branded-ids.ts` uses a `unique
|
|
8
|
+
* symbol` (`__brand`) that is not structurally identical to Zod 4's
|
|
9
|
+
* `BRAND` symbol, so a direct `.brand<"Tag">()` swap does not produce
|
|
10
|
+
* a compatible type. Wrapping the unavoidable cast in a single typed
|
|
11
|
+
* helper centralizes the rule violation to one auditable place
|
|
12
|
+
* instead of duplicating `as unknown as z.ZodType<Brand<…, …>>` at
|
|
13
|
+
* every adapter branded-field declaration.
|
|
14
|
+
*
|
|
15
|
+
* Usage:
|
|
16
|
+
*
|
|
17
|
+
* ```ts
|
|
18
|
+
* import { brandedString } from "../schemas/index.d.ts"
|
|
19
|
+
*
|
|
20
|
+
* judgmentId: brandedString<"JudgmentId">().optional(),
|
|
21
|
+
* ```
|
|
22
|
+
*
|
|
23
|
+
* This is the **only** allowed place to elide the no-`as`-on-`unknown`
|
|
24
|
+
* rule for branded-string schemas. Phase 7 will reconsider the
|
|
25
|
+
* `Brand<>` shape against Zod 4's `BRAND` so the cast can be retired.
|
|
26
|
+
*
|
|
27
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
28
|
+
* @see packages/core/src/types/branded-ids.ts
|
|
29
|
+
*/
|
|
30
|
+
import { z } from "zod";
|
|
31
|
+
import type { Brand } from "../types/branded-ids.js";
|
|
32
|
+
/**
|
|
33
|
+
* Schema for a non-empty string typed as `Brand<string, TBrand>`.
|
|
34
|
+
*
|
|
35
|
+
* The single `as unknown as` cast inside this helper is the audited
|
|
36
|
+
* exit from the project's no-`as`-on-`unknown` rule. Adapters MUST
|
|
37
|
+
* NOT replicate the cast at their own call sites — call this helper
|
|
38
|
+
* instead so the rule violation stays centralized.
|
|
39
|
+
*/
|
|
40
|
+
export declare function brandedString<TBrand extends string>(): z.ZodType<Brand<string, TBrand>>;
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* branded-string.ts — single, audited cast site for Zod schemas that
|
|
3
|
+
* parse a non-empty string into a {@link Brand}-tagged domain type.
|
|
4
|
+
*
|
|
5
|
+
* Project rule (`.claude/rules/typescript.md`): "Parse, don't validate
|
|
6
|
+
* — use schema libs at boundaries; no `as` on `unknown`." The
|
|
7
|
+
* `Brand<string, "Tag">` utility in `branded-ids.ts` uses a `unique
|
|
8
|
+
* symbol` (`__brand`) that is not structurally identical to Zod 4's
|
|
9
|
+
* `BRAND` symbol, so a direct `.brand<"Tag">()` swap does not produce
|
|
10
|
+
* a compatible type. Wrapping the unavoidable cast in a single typed
|
|
11
|
+
* helper centralizes the rule violation to one auditable place
|
|
12
|
+
* instead of duplicating `as unknown as z.ZodType<Brand<…, …>>` at
|
|
13
|
+
* every adapter branded-field declaration.
|
|
14
|
+
*
|
|
15
|
+
* Usage:
|
|
16
|
+
*
|
|
17
|
+
* ```ts
|
|
18
|
+
* import { brandedString } from "../schemas/index.js"
|
|
19
|
+
*
|
|
20
|
+
* judgmentId: brandedString<"JudgmentId">().optional(),
|
|
21
|
+
* ```
|
|
22
|
+
*
|
|
23
|
+
* This is the **only** allowed place to elide the no-`as`-on-`unknown`
|
|
24
|
+
* rule for branded-string schemas. Phase 7 will reconsider the
|
|
25
|
+
* `Brand<>` shape against Zod 4's `BRAND` so the cast can be retired.
|
|
26
|
+
*
|
|
27
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
28
|
+
* @see packages/core/src/types/branded-ids.ts
|
|
29
|
+
*/
|
|
30
|
+
import { z } from "zod";
|
|
31
|
+
/**
|
|
32
|
+
* Schema for a non-empty string typed as `Brand<string, TBrand>`.
|
|
33
|
+
*
|
|
34
|
+
* The single `as unknown as` cast inside this helper is the audited
|
|
35
|
+
* exit from the project's no-`as`-on-`unknown` rule. Adapters MUST
|
|
36
|
+
* NOT replicate the cast at their own call sites — call this helper
|
|
37
|
+
* instead so the rule violation stays centralized.
|
|
38
|
+
*/
|
|
39
|
+
export function brandedString() {
|
|
40
|
+
// The runtime is a plain non-empty string; the brand is a
|
|
41
|
+
// compile-time-only nominal tag (see `Brand<>` in branded-ids.ts).
|
|
42
|
+
// Zod 4's `.brand()` uses a different symbol shape, so a direct
|
|
43
|
+
// composition does not yield the project's `Brand<…>` type.
|
|
44
|
+
return z.string().min(1);
|
|
45
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* confidence-schema.ts — shared Zod schema for the D0049 `Confidence` triple.
|
|
3
|
+
*
|
|
4
|
+
* Authored ONCE here so adapter schemas under
|
|
5
|
+
* `packages/eval/src/adapters/grader-outputs/` and
|
|
6
|
+
* `packages/eval/src/adapters/attribution/` import a single shared schema
|
|
7
|
+
* fragment instead of redeclaring the shape inline. The schema asserts
|
|
8
|
+
* `satisfies z.ZodType<Confidence>` against the domain type in
|
|
9
|
+
* `packages/core/src/types/confidence.ts` so drift is a build error.
|
|
10
|
+
*
|
|
11
|
+
* NON-BOUNDARY HELPER: this file lives outside the D0045 SCAN_ROOTS gate
|
|
12
|
+
* by intent — it is a reusable schema fragment, not a trust boundary.
|
|
13
|
+
* Consumers import via the pinned subpath export
|
|
14
|
+
* `@sanity/ailf-core/schemas` (declared in `packages/core/package.json`),
|
|
15
|
+
* NOT through the top-level barrel — that pin is the
|
|
16
|
+
* single legal access path so all adapter sites use the same specifier.
|
|
17
|
+
*
|
|
18
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
19
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
20
|
+
*/
|
|
21
|
+
import { z } from "zod";
|
|
22
|
+
/**
|
|
23
|
+
* Shared schema for {@link Confidence}. The `derivation` field is the
|
|
24
|
+
* open `ConfidenceDerivation` tag; we accept any non-empty string so
|
|
25
|
+
* future emitters can mint their own identifiers without editing this
|
|
26
|
+
* package (matches `isConfidence`'s runtime guard).
|
|
27
|
+
*/
|
|
28
|
+
export declare const ConfidenceSchema: z.ZodObject<{
|
|
29
|
+
level: z.ZodEnum<{
|
|
30
|
+
low: "low";
|
|
31
|
+
medium: "medium";
|
|
32
|
+
high: "high";
|
|
33
|
+
}>;
|
|
34
|
+
signalsPresent: z.ZodNumber;
|
|
35
|
+
derivation: z.ZodString;
|
|
36
|
+
}, z.core.$strip>;
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* confidence-schema.ts — shared Zod schema for the D0049 `Confidence` triple.
|
|
3
|
+
*
|
|
4
|
+
* Authored ONCE here so adapter schemas under
|
|
5
|
+
* `packages/eval/src/adapters/grader-outputs/` and
|
|
6
|
+
* `packages/eval/src/adapters/attribution/` import a single shared schema
|
|
7
|
+
* fragment instead of redeclaring the shape inline. The schema asserts
|
|
8
|
+
* `satisfies z.ZodType<Confidence>` against the domain type in
|
|
9
|
+
* `packages/core/src/types/confidence.ts` so drift is a build error.
|
|
10
|
+
*
|
|
11
|
+
* NON-BOUNDARY HELPER: this file lives outside the D0045 SCAN_ROOTS gate
|
|
12
|
+
* by intent — it is a reusable schema fragment, not a trust boundary.
|
|
13
|
+
* Consumers import via the pinned subpath export
|
|
14
|
+
* `@sanity/ailf-core/schemas` (declared in `packages/core/package.json`),
|
|
15
|
+
* NOT through the top-level barrel — that pin is the
|
|
16
|
+
* single legal access path so all adapter sites use the same specifier.
|
|
17
|
+
*
|
|
18
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
19
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
20
|
+
*/
|
|
21
|
+
import { z } from "zod";
|
|
22
|
+
/**
|
|
23
|
+
* Shared schema for {@link Confidence}. The `derivation` field is the
|
|
24
|
+
* open `ConfidenceDerivation` tag; we accept any non-empty string so
|
|
25
|
+
* future emitters can mint their own identifiers without editing this
|
|
26
|
+
* package (matches `isConfidence`'s runtime guard).
|
|
27
|
+
*/
|
|
28
|
+
export const ConfidenceSchema = z.object({
|
|
29
|
+
level: z.enum(["high", "medium", "low"]),
|
|
30
|
+
signalsPresent: z.number().int().nonnegative(),
|
|
31
|
+
derivation: z.string().min(1),
|
|
32
|
+
});
|
|
@@ -41,6 +41,7 @@ export declare const EvalConfigSchema: z.ZodObject<{
|
|
|
41
41
|
execution: z.ZodOptional<z.ZodObject<{
|
|
42
42
|
concurrency: z.ZodOptional<z.ZodNumber>;
|
|
43
43
|
graderReplications: z.ZodOptional<z.ZodNumber>;
|
|
44
|
+
borderlineReplications: z.ZodOptional<z.ZodNumber>;
|
|
44
45
|
gapAnalysis: z.ZodOptional<z.ZodBoolean>;
|
|
45
46
|
apiUrl: z.ZodOptional<z.ZodString>;
|
|
46
47
|
}, z.core.$strip>>;
|
|
@@ -85,15 +85,19 @@ export const EvalConfigSchema = z
|
|
|
85
85
|
* `concurrency`, `gapAnalysis`, and `graderReplications` fields and adds
|
|
86
86
|
* `apiUrl` to the same group.
|
|
87
87
|
*
|
|
88
|
-
* - `concurrency`
|
|
89
|
-
* - `graderReplications`
|
|
90
|
-
* - `
|
|
91
|
-
* -
|
|
88
|
+
* - `concurrency` — max parallel API calls
|
|
89
|
+
* - `graderReplications` — grader consistency replications
|
|
90
|
+
* - `borderlineReplications` — replications per borderline judgment
|
|
91
|
+
* for the GRAD-04 intra-grader consensus
|
|
92
|
+
* pass (default 3 in composition-root)
|
|
93
|
+
* - `gapAnalysis` — enable failure-mode + impact analysis (default true)
|
|
94
|
+
* - `apiUrl` — AILF API base URL (default https://ailf-api.sanity.build)
|
|
92
95
|
*/
|
|
93
96
|
execution: z
|
|
94
97
|
.object({
|
|
95
98
|
concurrency: z.number().int().positive().optional(),
|
|
96
99
|
graderReplications: z.number().int().positive().optional(),
|
|
100
|
+
borderlineReplications: z.number().int().positive().optional(),
|
|
97
101
|
gapAnalysis: z.boolean().optional(),
|
|
98
102
|
apiUrl: z.string().url().optional(),
|
|
99
103
|
})
|
|
@@ -19,3 +19,12 @@ export * from "./schedules.js";
|
|
|
19
19
|
export * from "./sinks.js";
|
|
20
20
|
export * from "./symbol-preflight-report.js";
|
|
21
21
|
export * from "./test-budgets.js";
|
|
22
|
+
// Phase 1 Plan 02 — shared schema fragment for D0049 Confidence.
|
|
23
|
+
// Named re-export only (W0124 / D0045) and pinned-subpath access path
|
|
24
|
+
// `@sanity/ailf-core/schemas` for adapter consumers.
|
|
25
|
+
export { ConfidenceSchema } from "./confidence-schema.js";
|
|
26
|
+
// Phase 1 Plan 03 — single audited cast site for `Brand<string, T>`
|
|
27
|
+
// schemas. Adapters MUST route branded-field declarations through this
|
|
28
|
+
// helper instead of replicating `as unknown as z.ZodType<…>` at each
|
|
29
|
+
// schema author site (project rule: no `as` on `unknown`).
|
|
30
|
+
export { brandedString } from "./branded-string.js";
|
|
@@ -48,6 +48,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
|
|
|
48
48
|
"with-docs": "with-docs";
|
|
49
49
|
}>>;
|
|
50
50
|
graderReplications: z.ZodOptional<z.ZodNumber>;
|
|
51
|
+
borderlineReplications: z.ZodOptional<z.ZodNumber>;
|
|
51
52
|
headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
52
53
|
inlineTasks: z.ZodOptional<z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
|
|
53
54
|
jobId: z.ZodOptional<z.ZodString>;
|
|
@@ -114,6 +114,7 @@ export const PipelineRequestSchema = z.object({
|
|
|
114
114
|
*/
|
|
115
115
|
graderContext: z.enum(["rubric-only", "with-docs"]).optional(),
|
|
116
116
|
graderReplications: z.number().int().positive().optional(),
|
|
117
|
+
borderlineReplications: z.number().int().positive().optional(),
|
|
117
118
|
headers: z.record(z.string(), z.string()).optional(),
|
|
118
119
|
inlineTasks: z.array(z.record(z.string(), z.unknown())).optional(),
|
|
119
120
|
jobId: z.string().optional(),
|
|
@@ -20,6 +20,7 @@ import { z } from "zod";
|
|
|
20
20
|
export declare const RubricTemplateSchema: z.ZodObject<{
|
|
21
21
|
criteria_label: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
22
22
|
dimension: z.ZodOptional<z.ZodString>;
|
|
23
|
+
failureModes: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
23
24
|
header: z.ZodString;
|
|
24
25
|
scale: z.ZodArray<z.ZodString>;
|
|
25
26
|
}, z.core.$strip>;
|
|
@@ -52,6 +53,7 @@ export declare const RubricConfigSchema: z.ZodObject<{
|
|
|
52
53
|
templates: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
53
54
|
criteria_label: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
54
55
|
dimension: z.ZodOptional<z.ZodString>;
|
|
56
|
+
failureModes: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
55
57
|
header: z.ZodString;
|
|
56
58
|
scale: z.ZodArray<z.ZodString>;
|
|
57
59
|
}, z.core.$strip>>;
|
|
@@ -112,7 +114,10 @@ export type FeatureRegistry = z.infer<typeof FeatureRegistrySchema>;
|
|
|
112
114
|
* and provides task-specific criteria.
|
|
113
115
|
*/
|
|
114
116
|
declare const TemplatedLlmRubricAssertSchema: z.ZodObject<{
|
|
115
|
-
criteria: z.ZodArray<z.
|
|
117
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
118
|
+
id: z.ZodString;
|
|
119
|
+
text: z.ZodString;
|
|
120
|
+
}, z.core.$strip>>;
|
|
116
121
|
template: z.ZodString;
|
|
117
122
|
type: z.ZodLiteral<"llm-rubric">;
|
|
118
123
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
@@ -129,7 +134,10 @@ export type TemplatedLlmRubricAssert = z.infer<typeof TemplatedLlmRubricAssertSc
|
|
|
129
134
|
* is gone, but union is more flexible for future additions).
|
|
130
135
|
*/
|
|
131
136
|
export declare const AssertionSchema: z.ZodUnion<readonly [z.ZodObject<{
|
|
132
|
-
criteria: z.ZodArray<z.
|
|
137
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
138
|
+
id: z.ZodString;
|
|
139
|
+
text: z.ZodString;
|
|
140
|
+
}, z.core.$strip>>;
|
|
133
141
|
template: z.ZodString;
|
|
134
142
|
type: z.ZodLiteral<"llm-rubric">;
|
|
135
143
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
@@ -174,7 +182,10 @@ export type CanonicalDoc = z.infer<typeof CanonicalDocSchema>;
|
|
|
174
182
|
*/
|
|
175
183
|
export declare const SingleTaskSchema: z.ZodObject<{
|
|
176
184
|
assert: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
|
|
177
|
-
criteria: z.ZodArray<z.
|
|
185
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
186
|
+
id: z.ZodString;
|
|
187
|
+
text: z.ZodString;
|
|
188
|
+
}, z.core.$strip>>;
|
|
178
189
|
template: z.ZodString;
|
|
179
190
|
type: z.ZodLiteral<"llm-rubric">;
|
|
180
191
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
@@ -233,7 +244,10 @@ export type SingleTask = z.infer<typeof SingleTaskSchema>;
|
|
|
233
244
|
*/
|
|
234
245
|
export declare const LegacyTaskSchema: z.ZodObject<{
|
|
235
246
|
assert: z.ZodOptional<z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
|
|
236
|
-
criteria: z.ZodArray<z.
|
|
247
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
248
|
+
id: z.ZodString;
|
|
249
|
+
text: z.ZodString;
|
|
250
|
+
}, z.core.$strip>>;
|
|
237
251
|
template: z.ZodString;
|
|
238
252
|
type: z.ZodLiteral<"llm-rubric">;
|
|
239
253
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
@@ -269,7 +283,10 @@ export type LegacyTask = z.infer<typeof LegacyTaskSchema>;
|
|
|
269
283
|
*/
|
|
270
284
|
export declare const TaskEntrySchema: z.ZodUnion<readonly [z.ZodObject<{
|
|
271
285
|
assert: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
|
|
272
|
-
criteria: z.ZodArray<z.
|
|
286
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
287
|
+
id: z.ZodString;
|
|
288
|
+
text: z.ZodString;
|
|
289
|
+
}, z.core.$strip>>;
|
|
273
290
|
template: z.ZodString;
|
|
274
291
|
type: z.ZodLiteral<"llm-rubric">;
|
|
275
292
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
@@ -321,7 +338,10 @@ export declare const TaskEntrySchema: z.ZodUnion<readonly [z.ZodObject<{
|
|
|
321
338
|
}, z.core.$loose>;
|
|
322
339
|
}, z.core.$strip>, z.ZodObject<{
|
|
323
340
|
assert: z.ZodOptional<z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
|
|
324
|
-
criteria: z.ZodArray<z.
|
|
341
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
342
|
+
id: z.ZodString;
|
|
343
|
+
text: z.ZodString;
|
|
344
|
+
}, z.core.$strip>>;
|
|
325
345
|
template: z.ZodString;
|
|
326
346
|
type: z.ZodLiteral<"llm-rubric">;
|
|
327
347
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
@@ -355,7 +375,10 @@ export type TaskEntryParsed = z.infer<typeof TaskEntrySchema>;
|
|
|
355
375
|
*/
|
|
356
376
|
export declare const TaskFileSchema: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
|
|
357
377
|
assert: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
|
|
358
|
-
criteria: z.ZodArray<z.
|
|
378
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
379
|
+
id: z.ZodString;
|
|
380
|
+
text: z.ZodString;
|
|
381
|
+
}, z.core.$strip>>;
|
|
359
382
|
template: z.ZodString;
|
|
360
383
|
type: z.ZodLiteral<"llm-rubric">;
|
|
361
384
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
@@ -407,7 +430,10 @@ export declare const TaskFileSchema: z.ZodArray<z.ZodUnion<readonly [z.ZodObject
|
|
|
407
430
|
}, z.core.$loose>;
|
|
408
431
|
}, z.core.$strip>, z.ZodObject<{
|
|
409
432
|
assert: z.ZodOptional<z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
|
|
410
|
-
criteria: z.ZodArray<z.
|
|
433
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
434
|
+
id: z.ZodString;
|
|
435
|
+
text: z.ZodString;
|
|
436
|
+
}, z.core.$strip>>;
|
|
411
437
|
template: z.ZodString;
|
|
412
438
|
type: z.ZodLiteral<"llm-rubric">;
|
|
413
439
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
@@ -26,6 +26,14 @@ export const RubricTemplateSchema = z.object({
|
|
|
26
26
|
.min(1, "criteria_label must be a non-empty string")
|
|
27
27
|
.nullish(),
|
|
28
28
|
dimension: z.string().min(1).optional(),
|
|
29
|
+
/**
|
|
30
|
+
* Plan 03-02 — per-dimension legal failure-mode list. When present and
|
|
31
|
+
* non-empty, the runtime rubric assembler announces the legal modes to
|
|
32
|
+
* the grader before the structured-shape footer (Plan 03-01).
|
|
33
|
+
* Stamped at config-load time by `failureModesForDimension(dimension)`
|
|
34
|
+
* from `packages/eval/src/grader/index.ts`.
|
|
35
|
+
*/
|
|
36
|
+
failureModes: z.array(z.string().min(1)).optional(),
|
|
29
37
|
header: z.string().min(1, "header must be a non-empty string"),
|
|
30
38
|
scale: z
|
|
31
39
|
.array(z.string().min(1))
|
|
@@ -118,13 +126,27 @@ export const FeatureRegistrySchema = z.object({
|
|
|
118
126
|
// ---------------------------------------------------------------------------
|
|
119
127
|
// Assertion schemas — one per Promptfoo assertion type
|
|
120
128
|
// ---------------------------------------------------------------------------
|
|
129
|
+
// TODO(GRAD-01 follow-up): This schema duplicates
|
|
130
|
+
// packages/eval/src/adapters/task-sources/repo-schemas.ts:TemplatedAssertionSchema.
|
|
131
|
+
// Retiring requires reverse-extracting the canonical schema into
|
|
132
|
+
// packages/core/src/schemas/ (D0048 prevents importing from packages/eval).
|
|
133
|
+
// Out of Phase 2 scope; tracked separately.
|
|
134
|
+
//
|
|
135
|
+
// The `satisfies z.ZodType<CriterionRef>` clause asserts this duplicate
|
|
136
|
+
// stays shape-compatible with the canonical domain type in
|
|
137
|
+
// `@sanity/ailf-core` (D0045). If a future edit adds a third field to one
|
|
138
|
+
// schema and not the other, this build error catches the drift.
|
|
139
|
+
const CriterionRefShape = z.object({
|
|
140
|
+
id: z.string().min(1, "id must be a non-empty slug"),
|
|
141
|
+
text: z.string().min(1, "text must be a non-empty string"),
|
|
142
|
+
});
|
|
121
143
|
/**
|
|
122
144
|
* Templated llm-rubric assertion — references a rubric template by key
|
|
123
145
|
* and provides task-specific criteria.
|
|
124
146
|
*/
|
|
125
147
|
const TemplatedLlmRubricAssertSchema = z.object({
|
|
126
148
|
criteria: z
|
|
127
|
-
.array(
|
|
149
|
+
.array(CriterionRefShape)
|
|
128
150
|
.min(1, "criteria must have at least one entry"),
|
|
129
151
|
template: z.string().min(1, "template must be a non-empty string"),
|
|
130
152
|
type: z.literal("llm-rubric"),
|