@sanity/ailf 4.6.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/services/index.js +5 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
- package/dist/_vendor/ailf-core/types/confidence.js +7 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
- package/dist/_vendor/ailf-core/types/index.js +15 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +36 -0
- package/dist/composition-root.js +48 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +14 -0
- package/dist/grader/agent-harness.js +17 -0
- package/dist/grader/common.d.ts +17 -0
- package/dist/grader/common.js +21 -0
- package/dist/grader/index.d.ts +38 -0
- package/dist/grader/index.js +75 -0
- package/dist/grader/knowledge-probe.d.ts +14 -0
- package/dist/grader/knowledge-probe.js +18 -0
- package/dist/grader/literacy.d.ts +13 -0
- package/dist/grader/literacy.js +17 -0
- package/dist/grader/mcp.d.ts +14 -0
- package/dist/grader/mcp.js +18 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +6 -4
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# agent-harness-tools.yaml — Human-graded reference sample for agent-harness tasks
|
|
2
|
+
#
|
|
3
|
+
# Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
|
|
4
|
+
# `expectedFailureMode` + `dimension` are read by the calibration test in
|
|
5
|
+
# packages/eval/src/grader/__tests__/calibration.test.ts.
|
|
6
|
+
|
|
7
|
+
- taskId: agent-multi-step-schema-edit
|
|
8
|
+
area: agent-harness
|
|
9
|
+
dimension: agent-harness
|
|
10
|
+
expectedFailureMode: tool-misuse
|
|
11
|
+
response: |
|
|
12
|
+
To add a `coverImage` field to the `post` schema and migrate existing
|
|
13
|
+
documents, the agent ran the following tool calls:
|
|
14
|
+
|
|
15
|
+
1. `read_file({ path: "schemas/post.ts" })` — read the schema.
|
|
16
|
+
2. `write_file({ path: "schemas/post.ts", content: "<new schema>" })` — wrote the updated schema.
|
|
17
|
+
3. `run_command({ command: "sanity migrate add-cover-image" })` — attempted to run a migration.
|
|
18
|
+
|
|
19
|
+
The `sanity migrate` command failed because no migration script exists at
|
|
20
|
+
that name. The agent then stopped without creating the migration file.
|
|
21
|
+
rubrics:
|
|
22
|
+
- dimension: tool-usage
|
|
23
|
+
rubricText: |
|
|
24
|
+
Score the agent's tool usage from 0 to 100:
|
|
25
|
+
- 0: Completely wrong tool usage — called wrong tools or with invalid inputs
|
|
26
|
+
- 25: Poor tool usage — correct tools but wrong parameters or sequencing
|
|
27
|
+
- 50: Adequate — correct tools and basic parameters, some inefficiency
|
|
28
|
+
- 75: Good — efficient tool usage with proper error handling
|
|
29
|
+
- 100: Excellent — optimal tool selection, correct inputs, minimal redundancy
|
|
30
|
+
|
|
31
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
32
|
+
humanScore: 25
|
|
33
|
+
notes: |
|
|
34
|
+
The agent invoked `run_command` with a migration name that did not
|
|
35
|
+
exist in the project, instead of first listing available migrations
|
|
36
|
+
with `list_migrations` (or equivalent) or scaffolding a new migration
|
|
37
|
+
file with the correct tool. The migration step was a tool-misuse:
|
|
38
|
+
the right tool exists, but the agent supplied invalid arguments
|
|
39
|
+
(a non-existent migration name) and then failed to recover by
|
|
40
|
+
scaffolding a new migration. Failure mode: tool-misuse.
|
|
41
|
+
gradedBy: "senior-engineer"
|
|
42
|
+
gradedAt: "2026-05-10"
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# knowledge-probe-recall.yaml — Human-graded reference sample for KP tasks
|
|
2
|
+
#
|
|
3
|
+
# Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
|
|
4
|
+
# `expectedFailureMode` + `dimension` are read by the calibration test in
|
|
5
|
+
# packages/eval/src/grader/__tests__/calibration.test.ts.
|
|
6
|
+
|
|
7
|
+
- taskId: kp-default-dataset
|
|
8
|
+
area: knowledge-probe
|
|
9
|
+
dimension: knowledge-probe
|
|
10
|
+
expectedFailureMode: factual-error
|
|
11
|
+
response: |
|
|
12
|
+
The default dataset for new Sanity Studio projects is `production`. When
|
|
13
|
+
you run `npm create sanity@latest` and accept the defaults, the CLI
|
|
14
|
+
creates a dataset named `production`.
|
|
15
|
+
rubrics:
|
|
16
|
+
- dimension: factual-correctness
|
|
17
|
+
rubricText: |
|
|
18
|
+
Score factual correctness from 0 to 100:
|
|
19
|
+
- 0: Completely wrong — fabricated facts, hallucinated APIs, incorrect syntax
|
|
20
|
+
- 25: Mostly wrong — some real concepts but significant factual errors
|
|
21
|
+
- 50: Mixed — some correct facts alongside notable inaccuracies
|
|
22
|
+
- 75: Mostly correct — minor inaccuracies or imprecisions
|
|
23
|
+
- 100: Fully correct — all statements are accurate and verifiable
|
|
24
|
+
|
|
25
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
26
|
+
humanScore: 30
|
|
27
|
+
notes: |
|
|
28
|
+
Per current Sanity CLI docs the default dataset chosen by
|
|
29
|
+
`npm create sanity@latest` is selectable, with `production` being
|
|
30
|
+
the conventional default name surfaced in the prompt — but the
|
|
31
|
+
candidate phrases this as a deterministic default rather than a
|
|
32
|
+
prompt-driven choice. The factual claim "the default is production"
|
|
33
|
+
is wrong as stated; the CLI prompts for a dataset name and offers
|
|
34
|
+
`production` as a suggested value. Failure mode: factual-error.
|
|
35
|
+
gradedBy: "senior-engineer"
|
|
36
|
+
gradedAt: "2026-05-10"
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# mcp-server-spec.yaml — Human-graded reference sample for MCP server tasks
|
|
2
|
+
#
|
|
3
|
+
# Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
|
|
4
|
+
# `expectedFailureMode` + `dimension` are read by the calibration test in
|
|
5
|
+
# packages/eval/src/grader/__tests__/calibration.test.ts.
|
|
6
|
+
|
|
7
|
+
- taskId: mcp-search-tool-pagination
|
|
8
|
+
area: mcp-server
|
|
9
|
+
dimension: mcp-behavior
|
|
10
|
+
expectedFailureMode: spec-mismatch
|
|
11
|
+
response: |
|
|
12
|
+
Here's an MCP `search` tool that returns paginated results:
|
|
13
|
+
|
|
14
|
+
```typescript
|
|
15
|
+
import { Server } from "@modelcontextprotocol/sdk/server/index.js"
|
|
16
|
+
|
|
17
|
+
server.setRequestHandler({ method: "tools/call" }, async (request) => {
|
|
18
|
+
if (request.params.name === "search") {
|
|
19
|
+
const { query, page = 1, pageSize = 10 } = request.params.arguments
|
|
20
|
+
const results = await searchIndex(query, page, pageSize)
|
|
21
|
+
return {
|
|
22
|
+
content: [
|
|
23
|
+
{ type: "text", text: JSON.stringify(results) },
|
|
24
|
+
],
|
|
25
|
+
nextPageToken: page + 1,
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
})
|
|
29
|
+
```
|
|
30
|
+
rubrics:
|
|
31
|
+
- dimension: output-correctness
|
|
32
|
+
rubricText: |
|
|
33
|
+
Score MCP tool output handling from 0 to 100:
|
|
34
|
+
- 0: Completely failed to use tool outputs
|
|
35
|
+
- 25: Used outputs but misinterpreted the data
|
|
36
|
+
- 50: Partially correct interpretation of tool outputs
|
|
37
|
+
- 75: Mostly correct — minor misuse of returned data
|
|
38
|
+
- 100: Perfect output handling — correctly interpreted all tool responses
|
|
39
|
+
|
|
40
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
41
|
+
humanScore: 30
|
|
42
|
+
notes: |
|
|
43
|
+
The candidate adds a `nextPageToken` field at the top level of the
|
|
44
|
+
tool-call response. The published MCP `tools/call` response spec only
|
|
45
|
+
defines `content`, `isError`, and `_meta` at the top level; arbitrary
|
|
46
|
+
top-level fields like `nextPageToken` are not part of the spec and
|
|
47
|
+
are ignored by spec-compliant clients. Pagination metadata belongs
|
|
48
|
+
inside the `content` block (e.g., as a structured JSON payload) or
|
|
49
|
+
in `_meta`. Failure mode: spec-mismatch.
|
|
50
|
+
gradedBy: "senior-engineer"
|
|
51
|
+
gradedAt: "2026-05-10"
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# portable-text.yaml — Human-graded reference sample for Portable Text tasks
|
|
2
|
+
#
|
|
3
|
+
# Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
|
|
4
|
+
# `expectedFailureMode` + `dimension` are read by the calibration test in
|
|
5
|
+
# packages/eval/src/grader/__tests__/calibration.test.ts.
|
|
6
|
+
|
|
7
|
+
- taskId: portable-text-react-marks
|
|
8
|
+
area: portable-text
|
|
9
|
+
dimension: doc-coverage
|
|
10
|
+
expectedFailureMode: outdated-docs
|
|
11
|
+
response: |
|
|
12
|
+
To customize the marks renderer in @portabletext/react, you can pass a `serializers` prop:
|
|
13
|
+
|
|
14
|
+
```tsx
|
|
15
|
+
import { PortableText } from "@portabletext/react"
|
|
16
|
+
|
|
17
|
+
const serializers = {
|
|
18
|
+
marks: {
|
|
19
|
+
link: ({ children, mark }) => (
|
|
20
|
+
<a href={mark.href} target="_blank" rel="noopener">{children}</a>
|
|
21
|
+
),
|
|
22
|
+
},
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export function Body({ value }: { value: any }) {
|
|
26
|
+
return <PortableText blocks={value} serializers={serializers} />
|
|
27
|
+
}
|
|
28
|
+
```
|
|
29
|
+
rubrics:
|
|
30
|
+
- dimension: doc-coverage
|
|
31
|
+
rubricText: |
|
|
32
|
+
Score documentation coverage from 0 to 100:
|
|
33
|
+
- 0: Had to hallucinate/guess most implementation details
|
|
34
|
+
- 30: Significant gaps — filled with assumptions
|
|
35
|
+
- 50: Some gaps — inferred from partial information
|
|
36
|
+
- 80: Minor gaps — almost everything was documented
|
|
37
|
+
- 100: Complete coverage — all necessary info was in docs
|
|
38
|
+
|
|
39
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
40
|
+
humanScore: 25
|
|
41
|
+
notes: |
|
|
42
|
+
The candidate uses the v2 `serializers` prop and `blocks` prop, both of
|
|
43
|
+
which were renamed in @portabletext/react v3+ to `components` and
|
|
44
|
+
`value`. The doc the candidate is following is outdated — current
|
|
45
|
+
docs document the v3 component-based API. Failure mode:
|
|
46
|
+
outdated-docs (the doc reflects an older API).
|
|
47
|
+
gradedBy: "senior-engineer"
|
|
48
|
+
gradedAt: "2026-05-10"
|
package/config/rubrics.ts
CHANGED
|
@@ -11,6 +11,15 @@
|
|
|
11
11
|
|
|
12
12
|
import { defineRubrics } from "@sanity/ailf-core"
|
|
13
13
|
|
|
14
|
+
// Plan 03-02 — per-dimension failure-mode taxonomies stamped onto each
|
|
15
|
+
// template entry below. Source of truth lives in packages/eval/src/grader/;
|
|
16
|
+
// the helper picks the right list by dimension family.
|
|
17
|
+
import { failureModesForDimension } from "../src/grader/index.js"
|
|
18
|
+
// Single source of truth for the wire-format version stamped into the
|
|
19
|
+
// grader-prompt footer (VER-01 D-02). Interpolated below so the
|
|
20
|
+
// announced version cannot drift from the schema's expected value.
|
|
21
|
+
import { graderJudgmentsVersion } from "../src/adapters/grader-outputs/index.js"
|
|
22
|
+
|
|
14
23
|
export default defineRubrics({
|
|
15
24
|
templates: {
|
|
16
25
|
// ── Core literacy dimensions ────────────────────────────
|
|
@@ -25,6 +34,7 @@ export default defineRubrics({
|
|
|
25
34
|
"100: Fully functional code — works as expected",
|
|
26
35
|
],
|
|
27
36
|
criteria_label: "Must demonstrate:",
|
|
37
|
+
failureModes: failureModesForDimension("task-completion"),
|
|
28
38
|
},
|
|
29
39
|
"code-correctness": {
|
|
30
40
|
dimension: "code-correctness",
|
|
@@ -37,6 +47,7 @@ export default defineRubrics({
|
|
|
37
47
|
"100: Follows all best practices, idiomatic implementation",
|
|
38
48
|
],
|
|
39
49
|
criteria_label: "Check for:",
|
|
50
|
+
failureModes: failureModesForDimension("code-correctness"),
|
|
40
51
|
},
|
|
41
52
|
"doc-coverage": {
|
|
42
53
|
dimension: "doc-coverage",
|
|
@@ -48,6 +59,7 @@ export default defineRubrics({
|
|
|
48
59
|
"80: Minor gaps — almost everything was documented",
|
|
49
60
|
"100: Complete coverage — all necessary info was in docs",
|
|
50
61
|
],
|
|
62
|
+
failureModes: failureModesForDimension("doc-coverage"),
|
|
51
63
|
},
|
|
52
64
|
|
|
53
65
|
// ── MCP server dimensions ───────────────────────────────
|
|
@@ -62,6 +74,7 @@ export default defineRubrics({
|
|
|
62
74
|
"100: Perfect tool inputs — all parameters correct and well-formed",
|
|
63
75
|
],
|
|
64
76
|
criteria_label: "Evaluate:",
|
|
77
|
+
failureModes: failureModesForDimension("input-validation"),
|
|
65
78
|
},
|
|
66
79
|
"mcp-output-correctness": {
|
|
67
80
|
dimension: "output-correctness",
|
|
@@ -74,6 +87,7 @@ export default defineRubrics({
|
|
|
74
87
|
"100: Perfect output handling — correctly interpreted all tool responses",
|
|
75
88
|
],
|
|
76
89
|
criteria_label: "Check for:",
|
|
90
|
+
failureModes: failureModesForDimension("output-correctness"),
|
|
77
91
|
},
|
|
78
92
|
"mcp-error-handling": {
|
|
79
93
|
dimension: "error-handling",
|
|
@@ -86,6 +100,7 @@ export default defineRubrics({
|
|
|
86
100
|
"100: Excellent — handled all errors appropriately with clear messaging",
|
|
87
101
|
],
|
|
88
102
|
criteria_label: "Evaluate:",
|
|
103
|
+
failureModes: failureModesForDimension("error-handling"),
|
|
89
104
|
},
|
|
90
105
|
"mcp-security": {
|
|
91
106
|
dimension: "security",
|
|
@@ -98,6 +113,7 @@ export default defineRubrics({
|
|
|
98
113
|
"100: Perfect security — only used authorized tools with safe inputs",
|
|
99
114
|
],
|
|
100
115
|
criteria_label: "Check for:",
|
|
116
|
+
failureModes: failureModesForDimension("security"),
|
|
101
117
|
},
|
|
102
118
|
|
|
103
119
|
// ── Knowledge probe dimensions ──────────────────────────
|
|
@@ -112,6 +128,7 @@ export default defineRubrics({
|
|
|
112
128
|
"100: Fully correct — all statements are accurate and verifiable",
|
|
113
129
|
],
|
|
114
130
|
criteria_label: "Verify:",
|
|
131
|
+
failureModes: failureModesForDimension("factual-correctness"),
|
|
115
132
|
},
|
|
116
133
|
completeness: {
|
|
117
134
|
dimension: "completeness",
|
|
@@ -124,6 +141,7 @@ export default defineRubrics({
|
|
|
124
141
|
"100: Comprehensive — thorough coverage of all important aspects",
|
|
125
142
|
],
|
|
126
143
|
criteria_label: "Check coverage of:",
|
|
144
|
+
failureModes: failureModesForDimension("completeness"),
|
|
127
145
|
},
|
|
128
146
|
currency: {
|
|
129
147
|
dimension: "currency",
|
|
@@ -136,6 +154,7 @@ export default defineRubrics({
|
|
|
136
154
|
"100: Fully current — references latest APIs, patterns, and best practices",
|
|
137
155
|
],
|
|
138
156
|
criteria_label: "Check for:",
|
|
157
|
+
failureModes: failureModesForDimension("currency"),
|
|
139
158
|
},
|
|
140
159
|
|
|
141
160
|
// ── Agent harness dimensions ────────────────────────────
|
|
@@ -151,6 +170,7 @@ export default defineRubrics({
|
|
|
151
170
|
"100: Excellent process — optimal tool usage, clear planning, graceful recovery",
|
|
152
171
|
],
|
|
153
172
|
criteria_label: "Evaluate:",
|
|
173
|
+
failureModes: failureModesForDimension("process-quality"),
|
|
154
174
|
},
|
|
155
175
|
"agent-output": {
|
|
156
176
|
dimension: "agent-output",
|
|
@@ -163,6 +183,7 @@ export default defineRubrics({
|
|
|
163
183
|
"100: Excellent output — fully correct, clean, and complete",
|
|
164
184
|
],
|
|
165
185
|
criteria_label: "Check for:",
|
|
186
|
+
failureModes: failureModesForDimension("agent-output"),
|
|
166
187
|
},
|
|
167
188
|
"agent-tool-usage": {
|
|
168
189
|
dimension: "tool-usage",
|
|
@@ -175,6 +196,7 @@ export default defineRubrics({
|
|
|
175
196
|
"100: Excellent — optimal tool selection, correct inputs, minimal redundancy",
|
|
176
197
|
],
|
|
177
198
|
criteria_label: "Evaluate:",
|
|
199
|
+
failureModes: failureModesForDimension("tool-usage"),
|
|
178
200
|
},
|
|
179
201
|
},
|
|
180
202
|
|
|
@@ -220,6 +242,20 @@ export default defineRubrics({
|
|
|
220
242
|
"agent-harness": { gold: "agent-harness" },
|
|
221
243
|
},
|
|
222
244
|
|
|
223
|
-
|
|
224
|
-
|
|
245
|
+
// Phase 3 GRAD-05 (Plan 03-01) — structured GraderJudgment JSON sketch.
|
|
246
|
+
// Documents the target wire format the grader emits. The strict schema's
|
|
247
|
+
// GRAD-02 additive fields stay optional in this plan; Plan 03-04 flips
|
|
248
|
+
// them to required and bumps graderJudgmentsVersion to 1.0.0.
|
|
249
|
+
footer: `Return ONLY a JSON object with this exact shape:
|
|
250
|
+
{
|
|
251
|
+
"judgmentId": "<string>",
|
|
252
|
+
"score": <number 0-100>,
|
|
253
|
+
"reason": "<explanation, ≤500 chars>",
|
|
254
|
+
"subJudgments": [{ "criterionId": "<id>", "met": <bool>, "evidence": "<≤280 chars>", "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" } }],
|
|
255
|
+
"docCitations": [{ "documentId": "<id>", "slug": "<optional slug>", "role": "supports|contradicts|missing|irrelevant", "hallucinated": <bool> }],
|
|
256
|
+
"failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
|
|
257
|
+
"confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" },
|
|
258
|
+
"hallucinationCheckedAgainst": ["<doc id>"],
|
|
259
|
+
"metadata": { "graderModel": "<string>", "graderJudgmentsVersion": "${graderJudgmentsVersion}" }
|
|
260
|
+
}`,
|
|
225
261
|
})
|
|
@@ -79,7 +79,7 @@ export type WriteSource = WritePolicy;
|
|
|
79
79
|
*/
|
|
80
80
|
export type ArtifactTruncationPolicy = "reject" | "trailing-truncate" | "fielded-truncate" | "trial-oversize";
|
|
81
81
|
/** The union of every artifact type known to AILF. */
|
|
82
|
-
export type ArtifactType = "runManifest" | "scoreSummary" | "pipelineResult" | "pipelineContext" | "documentManifest" | "prComment" | "readinessReport" | "reportSnapshot" | "autoComparison" | "gapReport" | "sinkResults" | "callbackRequest" | "callbackResponse" | "configSnapshot" | "evalConfigGenerated" | "comparisonReport" | "discoveryReport" | "failureModes" | "renderedPrompts" | "rawResults" | "testOutputs" | "graderPrompts" | "graderJudgments" | "symbolPreflight" | "traces";
|
|
82
|
+
export type ArtifactType = "runManifest" | "scoreSummary" | "pipelineResult" | "pipelineContext" | "documentManifest" | "prComment" | "readinessReport" | "reportSnapshot" | "autoComparison" | "gapReport" | "sinkResults" | "callbackRequest" | "callbackResponse" | "configSnapshot" | "evalConfigGenerated" | "comparisonReport" | "discoveryReport" | "failureModes" | "renderedPrompts" | "rawResults" | "testOutputs" | "graderPrompts" | "graderJudgments" | "symbolPreflight" | "traces" | "diagnosis" | "perEntryAttribution" | "attributionMeta";
|
|
83
83
|
/**
|
|
84
84
|
* Result of parsing a per-entry key into a sanitized filename component.
|
|
85
85
|
* Success carries the sanitized value; failure carries a reason for 4xx responses.
|
|
@@ -114,6 +114,32 @@ export interface ManifestPreviewDeclaration<TPreview = unknown> {
|
|
|
114
114
|
readonly extract: (entry: unknown) => TPreview;
|
|
115
115
|
readonly capBytes: number;
|
|
116
116
|
}
|
|
117
|
+
/**
|
|
118
|
+
* Callback signature for `ArtifactDescriptor.objectPath`. Aliased so
|
|
119
|
+
* external path builders (e.g. `diagnosisPathBuilder`) declare a named
|
|
120
|
+
* return type rather than re-typing the inline arrow signature.
|
|
121
|
+
*/
|
|
122
|
+
export type ArtifactObjectPath = (runId: RunId, entryKey?: string, version?: string) => string;
|
|
123
|
+
/**
|
|
124
|
+
* Opt-in marker for the bulk-versioned-with-report-axis carve-out
|
|
125
|
+
* (Plan 01-03 / ITER2-BLOCKER-01). Any descriptor that needs to combine
|
|
126
|
+
* `layout: "bulk"` with a `report` axis (e.g. the post-hoc diagnosis
|
|
127
|
+
* artifact) must set
|
|
128
|
+
* `pathSafetyMarker: BULK_VERSIONED_WITH_REPORT_AXIS` to opt in to the
|
|
129
|
+
* carve-out in `assertValidArtifactDescriptor`.
|
|
130
|
+
*
|
|
131
|
+
* `Symbol.for(...)` is registry-keyed: it survives minification (the
|
|
132
|
+
* key is a string literal that minifiers cannot rename) and yields
|
|
133
|
+
* cross-module-instance equality by spec. This makes it bundler-stable
|
|
134
|
+
* — a deliberate replacement for any source-text reflection on the
|
|
135
|
+
* descriptor's `objectPath` closure body.
|
|
136
|
+
*
|
|
137
|
+
* The marker alone does NOT trigger the carve-out — the four structural
|
|
138
|
+
* conditions (bulk + versionedBy + run+report axes) must also hold.
|
|
139
|
+
* Conversely, the structural conditions alone do NOT trigger the
|
|
140
|
+
* carve-out without the marker. This is an explicit opt-in.
|
|
141
|
+
*/
|
|
142
|
+
export declare const BULK_VERSIONED_WITH_REPORT_AXIS: unique symbol;
|
|
117
143
|
/**
|
|
118
144
|
* Per-type declaration consumed by writers, signers, and readers.
|
|
119
145
|
*
|
|
@@ -157,6 +183,14 @@ export interface ArtifactDescriptor<TEntry = unknown, TPreview = unknown> {
|
|
|
157
183
|
* When unset, the path is unversioned (legacy behavior).
|
|
158
184
|
*/
|
|
159
185
|
readonly versionedBy?: VersionedBy;
|
|
186
|
+
/**
|
|
187
|
+
* Optional opt-in marker for the bulk-versioned-with-report-axis
|
|
188
|
+
* carve-out in `assertValidArtifactDescriptor` (Plan 01-03 /
|
|
189
|
+
* ITER2-BLOCKER-01). See `BULK_VERSIONED_WITH_REPORT_AXIS` for full
|
|
190
|
+
* semantics. Descriptors without this marker are subject to the
|
|
191
|
+
* standard bounded-axis rule.
|
|
192
|
+
*/
|
|
193
|
+
readonly pathSafetyMarker?: typeof BULK_VERSIONED_WITH_REPORT_AXIS;
|
|
160
194
|
/**
|
|
161
195
|
* Build the GCS object path for this artifact.
|
|
162
196
|
* - bulk: returns `runs/{runId}/{slug}.{ext}`; `entryKey` is ignored.
|
|
@@ -165,7 +199,7 @@ export interface ArtifactDescriptor<TEntry = unknown, TPreview = unknown> {
|
|
|
165
199
|
* `runs/{runId}/{slug}-{version}.{ext}` for bulk-shaped versioned
|
|
166
200
|
* descriptors. `entryKey` is ignored on versioned bulk paths.
|
|
167
201
|
*/
|
|
168
|
-
readonly objectPath:
|
|
202
|
+
readonly objectPath: ArtifactObjectPath;
|
|
169
203
|
/**
|
|
170
204
|
* Build a filename-safe entry key from association values. Only meaningful
|
|
171
205
|
* for `layout === "per-entry"` — bulk descriptors omit it.
|
|
@@ -201,6 +235,30 @@ export interface ArtifactDescriptor<TEntry = unknown, TPreview = unknown> {
|
|
|
201
235
|
* collapsed to an unversioned path.
|
|
202
236
|
*/
|
|
203
237
|
export declare function versionedPathBuilder(slug: string, mime: ArtifactMime): (runId: RunId, _entryKey?: string, version?: string) => string;
|
|
238
|
+
/**
|
|
239
|
+
* Two-axis post-hoc path builder for the diagnosis descriptor (Plan 01-03,
|
|
240
|
+
* D-09). Filename: `diagnosis-{diagnosisVersion}-{cardSetShortHash}.json`
|
|
241
|
+
* where `cardSetShortHash = sha256(cardVersion).slice(0, 12)`.
|
|
242
|
+
*
|
|
243
|
+
* The compound version argument is `${diagnosisVersion}|${cardVersion}`
|
|
244
|
+
* to fit the existing 3-arg `objectPath` signature without growing the
|
|
245
|
+
* shared `ArtifactObjectPath` shape. Callers should use
|
|
246
|
+
* `encodeDiagnosisPathVersion()` rather than building the compound
|
|
247
|
+
* string by hand.
|
|
248
|
+
*
|
|
249
|
+
* The diagnosis descriptor opts into the bulk-versioned-with-report-axis
|
|
250
|
+
* carve-out via `pathSafetyMarker: BULK_VERSIONED_WITH_REPORT_AXIS`
|
|
251
|
+
* (ITER2-BLOCKER-01). The builder itself does not need to know about
|
|
252
|
+
* the marker — it just builds the path; the carve-out is a registration-
|
|
253
|
+
* time concern in `assertValidArtifactDescriptor`.
|
|
254
|
+
*/
|
|
255
|
+
export declare function diagnosisPathBuilder(): ArtifactObjectPath;
|
|
256
|
+
/**
|
|
257
|
+
* Pack the diagnosis descriptor's two version segments into the existing
|
|
258
|
+
* single-string `version` slot of `ArtifactObjectPath`. Separator is `|`;
|
|
259
|
+
* `diagnosisVersion` MUST NOT contain `|` (the function rejects that case).
|
|
260
|
+
*/
|
|
261
|
+
export declare function encodeDiagnosisPathVersion(diagnosisVersion: string, cardVersion: string): string;
|
|
204
262
|
/** Test-only reset for the legacy-key warning flag. Not exported publicly. */
|
|
205
263
|
export declare function __resetLegacyTestOutputsWarning(): void;
|
|
206
264
|
/**
|