@sanity/ailf 4.6.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/services/index.js +5 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
- package/dist/_vendor/ailf-core/types/confidence.js +7 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
- package/dist/_vendor/ailf-core/types/index.js +15 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +36 -0
- package/dist/composition-root.js +48 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +14 -0
- package/dist/grader/agent-harness.js +17 -0
- package/dist/grader/common.d.ts +17 -0
- package/dist/grader/common.js +21 -0
- package/dist/grader/index.d.ts +38 -0
- package/dist/grader/index.js +75 -0
- package/dist/grader/knowledge-probe.d.ts +14 -0
- package/dist/grader/knowledge-probe.js +18 -0
- package/dist/grader/literacy.d.ts +13 -0
- package/dist/grader/literacy.js +17 -0
- package/dist/grader/mcp.d.ts +14 -0
- package/dist/grader/mcp.js +18 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +6 -4
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LegacyGraderJudgment — Phase 1 superset core only, with NO GRAD-02
|
|
3
|
+
* additive surface. Used by the read-only legacy parser at
|
|
4
|
+
* `packages/eval/src/adapters/grader-outputs/legacy/` for historical
|
|
5
|
+
* pre-Phase-3 reports.
|
|
6
|
+
*
|
|
7
|
+
* Reports are immutable events — once a Report is written to Content
|
|
8
|
+
* Lake the structured grader-judgment shape it captures cannot be
|
|
9
|
+
* back-filled. The legacy parser exists so historical-report rendering
|
|
10
|
+
* paths can keep deserializing pre-Phase-3 output through Phase 7
|
|
11
|
+
* (GRAD-06 cutover removes Studio's `reason`-only fallback rendering
|
|
12
|
+
* paths and the legacy adapter alongside).
|
|
13
|
+
*
|
|
14
|
+
* Authored INDEPENDENTLY of any Zod schema (D0045 doctrine — the
|
|
15
|
+
* legacy schema in
|
|
16
|
+
* `packages/eval/src/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.ts`
|
|
17
|
+
* `satisfies z.ZodType<LegacyGraderJudgment>` against this type, not
|
|
18
|
+
* the other way around). A tautological
|
|
19
|
+
* `satisfies z.ZodType<z.infer<typeof Schema>>` is forbidden.
|
|
20
|
+
*
|
|
21
|
+
* Invariant — live grader output that fails the strict
|
|
22
|
+
* `GraderJudgmentSchema` MUST NOT fall back to this schema. Drop to
|
|
23
|
+
* `failureMode: "unclassified"` instead. The legacy parser is invoked
|
|
24
|
+
* ONLY by historical-report rendering paths.
|
|
25
|
+
*
|
|
26
|
+
* @see ./grader-judgment.ts — the Phase 1+ structured shape (live path)
|
|
27
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
28
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
29
|
+
*/
|
|
30
|
+
/**
|
|
31
|
+
* The Phase 1 free-prose grader judgment as historical reports captured
|
|
32
|
+
* it. Mirrors the existing-pipeline-core surface of {@link GraderJudgment}
|
|
33
|
+
* (the required fields) and the pre-existing optional `outputFailure`
|
|
34
|
+
* flag. NO GRAD-02 additive fields (`subJudgments`, `docCitations`,
|
|
35
|
+
* `failureMode`, `confidence`, `hallucinationCheckedAgainst`,
|
|
36
|
+
* `metadata`) — those are by construction absent on pre-Phase-3 output.
|
|
37
|
+
*/
|
|
38
|
+
export interface LegacyGraderJudgment {
|
|
39
|
+
/** Rubric template name (e.g. "task-completion", "code-correctness"). */
|
|
40
|
+
dimension: string;
|
|
41
|
+
/** The model that produced the response being graded. */
|
|
42
|
+
modelId: string;
|
|
43
|
+
/**
|
|
44
|
+
* True when the model failed to produce meaningful output (empty
|
|
45
|
+
* response, API error, or refusal). Same semantics as
|
|
46
|
+
* {@link GraderJudgment.outputFailure}.
|
|
47
|
+
*/
|
|
48
|
+
outputFailure?: boolean;
|
|
49
|
+
/** The grader's natural-language reasoning (free-prose Phase 1 shape). */
|
|
50
|
+
reason: string;
|
|
51
|
+
/** Numeric score in [0, 100] (normalized). */
|
|
52
|
+
score: number;
|
|
53
|
+
/** The task this judgment belongs to. */
|
|
54
|
+
taskId: string;
|
|
55
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LegacyGraderJudgment — Phase 1 superset core only, with NO GRAD-02
|
|
3
|
+
* additive surface. Used by the read-only legacy parser at
|
|
4
|
+
* `packages/eval/src/adapters/grader-outputs/legacy/` for historical
|
|
5
|
+
* pre-Phase-3 reports.
|
|
6
|
+
*
|
|
7
|
+
* Reports are immutable events — once a Report is written to Content
|
|
8
|
+
* Lake the structured grader-judgment shape it captures cannot be
|
|
9
|
+
* back-filled. The legacy parser exists so historical-report rendering
|
|
10
|
+
* paths can keep deserializing pre-Phase-3 output through Phase 7
|
|
11
|
+
* (GRAD-06 cutover removes Studio's `reason`-only fallback rendering
|
|
12
|
+
* paths and the legacy adapter alongside).
|
|
13
|
+
*
|
|
14
|
+
* Authored INDEPENDENTLY of any Zod schema (D0045 doctrine — the
|
|
15
|
+
* legacy schema in
|
|
16
|
+
* `packages/eval/src/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.ts`
|
|
17
|
+
* `satisfies z.ZodType<LegacyGraderJudgment>` against this type, not
|
|
18
|
+
* the other way around). A tautological
|
|
19
|
+
* `satisfies z.ZodType<z.infer<typeof Schema>>` is forbidden.
|
|
20
|
+
*
|
|
21
|
+
* Invariant — live grader output that fails the strict
|
|
22
|
+
* `GraderJudgmentSchema` MUST NOT fall back to this schema. Drop to
|
|
23
|
+
* `failureMode: "unclassified"` instead. The legacy parser is invoked
|
|
24
|
+
* ONLY by historical-report rendering paths.
|
|
25
|
+
*
|
|
26
|
+
* @see ./grader-judgment.ts — the Phase 1+ structured shape (live path)
|
|
27
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
28
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
29
|
+
*/
|
|
30
|
+
export {};
|
|
@@ -84,6 +84,7 @@ export interface PipelineRequest {
|
|
|
84
84
|
dataset?: string;
|
|
85
85
|
debug?: PipelineRequestDebug | boolean;
|
|
86
86
|
executor?: PipelineRequestCallerExecutor;
|
|
87
|
+
borderlineReplications?: number;
|
|
87
88
|
gapAnalysis?: boolean;
|
|
88
89
|
graderContext?: "rubric-only" | "with-docs";
|
|
89
90
|
graderReplications?: number;
|
|
@@ -47,6 +47,14 @@ export interface RepoPublishConfig {
|
|
|
47
47
|
/** Execution-tier knobs — replaces the retired `--concurrency` / `--api-url` flags. */
|
|
48
48
|
export interface RepoExecutionConfig {
|
|
49
49
|
apiUrl?: string;
|
|
50
|
+
/**
|
|
51
|
+
* Plan 03-04 GRAD-04 — replications per borderline judgment for the
|
|
52
|
+
* intra-grader consensus pass. Default 3 (set in composition-root).
|
|
53
|
+
* A judgment is "borderline" when its score lies within ±5 of any
|
|
54
|
+
* severity boundary (30/50/60). Non-borderline judgments are not
|
|
55
|
+
* re-graded.
|
|
56
|
+
*/
|
|
57
|
+
borderlineReplications?: number;
|
|
50
58
|
concurrency?: number;
|
|
51
59
|
gapAnalysis?: boolean;
|
|
52
60
|
graderReplications?: number;
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* Attachable at every level of the report hierarchy:
|
|
9
9
|
* - ScoreSummary.documentManifest — all docs used in the evaluation
|
|
10
10
|
* - FeatureScore.documents — docs used for a specific area
|
|
11
|
-
* - StoredJudgment.canonicalDocs — docs expected for a specific task
|
|
11
|
+
* - StoredJudgment.contextDocs (legacy alias: canonicalDocs) — docs expected for a specific task
|
|
12
12
|
*/
|
|
13
13
|
export interface DocumentRef {
|
|
14
14
|
/**
|
|
@@ -130,6 +130,9 @@ export async function buildRemoteRequest(options) {
|
|
|
130
130
|
if (config.graderReplications) {
|
|
131
131
|
raw.graderReplications = config.graderReplications;
|
|
132
132
|
}
|
|
133
|
+
if (config.borderlineReplications) {
|
|
134
|
+
raw.borderlineReplications = config.borderlineReplications;
|
|
135
|
+
}
|
|
133
136
|
if (config.gapAnalysisEnabled)
|
|
134
137
|
raw.gapAnalysis = true;
|
|
135
138
|
if (config.noRemoteCache)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* attribution-meta-writer.ts — Zod schema for the run-scoped
|
|
3
|
+
* attribution metadata artifact (ATTR-01) emitted by Phase 4 and read
|
|
4
|
+
* back alongside the per-entry attribution objects.
|
|
5
|
+
*
|
|
6
|
+
* The schema asserts `satisfies z.ZodType<AttributionMeta>` against the
|
|
7
|
+
* canonical domain type in `packages/core/src/types/attribution.ts`
|
|
8
|
+
* (D0045 / W0187) — drift is a build error.
|
|
9
|
+
*
|
|
10
|
+
* `embeddingModel` is REQUIRED (Pitfall #6): silently downgrading to a
|
|
11
|
+
* default has caused regressions in adjacent codebases — model swaps
|
|
12
|
+
* MUST invalidate cached weights.
|
|
13
|
+
*
|
|
14
|
+
* Phase 1 lands the SHAPE only — no compute, no file I/O.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
17
|
+
* @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
|
|
18
|
+
*/
|
|
19
|
+
import { z } from "zod";
|
|
20
|
+
/**
|
|
21
|
+
* Canonical schema for {@link AttributionMeta}. Persisted at
|
|
22
|
+
* `runs/{runId}/attribution/_meta.json` (or whatever bulk path the
|
|
23
|
+
* Phase 4 descriptor pins) and parsed on read.
|
|
24
|
+
*/
|
|
25
|
+
export declare const AttributionMetaSchema: z.ZodObject<{
|
|
26
|
+
ensembleVersion: z.ZodString;
|
|
27
|
+
embeddingModel: z.ZodString;
|
|
28
|
+
calibrationSetVersion: z.ZodOptional<z.ZodString>;
|
|
29
|
+
weights: z.ZodObject<{
|
|
30
|
+
citation: z.ZodNumber;
|
|
31
|
+
canonical: z.ZodNumber;
|
|
32
|
+
retrieved: z.ZodNumber;
|
|
33
|
+
}, z.core.$strip>;
|
|
34
|
+
}, z.core.$strip>;
|
|
35
|
+
export type { AttributionMeta } from "../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* attribution-meta-writer.ts — Zod schema for the run-scoped
|
|
3
|
+
* attribution metadata artifact (ATTR-01) emitted by Phase 4 and read
|
|
4
|
+
* back alongside the per-entry attribution objects.
|
|
5
|
+
*
|
|
6
|
+
* The schema asserts `satisfies z.ZodType<AttributionMeta>` against the
|
|
7
|
+
* canonical domain type in `packages/core/src/types/attribution.ts`
|
|
8
|
+
* (D0045 / W0187) — drift is a build error.
|
|
9
|
+
*
|
|
10
|
+
* `embeddingModel` is REQUIRED (Pitfall #6): silently downgrading to a
|
|
11
|
+
* default has caused regressions in adjacent codebases — model swaps
|
|
12
|
+
* MUST invalidate cached weights.
|
|
13
|
+
*
|
|
14
|
+
* Phase 1 lands the SHAPE only — no compute, no file I/O.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
17
|
+
* @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
|
|
18
|
+
*/
|
|
19
|
+
import { z } from "zod";
|
|
20
|
+
/**
|
|
21
|
+
* Canonical schema for {@link AttributionMeta}. Persisted at
|
|
22
|
+
* `runs/{runId}/attribution/_meta.json` (or whatever bulk path the
|
|
23
|
+
* Phase 4 descriptor pins) and parsed on read.
|
|
24
|
+
*/
|
|
25
|
+
export const AttributionMetaSchema = z.object({
|
|
26
|
+
ensembleVersion: z.string().min(1),
|
|
27
|
+
embeddingModel: z.string().min(1),
|
|
28
|
+
calibrationSetVersion: z.string().optional(),
|
|
29
|
+
weights: z.object({
|
|
30
|
+
citation: z.number(),
|
|
31
|
+
canonical: z.number(),
|
|
32
|
+
retrieved: z.number(),
|
|
33
|
+
}),
|
|
34
|
+
});
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* attribution adapter barrel — named re-exports only (W0124 / D0045).
|
|
3
|
+
*
|
|
4
|
+
* The attribution schemas live here so they enter the D0045
|
|
5
|
+
* `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
|
|
6
|
+
*/
|
|
7
|
+
export { JudgmentAttributionSchema } from "./per-entry-attribution-writer.js";
|
|
8
|
+
export { AttributionMetaSchema } from "./attribution-meta-writer.js";
|
|
9
|
+
export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* attribution adapter barrel — named re-exports only (W0124 / D0045).
|
|
3
|
+
*
|
|
4
|
+
* The attribution schemas live here so they enter the D0045
|
|
5
|
+
* `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
|
|
6
|
+
*/
|
|
7
|
+
export { JudgmentAttributionSchema } from "./per-entry-attribution-writer.js";
|
|
8
|
+
export { AttributionMetaSchema } from "./attribution-meta-writer.js";
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* per-entry-attribution-writer.ts — Zod schema for the per-judgment
|
|
3
|
+
* attribution artifact (ATTR-01) emitted by Phase 4's
|
|
4
|
+
* `ComputeAttributionStep` and read back by Phase 5's diagnosis runner.
|
|
5
|
+
*
|
|
6
|
+
* The schema asserts `satisfies z.ZodType<JudgmentAttribution>` against
|
|
7
|
+
* the canonical domain type in `packages/core/src/types/attribution.ts`
|
|
8
|
+
* (D0045 / W0187) — drift between schema and type is a build error.
|
|
9
|
+
*
|
|
10
|
+
* Phase 1 lands the SHAPE only — no compute, no file I/O. Phase 4 wires
|
|
11
|
+
* the writer; Phase 5 wires the reader. Both `satisfies` against this
|
|
12
|
+
* single source-of-truth schema.
|
|
13
|
+
*
|
|
14
|
+
* `hallucinationCheckedAgainst` is REQUIRED (Pitfall #11): consumers
|
|
15
|
+
* must be able to audit citation grounding without re-deriving the
|
|
16
|
+
* resolvable-set. The canonical task field is `contextDocs`; do NOT
|
|
17
|
+
* invent `expectedDocs` / `usedDocs` synonyms.
|
|
18
|
+
*
|
|
19
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
20
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
21
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
22
|
+
* @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
|
|
23
|
+
*/
|
|
24
|
+
import { z } from "zod";
|
|
25
|
+
/**
|
|
26
|
+
* Canonical schema for {@link JudgmentAttribution}. Persisted at
|
|
27
|
+
* `runs/{runId}/attribution/{entryKey}.json` (Phase 4) and parsed by
|
|
28
|
+
* the diagnosis runner on read (Phase 5).
|
|
29
|
+
*/
|
|
30
|
+
export declare const JudgmentAttributionSchema: z.ZodObject<{
|
|
31
|
+
judgmentRef: z.ZodString;
|
|
32
|
+
taskId: z.ZodString;
|
|
33
|
+
modelId: z.ZodString;
|
|
34
|
+
dimension: z.ZodString;
|
|
35
|
+
attributions: z.ZodArray<z.ZodObject<{
|
|
36
|
+
documentId: z.ZodString;
|
|
37
|
+
slug: z.ZodOptional<z.ZodString>;
|
|
38
|
+
score: z.ZodNumber;
|
|
39
|
+
signals: z.ZodObject<{
|
|
40
|
+
citation: z.ZodOptional<z.ZodNumber>;
|
|
41
|
+
canonical: z.ZodOptional<z.ZodNumber>;
|
|
42
|
+
retrieved: z.ZodOptional<z.ZodNumber>;
|
|
43
|
+
}, z.core.$strip>;
|
|
44
|
+
confidence: z.ZodObject<{
|
|
45
|
+
level: z.ZodEnum<{
|
|
46
|
+
low: "low";
|
|
47
|
+
medium: "medium";
|
|
48
|
+
high: "high";
|
|
49
|
+
}>;
|
|
50
|
+
signalsPresent: z.ZodNumber;
|
|
51
|
+
derivation: z.ZodString;
|
|
52
|
+
}, z.core.$strip>;
|
|
53
|
+
}, z.core.$strip>>;
|
|
54
|
+
hallucinationCheckedAgainst: z.ZodArray<z.ZodString>;
|
|
55
|
+
}, z.core.$strip>;
|
|
56
|
+
export type { DocAttribution, JudgmentAttribution } from "../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* per-entry-attribution-writer.ts — Zod schema for the per-judgment
|
|
3
|
+
* attribution artifact (ATTR-01) emitted by Phase 4's
|
|
4
|
+
* `ComputeAttributionStep` and read back by Phase 5's diagnosis runner.
|
|
5
|
+
*
|
|
6
|
+
* The schema asserts `satisfies z.ZodType<JudgmentAttribution>` against
|
|
7
|
+
* the canonical domain type in `packages/core/src/types/attribution.ts`
|
|
8
|
+
* (D0045 / W0187) — drift between schema and type is a build error.
|
|
9
|
+
*
|
|
10
|
+
* Phase 1 lands the SHAPE only — no compute, no file I/O. Phase 4 wires
|
|
11
|
+
* the writer; Phase 5 wires the reader. Both `satisfies` against this
|
|
12
|
+
* single source-of-truth schema.
|
|
13
|
+
*
|
|
14
|
+
* `hallucinationCheckedAgainst` is REQUIRED (Pitfall #11): consumers
|
|
15
|
+
* must be able to audit citation grounding without re-deriving the
|
|
16
|
+
* resolvable-set. The canonical task field is `contextDocs`; do NOT
|
|
17
|
+
* invent `expectedDocs` / `usedDocs` synonyms.
|
|
18
|
+
*
|
|
19
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
20
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
21
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
22
|
+
* @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
|
|
23
|
+
*/
|
|
24
|
+
import { z } from "zod";
|
|
25
|
+
import { ConfidenceSchema } from "../../_vendor/ailf-core/schemas/index.js";
|
|
26
|
+
const DocAttributionSchema = z.object({
|
|
27
|
+
documentId: z.string().min(1),
|
|
28
|
+
slug: z.string().optional(),
|
|
29
|
+
score: z.number().min(0).max(1),
|
|
30
|
+
signals: z.object({
|
|
31
|
+
citation: z.number().min(0).max(1).optional(),
|
|
32
|
+
canonical: z.number().min(0).max(1).optional(),
|
|
33
|
+
retrieved: z.number().min(0).max(1).optional(),
|
|
34
|
+
}),
|
|
35
|
+
confidence: ConfidenceSchema,
|
|
36
|
+
});
|
|
37
|
+
/**
|
|
38
|
+
* Canonical schema for {@link JudgmentAttribution}. Persisted at
|
|
39
|
+
* `runs/{runId}/attribution/{entryKey}.json` (Phase 4) and parsed by
|
|
40
|
+
* the diagnosis runner on read (Phase 5).
|
|
41
|
+
*/
|
|
42
|
+
export const JudgmentAttributionSchema = z.object({
|
|
43
|
+
judgmentRef: z.string().min(1),
|
|
44
|
+
taskId: z.string().min(1),
|
|
45
|
+
modelId: z.string().min(1),
|
|
46
|
+
dimension: z.string().min(1),
|
|
47
|
+
attributions: z.array(DocAttributionSchema),
|
|
48
|
+
hallucinationCheckedAgainst: z.array(z.string()),
|
|
49
|
+
});
|
|
@@ -125,6 +125,7 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
|
|
|
125
125
|
noCache: config.noCache ?? false,
|
|
126
126
|
noRemoteCache: config.noRemoteCache ?? false,
|
|
127
127
|
graderReplications: config.execution?.graderReplications,
|
|
128
|
+
borderlineReplications: config.execution?.borderlineReplications,
|
|
128
129
|
graderContext: config.grader?.context,
|
|
129
130
|
urls: config.urls,
|
|
130
131
|
headers: config.agentic?.headers,
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-outputs adapter barrel — named re-exports only (W0124 / D0045).
|
|
3
|
+
*
|
|
4
|
+
* The grader-output schema lives here so it enters the D0045
|
|
5
|
+
* `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
|
|
6
|
+
*/
|
|
7
|
+
export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./promptfoo-grader-output.js";
|
|
8
|
+
export type { GraderJudgment } from "../../_vendor/ailf-core/index.d.ts";
|
|
9
|
+
export { LegacyGraderJudgmentSchema } from "./legacy/index.js";
|
|
10
|
+
export type { LegacyGraderJudgment } from "../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-outputs adapter barrel — named re-exports only (W0124 / D0045).
|
|
3
|
+
*
|
|
4
|
+
* The grader-output schema lives here so it enters the D0045
|
|
5
|
+
* `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
|
|
6
|
+
*/
|
|
7
|
+
export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./promptfoo-grader-output.js";
|
|
8
|
+
export { LegacyGraderJudgmentSchema } from "./legacy/index.js";
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* legacy grader-outputs adapter sub-barrel — named re-exports only
|
|
3
|
+
* (W0124 / D0045).
|
|
4
|
+
*
|
|
5
|
+
* Read-only schema for the Phase 1 free-prose grader-output shape,
|
|
6
|
+
* invoked only by historical-report rendering paths through Phase 7
|
|
7
|
+
* (GRAD-06 cutover). The schema lives here so it enters the D0045
|
|
8
|
+
* `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
|
|
9
|
+
*/
|
|
10
|
+
export { LegacyGraderJudgmentSchema } from "./promptfoo-grader-output-legacy.js";
|
|
11
|
+
export type { LegacyGraderJudgment } from "../../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* legacy grader-outputs adapter sub-barrel — named re-exports only
|
|
3
|
+
* (W0124 / D0045).
|
|
4
|
+
*
|
|
5
|
+
* Read-only schema for the Phase 1 free-prose grader-output shape,
|
|
6
|
+
* invoked only by historical-report rendering paths through Phase 7
|
|
7
|
+
* (GRAD-06 cutover). The schema lives here so it enters the D0045
|
|
8
|
+
* `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
|
|
9
|
+
*/
|
|
10
|
+
export { LegacyGraderJudgmentSchema } from "./promptfoo-grader-output-legacy.js";
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* promptfoo-grader-output-legacy.ts — Zod schema for the Phase 1
|
|
3
|
+
* free-prose grader-output shape, used by historical-report rendering
|
|
4
|
+
* paths.
|
|
5
|
+
*
|
|
6
|
+
* READ-ONLY: invoked only by historical-report rendering paths through
|
|
7
|
+
* Phase 7 (GRAD-06 cutover). Reports are immutable events — once a
|
|
8
|
+
* Report is written to Content Lake, the structured grader-judgment
|
|
9
|
+
* shape it captures cannot be back-filled. The legacy schema exists so
|
|
10
|
+
* pre-Phase-3 reports continue to deserialize cleanly.
|
|
11
|
+
*
|
|
12
|
+
* Live grader output that fails the strict {@link GraderJudgmentSchema}
|
|
13
|
+
* parse must NOT fall back to this schema. Drop to
|
|
14
|
+
* `failureMode: "unclassified"` instead. Strict and legacy schemas are
|
|
15
|
+
* deliberate siblings, not a legacy/canonical pair to consolidate.
|
|
16
|
+
*
|
|
17
|
+
* The schema asserts `satisfies z.ZodType<LegacyGraderJudgment>` against
|
|
18
|
+
* the canonical domain type in
|
|
19
|
+
* `packages/core/src/types/legacy-grader-judgment.ts` (D0045 / W0187) —
|
|
20
|
+
* drift between schema and type is a build error. The domain type is
|
|
21
|
+
* authored independently in `@sanity/ailf-core`; this file authors ONLY
|
|
22
|
+
* the schema and never derives the domain type from the schema itself
|
|
23
|
+
* (no schema-derived self-reference allowed by D0045).
|
|
24
|
+
*
|
|
25
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
26
|
+
* @see ../promptfoo-grader-output.ts — the strict (live-path) sibling
|
|
27
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
28
|
+
* §"Backwards compatibility"
|
|
29
|
+
*/
|
|
30
|
+
import { z } from "zod";
|
|
31
|
+
/**
|
|
32
|
+
* Canonical schema for {@link LegacyGraderJudgment}. Mirrors the Phase 1
|
|
33
|
+
* superset core (`taskId`, `modelId`, `dimension`, `reason`, `score`,
|
|
34
|
+
* optional `outputFailure`). NO GRAD-02 additive fields — those are by
|
|
35
|
+
* construction absent on pre-Phase-3 output.
|
|
36
|
+
*
|
|
37
|
+
* Intentionally NOT `.strict()` — pre-Phase-3 reports may carry stray
|
|
38
|
+
* keys; the legacy parser tolerates them so historical-report rendering
|
|
39
|
+
* keeps working through the GRAD-06 cutover.
|
|
40
|
+
*/
|
|
41
|
+
export declare const LegacyGraderJudgmentSchema: z.ZodObject<{
|
|
42
|
+
taskId: z.ZodString;
|
|
43
|
+
modelId: z.ZodString;
|
|
44
|
+
dimension: z.ZodString;
|
|
45
|
+
reason: z.ZodString;
|
|
46
|
+
score: z.ZodNumber;
|
|
47
|
+
outputFailure: z.ZodOptional<z.ZodBoolean>;
|
|
48
|
+
}, z.core.$strip>;
|
|
49
|
+
export type { LegacyGraderJudgment } from "../../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* promptfoo-grader-output-legacy.ts — Zod schema for the Phase 1
|
|
3
|
+
* free-prose grader-output shape, used by historical-report rendering
|
|
4
|
+
* paths.
|
|
5
|
+
*
|
|
6
|
+
* READ-ONLY: invoked only by historical-report rendering paths through
|
|
7
|
+
* Phase 7 (GRAD-06 cutover). Reports are immutable events — once a
|
|
8
|
+
* Report is written to Content Lake, the structured grader-judgment
|
|
9
|
+
* shape it captures cannot be back-filled. The legacy schema exists so
|
|
10
|
+
* pre-Phase-3 reports continue to deserialize cleanly.
|
|
11
|
+
*
|
|
12
|
+
* Live grader output that fails the strict {@link GraderJudgmentSchema}
|
|
13
|
+
* parse must NOT fall back to this schema. Drop to
|
|
14
|
+
* `failureMode: "unclassified"` instead. Strict and legacy schemas are
|
|
15
|
+
* deliberate siblings, not a legacy/canonical pair to consolidate.
|
|
16
|
+
*
|
|
17
|
+
* The schema asserts `satisfies z.ZodType<LegacyGraderJudgment>` against
|
|
18
|
+
* the canonical domain type in
|
|
19
|
+
* `packages/core/src/types/legacy-grader-judgment.ts` (D0045 / W0187) —
|
|
20
|
+
* drift between schema and type is a build error. The domain type is
|
|
21
|
+
* authored independently in `@sanity/ailf-core`; this file authors ONLY
|
|
22
|
+
* the schema and never derives the domain type from the schema itself
|
|
23
|
+
* (no schema-derived self-reference allowed by D0045).
|
|
24
|
+
*
|
|
25
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
26
|
+
* @see ../promptfoo-grader-output.ts — the strict (live-path) sibling
|
|
27
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
28
|
+
* §"Backwards compatibility"
|
|
29
|
+
*/
|
|
30
|
+
import { z } from "zod";
|
|
31
|
+
/**
|
|
32
|
+
* Canonical schema for {@link LegacyGraderJudgment}. Mirrors the Phase 1
|
|
33
|
+
* superset core (`taskId`, `modelId`, `dimension`, `reason`, `score`,
|
|
34
|
+
* optional `outputFailure`). NO GRAD-02 additive fields — those are by
|
|
35
|
+
* construction absent on pre-Phase-3 output.
|
|
36
|
+
*
|
|
37
|
+
* Intentionally NOT `.strict()` — pre-Phase-3 reports may carry stray
|
|
38
|
+
* keys; the legacy parser tolerates them so historical-report rendering
|
|
39
|
+
* keeps working through the GRAD-06 cutover.
|
|
40
|
+
*/
|
|
41
|
+
export const LegacyGraderJudgmentSchema = z.object({
|
|
42
|
+
taskId: z.string().min(1),
|
|
43
|
+
modelId: z.string().min(1),
|
|
44
|
+
dimension: z.string().min(1),
|
|
45
|
+
reason: z.string(),
|
|
46
|
+
score: z.number(),
|
|
47
|
+
outputFailure: z.boolean().optional(),
|
|
48
|
+
});
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* promptfoo-grader-output.ts — Zod schema for the structured grader output
|
|
3
|
+
* (GRAD-02) emitted by the promptfoo grader process and consumed by the
|
|
4
|
+
* eval pipeline.
|
|
5
|
+
*
|
|
6
|
+
* The schema asserts `satisfies z.ZodType<GraderJudgment>` against the
|
|
7
|
+
* canonical domain type in `packages/core/src/types/grader-judgment.ts`
|
|
8
|
+
* (D0045 / W0187) — drift between schema and type is a build error.
|
|
9
|
+
* The domain type was authored independently in Plan 01-01; this file
|
|
10
|
+
* authors ONLY the schema and never derives the domain type from the
|
|
11
|
+
* schema itself (no schema-derived self-reference allowed by D0045).
|
|
12
|
+
*
|
|
13
|
+
* `graderJudgmentsVersion` is co-located with the schema (VER-01 D-02 —
|
|
14
|
+
* source-of-truth file owns its version constant). Bumped by hand when
|
|
15
|
+
* the grader rubric, prompt template, or judgment shape changes.
|
|
16
|
+
*
|
|
17
|
+
* Phase 3 will replace the inline `JSON.parse` at
|
|
18
|
+
* `pipeline/calculate-scores.ts:380-392` (Pitfall #4) so all grader
|
|
19
|
+
* output flows through this schema.
|
|
20
|
+
*
|
|
21
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
22
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
23
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
24
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
25
|
+
*/
|
|
26
|
+
import { z } from "zod";
|
|
27
|
+
/**
|
|
28
|
+
* VER-01 D-02 — co-located version constant. Bumped by hand when the
|
|
29
|
+
* grader rubric, prompt template, or judgment shape changes in a way
|
|
30
|
+
* that should invalidate cached Diagnoses.
|
|
31
|
+
*
|
|
32
|
+
* Phase 3 GRAD-05 bumped this from `"0.1.0"` to `"1.0.0"` (semver
|
|
33
|
+
* major) — the additive GRAD-02 surface is now required + the schema
|
|
34
|
+
* is `.strict()`. AILF has no installed external base; the legacy
|
|
35
|
+
* parser at `./legacy/promptfoo-grader-output-legacy.ts` is the named
|
|
36
|
+
* consumer for already-stored historical reports.
|
|
37
|
+
*/
|
|
38
|
+
export declare const graderJudgmentsVersion = "1.0.0";
|
|
39
|
+
/**
|
|
40
|
+
* Canonical schema for {@link GraderJudgment}. Required fields mirror
|
|
41
|
+
* the existing pipeline core (Doc 03 §"existing, unchanged"):
|
|
42
|
+
* `taskId`, `modelId`, `dimension`, `reason`, `score`. Phase 3 GRAD-05
|
|
43
|
+
* has tightened the additive surface to required and added `.strict()`
|
|
44
|
+
* — the schema rejects unknown fields (defense-in-depth against future
|
|
45
|
+
* prompt-injection attempts that try to smuggle keys through the
|
|
46
|
+
* grader emission).
|
|
47
|
+
*
|
|
48
|
+
* Branded `JudgmentId` is represented at runtime by a non-empty string;
|
|
49
|
+
* the schema routes the brand through `brandedString<"JudgmentId">()`
|
|
50
|
+
* — the project's single audited cast site for branded-string
|
|
51
|
+
* schemas (project typescript rule: no `as` on `unknown`).
|
|
52
|
+
*/
|
|
53
|
+
export declare const GraderJudgmentSchema: z.ZodObject<{
|
|
54
|
+
taskId: z.ZodString;
|
|
55
|
+
modelId: z.ZodString;
|
|
56
|
+
dimension: z.ZodString;
|
|
57
|
+
reason: z.ZodString;
|
|
58
|
+
score: z.ZodNumber;
|
|
59
|
+
outputFailure: z.ZodOptional<z.ZodBoolean>;
|
|
60
|
+
judgmentId: z.ZodType<import("@sanity/ailf-core").Brand<string, "JudgmentId">, unknown, z.core.$ZodTypeInternals<import("@sanity/ailf-core").Brand<string, "JudgmentId">, unknown>>;
|
|
61
|
+
subJudgments: z.ZodArray<z.ZodObject<{
|
|
62
|
+
criterionId: z.ZodString;
|
|
63
|
+
met: z.ZodBoolean;
|
|
64
|
+
evidence: z.ZodString;
|
|
65
|
+
confidence: z.ZodObject<{
|
|
66
|
+
level: z.ZodEnum<{
|
|
67
|
+
low: "low";
|
|
68
|
+
medium: "medium";
|
|
69
|
+
high: "high";
|
|
70
|
+
}>;
|
|
71
|
+
signalsPresent: z.ZodNumber;
|
|
72
|
+
derivation: z.ZodString;
|
|
73
|
+
}, z.core.$strip>;
|
|
74
|
+
}, z.core.$strip>>;
|
|
75
|
+
docCitations: z.ZodArray<z.ZodObject<{
|
|
76
|
+
documentId: z.ZodString;
|
|
77
|
+
slug: z.ZodOptional<z.ZodString>;
|
|
78
|
+
role: z.ZodEnum<{
|
|
79
|
+
supports: "supports";
|
|
80
|
+
contradicts: "contradicts";
|
|
81
|
+
missing: "missing";
|
|
82
|
+
irrelevant: "irrelevant";
|
|
83
|
+
}>;
|
|
84
|
+
hallucinated: z.ZodOptional<z.ZodBoolean>;
|
|
85
|
+
}, z.core.$strip>>;
|
|
86
|
+
failureMode: z.ZodString;
|
|
87
|
+
confidence: z.ZodObject<{
|
|
88
|
+
level: z.ZodEnum<{
|
|
89
|
+
low: "low";
|
|
90
|
+
medium: "medium";
|
|
91
|
+
high: "high";
|
|
92
|
+
}>;
|
|
93
|
+
signalsPresent: z.ZodNumber;
|
|
94
|
+
derivation: z.ZodString;
|
|
95
|
+
}, z.core.$strip>;
|
|
96
|
+
hallucinationCheckedAgainst: z.ZodArray<z.ZodString>;
|
|
97
|
+
metadata: z.ZodObject<{
|
|
98
|
+
graderModel: z.ZodString;
|
|
99
|
+
graderJudgmentsVersion: z.ZodString;
|
|
100
|
+
}, z.core.$strip>;
|
|
101
|
+
}, z.core.$strict>;
|
|
102
|
+
export type { GraderJudgment } from "../../_vendor/ailf-core/index.d.ts";
|