@sanity/ailf 4.6.0 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +50 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +35 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +136 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +153 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +6 -0
- package/dist/_vendor/ailf-core/services/index.js +18 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
- package/dist/_vendor/ailf-core/types/confidence.js +7 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +271 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +19 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
- package/dist/_vendor/ailf-core/types/index.js +15 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/openai-llm-client.js +52 -3
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/cli-program.js +3 -0
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/interpret.d.ts +50 -0
- package/dist/commands/interpret.js +212 -0
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +57 -23
- package/dist/composition-root.js +155 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +9 -0
- package/dist/grader/agent-harness.js +9 -0
- package/dist/grader/common.d.ts +9 -0
- package/dist/grader/common.js +9 -0
- package/dist/grader/index.d.ts +24 -0
- package/dist/grader/index.js +24 -0
- package/dist/grader/knowledge-probe.d.ts +9 -0
- package/dist/grader/knowledge-probe.js +9 -0
- package/dist/grader/literacy.d.ts +9 -0
- package/dist/grader/literacy.js +9 -0
- package/dist/grader/mcp.d.ts +9 -0
- package/dist/grader/mcp.js +9 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +7 -5
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* per-entry-attribution-writer.ts — Zod schema for the per-judgment
|
|
3
|
+
* attribution artifact (ATTR-01) emitted by Phase 4's
|
|
4
|
+
* `ComputeAttributionStep` and read back by Phase 5's diagnosis runner.
|
|
5
|
+
*
|
|
6
|
+
* The schema asserts `satisfies z.ZodType<JudgmentAttribution>` against
|
|
7
|
+
* the canonical domain type in `packages/core/src/types/attribution.ts`
|
|
8
|
+
* (D0045 / W0187) — drift between schema and type is a build error.
|
|
9
|
+
*
|
|
10
|
+
* Phase 1 lands the SHAPE only — no compute, no file I/O. Phase 4 wires
|
|
11
|
+
* the writer; Phase 5 wires the reader. Both `satisfies` against this
|
|
12
|
+
* single source-of-truth schema.
|
|
13
|
+
*
|
|
14
|
+
* `hallucinationCheckedAgainst` is REQUIRED (Pitfall #11): consumers
|
|
15
|
+
* must be able to audit citation grounding without re-deriving the
|
|
16
|
+
* resolvable-set. The canonical task field is `contextDocs`; do NOT
|
|
17
|
+
* invent `expectedDocs` / `usedDocs` synonyms.
|
|
18
|
+
*
|
|
19
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
20
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
21
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
22
|
+
* @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
|
|
23
|
+
*/
|
|
24
|
+
import { z } from "zod";
|
|
25
|
+
import { ConfidenceSchema } from "../../_vendor/ailf-core/schemas/index.js";
|
|
26
|
+
const DocAttributionSchema = z.object({
|
|
27
|
+
documentId: z.string().min(1),
|
|
28
|
+
slug: z.string().optional(),
|
|
29
|
+
score: z.number().min(0).max(1),
|
|
30
|
+
signals: z.object({
|
|
31
|
+
citation: z.number().min(0).max(1).optional(),
|
|
32
|
+
canonical: z.number().min(0).max(1).optional(),
|
|
33
|
+
retrieved: z.number().min(0).max(1).optional(),
|
|
34
|
+
}),
|
|
35
|
+
confidence: ConfidenceSchema,
|
|
36
|
+
});
|
|
37
|
+
/**
|
|
38
|
+
* Canonical schema for {@link JudgmentAttribution}. Persisted at
|
|
39
|
+
* `runs/{runId}/attribution/{entryKey}.json` (Phase 4) and parsed by
|
|
40
|
+
* the diagnosis runner on read (Phase 5).
|
|
41
|
+
*/
|
|
42
|
+
export const JudgmentAttributionSchema = z.object({
|
|
43
|
+
judgmentRef: z.string().min(1),
|
|
44
|
+
taskId: z.string().min(1),
|
|
45
|
+
modelId: z.string().min(1),
|
|
46
|
+
dimension: z.string().min(1),
|
|
47
|
+
attributions: z.array(DocAttributionSchema),
|
|
48
|
+
hallucinationCheckedAgainst: z.array(z.string()),
|
|
49
|
+
});
|
|
@@ -125,6 +125,7 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
|
|
|
125
125
|
noCache: config.noCache ?? false,
|
|
126
126
|
noRemoteCache: config.noRemoteCache ?? false,
|
|
127
127
|
graderReplications: config.execution?.graderReplications,
|
|
128
|
+
borderlineReplications: config.execution?.borderlineReplications,
|
|
128
129
|
graderContext: config.grader?.context,
|
|
129
130
|
urls: config.urls,
|
|
130
131
|
headers: config.agentic?.headers,
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-outputs adapter barrel — named re-exports only (W0124 / D0045).
|
|
3
|
+
*
|
|
4
|
+
* The grader-output schema lives here so it enters the D0045
|
|
5
|
+
* `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
|
|
6
|
+
*/
|
|
7
|
+
export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./promptfoo-grader-output.js";
|
|
8
|
+
export type { GraderJudgment } from "../../_vendor/ailf-core/index.d.ts";
|
|
9
|
+
export { LegacyGraderJudgmentSchema } from "./legacy/index.js";
|
|
10
|
+
export type { LegacyGraderJudgment } from "../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-outputs adapter barrel — named re-exports only (W0124 / D0045).
|
|
3
|
+
*
|
|
4
|
+
* The grader-output schema lives here so it enters the D0045
|
|
5
|
+
* `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
|
|
6
|
+
*/
|
|
7
|
+
export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./promptfoo-grader-output.js";
|
|
8
|
+
export { LegacyGraderJudgmentSchema } from "./legacy/index.js";
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* legacy grader-outputs adapter sub-barrel — named re-exports only
|
|
3
|
+
* (W0124 / D0045).
|
|
4
|
+
*
|
|
5
|
+
* Read-only schema for the Phase 1 free-prose grader-output shape,
|
|
6
|
+
* invoked only by historical-report rendering paths through Phase 7
|
|
7
|
+
* (GRAD-06 cutover). The schema lives here so it enters the D0045
|
|
8
|
+
* `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
|
|
9
|
+
*/
|
|
10
|
+
export { LegacyGraderJudgmentSchema } from "./promptfoo-grader-output-legacy.js";
|
|
11
|
+
export type { LegacyGraderJudgment } from "../../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* legacy grader-outputs adapter sub-barrel — named re-exports only
|
|
3
|
+
* (W0124 / D0045).
|
|
4
|
+
*
|
|
5
|
+
* Read-only schema for the Phase 1 free-prose grader-output shape,
|
|
6
|
+
* invoked only by historical-report rendering paths through Phase 7
|
|
7
|
+
* (GRAD-06 cutover). The schema lives here so it enters the D0045
|
|
8
|
+
* `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
|
|
9
|
+
*/
|
|
10
|
+
export { LegacyGraderJudgmentSchema } from "./promptfoo-grader-output-legacy.js";
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* promptfoo-grader-output-legacy.ts — Zod schema for the Phase 1
|
|
3
|
+
* free-prose grader-output shape, used by historical-report rendering
|
|
4
|
+
* paths.
|
|
5
|
+
*
|
|
6
|
+
* READ-ONLY: invoked only by historical-report rendering paths through
|
|
7
|
+
* Phase 7 (GRAD-06 cutover). Reports are immutable events — once a
|
|
8
|
+
* Report is written to Content Lake, the structured grader-judgment
|
|
9
|
+
* shape it captures cannot be back-filled. The legacy schema exists so
|
|
10
|
+
* pre-Phase-3 reports continue to deserialize cleanly.
|
|
11
|
+
*
|
|
12
|
+
* Live grader output that fails the strict {@link GraderJudgmentSchema}
|
|
13
|
+
* parse must NOT fall back to this schema. Drop to
|
|
14
|
+
* `failureMode: "unclassified"` instead. Strict and legacy schemas are
|
|
15
|
+
* deliberate siblings, not a legacy/canonical pair to consolidate.
|
|
16
|
+
*
|
|
17
|
+
* The schema asserts `satisfies z.ZodType<LegacyGraderJudgment>` against
|
|
18
|
+
* the canonical domain type in
|
|
19
|
+
* `packages/core/src/types/legacy-grader-judgment.ts` (D0045 / W0187) —
|
|
20
|
+
* drift between schema and type is a build error. The domain type is
|
|
21
|
+
* authored independently in `@sanity/ailf-core`; this file authors ONLY
|
|
22
|
+
* the schema and never derives the domain type from the schema itself
|
|
23
|
+
* (no schema-derived self-reference allowed by D0045).
|
|
24
|
+
*
|
|
25
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
26
|
+
* @see ../promptfoo-grader-output.ts — the strict (live-path) sibling
|
|
27
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
28
|
+
* §"Backwards compatibility"
|
|
29
|
+
*/
|
|
30
|
+
import { z } from "zod";
|
|
31
|
+
/**
|
|
32
|
+
* Canonical schema for {@link LegacyGraderJudgment}. Mirrors the Phase 1
|
|
33
|
+
* superset core (`taskId`, `modelId`, `dimension`, `reason`, `score`,
|
|
34
|
+
* optional `outputFailure`). NO GRAD-02 additive fields — those are by
|
|
35
|
+
* construction absent on pre-Phase-3 output.
|
|
36
|
+
*
|
|
37
|
+
* Intentionally NOT `.strict()` — pre-Phase-3 reports may carry stray
|
|
38
|
+
* keys; the legacy parser tolerates them so historical-report rendering
|
|
39
|
+
* keeps working through the GRAD-06 cutover.
|
|
40
|
+
*/
|
|
41
|
+
export declare const LegacyGraderJudgmentSchema: z.ZodObject<{
|
|
42
|
+
taskId: z.ZodString;
|
|
43
|
+
modelId: z.ZodString;
|
|
44
|
+
dimension: z.ZodString;
|
|
45
|
+
reason: z.ZodString;
|
|
46
|
+
score: z.ZodNumber;
|
|
47
|
+
outputFailure: z.ZodOptional<z.ZodBoolean>;
|
|
48
|
+
}, z.core.$strip>;
|
|
49
|
+
export type { LegacyGraderJudgment } from "../../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* promptfoo-grader-output-legacy.ts — Zod schema for the Phase 1
|
|
3
|
+
* free-prose grader-output shape, used by historical-report rendering
|
|
4
|
+
* paths.
|
|
5
|
+
*
|
|
6
|
+
* READ-ONLY: invoked only by historical-report rendering paths through
|
|
7
|
+
* Phase 7 (GRAD-06 cutover). Reports are immutable events — once a
|
|
8
|
+
* Report is written to Content Lake, the structured grader-judgment
|
|
9
|
+
* shape it captures cannot be back-filled. The legacy schema exists so
|
|
10
|
+
* pre-Phase-3 reports continue to deserialize cleanly.
|
|
11
|
+
*
|
|
12
|
+
* Live grader output that fails the strict {@link GraderJudgmentSchema}
|
|
13
|
+
* parse must NOT fall back to this schema. Drop to
|
|
14
|
+
* `failureMode: "unclassified"` instead. Strict and legacy schemas are
|
|
15
|
+
* deliberate siblings, not a legacy/canonical pair to consolidate.
|
|
16
|
+
*
|
|
17
|
+
* The schema asserts `satisfies z.ZodType<LegacyGraderJudgment>` against
|
|
18
|
+
* the canonical domain type in
|
|
19
|
+
* `packages/core/src/types/legacy-grader-judgment.ts` (D0045 / W0187) —
|
|
20
|
+
* drift between schema and type is a build error. The domain type is
|
|
21
|
+
* authored independently in `@sanity/ailf-core`; this file authors ONLY
|
|
22
|
+
* the schema and never derives the domain type from the schema itself
|
|
23
|
+
* (no schema-derived self-reference allowed by D0045).
|
|
24
|
+
*
|
|
25
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
26
|
+
* @see ../promptfoo-grader-output.ts — the strict (live-path) sibling
|
|
27
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
28
|
+
* §"Backwards compatibility"
|
|
29
|
+
*/
|
|
30
|
+
import { z } from "zod";
|
|
31
|
+
/**
|
|
32
|
+
* Canonical schema for {@link LegacyGraderJudgment}. Mirrors the Phase 1
|
|
33
|
+
* superset core (`taskId`, `modelId`, `dimension`, `reason`, `score`,
|
|
34
|
+
* optional `outputFailure`). NO GRAD-02 additive fields — those are by
|
|
35
|
+
* construction absent on pre-Phase-3 output.
|
|
36
|
+
*
|
|
37
|
+
* Intentionally NOT `.strict()` — pre-Phase-3 reports may carry stray
|
|
38
|
+
* keys; the legacy parser tolerates them so historical-report rendering
|
|
39
|
+
* keeps working through the GRAD-06 cutover.
|
|
40
|
+
*/
|
|
41
|
+
export const LegacyGraderJudgmentSchema = z.object({
|
|
42
|
+
taskId: z.string().min(1),
|
|
43
|
+
modelId: z.string().min(1),
|
|
44
|
+
dimension: z.string().min(1),
|
|
45
|
+
reason: z.string(),
|
|
46
|
+
score: z.number(),
|
|
47
|
+
outputFailure: z.boolean().optional(),
|
|
48
|
+
});
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* promptfoo-grader-output.ts — Zod schema for the structured grader output
|
|
3
|
+
* (GRAD-02) emitted by the promptfoo grader process and consumed by the
|
|
4
|
+
* eval pipeline.
|
|
5
|
+
*
|
|
6
|
+
* The schema asserts `satisfies z.ZodType<GraderJudgment>` against the
|
|
7
|
+
* canonical domain type in `packages/core/src/types/grader-judgment.ts`
|
|
8
|
+
* (D0045 / W0187) — drift between schema and type is a build error.
|
|
9
|
+
* The domain type was authored independently in Plan 01-01; this file
|
|
10
|
+
* authors ONLY the schema and never derives the domain type from the
|
|
11
|
+
* schema itself (no schema-derived self-reference allowed by D0045).
|
|
12
|
+
*
|
|
13
|
+
* `graderJudgmentsVersion` is co-located with the schema (VER-01 D-02 —
|
|
14
|
+
* source-of-truth file owns its version constant). Bumped by hand when
|
|
15
|
+
* the grader rubric, prompt template, or judgment shape changes.
|
|
16
|
+
*
|
|
17
|
+
* Phase 3 will replace the inline `JSON.parse` at
|
|
18
|
+
* `pipeline/calculate-scores.ts:380-392` (Pitfall #4) so all grader
|
|
19
|
+
* output flows through this schema.
|
|
20
|
+
*
|
|
21
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
22
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
23
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
24
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
25
|
+
*/
|
|
26
|
+
import { z } from "zod";
|
|
27
|
+
/**
|
|
28
|
+
* VER-01 D-02 — co-located version constant. Bumped by hand when the
|
|
29
|
+
* grader rubric, prompt template, or judgment shape changes in a way
|
|
30
|
+
* that should invalidate cached Diagnoses.
|
|
31
|
+
*
|
|
32
|
+
* Phase 3 GRAD-05 bumped this from `"0.1.0"` to `"1.0.0"` (semver
|
|
33
|
+
* major) — the additive GRAD-02 surface is now required + the schema
|
|
34
|
+
* is `.strict()`. AILF has no installed external base; the legacy
|
|
35
|
+
* parser at `./legacy/promptfoo-grader-output-legacy.ts` is the named
|
|
36
|
+
* consumer for already-stored historical reports.
|
|
37
|
+
*/
|
|
38
|
+
export declare const graderJudgmentsVersion = "1.0.0";
|
|
39
|
+
/**
|
|
40
|
+
* Canonical schema for {@link GraderJudgment}. Required fields mirror
|
|
41
|
+
* the existing pipeline core (Doc 03 §"existing, unchanged"):
|
|
42
|
+
* `taskId`, `modelId`, `dimension`, `reason`, `score`. Phase 3 GRAD-05
|
|
43
|
+
* has tightened the additive surface to required and added `.strict()`
|
|
44
|
+
* — the schema rejects unknown fields (defense-in-depth against future
|
|
45
|
+
* prompt-injection attempts that try to smuggle keys through the
|
|
46
|
+
* grader emission).
|
|
47
|
+
*
|
|
48
|
+
* Branded `JudgmentId` is represented at runtime by a non-empty string;
|
|
49
|
+
* the schema routes the brand through `brandedString<"JudgmentId">()`
|
|
50
|
+
* — the project's single audited cast site for branded-string
|
|
51
|
+
* schemas (project typescript rule: no `as` on `unknown`).
|
|
52
|
+
*/
|
|
53
|
+
export declare const GraderJudgmentSchema: z.ZodObject<{
|
|
54
|
+
taskId: z.ZodString;
|
|
55
|
+
modelId: z.ZodString;
|
|
56
|
+
dimension: z.ZodString;
|
|
57
|
+
reason: z.ZodString;
|
|
58
|
+
score: z.ZodNumber;
|
|
59
|
+
outputFailure: z.ZodOptional<z.ZodBoolean>;
|
|
60
|
+
judgmentId: z.ZodType<import("@sanity/ailf-core").Brand<string, "JudgmentId">, unknown, z.core.$ZodTypeInternals<import("@sanity/ailf-core").Brand<string, "JudgmentId">, unknown>>;
|
|
61
|
+
subJudgments: z.ZodArray<z.ZodObject<{
|
|
62
|
+
criterionId: z.ZodString;
|
|
63
|
+
met: z.ZodBoolean;
|
|
64
|
+
evidence: z.ZodString;
|
|
65
|
+
confidence: z.ZodObject<{
|
|
66
|
+
level: z.ZodEnum<{
|
|
67
|
+
low: "low";
|
|
68
|
+
medium: "medium";
|
|
69
|
+
high: "high";
|
|
70
|
+
}>;
|
|
71
|
+
signalsPresent: z.ZodNumber;
|
|
72
|
+
derivation: z.ZodString;
|
|
73
|
+
}, z.core.$strip>;
|
|
74
|
+
}, z.core.$strip>>;
|
|
75
|
+
docCitations: z.ZodArray<z.ZodObject<{
|
|
76
|
+
documentId: z.ZodString;
|
|
77
|
+
slug: z.ZodOptional<z.ZodString>;
|
|
78
|
+
role: z.ZodEnum<{
|
|
79
|
+
supports: "supports";
|
|
80
|
+
contradicts: "contradicts";
|
|
81
|
+
missing: "missing";
|
|
82
|
+
irrelevant: "irrelevant";
|
|
83
|
+
}>;
|
|
84
|
+
hallucinated: z.ZodOptional<z.ZodBoolean>;
|
|
85
|
+
}, z.core.$strip>>;
|
|
86
|
+
failureMode: z.ZodString;
|
|
87
|
+
confidence: z.ZodObject<{
|
|
88
|
+
level: z.ZodEnum<{
|
|
89
|
+
low: "low";
|
|
90
|
+
medium: "medium";
|
|
91
|
+
high: "high";
|
|
92
|
+
}>;
|
|
93
|
+
signalsPresent: z.ZodNumber;
|
|
94
|
+
derivation: z.ZodString;
|
|
95
|
+
}, z.core.$strip>;
|
|
96
|
+
hallucinationCheckedAgainst: z.ZodArray<z.ZodString>;
|
|
97
|
+
metadata: z.ZodObject<{
|
|
98
|
+
graderModel: z.ZodString;
|
|
99
|
+
graderJudgmentsVersion: z.ZodString;
|
|
100
|
+
}, z.core.$strip>;
|
|
101
|
+
}, z.core.$strict>;
|
|
102
|
+
export type { GraderJudgment } from "../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* promptfoo-grader-output.ts — Zod schema for the structured grader output
|
|
3
|
+
* (GRAD-02) emitted by the promptfoo grader process and consumed by the
|
|
4
|
+
* eval pipeline.
|
|
5
|
+
*
|
|
6
|
+
* The schema asserts `satisfies z.ZodType<GraderJudgment>` against the
|
|
7
|
+
* canonical domain type in `packages/core/src/types/grader-judgment.ts`
|
|
8
|
+
* (D0045 / W0187) — drift between schema and type is a build error.
|
|
9
|
+
* The domain type was authored independently in Plan 01-01; this file
|
|
10
|
+
* authors ONLY the schema and never derives the domain type from the
|
|
11
|
+
* schema itself (no schema-derived self-reference allowed by D0045).
|
|
12
|
+
*
|
|
13
|
+
* `graderJudgmentsVersion` is co-located with the schema (VER-01 D-02 —
|
|
14
|
+
* source-of-truth file owns its version constant). Bumped by hand when
|
|
15
|
+
* the grader rubric, prompt template, or judgment shape changes.
|
|
16
|
+
*
|
|
17
|
+
* Phase 3 will replace the inline `JSON.parse` at
|
|
18
|
+
* `pipeline/calculate-scores.ts:380-392` (Pitfall #4) so all grader
|
|
19
|
+
* output flows through this schema.
|
|
20
|
+
*
|
|
21
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
22
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
23
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
24
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
25
|
+
*/
|
|
26
|
+
import { z } from "zod";
|
|
27
|
+
import { brandedString, ConfidenceSchema } from "../../_vendor/ailf-core/schemas/index.js";
|
|
28
|
+
/**
|
|
29
|
+
* VER-01 D-02 — co-located version constant. Bumped by hand when the
|
|
30
|
+
* grader rubric, prompt template, or judgment shape changes in a way
|
|
31
|
+
* that should invalidate cached Diagnoses.
|
|
32
|
+
*
|
|
33
|
+
* Phase 3 GRAD-05 bumped this from `"0.1.0"` to `"1.0.0"` (semver
|
|
34
|
+
* major) — the additive GRAD-02 surface is now required + the schema
|
|
35
|
+
* is `.strict()`. AILF has no installed external base; the legacy
|
|
36
|
+
* parser at `./legacy/promptfoo-grader-output-legacy.ts` is the named
|
|
37
|
+
* consumer for already-stored historical reports.
|
|
38
|
+
*/
|
|
39
|
+
export const graderJudgmentsVersion = "1.0.0";
|
|
40
|
+
const DocCitationRoleSchema = z.enum([
|
|
41
|
+
"supports",
|
|
42
|
+
"contradicts",
|
|
43
|
+
"missing",
|
|
44
|
+
"irrelevant",
|
|
45
|
+
]);
|
|
46
|
+
const DocCitationSchema = z.object({
|
|
47
|
+
documentId: z.string().min(1),
|
|
48
|
+
slug: z.string().optional(),
|
|
49
|
+
role: DocCitationRoleSchema,
|
|
50
|
+
hallucinated: z.boolean().optional(),
|
|
51
|
+
});
|
|
52
|
+
const CriterionSubJudgmentSchema = z.object({
|
|
53
|
+
criterionId: z.string().min(1),
|
|
54
|
+
met: z.boolean(),
|
|
55
|
+
evidence: z.string().max(280),
|
|
56
|
+
confidence: ConfidenceSchema,
|
|
57
|
+
});
|
|
58
|
+
/**
|
|
59
|
+
* Canonical schema for {@link GraderJudgment}. Required fields mirror
|
|
60
|
+
* the existing pipeline core (Doc 03 §"existing, unchanged"):
|
|
61
|
+
* `taskId`, `modelId`, `dimension`, `reason`, `score`. Phase 3 GRAD-05
|
|
62
|
+
* has tightened the additive surface to required and added `.strict()`
|
|
63
|
+
* — the schema rejects unknown fields (defense-in-depth against future
|
|
64
|
+
* prompt-injection attempts that try to smuggle keys through the
|
|
65
|
+
* grader emission).
|
|
66
|
+
*
|
|
67
|
+
* Branded `JudgmentId` is represented at runtime by a non-empty string;
|
|
68
|
+
* the schema routes the brand through `brandedString<"JudgmentId">()`
|
|
69
|
+
* — the project's single audited cast site for branded-string
|
|
70
|
+
* schemas (project typescript rule: no `as` on `unknown`).
|
|
71
|
+
*/
|
|
72
|
+
export const GraderJudgmentSchema = z
|
|
73
|
+
.object({
|
|
74
|
+
// ── Existing pipeline core (required — Doc 03 §"existing, unchanged") ─
|
|
75
|
+
taskId: z.string().min(1),
|
|
76
|
+
modelId: z.string().min(1),
|
|
77
|
+
dimension: z.string().min(1),
|
|
78
|
+
reason: z.string(),
|
|
79
|
+
score: z.number(),
|
|
80
|
+
outputFailure: z.boolean().optional(),
|
|
81
|
+
// ── GRAD-02 additive — required from Phase 3 GRAD-05 ───────────────
|
|
82
|
+
judgmentId: brandedString(),
|
|
83
|
+
subJudgments: z.array(CriterionSubJudgmentSchema),
|
|
84
|
+
docCitations: z.array(DocCitationSchema),
|
|
85
|
+
failureMode: z.string(),
|
|
86
|
+
confidence: ConfidenceSchema,
|
|
87
|
+
hallucinationCheckedAgainst: z.array(z.string()),
|
|
88
|
+
metadata: z.object({
|
|
89
|
+
graderModel: z.string().min(1),
|
|
90
|
+
graderJudgmentsVersion: z.string().min(1),
|
|
91
|
+
}),
|
|
92
|
+
})
|
|
93
|
+
.strict();
|
package/dist/adapters/index.d.ts
CHANGED
|
@@ -10,3 +10,6 @@ export { PromptfooEvalAdapter } from "./eval-runners/index.js";
|
|
|
10
10
|
export { ConsoleLogger, type ConsoleLoggerOptions, JsonLogger, QuietLogger, } from "./loggers/index.js";
|
|
11
11
|
export { CliConfigAdapter, FileConfigAdapter } from "./config-sources/index.js";
|
|
12
12
|
export { DtsPackageSurface, InMemoryPackageSurface, type DtsPackageSurfaceOptions, type PackageRootResolver, parseDtsExports, type ParsedDtsExports, } from "./package-surface/index.js";
|
|
13
|
+
export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./grader-outputs/index.js";
|
|
14
|
+
export { AttributionMetaSchema, JudgmentAttributionSchema, } from "./attribution/index.js";
|
|
15
|
+
export type { AttributionMeta, DocAttribution, GraderJudgment, JudgmentAttribution, } from "../_vendor/ailf-core/index.d.ts";
|
package/dist/adapters/index.js
CHANGED
|
@@ -10,3 +10,7 @@ export { PromptfooEvalAdapter } from "./eval-runners/index.js";
|
|
|
10
10
|
export { ConsoleLogger, JsonLogger, QuietLogger, } from "./loggers/index.js";
|
|
11
11
|
export { CliConfigAdapter, FileConfigAdapter } from "./config-sources/index.js";
|
|
12
12
|
export { DtsPackageSurface, InMemoryPackageSurface, parseDtsExports, } from "./package-surface/index.js";
|
|
13
|
+
// Phase 1 Plan 02 — actionability-ladder adapter schemas (GRAD-02, ATTR-01).
|
|
14
|
+
// Named re-exports only (W0124 / D0045).
|
|
15
|
+
export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./grader-outputs/index.js";
|
|
16
|
+
export { AttributionMetaSchema, JudgmentAttributionSchema, } from "./attribution/index.js";
|
|
@@ -40,9 +40,29 @@ export declare class FakeLLMClient implements LLMClient {
|
|
|
40
40
|
readonly calls: FakeCallRecord[];
|
|
41
41
|
private readonly completeQueue;
|
|
42
42
|
private readonly structuredQueue;
|
|
43
|
+
/**
|
|
44
|
+
* Per-cardId keyed responses. A single-value entry is returned on every
|
|
45
|
+
* call for that cardId (repeated calls always get the same response). An
|
|
46
|
+
* array-value entry is consumed in order; once exhausted, calls for that
|
|
47
|
+
* cardId fall back to the FIFO structuredQueue.
|
|
48
|
+
*
|
|
49
|
+
* This is the substrate Plan 07's 17-fixture eval matrix uses to wire
|
|
50
|
+
* deterministic responses to specific LLM cards.
|
|
51
|
+
*/
|
|
52
|
+
private readonly keyedResponses;
|
|
43
53
|
constructor(args?: {
|
|
44
54
|
completeResponses?: FakeCompletionResponse[];
|
|
45
55
|
structuredResponses?: FakeStructuredResponse[];
|
|
56
|
+
/**
|
|
57
|
+
* Optional keyed-response map. Keys are `cardId` values from
|
|
58
|
+
* `args.context.cardId`. When a call matches a key the keyed entry is
|
|
59
|
+
* used instead of the FIFO queue.
|
|
60
|
+
*
|
|
61
|
+
* - Single-value entry: same response on every call for this cardId.
|
|
62
|
+
* - Array-value entry: entries consumed in insertion order; falls back
|
|
63
|
+
* to FIFO (or throws) when the array is exhausted.
|
|
64
|
+
*/
|
|
65
|
+
keyedResponses?: Record<string, FakeStructuredResponse | FakeStructuredResponse[]>;
|
|
46
66
|
});
|
|
47
67
|
complete(args: LLMCompleteArgs): Promise<LLMCompletion>;
|
|
48
68
|
completeStructured<T>(args: LLMCompleteStructuredArgs<T>): Promise<LLMStructuredCompletion<T>>;
|
|
@@ -11,9 +11,25 @@ export class FakeLLMClient {
|
|
|
11
11
|
calls = [];
|
|
12
12
|
completeQueue;
|
|
13
13
|
structuredQueue;
|
|
14
|
+
/**
|
|
15
|
+
* Per-cardId keyed responses. A single-value entry is returned on every
|
|
16
|
+
* call for that cardId (repeated calls always get the same response). An
|
|
17
|
+
* array-value entry is consumed in order; once exhausted, calls for that
|
|
18
|
+
* cardId fall back to the FIFO structuredQueue.
|
|
19
|
+
*
|
|
20
|
+
* This is the substrate Plan 07's 17-fixture eval matrix uses to wire
|
|
21
|
+
* deterministic responses to specific LLM cards.
|
|
22
|
+
*/
|
|
23
|
+
keyedResponses;
|
|
14
24
|
constructor(args = {}) {
|
|
15
25
|
this.completeQueue = [...(args.completeResponses ?? [])];
|
|
16
26
|
this.structuredQueue = [...(args.structuredResponses ?? [])];
|
|
27
|
+
// Deep-copy arrays so the caller's fixture data is not mutated.
|
|
28
|
+
const keyed = {};
|
|
29
|
+
for (const [key, val] of Object.entries(args.keyedResponses ?? {})) {
|
|
30
|
+
keyed[key] = Array.isArray(val) ? [...val] : val;
|
|
31
|
+
}
|
|
32
|
+
this.keyedResponses = keyed;
|
|
17
33
|
}
|
|
18
34
|
async complete(args) {
|
|
19
35
|
this.calls.push({
|
|
@@ -37,13 +53,34 @@ export class FakeLLMClient {
|
|
|
37
53
|
};
|
|
38
54
|
}
|
|
39
55
|
async completeStructured(args) {
|
|
56
|
+
// Record every call first so test assertions on this.calls are never
|
|
57
|
+
// affected by which branch (keyed vs FIFO) handles the response.
|
|
40
58
|
this.calls.push({
|
|
41
59
|
kind: "completeStructured",
|
|
42
60
|
model: args.model,
|
|
43
61
|
prompt: args.prompt,
|
|
44
62
|
...(args.context ? { context: args.context } : {}),
|
|
45
63
|
});
|
|
46
|
-
|
|
64
|
+
let next;
|
|
65
|
+
const cardId = args.context?.cardId;
|
|
66
|
+
if (cardId !== undefined && cardId in this.keyedResponses) {
|
|
67
|
+
const entry = this.keyedResponses[cardId];
|
|
68
|
+
if (Array.isArray(entry)) {
|
|
69
|
+
// Array-value: consume one entry per call. When exhausted, fall
|
|
70
|
+
// through to the FIFO queue below.
|
|
71
|
+
if (entry.length > 0) {
|
|
72
|
+
next = entry.shift();
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
else {
|
|
76
|
+
// Single-value: return the same response on every call.
|
|
77
|
+
next = entry;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
if (next === undefined) {
|
|
81
|
+
// FIFO fallback (existing behavior)
|
|
82
|
+
next = this.structuredQueue.shift();
|
|
83
|
+
}
|
|
47
84
|
if (!next) {
|
|
48
85
|
throw new Error("FakeLLMClient: no more queued structured responses (call exceeded queue)");
|
|
49
86
|
}
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
* the adapter never reads `process.env`. The composition root maps env vars
|
|
11
11
|
* to typed constructor args.
|
|
12
12
|
*/
|
|
13
|
+
import { z } from "zod";
|
|
13
14
|
import { OpenAIChatResponseSchema, splitModelId, } from "../../_vendor/ailf-core/index.js";
|
|
14
15
|
import { DEFAULT_RETRY_POLICY, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
15
16
|
const DEFAULT_BASE_URL = "https://api.openai.com/v1/chat/completions";
|
|
@@ -67,10 +68,25 @@ export class OpenAILLMClient {
|
|
|
67
68
|
}
|
|
68
69
|
async completeStructured(args) {
|
|
69
70
|
const { modelName } = splitModelId(args.model);
|
|
71
|
+
// Derive the JSON Schema from the caller's Zod schema. Zod v4 natively
|
|
72
|
+
// emits `additionalProperties: false` on every nested z.object node —
|
|
73
|
+
// this is required for OpenAI strict-mode.
|
|
74
|
+
const jsonSchema = z.toJSONSchema(args.schema, { target: "draft-2020-12" });
|
|
75
|
+
// OpenAI strict-mode requires the root to be a plain object schema (no
|
|
76
|
+
// anyOf/oneOf/allOf at the top level). Discriminated unions produce
|
|
77
|
+
// anyOf at the root — callers must wrap them in a discriminator object.
|
|
78
|
+
assertSchemaIsObjectRoot(jsonSchema, args.model);
|
|
70
79
|
const body = buildBody(modelName, args.prompt, {
|
|
71
|
-
temperature: args.temperature,
|
|
72
|
-
maxTokens: args.maxTokens,
|
|
73
|
-
responseFormat: {
|
|
80
|
+
temperature: args.temperature ?? 0.1,
|
|
81
|
+
maxTokens: args.maxTokens ?? 2000,
|
|
82
|
+
responseFormat: {
|
|
83
|
+
type: "json_schema",
|
|
84
|
+
json_schema: {
|
|
85
|
+
name: args.context?.cardId ?? "structured_output",
|
|
86
|
+
schema: jsonSchema,
|
|
87
|
+
strict: true,
|
|
88
|
+
},
|
|
89
|
+
},
|
|
74
90
|
});
|
|
75
91
|
const data = await this.callApi(body);
|
|
76
92
|
const raw = data.choices?.[0]?.message?.content;
|
|
@@ -84,6 +100,9 @@ export class OpenAILLMClient {
|
|
|
84
100
|
catch (err) {
|
|
85
101
|
throw new Error(`OpenAI structured completion returned invalid JSON for model ${args.model}: ${err instanceof Error ? err.message : String(err)}`, { cause: err });
|
|
86
102
|
}
|
|
103
|
+
// strict:true guarantees a valid-against-the-schema JSON document, but
|
|
104
|
+
// the Zod parse is still load-bearing — it brands the result as T and is
|
|
105
|
+
// the only contract the engine trusts (D0045 parse-don't-validate).
|
|
87
106
|
const value = args.schema.parse(parsed);
|
|
88
107
|
const usage = extractUsage(data.usage);
|
|
89
108
|
const cost = this.computeCost(modelName, usage);
|
|
@@ -145,6 +164,36 @@ export class OpenAILLMClient {
|
|
|
145
164
|
`cost_usd=${cost.toFixed(6)}`);
|
|
146
165
|
}
|
|
147
166
|
}
|
|
167
|
+
/**
|
|
168
|
+
* Assert that the JSON Schema root is a plain object type.
|
|
169
|
+
*
|
|
170
|
+
* OpenAI strict-mode requires the root schema to be `{ type: "object" }`.
|
|
171
|
+
* A discriminated union (`z.union([...])`) produces `{ anyOf: [...] }` at
|
|
172
|
+
* the root — callers must wrap the union in a discriminator object before
|
|
173
|
+
* passing it to `completeStructured`.
|
|
174
|
+
*
|
|
175
|
+
* Per AI-SPEC §3 Pitfall 6 + T-05-03-01: caught at request-build time to
|
|
176
|
+
* avoid wasting API budget on a guaranteed 400.
|
|
177
|
+
*/
|
|
178
|
+
function assertSchemaIsObjectRoot(schema, modelId) {
|
|
179
|
+
if (typeof schema !== "object" || schema === null) {
|
|
180
|
+
throw new Error(`OpenAILLMClient: OpenAI strict-mode requires a single z.object at the ` +
|
|
181
|
+
`schema root for model ${modelId}; got non-object JSON Schema root.`);
|
|
182
|
+
}
|
|
183
|
+
const node = schema;
|
|
184
|
+
if (node.type !== "object") {
|
|
185
|
+
// Identify the kind so the error message is actionable.
|
|
186
|
+
const kind = "anyOf" in node
|
|
187
|
+
? "z.union"
|
|
188
|
+
: "oneOf" in node
|
|
189
|
+
? "z.discriminatedUnion"
|
|
190
|
+
: "allOf" in node
|
|
191
|
+
? "z.intersection"
|
|
192
|
+
: String(node.type ?? "unknown");
|
|
193
|
+
throw new Error(`OpenAILLMClient: OpenAI strict-mode requires a single z.object at the ` +
|
|
194
|
+
`schema root; got ${kind}. Wrap the union in a discriminator object.`);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
148
197
|
function buildBody(modelName, prompt, opts) {
|
|
149
198
|
const body = {
|
|
150
199
|
model: modelName,
|
|
@@ -55,9 +55,13 @@ interface ContentLakeCanonicalDoc {
|
|
|
55
55
|
sectionSlug?: string;
|
|
56
56
|
slug?: string;
|
|
57
57
|
}
|
|
58
|
+
interface ContentLakeCriterion {
|
|
59
|
+
id?: string;
|
|
60
|
+
text?: string;
|
|
61
|
+
}
|
|
58
62
|
/** Assertion shape from the Content Lake (mirrors the Studio schema). */
|
|
59
63
|
interface ContentLakeAssertion {
|
|
60
|
-
criteria?:
|
|
64
|
+
criteria?: ContentLakeCriterion[];
|
|
61
65
|
template?: string;
|
|
62
66
|
threshold?: number;
|
|
63
67
|
type?: string;
|
|
@@ -73,7 +73,13 @@ const TASKS_QUERY = /* groq */ `
|
|
|
73
73
|
perspective,
|
|
74
74
|
reason
|
|
75
75
|
},
|
|
76
|
-
"assertions": coalesce(assertions, assert)
|
|
76
|
+
"assertions": coalesce(assertions, assert)[] {
|
|
77
|
+
type, template, weight, value, threshold,
|
|
78
|
+
"criteria": criteria[] {
|
|
79
|
+
"id": coalesce(id.current, _key),
|
|
80
|
+
"text": coalesce(text, @)
|
|
81
|
+
}
|
|
82
|
+
},
|
|
77
83
|
rawAssert,
|
|
78
84
|
baseline,
|
|
79
85
|
tags,
|
|
@@ -256,8 +262,28 @@ function mapAssertions(raw) {
|
|
|
256
262
|
.filter((a) => !!a.type)
|
|
257
263
|
.map((a) => {
|
|
258
264
|
if (a.type === "llm-rubric" && a.template && a.criteria) {
|
|
265
|
+
// Tighten the runtime contract: the GROQ projection's
|
|
266
|
+
// `coalesce(text, @)` falls through to the entire criterion
|
|
267
|
+
// element when `text` is missing, so a partial legacy criterion
|
|
268
|
+
// like `{_key: "abc"}` arrives here as `{ id: "abc", text: {...} }`
|
|
269
|
+
// — `text` set to the whole `@` object. Explicit type checks
|
|
270
|
+
// drop those with a diagnostic, instead of letting the non-string
|
|
271
|
+
// `text` propagate until the outer ContentLakeAuthorableTaskSchema
|
|
272
|
+
// parse fails deep inside the assertions array (noisy diagnostic).
|
|
259
273
|
return {
|
|
260
|
-
criteria: a.criteria
|
|
274
|
+
criteria: a.criteria
|
|
275
|
+
.filter((c) => {
|
|
276
|
+
if (!c)
|
|
277
|
+
return false;
|
|
278
|
+
const idOk = typeof c.id === "string" && c.id.length > 0;
|
|
279
|
+
const textOk = typeof c.text === "string" && c.text.length > 0;
|
|
280
|
+
if (!idOk || !textOk) {
|
|
281
|
+
console.warn(`[ContentLakeTaskSource] dropping malformed criterion: ${JSON.stringify(c).slice(0, 100)}`);
|
|
282
|
+
return false;
|
|
283
|
+
}
|
|
284
|
+
return true;
|
|
285
|
+
})
|
|
286
|
+
.map((c) => ({ id: c.id, text: c.text })),
|
|
261
287
|
template: a.template,
|
|
262
288
|
type: "llm-rubric",
|
|
263
289
|
...(a.weight !== undefined ? { weight: a.weight } : {}),
|