@sanity/ailf 4.5.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/canonical/grader-references/agent-harness-tools.yaml +42 -0
  2. package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
  3. package/canonical/grader-references/mcp-server-spec.yaml +51 -0
  4. package/canonical/grader-references/portable-text.yaml +48 -0
  5. package/config/rubrics.ts +38 -2
  6. package/dist/_vendor/ailf-core/artifact-registry.d.ts +197 -2
  7. package/dist/_vendor/ailf-core/artifact-registry.js +419 -5
  8. package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
  9. package/dist/_vendor/ailf-core/examples/index.js +146 -47
  10. package/dist/_vendor/ailf-core/ports/context.d.ts +26 -0
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  13. package/dist/_vendor/ailf-core/ports/llm-client.d.ts +112 -0
  14. package/dist/_vendor/ailf-core/ports/llm-client.js +68 -0
  15. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
  16. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
  17. package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
  18. package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
  19. package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
  20. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  21. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
  22. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  23. package/dist/_vendor/ailf-core/schemas/index.js +9 -0
  24. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  25. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
  26. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
  27. package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
  28. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
  29. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
  30. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
  31. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
  32. package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
  33. package/dist/_vendor/ailf-core/services/index.js +5 -0
  34. package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
  35. package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
  36. package/dist/_vendor/ailf-core/types/attribution.js +18 -0
  37. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
  38. package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
  39. package/dist/_vendor/ailf-core/types/confidence.d.ts +68 -0
  40. package/dist/_vendor/ailf-core/types/confidence.js +56 -0
  41. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
  42. package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
  43. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
  44. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
  45. package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
  46. package/dist/_vendor/ailf-core/types/index.d.ts +82 -29
  47. package/dist/_vendor/ailf-core/types/index.js +16 -1
  48. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
  49. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
  50. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  51. package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
  52. package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
  53. package/dist/adapters/api-client/build-request.d.ts +1 -0
  54. package/dist/adapters/api-client/build-request.js +3 -0
  55. package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
  56. package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
  57. package/dist/adapters/attribution/index.d.ts +9 -0
  58. package/dist/adapters/attribution/index.js +8 -0
  59. package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
  60. package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
  61. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  62. package/dist/adapters/grader-outputs/index.d.ts +10 -0
  63. package/dist/adapters/grader-outputs/index.js +8 -0
  64. package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
  65. package/dist/adapters/grader-outputs/legacy/index.js +10 -0
  66. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
  67. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
  68. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
  69. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
  70. package/dist/adapters/index.d.ts +3 -0
  71. package/dist/adapters/index.js +4 -0
  72. package/dist/adapters/llm/anthropic-llm-client.d.ts +48 -0
  73. package/dist/adapters/llm/anthropic-llm-client.js +205 -0
  74. package/dist/adapters/llm/fake-llm-client.d.ts +49 -0
  75. package/dist/adapters/llm/fake-llm-client.js +63 -0
  76. package/dist/adapters/llm/index.d.ts +9 -0
  77. package/dist/adapters/llm/index.js +4 -0
  78. package/dist/adapters/llm/openai-llm-client.d.ts +44 -0
  79. package/dist/adapters/llm/openai-llm-client.js +168 -0
  80. package/dist/adapters/llm/pricing.d.ts +12 -0
  81. package/dist/adapters/llm/pricing.js +8 -0
  82. package/dist/adapters/llm/retry.d.ts +56 -0
  83. package/dist/adapters/llm/retry.js +66 -0
  84. package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
  85. package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
  86. package/dist/adapters/task-sources/repo-schemas.d.ts +90 -22
  87. package/dist/adapters/task-sources/repo-schemas.js +19 -2
  88. package/dist/artifact-capture/api-gateway-artifact-writer.js +2 -1
  89. package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +2 -1
  90. package/dist/artifact-capture/gcs-artifact-writer.js +3 -1
  91. package/dist/artifact-capture/local-fs-artifact-writer.js +3 -1
  92. package/dist/commands/calculate-scores.js +1 -1
  93. package/dist/commands/explain-handler.js +1 -1
  94. package/dist/commands/lookup-doc.d.ts +1 -1
  95. package/dist/commands/lookup-doc.js +3 -3
  96. package/dist/commands/pipeline-action.d.ts +6 -0
  97. package/dist/commands/pipeline-action.js +2 -0
  98. package/dist/commands/remote-pipeline.js +1 -0
  99. package/dist/composition-root.d.ts +59 -1
  100. package/dist/composition-root.js +95 -0
  101. package/dist/config/rubrics.ts +38 -2
  102. package/dist/grader/agent-harness.d.ts +14 -0
  103. package/dist/grader/agent-harness.js +17 -0
  104. package/dist/grader/common.d.ts +17 -0
  105. package/dist/grader/common.js +21 -0
  106. package/dist/grader/index.d.ts +38 -0
  107. package/dist/grader/index.js +75 -0
  108. package/dist/grader/knowledge-probe.d.ts +14 -0
  109. package/dist/grader/knowledge-probe.js +18 -0
  110. package/dist/grader/literacy.d.ts +13 -0
  111. package/dist/grader/literacy.js +17 -0
  112. package/dist/grader/mcp.d.ts +14 -0
  113. package/dist/grader/mcp.js +18 -0
  114. package/dist/orchestration/build-app-context.js +1 -0
  115. package/dist/orchestration/build-step-sequence.js +5 -0
  116. package/dist/orchestration/steps/calculate-scores-step.js +23 -1
  117. package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
  118. package/dist/orchestration/steps/compute-attribution-step.js +279 -0
  119. package/dist/orchestration/steps/gap-analysis-step.js +35 -7
  120. package/dist/orchestration/steps/index.d.ts +1 -0
  121. package/dist/orchestration/steps/index.js +1 -0
  122. package/dist/pipeline/attribution.d.ts +15 -0
  123. package/dist/pipeline/attribution.js +18 -9
  124. package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
  125. package/dist/pipeline/borderline-consensus-runner.js +124 -0
  126. package/dist/pipeline/borderline-detector.d.ts +24 -0
  127. package/dist/pipeline/borderline-detector.js +26 -0
  128. package/dist/pipeline/calculate-scores.d.ts +114 -3
  129. package/dist/pipeline/calculate-scores.js +426 -24
  130. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  131. package/dist/pipeline/compiler/literacy-bridge.js +35 -17
  132. package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
  133. package/dist/pipeline/compiler/rubric-resolution.js +9 -1
  134. package/dist/pipeline/compute-attribution.d.ts +80 -0
  135. package/dist/pipeline/compute-attribution.js +196 -0
  136. package/dist/pipeline/failure-modes.d.ts +52 -17
  137. package/dist/pipeline/failure-modes.js +178 -117
  138. package/dist/pipeline/map-request-to-config.js +1 -0
  139. package/package.json +6 -4
@@ -0,0 +1,49 @@
1
+ /**
2
+ * per-entry-attribution-writer.ts — Zod schema for the per-judgment
3
+ * attribution artifact (ATTR-01) emitted by Phase 4's
4
+ * `ComputeAttributionStep` and read back by Phase 5's diagnosis runner.
5
+ *
6
+ * The schema asserts `satisfies z.ZodType<JudgmentAttribution>` against
7
+ * the canonical domain type in `packages/core/src/types/attribution.ts`
8
+ * (D0045 / W0187) — drift between schema and type is a build error.
9
+ *
10
+ * Phase 1 lands the SHAPE only — no compute, no file I/O. Phase 4 wires
11
+ * the writer; Phase 5 wires the reader. Both `satisfies` against this
12
+ * single source-of-truth schema.
13
+ *
14
+ * `hallucinationCheckedAgainst` is REQUIRED (Pitfall #11): consumers
15
+ * must be able to audit citation grounding without re-deriving the
16
+ * resolvable-set. The canonical task field is `contextDocs`; do NOT
17
+ * invent `expectedDocs` / `usedDocs` synonyms.
18
+ *
19
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
20
+ * @see docs/decisions/D0049-shared-confidence-contract.md
21
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
22
+ * @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
23
+ */
24
+ import { z } from "zod";
25
+ import { ConfidenceSchema } from "../../_vendor/ailf-core/schemas/index.js";
26
+ const DocAttributionSchema = z.object({
27
+ documentId: z.string().min(1),
28
+ slug: z.string().optional(),
29
+ score: z.number().min(0).max(1),
30
+ signals: z.object({
31
+ citation: z.number().min(0).max(1).optional(),
32
+ canonical: z.number().min(0).max(1).optional(),
33
+ retrieved: z.number().min(0).max(1).optional(),
34
+ }),
35
+ confidence: ConfidenceSchema,
36
+ });
37
+ /**
38
+ * Canonical schema for {@link JudgmentAttribution}. Persisted at
39
+ * `runs/{runId}/attribution/{entryKey}.json` (Phase 4) and parsed by
40
+ * the diagnosis runner on read (Phase 5).
41
+ */
42
+ export const JudgmentAttributionSchema = z.object({
43
+ judgmentRef: z.string().min(1),
44
+ taskId: z.string().min(1),
45
+ modelId: z.string().min(1),
46
+ dimension: z.string().min(1),
47
+ attributions: z.array(DocAttributionSchema),
48
+ hallucinationCheckedAgainst: z.array(z.string()),
49
+ });
@@ -125,6 +125,7 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
125
125
  noCache: config.noCache ?? false,
126
126
  noRemoteCache: config.noRemoteCache ?? false,
127
127
  graderReplications: config.execution?.graderReplications,
128
+ borderlineReplications: config.execution?.borderlineReplications,
128
129
  graderContext: config.grader?.context,
129
130
  urls: config.urls,
130
131
  headers: config.agentic?.headers,
@@ -0,0 +1,10 @@
1
+ /**
2
+ * grader-outputs adapter barrel — named re-exports only (W0124 / D0045).
3
+ *
4
+ * The grader-output schema lives here so it enters the D0045
5
+ * `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
6
+ */
7
+ export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./promptfoo-grader-output.js";
8
+ export type { GraderJudgment } from "../../_vendor/ailf-core/index.d.ts";
9
+ export { LegacyGraderJudgmentSchema } from "./legacy/index.js";
10
+ export type { LegacyGraderJudgment } from "../../_vendor/ailf-core/index.d.ts";
@@ -0,0 +1,8 @@
1
+ /**
2
+ * grader-outputs adapter barrel — named re-exports only (W0124 / D0045).
3
+ *
4
+ * The grader-output schema lives here so it enters the D0045
5
+ * `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
6
+ */
7
+ export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./promptfoo-grader-output.js";
8
+ export { LegacyGraderJudgmentSchema } from "./legacy/index.js";
@@ -0,0 +1,11 @@
1
+ /**
2
+ * legacy grader-outputs adapter sub-barrel — named re-exports only
3
+ * (W0124 / D0045).
4
+ *
5
+ * Read-only schema for the Phase 1 free-prose grader-output shape,
6
+ * invoked only by historical-report rendering paths through Phase 7
7
+ * (GRAD-06 cutover). The schema lives here so it enters the D0045
8
+ * `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
9
+ */
10
+ export { LegacyGraderJudgmentSchema } from "./promptfoo-grader-output-legacy.js";
11
+ export type { LegacyGraderJudgment } from "../../../_vendor/ailf-core/index.d.ts";
@@ -0,0 +1,10 @@
1
+ /**
2
+ * legacy grader-outputs adapter sub-barrel — named re-exports only
3
+ * (W0124 / D0045).
4
+ *
5
+ * Read-only schema for the Phase 1 free-prose grader-output shape,
6
+ * invoked only by historical-report rendering paths through Phase 7
7
+ * (GRAD-06 cutover). The schema lives here so it enters the D0045
8
+ * `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
9
+ */
10
+ export { LegacyGraderJudgmentSchema } from "./promptfoo-grader-output-legacy.js";
@@ -0,0 +1,49 @@
1
+ /**
2
+ * promptfoo-grader-output-legacy.ts — Zod schema for the Phase 1
3
+ * free-prose grader-output shape, used by historical-report rendering
4
+ * paths.
5
+ *
6
+ * READ-ONLY: invoked only by historical-report rendering paths through
7
+ * Phase 7 (GRAD-06 cutover). Reports are immutable events — once a
8
+ * Report is written to Content Lake, the structured grader-judgment
9
+ * shape it captures cannot be back-filled. The legacy schema exists so
10
+ * pre-Phase-3 reports continue to deserialize cleanly.
11
+ *
12
+ * Live grader output that fails the strict {@link GraderJudgmentSchema}
13
+ * parse must NOT fall back to this schema. Drop to
14
+ * `failureMode: "unclassified"` instead. Strict and legacy schemas are
15
+ * deliberate siblings, not a legacy/canonical pair to consolidate.
16
+ *
17
+ * The schema asserts `satisfies z.ZodType<LegacyGraderJudgment>` against
18
+ * the canonical domain type in
19
+ * `packages/core/src/types/legacy-grader-judgment.ts` (D0045 / W0187) —
20
+ * drift between schema and type is a build error. The domain type is
21
+ * authored independently in `@sanity/ailf-core`; this file authors ONLY
22
+ * the schema and never derives the domain type from the schema itself
23
+ * (no schema-derived self-reference allowed by D0045).
24
+ *
25
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
26
+ * @see ../promptfoo-grader-output.ts — the strict (live-path) sibling
27
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
28
+ * §"Backwards compatibility"
29
+ */
30
+ import { z } from "zod";
31
+ /**
32
+ * Canonical schema for {@link LegacyGraderJudgment}. Mirrors the Phase 1
33
+ * superset core (`taskId`, `modelId`, `dimension`, `reason`, `score`,
34
+ * optional `outputFailure`). NO GRAD-02 additive fields — those are by
35
+ * construction absent on pre-Phase-3 output.
36
+ *
37
+ * Intentionally NOT `.strict()` — pre-Phase-3 reports may carry stray
38
+ * keys; the legacy parser tolerates them so historical-report rendering
39
+ * keeps working through the GRAD-06 cutover.
40
+ */
41
+ export declare const LegacyGraderJudgmentSchema: z.ZodObject<{
42
+ taskId: z.ZodString;
43
+ modelId: z.ZodString;
44
+ dimension: z.ZodString;
45
+ reason: z.ZodString;
46
+ score: z.ZodNumber;
47
+ outputFailure: z.ZodOptional<z.ZodBoolean>;
48
+ }, z.core.$strip>;
49
+ export type { LegacyGraderJudgment } from "../../../_vendor/ailf-core/index.d.ts";
@@ -0,0 +1,48 @@
1
+ /**
2
+ * promptfoo-grader-output-legacy.ts — Zod schema for the Phase 1
3
+ * free-prose grader-output shape, used by historical-report rendering
4
+ * paths.
5
+ *
6
+ * READ-ONLY: invoked only by historical-report rendering paths through
7
+ * Phase 7 (GRAD-06 cutover). Reports are immutable events — once a
8
+ * Report is written to Content Lake, the structured grader-judgment
9
+ * shape it captures cannot be back-filled. The legacy schema exists so
10
+ * pre-Phase-3 reports continue to deserialize cleanly.
11
+ *
12
+ * Live grader output that fails the strict {@link GraderJudgmentSchema}
13
+ * parse must NOT fall back to this schema. Drop to
14
+ * `failureMode: "unclassified"` instead. Strict and legacy schemas are
15
+ * deliberate siblings, not a legacy/canonical pair to consolidate.
16
+ *
17
+ * The schema asserts `satisfies z.ZodType<LegacyGraderJudgment>` against
18
+ * the canonical domain type in
19
+ * `packages/core/src/types/legacy-grader-judgment.ts` (D0045 / W0187) —
20
+ * drift between schema and type is a build error. The domain type is
21
+ * authored independently in `@sanity/ailf-core`; this file authors ONLY
22
+ * the schema and never derives the domain type from the schema itself
23
+ * (no schema-derived self-reference allowed by D0045).
24
+ *
25
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
26
+ * @see ../promptfoo-grader-output.ts — the strict (live-path) sibling
27
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
28
+ * §"Backwards compatibility"
29
+ */
30
+ import { z } from "zod";
31
+ /**
32
+ * Canonical schema for {@link LegacyGraderJudgment}. Mirrors the Phase 1
33
+ * superset core (`taskId`, `modelId`, `dimension`, `reason`, `score`,
34
+ * optional `outputFailure`). NO GRAD-02 additive fields — those are by
35
+ * construction absent on pre-Phase-3 output.
36
+ *
37
+ * Intentionally NOT `.strict()` — pre-Phase-3 reports may carry stray
38
+ * keys; the legacy parser tolerates them so historical-report rendering
39
+ * keeps working through the GRAD-06 cutover.
40
+ */
41
+ export const LegacyGraderJudgmentSchema = z.object({
42
+ taskId: z.string().min(1),
43
+ modelId: z.string().min(1),
44
+ dimension: z.string().min(1),
45
+ reason: z.string(),
46
+ score: z.number(),
47
+ outputFailure: z.boolean().optional(),
48
+ });
@@ -0,0 +1,102 @@
1
+ /**
2
+ * promptfoo-grader-output.ts — Zod schema for the structured grader output
3
+ * (GRAD-02) emitted by the promptfoo grader process and consumed by the
4
+ * eval pipeline.
5
+ *
6
+ * The schema asserts `satisfies z.ZodType<GraderJudgment>` against the
7
+ * canonical domain type in `packages/core/src/types/grader-judgment.ts`
8
+ * (D0045 / W0187) — drift between schema and type is a build error.
9
+ * The domain type was authored independently in Plan 01-01; this file
10
+ * authors ONLY the schema and never derives the domain type from the
11
+ * schema itself (no schema-derived self-reference allowed by D0045).
12
+ *
13
+ * `graderJudgmentsVersion` is co-located with the schema (VER-01 D-02 —
14
+ * source-of-truth file owns its version constant). Bumped by hand when
15
+ * the grader rubric, prompt template, or judgment shape changes.
16
+ *
17
+ * Phase 3 will replace the inline `JSON.parse` at
18
+ * `pipeline/calculate-scores.ts:380-392` (Pitfall #4) so all grader
19
+ * output flows through this schema.
20
+ *
21
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
22
+ * @see docs/decisions/D0049-shared-confidence-contract.md
23
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
24
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
25
+ */
26
+ import { z } from "zod";
27
+ /**
28
+ * VER-01 D-02 — co-located version constant. Bumped by hand when the
29
+ * grader rubric, prompt template, or judgment shape changes in a way
30
+ * that should invalidate cached Diagnoses.
31
+ *
32
+ * Phase 3 GRAD-05 bumped this from `"0.1.0"` to `"1.0.0"` (semver
33
+ * major) — the additive GRAD-02 surface is now required + the schema
34
+ * is `.strict()`. AILF has no installed external base; the legacy
35
+ * parser at `./legacy/promptfoo-grader-output-legacy.ts` is the named
36
+ * consumer for already-stored historical reports.
37
+ */
38
+ export declare const graderJudgmentsVersion = "1.0.0";
39
+ /**
40
+ * Canonical schema for {@link GraderJudgment}. Required fields mirror
41
+ * the existing pipeline core (Doc 03 §"existing, unchanged"):
42
+ * `taskId`, `modelId`, `dimension`, `reason`, `score`. Phase 3 GRAD-05
43
+ * has tightened the additive surface to required and added `.strict()`
44
+ * — the schema rejects unknown fields (defense-in-depth against future
45
+ * prompt-injection attempts that try to smuggle keys through the
46
+ * grader emission).
47
+ *
48
+ * Branded `JudgmentId` is represented at runtime by a non-empty string;
49
+ * the schema routes the brand through `brandedString<"JudgmentId">()`
50
+ * — the project's single audited cast site for branded-string
51
+ * schemas (project typescript rule: no `as` on `unknown`).
52
+ */
53
+ export declare const GraderJudgmentSchema: z.ZodObject<{
54
+ taskId: z.ZodString;
55
+ modelId: z.ZodString;
56
+ dimension: z.ZodString;
57
+ reason: z.ZodString;
58
+ score: z.ZodNumber;
59
+ outputFailure: z.ZodOptional<z.ZodBoolean>;
60
+ judgmentId: z.ZodType<import("@sanity/ailf-core").Brand<string, "JudgmentId">, unknown, z.core.$ZodTypeInternals<import("@sanity/ailf-core").Brand<string, "JudgmentId">, unknown>>;
61
+ subJudgments: z.ZodArray<z.ZodObject<{
62
+ criterionId: z.ZodString;
63
+ met: z.ZodBoolean;
64
+ evidence: z.ZodString;
65
+ confidence: z.ZodObject<{
66
+ level: z.ZodEnum<{
67
+ low: "low";
68
+ medium: "medium";
69
+ high: "high";
70
+ }>;
71
+ signalsPresent: z.ZodNumber;
72
+ derivation: z.ZodString;
73
+ }, z.core.$strip>;
74
+ }, z.core.$strip>>;
75
+ docCitations: z.ZodArray<z.ZodObject<{
76
+ documentId: z.ZodString;
77
+ slug: z.ZodOptional<z.ZodString>;
78
+ role: z.ZodEnum<{
79
+ supports: "supports";
80
+ contradicts: "contradicts";
81
+ missing: "missing";
82
+ irrelevant: "irrelevant";
83
+ }>;
84
+ hallucinated: z.ZodOptional<z.ZodBoolean>;
85
+ }, z.core.$strip>>;
86
+ failureMode: z.ZodString;
87
+ confidence: z.ZodObject<{
88
+ level: z.ZodEnum<{
89
+ low: "low";
90
+ medium: "medium";
91
+ high: "high";
92
+ }>;
93
+ signalsPresent: z.ZodNumber;
94
+ derivation: z.ZodString;
95
+ }, z.core.$strip>;
96
+ hallucinationCheckedAgainst: z.ZodArray<z.ZodString>;
97
+ metadata: z.ZodObject<{
98
+ graderModel: z.ZodString;
99
+ graderJudgmentsVersion: z.ZodString;
100
+ }, z.core.$strip>;
101
+ }, z.core.$strict>;
102
+ export type { GraderJudgment } from "../../_vendor/ailf-core/index.d.ts";
@@ -0,0 +1,93 @@
1
+ /**
2
+ * promptfoo-grader-output.ts — Zod schema for the structured grader output
3
+ * (GRAD-02) emitted by the promptfoo grader process and consumed by the
4
+ * eval pipeline.
5
+ *
6
+ * The schema asserts `satisfies z.ZodType<GraderJudgment>` against the
7
+ * canonical domain type in `packages/core/src/types/grader-judgment.ts`
8
+ * (D0045 / W0187) — drift between schema and type is a build error.
9
+ * The domain type was authored independently in Plan 01-01; this file
10
+ * authors ONLY the schema and never derives the domain type from the
11
+ * schema itself (no schema-derived self-reference allowed by D0045).
12
+ *
13
+ * `graderJudgmentsVersion` is co-located with the schema (VER-01 D-02 —
14
+ * source-of-truth file owns its version constant). Bumped by hand when
15
+ * the grader rubric, prompt template, or judgment shape changes.
16
+ *
17
+ * Phase 3 will replace the inline `JSON.parse` at
18
+ * `pipeline/calculate-scores.ts:380-392` (Pitfall #4) so all grader
19
+ * output flows through this schema.
20
+ *
21
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
22
+ * @see docs/decisions/D0049-shared-confidence-contract.md
23
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
24
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
25
+ */
26
+ import { z } from "zod";
27
+ import { brandedString, ConfidenceSchema } from "../../_vendor/ailf-core/schemas/index.js";
28
+ /**
29
+ * VER-01 D-02 — co-located version constant. Bumped by hand when the
30
+ * grader rubric, prompt template, or judgment shape changes in a way
31
+ * that should invalidate cached Diagnoses.
32
+ *
33
+ * Phase 3 GRAD-05 bumped this from `"0.1.0"` to `"1.0.0"` (semver
34
+ * major) — the additive GRAD-02 surface is now required + the schema
35
+ * is `.strict()`. AILF has no installed external base; the legacy
36
+ * parser at `./legacy/promptfoo-grader-output-legacy.ts` is the named
37
+ * consumer for already-stored historical reports.
38
+ */
39
+ export const graderJudgmentsVersion = "1.0.0";
40
+ const DocCitationRoleSchema = z.enum([
41
+ "supports",
42
+ "contradicts",
43
+ "missing",
44
+ "irrelevant",
45
+ ]);
46
+ const DocCitationSchema = z.object({
47
+ documentId: z.string().min(1),
48
+ slug: z.string().optional(),
49
+ role: DocCitationRoleSchema,
50
+ hallucinated: z.boolean().optional(),
51
+ });
52
+ const CriterionSubJudgmentSchema = z.object({
53
+ criterionId: z.string().min(1),
54
+ met: z.boolean(),
55
+ evidence: z.string().max(280),
56
+ confidence: ConfidenceSchema,
57
+ });
58
+ /**
59
+ * Canonical schema for {@link GraderJudgment}. Required fields mirror
60
+ * the existing pipeline core (Doc 03 §"existing, unchanged"):
61
+ * `taskId`, `modelId`, `dimension`, `reason`, `score`. Phase 3 GRAD-05
62
+ * has tightened the additive surface to required and added `.strict()`
63
+ * — the schema rejects unknown fields (defense-in-depth against future
64
+ * prompt-injection attempts that try to smuggle keys through the
65
+ * grader emission).
66
+ *
67
+ * Branded `JudgmentId` is represented at runtime by a non-empty string;
68
+ * the schema routes the brand through `brandedString<"JudgmentId">()`
69
+ * — the project's single audited cast site for branded-string
70
+ * schemas (project typescript rule: no `as` on `unknown`).
71
+ */
72
+ export const GraderJudgmentSchema = z
73
+ .object({
74
+ // ── Existing pipeline core (required — Doc 03 §"existing, unchanged") ─
75
+ taskId: z.string().min(1),
76
+ modelId: z.string().min(1),
77
+ dimension: z.string().min(1),
78
+ reason: z.string(),
79
+ score: z.number(),
80
+ outputFailure: z.boolean().optional(),
81
+ // ── GRAD-02 additive — required from Phase 3 GRAD-05 ───────────────
82
+ judgmentId: brandedString(),
83
+ subJudgments: z.array(CriterionSubJudgmentSchema),
84
+ docCitations: z.array(DocCitationSchema),
85
+ failureMode: z.string(),
86
+ confidence: ConfidenceSchema,
87
+ hallucinationCheckedAgainst: z.array(z.string()),
88
+ metadata: z.object({
89
+ graderModel: z.string().min(1),
90
+ graderJudgmentsVersion: z.string().min(1),
91
+ }),
92
+ })
93
+ .strict();
@@ -10,3 +10,6 @@ export { PromptfooEvalAdapter } from "./eval-runners/index.js";
10
10
  export { ConsoleLogger, type ConsoleLoggerOptions, JsonLogger, QuietLogger, } from "./loggers/index.js";
11
11
  export { CliConfigAdapter, FileConfigAdapter } from "./config-sources/index.js";
12
12
  export { DtsPackageSurface, InMemoryPackageSurface, type DtsPackageSurfaceOptions, type PackageRootResolver, parseDtsExports, type ParsedDtsExports, } from "./package-surface/index.js";
13
+ export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./grader-outputs/index.js";
14
+ export { AttributionMetaSchema, JudgmentAttributionSchema, } from "./attribution/index.js";
15
+ export type { AttributionMeta, DocAttribution, GraderJudgment, JudgmentAttribution, } from "../_vendor/ailf-core/index.d.ts";
@@ -10,3 +10,7 @@ export { PromptfooEvalAdapter } from "./eval-runners/index.js";
10
10
  export { ConsoleLogger, JsonLogger, QuietLogger, } from "./loggers/index.js";
11
11
  export { CliConfigAdapter, FileConfigAdapter } from "./config-sources/index.js";
12
12
  export { DtsPackageSurface, InMemoryPackageSurface, parseDtsExports, } from "./package-surface/index.js";
13
+ // Phase 1 Plan 02 — actionability-ladder adapter schemas (GRAD-02, ATTR-01).
14
+ // Named re-exports only (W0124 / D0045).
15
+ export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./grader-outputs/index.js";
16
+ export { AttributionMetaSchema, JudgmentAttributionSchema, } from "./attribution/index.js";
@@ -0,0 +1,48 @@
1
+ /**
2
+ * Anthropic adapter for the LLMClient port.
3
+ *
4
+ * Uses fetch() against the Messages API — same transport pattern as the
5
+ * existing grader. No SDK dependency. Centralizes retry, rate-limit handling,
6
+ * cost accounting, and per-call telemetry tagging via `context.feature`.
7
+ *
8
+ * Anthropic does not have a first-class JSON mode like OpenAI. For
9
+ * `completeStructured`, the adapter uses the API's top-level `system`
10
+ * field to instruct the model to return JSON only (top-level system is
11
+ * harder for user-controlled content in `prompt` to override than a
12
+ * user-turn prefix), then strips any surrounding ``` fences before
13
+ * parsing through the Zod schema (parse-don't-validate).
14
+ *
15
+ * Per `.claude/rules/eval-pipeline.md` and `.claude/rules/typescript.md`:
16
+ * the adapter never reads `process.env`. Typed constructor args only.
17
+ */
18
+ import { type LLMClient, type LLMCompleteArgs, type LLMCompleteStructuredArgs, type LLMCompletion, type LLMStructuredCompletion, type Logger } from "../../_vendor/ailf-core/index.d.ts";
19
+ import type { ModelPricing } from "./pricing.js";
20
+ import { type RetryPolicy } from "./retry.js";
21
+ export interface AnthropicLLMClientOptions {
22
+ apiKey: string;
23
+ baseUrl?: string;
24
+ /** Pricing keyed by canonical model id (without `anthropic:` prefix or `messages:` segment). */
25
+ pricing?: Record<string, ModelPricing>;
26
+ retryPolicy?: Partial<RetryPolicy>;
27
+ logger?: Logger;
28
+ sleep?: (ms: number) => Promise<void>;
29
+ rng?: () => number;
30
+ /** API version header. Default "2023-06-01" — matches the existing grader. */
31
+ apiVersion?: string;
32
+ }
33
+ export declare class AnthropicLLMClient implements LLMClient {
34
+ private readonly apiKey;
35
+ private readonly baseUrl;
36
+ private readonly apiVersion;
37
+ private readonly pricing;
38
+ private readonly retryPolicy;
39
+ private readonly logger?;
40
+ private readonly sleep?;
41
+ private readonly rng?;
42
+ constructor(options: AnthropicLLMClientOptions);
43
+ complete(args: LLMCompleteArgs): Promise<LLMCompletion>;
44
+ completeStructured<T>(args: LLMCompleteStructuredArgs<T>): Promise<LLMStructuredCompletion<T>>;
45
+ private callApi;
46
+ private computeCost;
47
+ private logTelemetry;
48
+ }
@@ -0,0 +1,205 @@
1
+ /**
2
+ * Anthropic adapter for the LLMClient port.
3
+ *
4
+ * Uses fetch() against the Messages API — same transport pattern as the
5
+ * existing grader. No SDK dependency. Centralizes retry, rate-limit handling,
6
+ * cost accounting, and per-call telemetry tagging via `context.feature`.
7
+ *
8
+ * Anthropic does not have a first-class JSON mode like OpenAI. For
9
+ * `completeStructured`, the adapter uses the API's top-level `system`
10
+ * field to instruct the model to return JSON only (top-level system is
11
+ * harder for user-controlled content in `prompt` to override than a
12
+ * user-turn prefix), then strips any surrounding ``` fences before
13
+ * parsing through the Zod schema (parse-don't-validate).
14
+ *
15
+ * Per `.claude/rules/eval-pipeline.md` and `.claude/rules/typescript.md`:
16
+ * the adapter never reads `process.env`. Typed constructor args only.
17
+ */
18
+ import { AnthropicResponseSchema, splitModelId, } from "../../_vendor/ailf-core/index.js";
19
+ import { DEFAULT_RETRY_POLICY, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
20
+ const DEFAULT_BASE_URL = "https://api.anthropic.com/v1/messages";
21
+ const DEFAULT_API_VERSION = "2023-06-01";
22
+ const DEFAULT_MAX_TOKENS = 4096;
23
+ /**
24
+ * Pricing reference: https://www.anthropic.com/pricing#api
25
+ * Update when models or vendor pricing changes.
26
+ */
27
+ const DEFAULT_PRICING = {
28
+ "claude-opus-4-6": { inputPer1k: 0.015, outputPer1k: 0.075 },
29
+ "claude-opus-4-5-20251101": { inputPer1k: 0.015, outputPer1k: 0.075 },
30
+ "claude-sonnet-4-6": { inputPer1k: 0.003, outputPer1k: 0.015 },
31
+ };
32
+ const STRUCTURED_SYSTEM = "Respond with only a single JSON object that conforms to the requested schema. " +
33
+ "Do not include any prose, commentary, or markdown code fences. " +
34
+ "Return raw JSON only.";
35
+ export class AnthropicLLMClient {
36
+ apiKey;
37
+ baseUrl;
38
+ apiVersion;
39
+ pricing;
40
+ retryPolicy;
41
+ logger;
42
+ sleep;
43
+ rng;
44
+ constructor(options) {
45
+ this.apiKey = options.apiKey;
46
+ this.baseUrl = options.baseUrl ?? DEFAULT_BASE_URL;
47
+ this.apiVersion = options.apiVersion ?? DEFAULT_API_VERSION;
48
+ this.pricing = { ...DEFAULT_PRICING, ...(options.pricing ?? {}) };
49
+ this.retryPolicy = {
50
+ ...DEFAULT_RETRY_POLICY,
51
+ ...(options.retryPolicy ?? {}),
52
+ };
53
+ if (options.logger)
54
+ this.logger = options.logger;
55
+ if (options.sleep)
56
+ this.sleep = options.sleep;
57
+ if (options.rng)
58
+ this.rng = options.rng;
59
+ }
60
+ async complete(args) {
61
+ const { modelName } = splitModelId(args.model);
62
+ const body = buildBody(modelName, args.prompt, {
63
+ temperature: args.temperature,
64
+ maxTokens: args.maxTokens,
65
+ stop: args.stop,
66
+ });
67
+ const data = await this.callApi(body);
68
+ const text = extractText(data.content);
69
+ if (text === "") {
70
+ throw new Error(`Anthropic returned empty completion for model ${args.model}`);
71
+ }
72
+ const usage = extractUsage(data.usage);
73
+ const cost = this.computeCost(modelName, usage);
74
+ this.logTelemetry(args.context, args.model, usage, cost);
75
+ return { text, usage, cost, model: args.model };
76
+ }
77
+ async completeStructured(args) {
78
+ const { modelName } = splitModelId(args.model);
79
+ const body = buildBody(modelName, args.prompt, {
80
+ temperature: args.temperature,
81
+ maxTokens: args.maxTokens,
82
+ system: STRUCTURED_SYSTEM,
83
+ });
84
+ const data = await this.callApi(body);
85
+ const raw = extractText(data.content);
86
+ if (raw === "") {
87
+ throw new Error(`Anthropic returned empty structured completion for model ${args.model}`);
88
+ }
89
+ const stripped = stripJsonFence(raw);
90
+ let parsed;
91
+ try {
92
+ parsed = JSON.parse(stripped);
93
+ }
94
+ catch (err) {
95
+ throw new Error(`Anthropic structured completion returned invalid JSON for model ${args.model}: ${err instanceof Error ? err.message : String(err)}`, { cause: err });
96
+ }
97
+ const value = args.schema.parse(parsed);
98
+ const usage = extractUsage(data.usage);
99
+ const cost = this.computeCost(modelName, usage);
100
+ this.logTelemetry(args.context, args.model, usage, cost);
101
+ return { value, usage, cost, model: args.model };
102
+ }
103
+ async callApi(body) {
104
+ return runWithRetry({
105
+ policy: this.retryPolicy,
106
+ ...(this.sleep ? { sleep: this.sleep } : {}),
107
+ ...(this.rng ? { rng: this.rng } : {}),
108
+ attempt: async () => {
109
+ const response = await fetch(this.baseUrl, {
110
+ method: "POST",
111
+ headers: {
112
+ "x-api-key": this.apiKey,
113
+ "anthropic-version": this.apiVersion,
114
+ "Content-Type": "application/json",
115
+ },
116
+ body: JSON.stringify(body),
117
+ });
118
+ if (!response.ok) {
119
+ const text = await response.text();
120
+ const retryAfter = parseRetryAfterSeconds(response.headers.get("retry-after"));
121
+ return {
122
+ ok: false,
123
+ status: response.status,
124
+ body: text,
125
+ ...(retryAfter !== undefined
126
+ ? { retryAfterSeconds: retryAfter }
127
+ : {}),
128
+ };
129
+ }
130
+ const json = await response.json();
131
+ const data = AnthropicResponseSchema.parse(json);
132
+ if (data.error?.message) {
133
+ throw new Error(`Anthropic API error: ${data.error.message}`);
134
+ }
135
+ return { ok: true, value: data };
136
+ },
137
+ });
138
+ }
139
+ computeCost(modelName, usage) {
140
+ const price = this.pricing[modelName];
141
+ if (!price) {
142
+ this.logger?.warn(`Anthropic cost unknown for model "${modelName}" — recording cost=0. Add it to AnthropicLLMClientOptions.pricing.`);
143
+ return 0;
144
+ }
145
+ return ((usage.promptTokens / 1000) * price.inputPer1k +
146
+ (usage.completionTokens / 1000) * price.outputPer1k);
147
+ }
148
+ logTelemetry(context, model, usage, cost) {
149
+ if (!this.logger)
150
+ return;
151
+ const tag = context ? ` feature=${context.feature}` : "";
152
+ const runTag = context?.runId ? ` runId=${context.runId}` : "";
153
+ const cardTag = context?.cardId ? ` cardId=${context.cardId}` : "";
154
+ this.logger.debug(`LLM call (anthropic)${tag}${runTag}${cardTag} model=${model} ` +
155
+ `prompt_tokens=${usage.promptTokens} completion_tokens=${usage.completionTokens} ` +
156
+ `cost_usd=${cost.toFixed(6)}`);
157
+ }
158
+ }
159
+ function buildBody(modelName, prompt, opts) {
160
+ const body = {
161
+ model: modelName,
162
+ max_tokens: opts.maxTokens ?? DEFAULT_MAX_TOKENS,
163
+ messages: [{ role: "user", content: prompt }],
164
+ };
165
+ if (opts.temperature !== undefined)
166
+ body.temperature = opts.temperature;
167
+ if (opts.stop && opts.stop.length > 0)
168
+ body.stop_sequences = opts.stop;
169
+ if (opts.system)
170
+ body.system = opts.system;
171
+ return body;
172
+ }
173
+ /**
174
+ * Concatenate every `text` content block. Anthropic responses can interleave
175
+ * `text` and `tool_use` blocks; for non-tool calls there's typically one
176
+ * text block, but joining is the robust default.
177
+ */
178
+ function extractText(content) {
179
+ if (!content)
180
+ return "";
181
+ const parts = [];
182
+ for (const block of content) {
183
+ if (block.type === "text" && typeof block.text === "string") {
184
+ parts.push(block.text);
185
+ }
186
+ }
187
+ return parts.join("");
188
+ }
189
+ function extractUsage(usage) {
190
+ return {
191
+ promptTokens: usage?.input_tokens ?? 0,
192
+ completionTokens: usage?.output_tokens ?? 0,
193
+ };
194
+ }
195
+ /**
196
+ * Strip a single ```json ... ``` or ``` ... ``` fence wrapper if present.
197
+ * Anthropic occasionally wraps JSON despite the system instruction.
198
+ */
199
+ function stripJsonFence(text) {
200
+ const trimmed = text.trim();
201
+ const fenceMatch = trimmed.match(/^```(?:json)?\s*([\s\S]*?)\s*```$/);
202
+ if (fenceMatch)
203
+ return fenceMatch[1].trim();
204
+ return trimmed;
205
+ }