@sanity/ailf 4.5.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +197 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +419 -5
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/ports/context.d.ts +26 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/llm-client.d.ts +112 -0
- package/dist/_vendor/ailf-core/ports/llm-client.js +68 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/services/index.js +5 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +68 -0
- package/dist/_vendor/ailf-core/types/confidence.js +56 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +82 -29
- package/dist/_vendor/ailf-core/types/index.js +16 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/llm/anthropic-llm-client.d.ts +48 -0
- package/dist/adapters/llm/anthropic-llm-client.js +205 -0
- package/dist/adapters/llm/fake-llm-client.d.ts +49 -0
- package/dist/adapters/llm/fake-llm-client.js +63 -0
- package/dist/adapters/llm/index.d.ts +9 -0
- package/dist/adapters/llm/index.js +4 -0
- package/dist/adapters/llm/openai-llm-client.d.ts +44 -0
- package/dist/adapters/llm/openai-llm-client.js +168 -0
- package/dist/adapters/llm/pricing.d.ts +12 -0
- package/dist/adapters/llm/pricing.js +8 -0
- package/dist/adapters/llm/retry.d.ts +56 -0
- package/dist/adapters/llm/retry.js +66 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +90 -22
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/artifact-capture/api-gateway-artifact-writer.js +2 -1
- package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +2 -1
- package/dist/artifact-capture/gcs-artifact-writer.js +3 -1
- package/dist/artifact-capture/local-fs-artifact-writer.js +3 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +59 -1
- package/dist/composition-root.js +95 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +14 -0
- package/dist/grader/agent-harness.js +17 -0
- package/dist/grader/common.d.ts +17 -0
- package/dist/grader/common.js +21 -0
- package/dist/grader/index.d.ts +38 -0
- package/dist/grader/index.js +75 -0
- package/dist/grader/knowledge-probe.d.ts +14 -0
- package/dist/grader/knowledge-probe.js +18 -0
- package/dist/grader/literacy.d.ts +13 -0
- package/dist/grader/literacy.js +17 -0
- package/dist/grader/mcp.d.ts +14 -0
- package/dist/grader/mcp.js +18 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +6 -4
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* per-entry-attribution-writer.ts — Zod schema for the per-judgment
|
|
3
|
+
* attribution artifact (ATTR-01) emitted by Phase 4's
|
|
4
|
+
* `ComputeAttributionStep` and read back by Phase 5's diagnosis runner.
|
|
5
|
+
*
|
|
6
|
+
* The schema asserts `satisfies z.ZodType<JudgmentAttribution>` against
|
|
7
|
+
* the canonical domain type in `packages/core/src/types/attribution.ts`
|
|
8
|
+
* (D0045 / W0187) — drift between schema and type is a build error.
|
|
9
|
+
*
|
|
10
|
+
* Phase 1 lands the SHAPE only — no compute, no file I/O. Phase 4 wires
|
|
11
|
+
* the writer; Phase 5 wires the reader. Both `satisfies` against this
|
|
12
|
+
* single source-of-truth schema.
|
|
13
|
+
*
|
|
14
|
+
* `hallucinationCheckedAgainst` is REQUIRED (Pitfall #11): consumers
|
|
15
|
+
* must be able to audit citation grounding without re-deriving the
|
|
16
|
+
* resolvable-set. The canonical task field is `contextDocs`; do NOT
|
|
17
|
+
* invent `expectedDocs` / `usedDocs` synonyms.
|
|
18
|
+
*
|
|
19
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
20
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
21
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
22
|
+
* @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
|
|
23
|
+
*/
|
|
24
|
+
import { z } from "zod";
|
|
25
|
+
import { ConfidenceSchema } from "../../_vendor/ailf-core/schemas/index.js";
|
|
26
|
+
const DocAttributionSchema = z.object({
|
|
27
|
+
documentId: z.string().min(1),
|
|
28
|
+
slug: z.string().optional(),
|
|
29
|
+
score: z.number().min(0).max(1),
|
|
30
|
+
signals: z.object({
|
|
31
|
+
citation: z.number().min(0).max(1).optional(),
|
|
32
|
+
canonical: z.number().min(0).max(1).optional(),
|
|
33
|
+
retrieved: z.number().min(0).max(1).optional(),
|
|
34
|
+
}),
|
|
35
|
+
confidence: ConfidenceSchema,
|
|
36
|
+
});
|
|
37
|
+
/**
|
|
38
|
+
* Canonical schema for {@link JudgmentAttribution}. Persisted at
|
|
39
|
+
* `runs/{runId}/attribution/{entryKey}.json` (Phase 4) and parsed by
|
|
40
|
+
* the diagnosis runner on read (Phase 5).
|
|
41
|
+
*/
|
|
42
|
+
export const JudgmentAttributionSchema = z.object({
|
|
43
|
+
judgmentRef: z.string().min(1),
|
|
44
|
+
taskId: z.string().min(1),
|
|
45
|
+
modelId: z.string().min(1),
|
|
46
|
+
dimension: z.string().min(1),
|
|
47
|
+
attributions: z.array(DocAttributionSchema),
|
|
48
|
+
hallucinationCheckedAgainst: z.array(z.string()),
|
|
49
|
+
});
|
|
@@ -125,6 +125,7 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
|
|
|
125
125
|
noCache: config.noCache ?? false,
|
|
126
126
|
noRemoteCache: config.noRemoteCache ?? false,
|
|
127
127
|
graderReplications: config.execution?.graderReplications,
|
|
128
|
+
borderlineReplications: config.execution?.borderlineReplications,
|
|
128
129
|
graderContext: config.grader?.context,
|
|
129
130
|
urls: config.urls,
|
|
130
131
|
headers: config.agentic?.headers,
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-outputs adapter barrel — named re-exports only (W0124 / D0045).
|
|
3
|
+
*
|
|
4
|
+
* The grader-output schema lives here so it enters the D0045
|
|
5
|
+
* `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
|
|
6
|
+
*/
|
|
7
|
+
export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./promptfoo-grader-output.js";
|
|
8
|
+
export type { GraderJudgment } from "../../_vendor/ailf-core/index.d.ts";
|
|
9
|
+
export { LegacyGraderJudgmentSchema } from "./legacy/index.js";
|
|
10
|
+
export type { LegacyGraderJudgment } from "../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-outputs adapter barrel — named re-exports only (W0124 / D0045).
|
|
3
|
+
*
|
|
4
|
+
* The grader-output schema lives here so it enters the D0045
|
|
5
|
+
* `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
|
|
6
|
+
*/
|
|
7
|
+
export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./promptfoo-grader-output.js";
|
|
8
|
+
export { LegacyGraderJudgmentSchema } from "./legacy/index.js";
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* legacy grader-outputs adapter sub-barrel — named re-exports only
|
|
3
|
+
* (W0124 / D0045).
|
|
4
|
+
*
|
|
5
|
+
* Read-only schema for the Phase 1 free-prose grader-output shape,
|
|
6
|
+
* invoked only by historical-report rendering paths through Phase 7
|
|
7
|
+
* (GRAD-06 cutover). The schema lives here so it enters the D0045
|
|
8
|
+
* `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
|
|
9
|
+
*/
|
|
10
|
+
export { LegacyGraderJudgmentSchema } from "./promptfoo-grader-output-legacy.js";
|
|
11
|
+
export type { LegacyGraderJudgment } from "../../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* legacy grader-outputs adapter sub-barrel — named re-exports only
|
|
3
|
+
* (W0124 / D0045).
|
|
4
|
+
*
|
|
5
|
+
* Read-only schema for the Phase 1 free-prose grader-output shape,
|
|
6
|
+
* invoked only by historical-report rendering paths through Phase 7
|
|
7
|
+
* (GRAD-06 cutover). The schema lives here so it enters the D0045
|
|
8
|
+
* `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
|
|
9
|
+
*/
|
|
10
|
+
export { LegacyGraderJudgmentSchema } from "./promptfoo-grader-output-legacy.js";
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* promptfoo-grader-output-legacy.ts — Zod schema for the Phase 1
|
|
3
|
+
* free-prose grader-output shape, used by historical-report rendering
|
|
4
|
+
* paths.
|
|
5
|
+
*
|
|
6
|
+
* READ-ONLY: invoked only by historical-report rendering paths through
|
|
7
|
+
* Phase 7 (GRAD-06 cutover). Reports are immutable events — once a
|
|
8
|
+
* Report is written to Content Lake, the structured grader-judgment
|
|
9
|
+
* shape it captures cannot be back-filled. The legacy schema exists so
|
|
10
|
+
* pre-Phase-3 reports continue to deserialize cleanly.
|
|
11
|
+
*
|
|
12
|
+
* Live grader output that fails the strict {@link GraderJudgmentSchema}
|
|
13
|
+
* parse must NOT fall back to this schema. Drop to
|
|
14
|
+
* `failureMode: "unclassified"` instead. Strict and legacy schemas are
|
|
15
|
+
* deliberate siblings, not a legacy/canonical pair to consolidate.
|
|
16
|
+
*
|
|
17
|
+
* The schema asserts `satisfies z.ZodType<LegacyGraderJudgment>` against
|
|
18
|
+
* the canonical domain type in
|
|
19
|
+
* `packages/core/src/types/legacy-grader-judgment.ts` (D0045 / W0187) —
|
|
20
|
+
* drift between schema and type is a build error. The domain type is
|
|
21
|
+
* authored independently in `@sanity/ailf-core`; this file authors ONLY
|
|
22
|
+
* the schema and never derives the domain type from the schema itself
|
|
23
|
+
* (no schema-derived self-reference allowed by D0045).
|
|
24
|
+
*
|
|
25
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
26
|
+
* @see ../promptfoo-grader-output.ts — the strict (live-path) sibling
|
|
27
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
28
|
+
* §"Backwards compatibility"
|
|
29
|
+
*/
|
|
30
|
+
import { z } from "zod";
|
|
31
|
+
/**
|
|
32
|
+
* Canonical schema for {@link LegacyGraderJudgment}. Mirrors the Phase 1
|
|
33
|
+
* superset core (`taskId`, `modelId`, `dimension`, `reason`, `score`,
|
|
34
|
+
* optional `outputFailure`). NO GRAD-02 additive fields — those are by
|
|
35
|
+
* construction absent on pre-Phase-3 output.
|
|
36
|
+
*
|
|
37
|
+
* Intentionally NOT `.strict()` — pre-Phase-3 reports may carry stray
|
|
38
|
+
* keys; the legacy parser tolerates them so historical-report rendering
|
|
39
|
+
* keeps working through the GRAD-06 cutover.
|
|
40
|
+
*/
|
|
41
|
+
export declare const LegacyGraderJudgmentSchema: z.ZodObject<{
|
|
42
|
+
taskId: z.ZodString;
|
|
43
|
+
modelId: z.ZodString;
|
|
44
|
+
dimension: z.ZodString;
|
|
45
|
+
reason: z.ZodString;
|
|
46
|
+
score: z.ZodNumber;
|
|
47
|
+
outputFailure: z.ZodOptional<z.ZodBoolean>;
|
|
48
|
+
}, z.core.$strip>;
|
|
49
|
+
export type { LegacyGraderJudgment } from "../../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* promptfoo-grader-output-legacy.ts — Zod schema for the Phase 1
|
|
3
|
+
* free-prose grader-output shape, used by historical-report rendering
|
|
4
|
+
* paths.
|
|
5
|
+
*
|
|
6
|
+
* READ-ONLY: invoked only by historical-report rendering paths through
|
|
7
|
+
* Phase 7 (GRAD-06 cutover). Reports are immutable events — once a
|
|
8
|
+
* Report is written to Content Lake, the structured grader-judgment
|
|
9
|
+
* shape it captures cannot be back-filled. The legacy schema exists so
|
|
10
|
+
* pre-Phase-3 reports continue to deserialize cleanly.
|
|
11
|
+
*
|
|
12
|
+
* Live grader output that fails the strict {@link GraderJudgmentSchema}
|
|
13
|
+
* parse must NOT fall back to this schema. Drop to
|
|
14
|
+
* `failureMode: "unclassified"` instead. Strict and legacy schemas are
|
|
15
|
+
* deliberate siblings, not a legacy/canonical pair to consolidate.
|
|
16
|
+
*
|
|
17
|
+
* The schema asserts `satisfies z.ZodType<LegacyGraderJudgment>` against
|
|
18
|
+
* the canonical domain type in
|
|
19
|
+
* `packages/core/src/types/legacy-grader-judgment.ts` (D0045 / W0187) —
|
|
20
|
+
* drift between schema and type is a build error. The domain type is
|
|
21
|
+
* authored independently in `@sanity/ailf-core`; this file authors ONLY
|
|
22
|
+
* the schema and never derives the domain type from the schema itself
|
|
23
|
+
* (no schema-derived self-reference allowed by D0045).
|
|
24
|
+
*
|
|
25
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
26
|
+
* @see ../promptfoo-grader-output.ts — the strict (live-path) sibling
|
|
27
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
28
|
+
* §"Backwards compatibility"
|
|
29
|
+
*/
|
|
30
|
+
import { z } from "zod";
|
|
31
|
+
/**
|
|
32
|
+
* Canonical schema for {@link LegacyGraderJudgment}. Mirrors the Phase 1
|
|
33
|
+
* superset core (`taskId`, `modelId`, `dimension`, `reason`, `score`,
|
|
34
|
+
* optional `outputFailure`). NO GRAD-02 additive fields — those are by
|
|
35
|
+
* construction absent on pre-Phase-3 output.
|
|
36
|
+
*
|
|
37
|
+
* Intentionally NOT `.strict()` — pre-Phase-3 reports may carry stray
|
|
38
|
+
* keys; the legacy parser tolerates them so historical-report rendering
|
|
39
|
+
* keeps working through the GRAD-06 cutover.
|
|
40
|
+
*/
|
|
41
|
+
export const LegacyGraderJudgmentSchema = z.object({
|
|
42
|
+
taskId: z.string().min(1),
|
|
43
|
+
modelId: z.string().min(1),
|
|
44
|
+
dimension: z.string().min(1),
|
|
45
|
+
reason: z.string(),
|
|
46
|
+
score: z.number(),
|
|
47
|
+
outputFailure: z.boolean().optional(),
|
|
48
|
+
});
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* promptfoo-grader-output.ts — Zod schema for the structured grader output
|
|
3
|
+
* (GRAD-02) emitted by the promptfoo grader process and consumed by the
|
|
4
|
+
* eval pipeline.
|
|
5
|
+
*
|
|
6
|
+
* The schema asserts `satisfies z.ZodType<GraderJudgment>` against the
|
|
7
|
+
* canonical domain type in `packages/core/src/types/grader-judgment.ts`
|
|
8
|
+
* (D0045 / W0187) — drift between schema and type is a build error.
|
|
9
|
+
* The domain type was authored independently in Plan 01-01; this file
|
|
10
|
+
* authors ONLY the schema and never derives the domain type from the
|
|
11
|
+
* schema itself (no schema-derived self-reference allowed by D0045).
|
|
12
|
+
*
|
|
13
|
+
* `graderJudgmentsVersion` is co-located with the schema (VER-01 D-02 —
|
|
14
|
+
* source-of-truth file owns its version constant). Bumped by hand when
|
|
15
|
+
* the grader rubric, prompt template, or judgment shape changes.
|
|
16
|
+
*
|
|
17
|
+
* Phase 3 will replace the inline `JSON.parse` at
|
|
18
|
+
* `pipeline/calculate-scores.ts:380-392` (Pitfall #4) so all grader
|
|
19
|
+
* output flows through this schema.
|
|
20
|
+
*
|
|
21
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
22
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
23
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
24
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
25
|
+
*/
|
|
26
|
+
import { z } from "zod";
|
|
27
|
+
/**
|
|
28
|
+
* VER-01 D-02 — co-located version constant. Bumped by hand when the
|
|
29
|
+
* grader rubric, prompt template, or judgment shape changes in a way
|
|
30
|
+
* that should invalidate cached Diagnoses.
|
|
31
|
+
*
|
|
32
|
+
* Phase 3 GRAD-05 bumped this from `"0.1.0"` to `"1.0.0"` (semver
|
|
33
|
+
* major) — the additive GRAD-02 surface is now required + the schema
|
|
34
|
+
* is `.strict()`. AILF has no installed external base; the legacy
|
|
35
|
+
* parser at `./legacy/promptfoo-grader-output-legacy.ts` is the named
|
|
36
|
+
* consumer for already-stored historical reports.
|
|
37
|
+
*/
|
|
38
|
+
export declare const graderJudgmentsVersion = "1.0.0";
|
|
39
|
+
/**
|
|
40
|
+
* Canonical schema for {@link GraderJudgment}. Required fields mirror
|
|
41
|
+
* the existing pipeline core (Doc 03 §"existing, unchanged"):
|
|
42
|
+
* `taskId`, `modelId`, `dimension`, `reason`, `score`. Phase 3 GRAD-05
|
|
43
|
+
* has tightened the additive surface to required and added `.strict()`
|
|
44
|
+
* — the schema rejects unknown fields (defense-in-depth against future
|
|
45
|
+
* prompt-injection attempts that try to smuggle keys through the
|
|
46
|
+
* grader emission).
|
|
47
|
+
*
|
|
48
|
+
* Branded `JudgmentId` is represented at runtime by a non-empty string;
|
|
49
|
+
* the schema routes the brand through `brandedString<"JudgmentId">()`
|
|
50
|
+
* — the project's single audited cast site for branded-string
|
|
51
|
+
* schemas (project typescript rule: no `as` on `unknown`).
|
|
52
|
+
*/
|
|
53
|
+
export declare const GraderJudgmentSchema: z.ZodObject<{
|
|
54
|
+
taskId: z.ZodString;
|
|
55
|
+
modelId: z.ZodString;
|
|
56
|
+
dimension: z.ZodString;
|
|
57
|
+
reason: z.ZodString;
|
|
58
|
+
score: z.ZodNumber;
|
|
59
|
+
outputFailure: z.ZodOptional<z.ZodBoolean>;
|
|
60
|
+
judgmentId: z.ZodType<import("@sanity/ailf-core").Brand<string, "JudgmentId">, unknown, z.core.$ZodTypeInternals<import("@sanity/ailf-core").Brand<string, "JudgmentId">, unknown>>;
|
|
61
|
+
subJudgments: z.ZodArray<z.ZodObject<{
|
|
62
|
+
criterionId: z.ZodString;
|
|
63
|
+
met: z.ZodBoolean;
|
|
64
|
+
evidence: z.ZodString;
|
|
65
|
+
confidence: z.ZodObject<{
|
|
66
|
+
level: z.ZodEnum<{
|
|
67
|
+
low: "low";
|
|
68
|
+
medium: "medium";
|
|
69
|
+
high: "high";
|
|
70
|
+
}>;
|
|
71
|
+
signalsPresent: z.ZodNumber;
|
|
72
|
+
derivation: z.ZodString;
|
|
73
|
+
}, z.core.$strip>;
|
|
74
|
+
}, z.core.$strip>>;
|
|
75
|
+
docCitations: z.ZodArray<z.ZodObject<{
|
|
76
|
+
documentId: z.ZodString;
|
|
77
|
+
slug: z.ZodOptional<z.ZodString>;
|
|
78
|
+
role: z.ZodEnum<{
|
|
79
|
+
supports: "supports";
|
|
80
|
+
contradicts: "contradicts";
|
|
81
|
+
missing: "missing";
|
|
82
|
+
irrelevant: "irrelevant";
|
|
83
|
+
}>;
|
|
84
|
+
hallucinated: z.ZodOptional<z.ZodBoolean>;
|
|
85
|
+
}, z.core.$strip>>;
|
|
86
|
+
failureMode: z.ZodString;
|
|
87
|
+
confidence: z.ZodObject<{
|
|
88
|
+
level: z.ZodEnum<{
|
|
89
|
+
low: "low";
|
|
90
|
+
medium: "medium";
|
|
91
|
+
high: "high";
|
|
92
|
+
}>;
|
|
93
|
+
signalsPresent: z.ZodNumber;
|
|
94
|
+
derivation: z.ZodString;
|
|
95
|
+
}, z.core.$strip>;
|
|
96
|
+
hallucinationCheckedAgainst: z.ZodArray<z.ZodString>;
|
|
97
|
+
metadata: z.ZodObject<{
|
|
98
|
+
graderModel: z.ZodString;
|
|
99
|
+
graderJudgmentsVersion: z.ZodString;
|
|
100
|
+
}, z.core.$strip>;
|
|
101
|
+
}, z.core.$strict>;
|
|
102
|
+
export type { GraderJudgment } from "../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* promptfoo-grader-output.ts — Zod schema for the structured grader output
|
|
3
|
+
* (GRAD-02) emitted by the promptfoo grader process and consumed by the
|
|
4
|
+
* eval pipeline.
|
|
5
|
+
*
|
|
6
|
+
* The schema asserts `satisfies z.ZodType<GraderJudgment>` against the
|
|
7
|
+
* canonical domain type in `packages/core/src/types/grader-judgment.ts`
|
|
8
|
+
* (D0045 / W0187) — drift between schema and type is a build error.
|
|
9
|
+
* The domain type was authored independently in Plan 01-01; this file
|
|
10
|
+
* authors ONLY the schema and never derives the domain type from the
|
|
11
|
+
* schema itself (no schema-derived self-reference allowed by D0045).
|
|
12
|
+
*
|
|
13
|
+
* `graderJudgmentsVersion` is co-located with the schema (VER-01 D-02 —
|
|
14
|
+
* source-of-truth file owns its version constant). Bumped by hand when
|
|
15
|
+
* the grader rubric, prompt template, or judgment shape changes.
|
|
16
|
+
*
|
|
17
|
+
* Phase 3 will replace the inline `JSON.parse` at
|
|
18
|
+
* `pipeline/calculate-scores.ts:380-392` (Pitfall #4) so all grader
|
|
19
|
+
* output flows through this schema.
|
|
20
|
+
*
|
|
21
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
22
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
23
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
24
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
25
|
+
*/
|
|
26
|
+
import { z } from "zod";
|
|
27
|
+
import { brandedString, ConfidenceSchema } from "../../_vendor/ailf-core/schemas/index.js";
|
|
28
|
+
/**
|
|
29
|
+
* VER-01 D-02 — co-located version constant. Bumped by hand when the
|
|
30
|
+
* grader rubric, prompt template, or judgment shape changes in a way
|
|
31
|
+
* that should invalidate cached Diagnoses.
|
|
32
|
+
*
|
|
33
|
+
* Phase 3 GRAD-05 bumped this from `"0.1.0"` to `"1.0.0"` (semver
|
|
34
|
+
* major) — the additive GRAD-02 surface is now required + the schema
|
|
35
|
+
* is `.strict()`. AILF has no installed external base; the legacy
|
|
36
|
+
* parser at `./legacy/promptfoo-grader-output-legacy.ts` is the named
|
|
37
|
+
* consumer for already-stored historical reports.
|
|
38
|
+
*/
|
|
39
|
+
export const graderJudgmentsVersion = "1.0.0";
|
|
40
|
+
const DocCitationRoleSchema = z.enum([
|
|
41
|
+
"supports",
|
|
42
|
+
"contradicts",
|
|
43
|
+
"missing",
|
|
44
|
+
"irrelevant",
|
|
45
|
+
]);
|
|
46
|
+
const DocCitationSchema = z.object({
|
|
47
|
+
documentId: z.string().min(1),
|
|
48
|
+
slug: z.string().optional(),
|
|
49
|
+
role: DocCitationRoleSchema,
|
|
50
|
+
hallucinated: z.boolean().optional(),
|
|
51
|
+
});
|
|
52
|
+
const CriterionSubJudgmentSchema = z.object({
|
|
53
|
+
criterionId: z.string().min(1),
|
|
54
|
+
met: z.boolean(),
|
|
55
|
+
evidence: z.string().max(280),
|
|
56
|
+
confidence: ConfidenceSchema,
|
|
57
|
+
});
|
|
58
|
+
/**
|
|
59
|
+
* Canonical schema for {@link GraderJudgment}. Required fields mirror
|
|
60
|
+
* the existing pipeline core (Doc 03 §"existing, unchanged"):
|
|
61
|
+
* `taskId`, `modelId`, `dimension`, `reason`, `score`. Phase 3 GRAD-05
|
|
62
|
+
* has tightened the additive surface to required and added `.strict()`
|
|
63
|
+
* — the schema rejects unknown fields (defense-in-depth against future
|
|
64
|
+
* prompt-injection attempts that try to smuggle keys through the
|
|
65
|
+
* grader emission).
|
|
66
|
+
*
|
|
67
|
+
* Branded `JudgmentId` is represented at runtime by a non-empty string;
|
|
68
|
+
* the schema routes the brand through `brandedString<"JudgmentId">()`
|
|
69
|
+
* — the project's single audited cast site for branded-string
|
|
70
|
+
* schemas (project typescript rule: no `as` on `unknown`).
|
|
71
|
+
*/
|
|
72
|
+
export const GraderJudgmentSchema = z
|
|
73
|
+
.object({
|
|
74
|
+
// ── Existing pipeline core (required — Doc 03 §"existing, unchanged") ─
|
|
75
|
+
taskId: z.string().min(1),
|
|
76
|
+
modelId: z.string().min(1),
|
|
77
|
+
dimension: z.string().min(1),
|
|
78
|
+
reason: z.string(),
|
|
79
|
+
score: z.number(),
|
|
80
|
+
outputFailure: z.boolean().optional(),
|
|
81
|
+
// ── GRAD-02 additive — required from Phase 3 GRAD-05 ───────────────
|
|
82
|
+
judgmentId: brandedString(),
|
|
83
|
+
subJudgments: z.array(CriterionSubJudgmentSchema),
|
|
84
|
+
docCitations: z.array(DocCitationSchema),
|
|
85
|
+
failureMode: z.string(),
|
|
86
|
+
confidence: ConfidenceSchema,
|
|
87
|
+
hallucinationCheckedAgainst: z.array(z.string()),
|
|
88
|
+
metadata: z.object({
|
|
89
|
+
graderModel: z.string().min(1),
|
|
90
|
+
graderJudgmentsVersion: z.string().min(1),
|
|
91
|
+
}),
|
|
92
|
+
})
|
|
93
|
+
.strict();
|
package/dist/adapters/index.d.ts
CHANGED
|
@@ -10,3 +10,6 @@ export { PromptfooEvalAdapter } from "./eval-runners/index.js";
|
|
|
10
10
|
export { ConsoleLogger, type ConsoleLoggerOptions, JsonLogger, QuietLogger, } from "./loggers/index.js";
|
|
11
11
|
export { CliConfigAdapter, FileConfigAdapter } from "./config-sources/index.js";
|
|
12
12
|
export { DtsPackageSurface, InMemoryPackageSurface, type DtsPackageSurfaceOptions, type PackageRootResolver, parseDtsExports, type ParsedDtsExports, } from "./package-surface/index.js";
|
|
13
|
+
export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./grader-outputs/index.js";
|
|
14
|
+
export { AttributionMetaSchema, JudgmentAttributionSchema, } from "./attribution/index.js";
|
|
15
|
+
export type { AttributionMeta, DocAttribution, GraderJudgment, JudgmentAttribution, } from "../_vendor/ailf-core/index.d.ts";
|
package/dist/adapters/index.js
CHANGED
|
@@ -10,3 +10,7 @@ export { PromptfooEvalAdapter } from "./eval-runners/index.js";
|
|
|
10
10
|
export { ConsoleLogger, JsonLogger, QuietLogger, } from "./loggers/index.js";
|
|
11
11
|
export { CliConfigAdapter, FileConfigAdapter } from "./config-sources/index.js";
|
|
12
12
|
export { DtsPackageSurface, InMemoryPackageSurface, parseDtsExports, } from "./package-surface/index.js";
|
|
13
|
+
// Phase 1 Plan 02 — actionability-ladder adapter schemas (GRAD-02, ATTR-01).
|
|
14
|
+
// Named re-exports only (W0124 / D0045).
|
|
15
|
+
export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./grader-outputs/index.js";
|
|
16
|
+
export { AttributionMetaSchema, JudgmentAttributionSchema, } from "./attribution/index.js";
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Anthropic adapter for the LLMClient port.
|
|
3
|
+
*
|
|
4
|
+
* Uses fetch() against the Messages API — same transport pattern as the
|
|
5
|
+
* existing grader. No SDK dependency. Centralizes retry, rate-limit handling,
|
|
6
|
+
* cost accounting, and per-call telemetry tagging via `context.feature`.
|
|
7
|
+
*
|
|
8
|
+
* Anthropic does not have a first-class JSON mode like OpenAI. For
|
|
9
|
+
* `completeStructured`, the adapter uses the API's top-level `system`
|
|
10
|
+
* field to instruct the model to return JSON only (top-level system is
|
|
11
|
+
* harder for user-controlled content in `prompt` to override than a
|
|
12
|
+
* user-turn prefix), then strips any surrounding ``` fences before
|
|
13
|
+
* parsing through the Zod schema (parse-don't-validate).
|
|
14
|
+
*
|
|
15
|
+
* Per `.claude/rules/eval-pipeline.md` and `.claude/rules/typescript.md`:
|
|
16
|
+
* the adapter never reads `process.env`. Typed constructor args only.
|
|
17
|
+
*/
|
|
18
|
+
import { type LLMClient, type LLMCompleteArgs, type LLMCompleteStructuredArgs, type LLMCompletion, type LLMStructuredCompletion, type Logger } from "../../_vendor/ailf-core/index.d.ts";
|
|
19
|
+
import type { ModelPricing } from "./pricing.js";
|
|
20
|
+
import { type RetryPolicy } from "./retry.js";
|
|
21
|
+
export interface AnthropicLLMClientOptions {
|
|
22
|
+
apiKey: string;
|
|
23
|
+
baseUrl?: string;
|
|
24
|
+
/** Pricing keyed by canonical model id (without `anthropic:` prefix or `messages:` segment). */
|
|
25
|
+
pricing?: Record<string, ModelPricing>;
|
|
26
|
+
retryPolicy?: Partial<RetryPolicy>;
|
|
27
|
+
logger?: Logger;
|
|
28
|
+
sleep?: (ms: number) => Promise<void>;
|
|
29
|
+
rng?: () => number;
|
|
30
|
+
/** API version header. Default "2023-06-01" — matches the existing grader. */
|
|
31
|
+
apiVersion?: string;
|
|
32
|
+
}
|
|
33
|
+
export declare class AnthropicLLMClient implements LLMClient {
|
|
34
|
+
private readonly apiKey;
|
|
35
|
+
private readonly baseUrl;
|
|
36
|
+
private readonly apiVersion;
|
|
37
|
+
private readonly pricing;
|
|
38
|
+
private readonly retryPolicy;
|
|
39
|
+
private readonly logger?;
|
|
40
|
+
private readonly sleep?;
|
|
41
|
+
private readonly rng?;
|
|
42
|
+
constructor(options: AnthropicLLMClientOptions);
|
|
43
|
+
complete(args: LLMCompleteArgs): Promise<LLMCompletion>;
|
|
44
|
+
completeStructured<T>(args: LLMCompleteStructuredArgs<T>): Promise<LLMStructuredCompletion<T>>;
|
|
45
|
+
private callApi;
|
|
46
|
+
private computeCost;
|
|
47
|
+
private logTelemetry;
|
|
48
|
+
}
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Anthropic adapter for the LLMClient port.
|
|
3
|
+
*
|
|
4
|
+
* Uses fetch() against the Messages API — same transport pattern as the
|
|
5
|
+
* existing grader. No SDK dependency. Centralizes retry, rate-limit handling,
|
|
6
|
+
* cost accounting, and per-call telemetry tagging via `context.feature`.
|
|
7
|
+
*
|
|
8
|
+
* Anthropic does not have a first-class JSON mode like OpenAI. For
|
|
9
|
+
* `completeStructured`, the adapter uses the API's top-level `system`
|
|
10
|
+
* field to instruct the model to return JSON only (top-level system is
|
|
11
|
+
* harder for user-controlled content in `prompt` to override than a
|
|
12
|
+
* user-turn prefix), then strips any surrounding ``` fences before
|
|
13
|
+
* parsing through the Zod schema (parse-don't-validate).
|
|
14
|
+
*
|
|
15
|
+
* Per `.claude/rules/eval-pipeline.md` and `.claude/rules/typescript.md`:
|
|
16
|
+
* the adapter never reads `process.env`. Typed constructor args only.
|
|
17
|
+
*/
|
|
18
|
+
import { AnthropicResponseSchema, splitModelId, } from "../../_vendor/ailf-core/index.js";
|
|
19
|
+
import { DEFAULT_RETRY_POLICY, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
20
|
+
const DEFAULT_BASE_URL = "https://api.anthropic.com/v1/messages";
|
|
21
|
+
const DEFAULT_API_VERSION = "2023-06-01";
|
|
22
|
+
const DEFAULT_MAX_TOKENS = 4096;
|
|
23
|
+
/**
|
|
24
|
+
* Pricing reference: https://www.anthropic.com/pricing#api
|
|
25
|
+
* Update when models or vendor pricing changes.
|
|
26
|
+
*/
|
|
27
|
+
const DEFAULT_PRICING = {
|
|
28
|
+
"claude-opus-4-6": { inputPer1k: 0.015, outputPer1k: 0.075 },
|
|
29
|
+
"claude-opus-4-5-20251101": { inputPer1k: 0.015, outputPer1k: 0.075 },
|
|
30
|
+
"claude-sonnet-4-6": { inputPer1k: 0.003, outputPer1k: 0.015 },
|
|
31
|
+
};
|
|
32
|
+
const STRUCTURED_SYSTEM = "Respond with only a single JSON object that conforms to the requested schema. " +
|
|
33
|
+
"Do not include any prose, commentary, or markdown code fences. " +
|
|
34
|
+
"Return raw JSON only.";
|
|
35
|
+
export class AnthropicLLMClient {
|
|
36
|
+
apiKey;
|
|
37
|
+
baseUrl;
|
|
38
|
+
apiVersion;
|
|
39
|
+
pricing;
|
|
40
|
+
retryPolicy;
|
|
41
|
+
logger;
|
|
42
|
+
sleep;
|
|
43
|
+
rng;
|
|
44
|
+
constructor(options) {
|
|
45
|
+
this.apiKey = options.apiKey;
|
|
46
|
+
this.baseUrl = options.baseUrl ?? DEFAULT_BASE_URL;
|
|
47
|
+
this.apiVersion = options.apiVersion ?? DEFAULT_API_VERSION;
|
|
48
|
+
this.pricing = { ...DEFAULT_PRICING, ...(options.pricing ?? {}) };
|
|
49
|
+
this.retryPolicy = {
|
|
50
|
+
...DEFAULT_RETRY_POLICY,
|
|
51
|
+
...(options.retryPolicy ?? {}),
|
|
52
|
+
};
|
|
53
|
+
if (options.logger)
|
|
54
|
+
this.logger = options.logger;
|
|
55
|
+
if (options.sleep)
|
|
56
|
+
this.sleep = options.sleep;
|
|
57
|
+
if (options.rng)
|
|
58
|
+
this.rng = options.rng;
|
|
59
|
+
}
|
|
60
|
+
async complete(args) {
|
|
61
|
+
const { modelName } = splitModelId(args.model);
|
|
62
|
+
const body = buildBody(modelName, args.prompt, {
|
|
63
|
+
temperature: args.temperature,
|
|
64
|
+
maxTokens: args.maxTokens,
|
|
65
|
+
stop: args.stop,
|
|
66
|
+
});
|
|
67
|
+
const data = await this.callApi(body);
|
|
68
|
+
const text = extractText(data.content);
|
|
69
|
+
if (text === "") {
|
|
70
|
+
throw new Error(`Anthropic returned empty completion for model ${args.model}`);
|
|
71
|
+
}
|
|
72
|
+
const usage = extractUsage(data.usage);
|
|
73
|
+
const cost = this.computeCost(modelName, usage);
|
|
74
|
+
this.logTelemetry(args.context, args.model, usage, cost);
|
|
75
|
+
return { text, usage, cost, model: args.model };
|
|
76
|
+
}
|
|
77
|
+
async completeStructured(args) {
|
|
78
|
+
const { modelName } = splitModelId(args.model);
|
|
79
|
+
const body = buildBody(modelName, args.prompt, {
|
|
80
|
+
temperature: args.temperature,
|
|
81
|
+
maxTokens: args.maxTokens,
|
|
82
|
+
system: STRUCTURED_SYSTEM,
|
|
83
|
+
});
|
|
84
|
+
const data = await this.callApi(body);
|
|
85
|
+
const raw = extractText(data.content);
|
|
86
|
+
if (raw === "") {
|
|
87
|
+
throw new Error(`Anthropic returned empty structured completion for model ${args.model}`);
|
|
88
|
+
}
|
|
89
|
+
const stripped = stripJsonFence(raw);
|
|
90
|
+
let parsed;
|
|
91
|
+
try {
|
|
92
|
+
parsed = JSON.parse(stripped);
|
|
93
|
+
}
|
|
94
|
+
catch (err) {
|
|
95
|
+
throw new Error(`Anthropic structured completion returned invalid JSON for model ${args.model}: ${err instanceof Error ? err.message : String(err)}`, { cause: err });
|
|
96
|
+
}
|
|
97
|
+
const value = args.schema.parse(parsed);
|
|
98
|
+
const usage = extractUsage(data.usage);
|
|
99
|
+
const cost = this.computeCost(modelName, usage);
|
|
100
|
+
this.logTelemetry(args.context, args.model, usage, cost);
|
|
101
|
+
return { value, usage, cost, model: args.model };
|
|
102
|
+
}
|
|
103
|
+
async callApi(body) {
|
|
104
|
+
return runWithRetry({
|
|
105
|
+
policy: this.retryPolicy,
|
|
106
|
+
...(this.sleep ? { sleep: this.sleep } : {}),
|
|
107
|
+
...(this.rng ? { rng: this.rng } : {}),
|
|
108
|
+
attempt: async () => {
|
|
109
|
+
const response = await fetch(this.baseUrl, {
|
|
110
|
+
method: "POST",
|
|
111
|
+
headers: {
|
|
112
|
+
"x-api-key": this.apiKey,
|
|
113
|
+
"anthropic-version": this.apiVersion,
|
|
114
|
+
"Content-Type": "application/json",
|
|
115
|
+
},
|
|
116
|
+
body: JSON.stringify(body),
|
|
117
|
+
});
|
|
118
|
+
if (!response.ok) {
|
|
119
|
+
const text = await response.text();
|
|
120
|
+
const retryAfter = parseRetryAfterSeconds(response.headers.get("retry-after"));
|
|
121
|
+
return {
|
|
122
|
+
ok: false,
|
|
123
|
+
status: response.status,
|
|
124
|
+
body: text,
|
|
125
|
+
...(retryAfter !== undefined
|
|
126
|
+
? { retryAfterSeconds: retryAfter }
|
|
127
|
+
: {}),
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
const json = await response.json();
|
|
131
|
+
const data = AnthropicResponseSchema.parse(json);
|
|
132
|
+
if (data.error?.message) {
|
|
133
|
+
throw new Error(`Anthropic API error: ${data.error.message}`);
|
|
134
|
+
}
|
|
135
|
+
return { ok: true, value: data };
|
|
136
|
+
},
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
computeCost(modelName, usage) {
|
|
140
|
+
const price = this.pricing[modelName];
|
|
141
|
+
if (!price) {
|
|
142
|
+
this.logger?.warn(`Anthropic cost unknown for model "${modelName}" — recording cost=0. Add it to AnthropicLLMClientOptions.pricing.`);
|
|
143
|
+
return 0;
|
|
144
|
+
}
|
|
145
|
+
return ((usage.promptTokens / 1000) * price.inputPer1k +
|
|
146
|
+
(usage.completionTokens / 1000) * price.outputPer1k);
|
|
147
|
+
}
|
|
148
|
+
logTelemetry(context, model, usage, cost) {
|
|
149
|
+
if (!this.logger)
|
|
150
|
+
return;
|
|
151
|
+
const tag = context ? ` feature=${context.feature}` : "";
|
|
152
|
+
const runTag = context?.runId ? ` runId=${context.runId}` : "";
|
|
153
|
+
const cardTag = context?.cardId ? ` cardId=${context.cardId}` : "";
|
|
154
|
+
this.logger.debug(`LLM call (anthropic)${tag}${runTag}${cardTag} model=${model} ` +
|
|
155
|
+
`prompt_tokens=${usage.promptTokens} completion_tokens=${usage.completionTokens} ` +
|
|
156
|
+
`cost_usd=${cost.toFixed(6)}`);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
function buildBody(modelName, prompt, opts) {
|
|
160
|
+
const body = {
|
|
161
|
+
model: modelName,
|
|
162
|
+
max_tokens: opts.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
163
|
+
messages: [{ role: "user", content: prompt }],
|
|
164
|
+
};
|
|
165
|
+
if (opts.temperature !== undefined)
|
|
166
|
+
body.temperature = opts.temperature;
|
|
167
|
+
if (opts.stop && opts.stop.length > 0)
|
|
168
|
+
body.stop_sequences = opts.stop;
|
|
169
|
+
if (opts.system)
|
|
170
|
+
body.system = opts.system;
|
|
171
|
+
return body;
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Concatenate every `text` content block. Anthropic responses can interleave
|
|
175
|
+
* `text` and `tool_use` blocks; for non-tool calls there's typically one
|
|
176
|
+
* text block, but joining is the robust default.
|
|
177
|
+
*/
|
|
178
|
+
function extractText(content) {
|
|
179
|
+
if (!content)
|
|
180
|
+
return "";
|
|
181
|
+
const parts = [];
|
|
182
|
+
for (const block of content) {
|
|
183
|
+
if (block.type === "text" && typeof block.text === "string") {
|
|
184
|
+
parts.push(block.text);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
return parts.join("");
|
|
188
|
+
}
|
|
189
|
+
function extractUsage(usage) {
|
|
190
|
+
return {
|
|
191
|
+
promptTokens: usage?.input_tokens ?? 0,
|
|
192
|
+
completionTokens: usage?.output_tokens ?? 0,
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Strip a single ```json ... ``` or ``` ... ``` fence wrapper if present.
|
|
197
|
+
* Anthropic occasionally wraps JSON despite the system instruction.
|
|
198
|
+
*/
|
|
199
|
+
function stripJsonFence(text) {
|
|
200
|
+
const trimmed = text.trim();
|
|
201
|
+
const fenceMatch = trimmed.match(/^```(?:json)?\s*([\s\S]*?)\s*```$/);
|
|
202
|
+
if (fenceMatch)
|
|
203
|
+
return fenceMatch[1].trim();
|
|
204
|
+
return trimmed;
|
|
205
|
+
}
|