@sanity/ailf 4.4.0 → 4.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +138 -1
- package/dist/_vendor/ailf-core/artifact-registry.js +137 -4
- package/dist/_vendor/ailf-core/ports/context.d.ts +18 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/llm-client.d.ts +112 -0
- package/dist/_vendor/ailf-core/ports/llm-client.js +68 -0
- package/dist/_vendor/ailf-core/types/confidence.d.ts +68 -0
- package/dist/_vendor/ailf-core/types/confidence.js +49 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/types/index.js +1 -0
- package/dist/adapters/llm/anthropic-llm-client.d.ts +48 -0
- package/dist/adapters/llm/anthropic-llm-client.js +205 -0
- package/dist/adapters/llm/fake-llm-client.d.ts +49 -0
- package/dist/adapters/llm/fake-llm-client.js +63 -0
- package/dist/adapters/llm/index.d.ts +9 -0
- package/dist/adapters/llm/index.js +4 -0
- package/dist/adapters/llm/openai-llm-client.d.ts +44 -0
- package/dist/adapters/llm/openai-llm-client.js +168 -0
- package/dist/adapters/llm/pricing.d.ts +12 -0
- package/dist/adapters/llm/pricing.js +8 -0
- package/dist/adapters/llm/retry.d.ts +56 -0
- package/dist/adapters/llm/retry.js +66 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +11 -11
- package/dist/artifact-capture/api-gateway-artifact-writer.js +2 -1
- package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +2 -1
- package/dist/artifact-capture/gcs-artifact-writer.js +3 -1
- package/dist/artifact-capture/local-fs-artifact-writer.js +3 -1
- package/dist/commands/pipeline-action.js +7 -1
- package/dist/commands/run.d.ts +1 -0
- package/dist/commands/run.js +1 -0
- package/dist/composition-root.d.ts +23 -1
- package/dist/composition-root.js +47 -0
- package/package.json +3 -3
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared confidence contract for actionability-ladder emitters (D0049).
|
|
3
|
+
*
|
|
4
|
+
* Every confidence-emitting site in the actionability-ladder design set
|
|
5
|
+
* (per-document attribution ensemble, structured grader judgments,
|
|
6
|
+
* diagnosis cards, regression detection) emits the same abstract triple
|
|
7
|
+
* so consumers can reason about confidence uniformly across emitters.
|
|
8
|
+
*
|
|
9
|
+
* Bucket thresholds and the formula behind `level` are emitter-specific;
|
|
10
|
+
* the externally comparable behavior is the `level` enum. Consumers that
|
|
11
|
+
* need the underlying mechanic read `derivation` and can branch.
|
|
12
|
+
*/
|
|
13
|
+
/**
|
|
14
|
+
* Conventional `derivation` identifiers for the seed set of emitters
|
|
15
|
+
* named in D0049. Re-exported as a typed tuple so consumers and tests can
|
|
16
|
+
* reference one source of truth instead of redeclaring the literals.
|
|
17
|
+
*
|
|
18
|
+
* Adding a new emitter does not require editing this list — `derivation`
|
|
19
|
+
* is an open tag (see `ConfidenceDerivation`). The list is the
|
|
20
|
+
* recommended starting set, not the universe.
|
|
21
|
+
*/
|
|
22
|
+
export const CONVENTIONAL_DERIVATIONS = [
|
|
23
|
+
"ensemble-stdev",
|
|
24
|
+
"ceiling-cross-check",
|
|
25
|
+
"regression-gate",
|
|
26
|
+
"card-type-specific",
|
|
27
|
+
];
|
|
28
|
+
/**
|
|
29
|
+
* Structural type guard for `Confidence`. Verifies the runtime shape
|
|
30
|
+
* matches the contract — useful at trust boundaries that can't depend on
|
|
31
|
+
* a Zod schema (the schema lives at the consuming site since each emitter
|
|
32
|
+
* picks its own `level` thresholds, but the shape is shared).
|
|
33
|
+
*/
|
|
34
|
+
export function isConfidence(value) {
|
|
35
|
+
if (typeof value !== "object" || value === null)
|
|
36
|
+
return false;
|
|
37
|
+
const v = value;
|
|
38
|
+
if (v.level !== "high" && v.level !== "medium" && v.level !== "low") {
|
|
39
|
+
return false;
|
|
40
|
+
}
|
|
41
|
+
if (typeof v.signalsPresent !== "number" ||
|
|
42
|
+
!Number.isFinite(v.signalsPresent)) {
|
|
43
|
+
return false;
|
|
44
|
+
}
|
|
45
|
+
if (typeof v.derivation !== "string" || v.derivation.length === 0) {
|
|
46
|
+
return false;
|
|
47
|
+
}
|
|
48
|
+
return true;
|
|
49
|
+
}
|
|
@@ -30,6 +30,8 @@ export type { PipelineRequest, PipelineRequestCallback, PipelineRequestCallerExe
|
|
|
30
30
|
export type { PackageSurfaceConfig, PackageSurfaceEntry, } from "./package-surface.js";
|
|
31
31
|
export type { SymbolPreflightDeduction, SymbolPreflightFinding, SymbolPreflightReport, SymbolPreflightUnresolvedReason, } from "./symbol-preflight-report.js";
|
|
32
32
|
export { DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT, type PreflightRubricContext, type PreflightScoringConfig, } from "./preflight-scoring.js";
|
|
33
|
+
export type { Confidence, ConfidenceDerivation } from "./confidence.js";
|
|
34
|
+
export { CONVENTIONAL_DERIVATIONS, isConfidence } from "./confidence.js";
|
|
33
35
|
export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, Err, FixtureId, IdValidationError, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
|
|
34
36
|
export { err, fixtureId, generateRunId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
35
37
|
export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLakeAuthorableTask, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PromptVars, PathDocRef, PerspectiveDocRef, ReservedPromptVarKey, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
|
|
@@ -17,6 +17,7 @@ export { InMemoryPluginRegistry } from "./plugin-registry.js";
|
|
|
17
17
|
// the mode-specific version, they import from "./eval-mode-config.js".
|
|
18
18
|
export { evalModeType } from "./eval-mode-config.js";
|
|
19
19
|
export { DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT, } from "./preflight-scoring.js";
|
|
20
|
+
export { CONVENTIONAL_DERIVATIONS, isConfidence } from "./confidence.js";
|
|
20
21
|
export { err, fixtureId, generateRunId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
21
22
|
// ---------------------------------------------------------------------------
|
|
22
23
|
// Comparison (Approach 2: structured comparison output)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Anthropic adapter for the LLMClient port.
|
|
3
|
+
*
|
|
4
|
+
* Uses fetch() against the Messages API — same transport pattern as the
|
|
5
|
+
* existing grader. No SDK dependency. Centralizes retry, rate-limit handling,
|
|
6
|
+
* cost accounting, and per-call telemetry tagging via `context.feature`.
|
|
7
|
+
*
|
|
8
|
+
* Anthropic does not have a first-class JSON mode like OpenAI. For
|
|
9
|
+
* `completeStructured`, the adapter uses the API's top-level `system`
|
|
10
|
+
* field to instruct the model to return JSON only (top-level system is
|
|
11
|
+
* harder for user-controlled content in `prompt` to override than a
|
|
12
|
+
* user-turn prefix), then strips any surrounding ``` fences before
|
|
13
|
+
* parsing through the Zod schema (parse-don't-validate).
|
|
14
|
+
*
|
|
15
|
+
* Per `.claude/rules/eval-pipeline.md` and `.claude/rules/typescript.md`:
|
|
16
|
+
* the adapter never reads `process.env`. Typed constructor args only.
|
|
17
|
+
*/
|
|
18
|
+
import { type LLMClient, type LLMCompleteArgs, type LLMCompleteStructuredArgs, type LLMCompletion, type LLMStructuredCompletion, type Logger } from "../../_vendor/ailf-core/index.d.ts";
|
|
19
|
+
import type { ModelPricing } from "./pricing.js";
|
|
20
|
+
import { type RetryPolicy } from "./retry.js";
|
|
21
|
+
export interface AnthropicLLMClientOptions {
|
|
22
|
+
apiKey: string;
|
|
23
|
+
baseUrl?: string;
|
|
24
|
+
/** Pricing keyed by canonical model id (without `anthropic:` prefix or `messages:` segment). */
|
|
25
|
+
pricing?: Record<string, ModelPricing>;
|
|
26
|
+
retryPolicy?: Partial<RetryPolicy>;
|
|
27
|
+
logger?: Logger;
|
|
28
|
+
sleep?: (ms: number) => Promise<void>;
|
|
29
|
+
rng?: () => number;
|
|
30
|
+
/** API version header. Default "2023-06-01" — matches the existing grader. */
|
|
31
|
+
apiVersion?: string;
|
|
32
|
+
}
|
|
33
|
+
export declare class AnthropicLLMClient implements LLMClient {
|
|
34
|
+
private readonly apiKey;
|
|
35
|
+
private readonly baseUrl;
|
|
36
|
+
private readonly apiVersion;
|
|
37
|
+
private readonly pricing;
|
|
38
|
+
private readonly retryPolicy;
|
|
39
|
+
private readonly logger?;
|
|
40
|
+
private readonly sleep?;
|
|
41
|
+
private readonly rng?;
|
|
42
|
+
constructor(options: AnthropicLLMClientOptions);
|
|
43
|
+
complete(args: LLMCompleteArgs): Promise<LLMCompletion>;
|
|
44
|
+
completeStructured<T>(args: LLMCompleteStructuredArgs<T>): Promise<LLMStructuredCompletion<T>>;
|
|
45
|
+
private callApi;
|
|
46
|
+
private computeCost;
|
|
47
|
+
private logTelemetry;
|
|
48
|
+
}
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Anthropic adapter for the LLMClient port.
|
|
3
|
+
*
|
|
4
|
+
* Uses fetch() against the Messages API — same transport pattern as the
|
|
5
|
+
* existing grader. No SDK dependency. Centralizes retry, rate-limit handling,
|
|
6
|
+
* cost accounting, and per-call telemetry tagging via `context.feature`.
|
|
7
|
+
*
|
|
8
|
+
* Anthropic does not have a first-class JSON mode like OpenAI. For
|
|
9
|
+
* `completeStructured`, the adapter uses the API's top-level `system`
|
|
10
|
+
* field to instruct the model to return JSON only (top-level system is
|
|
11
|
+
* harder for user-controlled content in `prompt` to override than a
|
|
12
|
+
* user-turn prefix), then strips any surrounding ``` fences before
|
|
13
|
+
* parsing through the Zod schema (parse-don't-validate).
|
|
14
|
+
*
|
|
15
|
+
* Per `.claude/rules/eval-pipeline.md` and `.claude/rules/typescript.md`:
|
|
16
|
+
* the adapter never reads `process.env`. Typed constructor args only.
|
|
17
|
+
*/
|
|
18
|
+
import { AnthropicResponseSchema, splitModelId, } from "../../_vendor/ailf-core/index.js";
|
|
19
|
+
import { DEFAULT_RETRY_POLICY, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
20
|
+
const DEFAULT_BASE_URL = "https://api.anthropic.com/v1/messages";
|
|
21
|
+
const DEFAULT_API_VERSION = "2023-06-01";
|
|
22
|
+
const DEFAULT_MAX_TOKENS = 4096;
|
|
23
|
+
/**
|
|
24
|
+
* Pricing reference: https://www.anthropic.com/pricing#api
|
|
25
|
+
* Update when models or vendor pricing changes.
|
|
26
|
+
*/
|
|
27
|
+
const DEFAULT_PRICING = {
|
|
28
|
+
"claude-opus-4-6": { inputPer1k: 0.015, outputPer1k: 0.075 },
|
|
29
|
+
"claude-opus-4-5-20251101": { inputPer1k: 0.015, outputPer1k: 0.075 },
|
|
30
|
+
"claude-sonnet-4-6": { inputPer1k: 0.003, outputPer1k: 0.015 },
|
|
31
|
+
};
|
|
32
|
+
const STRUCTURED_SYSTEM = "Respond with only a single JSON object that conforms to the requested schema. " +
|
|
33
|
+
"Do not include any prose, commentary, or markdown code fences. " +
|
|
34
|
+
"Return raw JSON only.";
|
|
35
|
+
export class AnthropicLLMClient {
|
|
36
|
+
apiKey;
|
|
37
|
+
baseUrl;
|
|
38
|
+
apiVersion;
|
|
39
|
+
pricing;
|
|
40
|
+
retryPolicy;
|
|
41
|
+
logger;
|
|
42
|
+
sleep;
|
|
43
|
+
rng;
|
|
44
|
+
constructor(options) {
|
|
45
|
+
this.apiKey = options.apiKey;
|
|
46
|
+
this.baseUrl = options.baseUrl ?? DEFAULT_BASE_URL;
|
|
47
|
+
this.apiVersion = options.apiVersion ?? DEFAULT_API_VERSION;
|
|
48
|
+
this.pricing = { ...DEFAULT_PRICING, ...(options.pricing ?? {}) };
|
|
49
|
+
this.retryPolicy = {
|
|
50
|
+
...DEFAULT_RETRY_POLICY,
|
|
51
|
+
...(options.retryPolicy ?? {}),
|
|
52
|
+
};
|
|
53
|
+
if (options.logger)
|
|
54
|
+
this.logger = options.logger;
|
|
55
|
+
if (options.sleep)
|
|
56
|
+
this.sleep = options.sleep;
|
|
57
|
+
if (options.rng)
|
|
58
|
+
this.rng = options.rng;
|
|
59
|
+
}
|
|
60
|
+
async complete(args) {
|
|
61
|
+
const { modelName } = splitModelId(args.model);
|
|
62
|
+
const body = buildBody(modelName, args.prompt, {
|
|
63
|
+
temperature: args.temperature,
|
|
64
|
+
maxTokens: args.maxTokens,
|
|
65
|
+
stop: args.stop,
|
|
66
|
+
});
|
|
67
|
+
const data = await this.callApi(body);
|
|
68
|
+
const text = extractText(data.content);
|
|
69
|
+
if (text === "") {
|
|
70
|
+
throw new Error(`Anthropic returned empty completion for model ${args.model}`);
|
|
71
|
+
}
|
|
72
|
+
const usage = extractUsage(data.usage);
|
|
73
|
+
const cost = this.computeCost(modelName, usage);
|
|
74
|
+
this.logTelemetry(args.context, args.model, usage, cost);
|
|
75
|
+
return { text, usage, cost, model: args.model };
|
|
76
|
+
}
|
|
77
|
+
async completeStructured(args) {
|
|
78
|
+
const { modelName } = splitModelId(args.model);
|
|
79
|
+
const body = buildBody(modelName, args.prompt, {
|
|
80
|
+
temperature: args.temperature,
|
|
81
|
+
maxTokens: args.maxTokens,
|
|
82
|
+
system: STRUCTURED_SYSTEM,
|
|
83
|
+
});
|
|
84
|
+
const data = await this.callApi(body);
|
|
85
|
+
const raw = extractText(data.content);
|
|
86
|
+
if (raw === "") {
|
|
87
|
+
throw new Error(`Anthropic returned empty structured completion for model ${args.model}`);
|
|
88
|
+
}
|
|
89
|
+
const stripped = stripJsonFence(raw);
|
|
90
|
+
let parsed;
|
|
91
|
+
try {
|
|
92
|
+
parsed = JSON.parse(stripped);
|
|
93
|
+
}
|
|
94
|
+
catch (err) {
|
|
95
|
+
throw new Error(`Anthropic structured completion returned invalid JSON for model ${args.model}: ${err instanceof Error ? err.message : String(err)}`, { cause: err });
|
|
96
|
+
}
|
|
97
|
+
const value = args.schema.parse(parsed);
|
|
98
|
+
const usage = extractUsage(data.usage);
|
|
99
|
+
const cost = this.computeCost(modelName, usage);
|
|
100
|
+
this.logTelemetry(args.context, args.model, usage, cost);
|
|
101
|
+
return { value, usage, cost, model: args.model };
|
|
102
|
+
}
|
|
103
|
+
async callApi(body) {
|
|
104
|
+
return runWithRetry({
|
|
105
|
+
policy: this.retryPolicy,
|
|
106
|
+
...(this.sleep ? { sleep: this.sleep } : {}),
|
|
107
|
+
...(this.rng ? { rng: this.rng } : {}),
|
|
108
|
+
attempt: async () => {
|
|
109
|
+
const response = await fetch(this.baseUrl, {
|
|
110
|
+
method: "POST",
|
|
111
|
+
headers: {
|
|
112
|
+
"x-api-key": this.apiKey,
|
|
113
|
+
"anthropic-version": this.apiVersion,
|
|
114
|
+
"Content-Type": "application/json",
|
|
115
|
+
},
|
|
116
|
+
body: JSON.stringify(body),
|
|
117
|
+
});
|
|
118
|
+
if (!response.ok) {
|
|
119
|
+
const text = await response.text();
|
|
120
|
+
const retryAfter = parseRetryAfterSeconds(response.headers.get("retry-after"));
|
|
121
|
+
return {
|
|
122
|
+
ok: false,
|
|
123
|
+
status: response.status,
|
|
124
|
+
body: text,
|
|
125
|
+
...(retryAfter !== undefined
|
|
126
|
+
? { retryAfterSeconds: retryAfter }
|
|
127
|
+
: {}),
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
const json = await response.json();
|
|
131
|
+
const data = AnthropicResponseSchema.parse(json);
|
|
132
|
+
if (data.error?.message) {
|
|
133
|
+
throw new Error(`Anthropic API error: ${data.error.message}`);
|
|
134
|
+
}
|
|
135
|
+
return { ok: true, value: data };
|
|
136
|
+
},
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
computeCost(modelName, usage) {
|
|
140
|
+
const price = this.pricing[modelName];
|
|
141
|
+
if (!price) {
|
|
142
|
+
this.logger?.warn(`Anthropic cost unknown for model "${modelName}" — recording cost=0. Add it to AnthropicLLMClientOptions.pricing.`);
|
|
143
|
+
return 0;
|
|
144
|
+
}
|
|
145
|
+
return ((usage.promptTokens / 1000) * price.inputPer1k +
|
|
146
|
+
(usage.completionTokens / 1000) * price.outputPer1k);
|
|
147
|
+
}
|
|
148
|
+
logTelemetry(context, model, usage, cost) {
|
|
149
|
+
if (!this.logger)
|
|
150
|
+
return;
|
|
151
|
+
const tag = context ? ` feature=${context.feature}` : "";
|
|
152
|
+
const runTag = context?.runId ? ` runId=${context.runId}` : "";
|
|
153
|
+
const cardTag = context?.cardId ? ` cardId=${context.cardId}` : "";
|
|
154
|
+
this.logger.debug(`LLM call (anthropic)${tag}${runTag}${cardTag} model=${model} ` +
|
|
155
|
+
`prompt_tokens=${usage.promptTokens} completion_tokens=${usage.completionTokens} ` +
|
|
156
|
+
`cost_usd=${cost.toFixed(6)}`);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
function buildBody(modelName, prompt, opts) {
|
|
160
|
+
const body = {
|
|
161
|
+
model: modelName,
|
|
162
|
+
max_tokens: opts.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
163
|
+
messages: [{ role: "user", content: prompt }],
|
|
164
|
+
};
|
|
165
|
+
if (opts.temperature !== undefined)
|
|
166
|
+
body.temperature = opts.temperature;
|
|
167
|
+
if (opts.stop && opts.stop.length > 0)
|
|
168
|
+
body.stop_sequences = opts.stop;
|
|
169
|
+
if (opts.system)
|
|
170
|
+
body.system = opts.system;
|
|
171
|
+
return body;
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Concatenate every `text` content block. Anthropic responses can interleave
|
|
175
|
+
* `text` and `tool_use` blocks; for non-tool calls there's typically one
|
|
176
|
+
* text block, but joining is the robust default.
|
|
177
|
+
*/
|
|
178
|
+
function extractText(content) {
|
|
179
|
+
if (!content)
|
|
180
|
+
return "";
|
|
181
|
+
const parts = [];
|
|
182
|
+
for (const block of content) {
|
|
183
|
+
if (block.type === "text" && typeof block.text === "string") {
|
|
184
|
+
parts.push(block.text);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
return parts.join("");
|
|
188
|
+
}
|
|
189
|
+
function extractUsage(usage) {
|
|
190
|
+
return {
|
|
191
|
+
promptTokens: usage?.input_tokens ?? 0,
|
|
192
|
+
completionTokens: usage?.output_tokens ?? 0,
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Strip a single ```json ... ``` or ``` ... ``` fence wrapper if present.
|
|
197
|
+
* Anthropic occasionally wraps JSON despite the system instruction.
|
|
198
|
+
*/
|
|
199
|
+
function stripJsonFence(text) {
|
|
200
|
+
const trimmed = text.trim();
|
|
201
|
+
const fenceMatch = trimmed.match(/^```(?:json)?\s*([\s\S]*?)\s*```$/);
|
|
202
|
+
if (fenceMatch)
|
|
203
|
+
return fenceMatch[1].trim();
|
|
204
|
+
return trimmed;
|
|
205
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fake LLMClient for tests.
|
|
3
|
+
*
|
|
4
|
+
* Returns canned responses in insertion order and records each call so
|
|
5
|
+
* consumers' tests can assert on prompts, models, and telemetry context.
|
|
6
|
+
*
|
|
7
|
+
* Exported alongside the real adapters so consuming-feature tests can
|
|
8
|
+
* stub the LLM cleanly without a network round-trip.
|
|
9
|
+
*/
|
|
10
|
+
import type { LLMClient, LLMCompleteArgs, LLMCompleteStructuredArgs, LLMCompletion, LLMStructuredCompletion } from "../../_vendor/ailf-core/index.d.ts";
|
|
11
|
+
export interface FakeCallRecord {
|
|
12
|
+
kind: "complete" | "completeStructured";
|
|
13
|
+
model: string;
|
|
14
|
+
prompt: string;
|
|
15
|
+
context?: {
|
|
16
|
+
feature: string;
|
|
17
|
+
runId?: string;
|
|
18
|
+
cardId?: string;
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
export interface FakeCompletionResponse {
|
|
22
|
+
text: string;
|
|
23
|
+
promptTokens?: number;
|
|
24
|
+
completionTokens?: number;
|
|
25
|
+
cost?: number;
|
|
26
|
+
}
|
|
27
|
+
export interface FakeStructuredResponse {
|
|
28
|
+
/**
|
|
29
|
+
* Value returned to the consumer. Run through the consumer-supplied Zod
|
|
30
|
+
* schema in `completeStructured` so consumers exercise their parse-don't-
|
|
31
|
+
* validate path even with the fake; supply a value that matches the
|
|
32
|
+
* schema's shape (or supply a deliberately malformed one to test failure).
|
|
33
|
+
*/
|
|
34
|
+
value: unknown;
|
|
35
|
+
promptTokens?: number;
|
|
36
|
+
completionTokens?: number;
|
|
37
|
+
cost?: number;
|
|
38
|
+
}
|
|
39
|
+
export declare class FakeLLMClient implements LLMClient {
|
|
40
|
+
readonly calls: FakeCallRecord[];
|
|
41
|
+
private readonly completeQueue;
|
|
42
|
+
private readonly structuredQueue;
|
|
43
|
+
constructor(args?: {
|
|
44
|
+
completeResponses?: FakeCompletionResponse[];
|
|
45
|
+
structuredResponses?: FakeStructuredResponse[];
|
|
46
|
+
});
|
|
47
|
+
complete(args: LLMCompleteArgs): Promise<LLMCompletion>;
|
|
48
|
+
completeStructured<T>(args: LLMCompleteStructuredArgs<T>): Promise<LLMStructuredCompletion<T>>;
|
|
49
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fake LLMClient for tests.
|
|
3
|
+
*
|
|
4
|
+
* Returns canned responses in insertion order and records each call so
|
|
5
|
+
* consumers' tests can assert on prompts, models, and telemetry context.
|
|
6
|
+
*
|
|
7
|
+
* Exported alongside the real adapters so consuming-feature tests can
|
|
8
|
+
* stub the LLM cleanly without a network round-trip.
|
|
9
|
+
*/
|
|
10
|
+
export class FakeLLMClient {
|
|
11
|
+
calls = [];
|
|
12
|
+
completeQueue;
|
|
13
|
+
structuredQueue;
|
|
14
|
+
constructor(args = {}) {
|
|
15
|
+
this.completeQueue = [...(args.completeResponses ?? [])];
|
|
16
|
+
this.structuredQueue = [...(args.structuredResponses ?? [])];
|
|
17
|
+
}
|
|
18
|
+
async complete(args) {
|
|
19
|
+
this.calls.push({
|
|
20
|
+
kind: "complete",
|
|
21
|
+
model: args.model,
|
|
22
|
+
prompt: args.prompt,
|
|
23
|
+
...(args.context ? { context: args.context } : {}),
|
|
24
|
+
});
|
|
25
|
+
const next = this.completeQueue.shift();
|
|
26
|
+
if (!next) {
|
|
27
|
+
throw new Error("FakeLLMClient: no more queued complete responses (call exceeded queue)");
|
|
28
|
+
}
|
|
29
|
+
return {
|
|
30
|
+
text: next.text,
|
|
31
|
+
usage: {
|
|
32
|
+
promptTokens: next.promptTokens ?? 0,
|
|
33
|
+
completionTokens: next.completionTokens ?? 0,
|
|
34
|
+
},
|
|
35
|
+
cost: next.cost ?? 0,
|
|
36
|
+
model: args.model,
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
async completeStructured(args) {
|
|
40
|
+
this.calls.push({
|
|
41
|
+
kind: "completeStructured",
|
|
42
|
+
model: args.model,
|
|
43
|
+
prompt: args.prompt,
|
|
44
|
+
...(args.context ? { context: args.context } : {}),
|
|
45
|
+
});
|
|
46
|
+
const next = this.structuredQueue.shift();
|
|
47
|
+
if (!next) {
|
|
48
|
+
throw new Error("FakeLLMClient: no more queued structured responses (call exceeded queue)");
|
|
49
|
+
}
|
|
50
|
+
// Run through the consumer-supplied schema so consumers exercise the
|
|
51
|
+
// parse-don't-validate path even with the fake.
|
|
52
|
+
const value = args.schema.parse(next.value);
|
|
53
|
+
return {
|
|
54
|
+
value,
|
|
55
|
+
usage: {
|
|
56
|
+
promptTokens: next.promptTokens ?? 0,
|
|
57
|
+
completionTokens: next.completionTokens ?? 0,
|
|
58
|
+
},
|
|
59
|
+
cost: next.cost ?? 0,
|
|
60
|
+
model: args.model,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export { AnthropicLLMClient } from "./anthropic-llm-client.js";
|
|
2
|
+
export type { AnthropicLLMClientOptions } from "./anthropic-llm-client.js";
|
|
3
|
+
export { FakeLLMClient } from "./fake-llm-client.js";
|
|
4
|
+
export type { FakeCallRecord, FakeCompletionResponse, FakeStructuredResponse, } from "./fake-llm-client.js";
|
|
5
|
+
export { OpenAILLMClient } from "./openai-llm-client.js";
|
|
6
|
+
export type { OpenAILLMClientOptions } from "./openai-llm-client.js";
|
|
7
|
+
export type { ModelPricing } from "./pricing.js";
|
|
8
|
+
export { DEFAULT_RETRY_POLICY, LLMHttpError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
9
|
+
export type { RetryPolicy } from "./retry.js";
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
export { AnthropicLLMClient } from "./anthropic-llm-client.js";
|
|
2
|
+
export { FakeLLMClient } from "./fake-llm-client.js";
|
|
3
|
+
export { OpenAILLMClient } from "./openai-llm-client.js";
|
|
4
|
+
export { DEFAULT_RETRY_POLICY, LLMHttpError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenAI adapter for the LLMClient port.
|
|
3
|
+
*
|
|
4
|
+
* Uses fetch() against the Chat Completions API — same transport pattern as
|
|
5
|
+
* the existing grader (`packages/eval/src/pipeline/grader-api.ts`). No SDK
|
|
6
|
+
* dependency. Centralizes retry, rate-limit handling, cost accounting, and
|
|
7
|
+
* per-call telemetry tagging via `context.feature`.
|
|
8
|
+
*
|
|
9
|
+
* Per `.claude/rules/eval-pipeline.md` and `.claude/rules/typescript.md`:
|
|
10
|
+
* the adapter never reads `process.env`. The composition root maps env vars
|
|
11
|
+
* to typed constructor args.
|
|
12
|
+
*/
|
|
13
|
+
import { type LLMClient, type LLMCompleteArgs, type LLMCompleteStructuredArgs, type LLMCompletion, type LLMStructuredCompletion, type Logger } from "../../_vendor/ailf-core/index.d.ts";
|
|
14
|
+
import type { ModelPricing } from "./pricing.js";
|
|
15
|
+
import { type RetryPolicy } from "./retry.js";
|
|
16
|
+
export interface OpenAILLMClientOptions {
|
|
17
|
+
/** OpenAI API key. */
|
|
18
|
+
apiKey: string;
|
|
19
|
+
/** Optional override of the chat-completions endpoint. */
|
|
20
|
+
baseUrl?: string;
|
|
21
|
+
/** Pricing table keyed by canonical model id (without the `openai:` prefix). */
|
|
22
|
+
pricing?: Record<string, ModelPricing>;
|
|
23
|
+
retryPolicy?: Partial<RetryPolicy>;
|
|
24
|
+
logger?: Logger;
|
|
25
|
+
/** Sleep injectable for tests. */
|
|
26
|
+
sleep?: (ms: number) => Promise<void>;
|
|
27
|
+
/** Random source for jitter — injectable for tests. */
|
|
28
|
+
rng?: () => number;
|
|
29
|
+
}
|
|
30
|
+
export declare class OpenAILLMClient implements LLMClient {
|
|
31
|
+
private readonly apiKey;
|
|
32
|
+
private readonly baseUrl;
|
|
33
|
+
private readonly pricing;
|
|
34
|
+
private readonly retryPolicy;
|
|
35
|
+
private readonly logger?;
|
|
36
|
+
private readonly sleep?;
|
|
37
|
+
private readonly rng?;
|
|
38
|
+
constructor(options: OpenAILLMClientOptions);
|
|
39
|
+
complete(args: LLMCompleteArgs): Promise<LLMCompletion>;
|
|
40
|
+
completeStructured<T>(args: LLMCompleteStructuredArgs<T>): Promise<LLMStructuredCompletion<T>>;
|
|
41
|
+
private callApi;
|
|
42
|
+
private computeCost;
|
|
43
|
+
private logTelemetry;
|
|
44
|
+
}
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenAI adapter for the LLMClient port.
|
|
3
|
+
*
|
|
4
|
+
* Uses fetch() against the Chat Completions API — same transport pattern as
|
|
5
|
+
* the existing grader (`packages/eval/src/pipeline/grader-api.ts`). No SDK
|
|
6
|
+
* dependency. Centralizes retry, rate-limit handling, cost accounting, and
|
|
7
|
+
* per-call telemetry tagging via `context.feature`.
|
|
8
|
+
*
|
|
9
|
+
* Per `.claude/rules/eval-pipeline.md` and `.claude/rules/typescript.md`:
|
|
10
|
+
* the adapter never reads `process.env`. The composition root maps env vars
|
|
11
|
+
* to typed constructor args.
|
|
12
|
+
*/
|
|
13
|
+
import { OpenAIChatResponseSchema, splitModelId, } from "../../_vendor/ailf-core/index.js";
|
|
14
|
+
import { DEFAULT_RETRY_POLICY, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
15
|
+
const DEFAULT_BASE_URL = "https://api.openai.com/v1/chat/completions";
|
|
16
|
+
/**
|
|
17
|
+
* Conservative defaults for the models in `packages/eval/config/models.ts`.
|
|
18
|
+
* Update when models are added or vendor pricing changes. Unknown models
|
|
19
|
+
* fall through to `cost: 0` with a warning logged.
|
|
20
|
+
*
|
|
21
|
+
* Pricing reference: https://openai.com/api/pricing
|
|
22
|
+
*/
|
|
23
|
+
const DEFAULT_PRICING = {
|
|
24
|
+
"gpt-5.2": { inputPer1k: 0.005, outputPer1k: 0.015 },
|
|
25
|
+
"gpt-5": { inputPer1k: 0.005, outputPer1k: 0.015 },
|
|
26
|
+
"gpt-5.4": { inputPer1k: 0.005, outputPer1k: 0.015 },
|
|
27
|
+
};
|
|
28
|
+
export class OpenAILLMClient {
|
|
29
|
+
apiKey;
|
|
30
|
+
baseUrl;
|
|
31
|
+
pricing;
|
|
32
|
+
retryPolicy;
|
|
33
|
+
logger;
|
|
34
|
+
sleep;
|
|
35
|
+
rng;
|
|
36
|
+
constructor(options) {
|
|
37
|
+
this.apiKey = options.apiKey;
|
|
38
|
+
this.baseUrl = options.baseUrl ?? DEFAULT_BASE_URL;
|
|
39
|
+
this.pricing = { ...DEFAULT_PRICING, ...(options.pricing ?? {}) };
|
|
40
|
+
this.retryPolicy = {
|
|
41
|
+
...DEFAULT_RETRY_POLICY,
|
|
42
|
+
...(options.retryPolicy ?? {}),
|
|
43
|
+
};
|
|
44
|
+
if (options.logger)
|
|
45
|
+
this.logger = options.logger;
|
|
46
|
+
if (options.sleep)
|
|
47
|
+
this.sleep = options.sleep;
|
|
48
|
+
if (options.rng)
|
|
49
|
+
this.rng = options.rng;
|
|
50
|
+
}
|
|
51
|
+
async complete(args) {
|
|
52
|
+
const { modelName } = splitModelId(args.model);
|
|
53
|
+
const body = buildBody(modelName, args.prompt, {
|
|
54
|
+
temperature: args.temperature,
|
|
55
|
+
maxTokens: args.maxTokens,
|
|
56
|
+
stop: args.stop,
|
|
57
|
+
});
|
|
58
|
+
const data = await this.callApi(body);
|
|
59
|
+
const text = data.choices?.[0]?.message?.content;
|
|
60
|
+
if (text == null || text === "") {
|
|
61
|
+
throw new Error(`OpenAI returned empty completion for model ${args.model}`);
|
|
62
|
+
}
|
|
63
|
+
const usage = extractUsage(data.usage);
|
|
64
|
+
const cost = this.computeCost(modelName, usage);
|
|
65
|
+
this.logTelemetry(args.context, args.model, usage, cost);
|
|
66
|
+
return { text, usage, cost, model: args.model };
|
|
67
|
+
}
|
|
68
|
+
async completeStructured(args) {
|
|
69
|
+
const { modelName } = splitModelId(args.model);
|
|
70
|
+
const body = buildBody(modelName, args.prompt, {
|
|
71
|
+
temperature: args.temperature,
|
|
72
|
+
maxTokens: args.maxTokens,
|
|
73
|
+
responseFormat: { type: "json_object" },
|
|
74
|
+
});
|
|
75
|
+
const data = await this.callApi(body);
|
|
76
|
+
const raw = data.choices?.[0]?.message?.content;
|
|
77
|
+
if (raw == null || raw === "") {
|
|
78
|
+
throw new Error(`OpenAI returned empty structured completion for model ${args.model}`);
|
|
79
|
+
}
|
|
80
|
+
let parsed;
|
|
81
|
+
try {
|
|
82
|
+
parsed = JSON.parse(raw);
|
|
83
|
+
}
|
|
84
|
+
catch (err) {
|
|
85
|
+
throw new Error(`OpenAI structured completion returned invalid JSON for model ${args.model}: ${err instanceof Error ? err.message : String(err)}`, { cause: err });
|
|
86
|
+
}
|
|
87
|
+
const value = args.schema.parse(parsed);
|
|
88
|
+
const usage = extractUsage(data.usage);
|
|
89
|
+
const cost = this.computeCost(modelName, usage);
|
|
90
|
+
this.logTelemetry(args.context, args.model, usage, cost);
|
|
91
|
+
return { value, usage, cost, model: args.model };
|
|
92
|
+
}
|
|
93
|
+
async callApi(body) {
|
|
94
|
+
return runWithRetry({
|
|
95
|
+
policy: this.retryPolicy,
|
|
96
|
+
...(this.sleep ? { sleep: this.sleep } : {}),
|
|
97
|
+
...(this.rng ? { rng: this.rng } : {}),
|
|
98
|
+
attempt: async () => {
|
|
99
|
+
const response = await fetch(this.baseUrl, {
|
|
100
|
+
method: "POST",
|
|
101
|
+
headers: {
|
|
102
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
103
|
+
"Content-Type": "application/json",
|
|
104
|
+
},
|
|
105
|
+
body: JSON.stringify(body),
|
|
106
|
+
});
|
|
107
|
+
if (!response.ok) {
|
|
108
|
+
const text = await response.text();
|
|
109
|
+
const retryAfter = parseRetryAfterSeconds(response.headers.get("retry-after"));
|
|
110
|
+
return {
|
|
111
|
+
ok: false,
|
|
112
|
+
status: response.status,
|
|
113
|
+
body: text,
|
|
114
|
+
...(retryAfter !== undefined
|
|
115
|
+
? { retryAfterSeconds: retryAfter }
|
|
116
|
+
: {}),
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
const json = await response.json();
|
|
120
|
+
const data = OpenAIChatResponseSchema.parse(json);
|
|
121
|
+
if (data.error?.message) {
|
|
122
|
+
throw new Error(`OpenAI API error: ${data.error.message}`);
|
|
123
|
+
}
|
|
124
|
+
return { ok: true, value: data };
|
|
125
|
+
},
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
computeCost(modelName, usage) {
|
|
129
|
+
const price = this.pricing[modelName];
|
|
130
|
+
if (!price) {
|
|
131
|
+
this.logger?.warn(`OpenAI cost unknown for model "${modelName}" — recording cost=0. Add it to OpenAILLMClientOptions.pricing.`);
|
|
132
|
+
return 0;
|
|
133
|
+
}
|
|
134
|
+
return ((usage.promptTokens / 1000) * price.inputPer1k +
|
|
135
|
+
(usage.completionTokens / 1000) * price.outputPer1k);
|
|
136
|
+
}
|
|
137
|
+
logTelemetry(context, model, usage, cost) {
|
|
138
|
+
if (!this.logger)
|
|
139
|
+
return;
|
|
140
|
+
const tag = context ? ` feature=${context.feature}` : "";
|
|
141
|
+
const runTag = context?.runId ? ` runId=${context.runId}` : "";
|
|
142
|
+
const cardTag = context?.cardId ? ` cardId=${context.cardId}` : "";
|
|
143
|
+
this.logger.debug(`LLM call (openai)${tag}${runTag}${cardTag} model=${model} ` +
|
|
144
|
+
`prompt_tokens=${usage.promptTokens} completion_tokens=${usage.completionTokens} ` +
|
|
145
|
+
`cost_usd=${cost.toFixed(6)}`);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
function buildBody(modelName, prompt, opts) {
|
|
149
|
+
const body = {
|
|
150
|
+
model: modelName,
|
|
151
|
+
messages: [{ role: "user", content: prompt }],
|
|
152
|
+
};
|
|
153
|
+
if (opts.temperature !== undefined)
|
|
154
|
+
body.temperature = opts.temperature;
|
|
155
|
+
if (opts.maxTokens !== undefined)
|
|
156
|
+
body.max_tokens = opts.maxTokens;
|
|
157
|
+
if (opts.stop && opts.stop.length > 0)
|
|
158
|
+
body.stop = opts.stop;
|
|
159
|
+
if (opts.responseFormat)
|
|
160
|
+
body.response_format = opts.responseFormat;
|
|
161
|
+
return body;
|
|
162
|
+
}
|
|
163
|
+
function extractUsage(usage) {
|
|
164
|
+
return {
|
|
165
|
+
promptTokens: usage?.prompt_tokens ?? 0,
|
|
166
|
+
completionTokens: usage?.completion_tokens ?? 0,
|
|
167
|
+
};
|
|
168
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-model pricing types shared by LLM adapters.
|
|
3
|
+
*
|
|
4
|
+
* Hard-coded vendor pricing drifts; treat the in-adapter defaults as a
|
|
5
|
+
* sensible starting point and override via constructor options when the
|
|
6
|
+
* vendor changes their rate card.
|
|
7
|
+
*/
|
|
8
|
+
/** USD per 1K tokens. */
|
|
9
|
+
export interface ModelPricing {
|
|
10
|
+
inputPer1k: number;
|
|
11
|
+
outputPer1k: number;
|
|
12
|
+
}
|