@clinebot/llms 0.0.10 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,7 +28,7 @@ type AiSdkUsageMetrics = {
28
28
  export type EmitAiSdkStreamOptions = {
29
29
  responseId: string;
30
30
  errorMessage: string;
31
- calculateCost: (inputTokens: number, outputTokens: number, cacheReadTokens: number) => number | undefined;
31
+ calculateCost: (inputTokens: number, outputTokens: number, cacheReadTokens: number, cacheWriteTokens?: number) => number | undefined;
32
32
  reasoningTypes?: string[];
33
33
  enableToolCalls?: boolean;
34
34
  toolCallArgsOrder?: Array<"args" | "input">;
@@ -3,7 +3,7 @@
3
3
  *
4
4
  * Abstract base class that provides common functionality for all handlers.
5
5
  */
6
- import type { ApiHandler, ApiStream, ApiStreamUsageChunk, HandlerModelInfo, ProviderConfig } from "../types";
6
+ import type { ApiHandler, ApiStream, ApiStreamUsageChunk, HandlerModelInfo, ModelInfo, ProviderConfig } from "../types";
7
7
  import type { Message, ToolDefinition } from "../types/messages";
8
8
  import type { ApiStreamChunk } from "../types/stream";
9
9
  export declare const DEFAULT_REQUEST_HEADERS: Record<string, string>;
@@ -23,7 +23,8 @@ export declare abstract class BaseHandler implements ApiHandler {
23
23
  abort(): void;
24
24
  setAbortSignal(signal: AbortSignal | undefined): void;
25
25
  private logAbort;
26
- protected calculateCost(inputTokens: number, outputTokens: number, cacheReadTokens?: number): number | undefined;
26
+ protected supportsPromptCache(modelInfo?: ModelInfo): boolean;
27
+ protected calculateCost(inputTokens: number, outputTokens: number, cacheReadTokens?: number, cacheWriteTokens?: number): number | undefined;
27
28
  protected createResponseId(): string;
28
29
  protected withResponseId<T extends ApiStreamChunk>(chunk: T, responseId: string): T;
29
30
  protected withResponseIdForAll(chunks: Iterable<ApiStreamChunk>, responseId: string): Generator<ApiStreamChunk>;
@@ -47,7 +47,7 @@ export interface ApiStreamReasoningChunk {
47
47
  */
48
48
  export interface ApiStreamUsageChunk {
49
49
  type: "usage";
50
- /** Number of input tokens (excluding cached) */
50
+ /** Total number of input tokens reported by the provider */
51
51
  inputTokens: number;
52
52
  /** Number of output tokens */
53
53
  outputTokens: number;
package/package.json CHANGED
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "name": "@clinebot/llms",
3
- "version": "0.0.10",
3
+ "version": "0.0.11",
4
4
  "description": "Config-driven SDK for selecting, extending, and instantiating LLM providers and models",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.js",
7
7
  "dependencies": {
8
- "@clinebot/shared": "0.0.10",
8
+ "@clinebot/shared": "0.0.11",
9
9
  "@ai-sdk/amazon-bedrock": "^4.0.67",
10
10
  "@ai-sdk/google-vertex": "^4.0.74",
11
11
  "@ai-sdk/mistral": "^3.0.24",
@@ -41,6 +41,7 @@ export type EmitAiSdkStreamOptions = {
41
41
  inputTokens: number,
42
42
  outputTokens: number,
43
43
  cacheReadTokens: number,
44
+ cacheWriteTokens?: number,
44
45
  ) => number | undefined;
45
46
  reasoningTypes?: string[];
46
47
  enableToolCalls?: boolean;
@@ -168,10 +169,7 @@ export async function* emitAiSdkStream(
168
169
 
169
170
  yield {
170
171
  type: "usage",
171
- inputTokens: Math.max(
172
- 0,
173
- usageMetrics.inputTokens - usageMetrics.cacheReadTokens,
174
- ),
172
+ inputTokens: usageMetrics.inputTokens,
175
173
  outputTokens: usageMetrics.outputTokens,
176
174
  thoughtsTokenCount: usageMetrics.thoughtsTokenCount,
177
175
  cacheReadTokens: usageMetrics.cacheReadTokens,
@@ -180,6 +178,7 @@ export async function* emitAiSdkStream(
180
178
  usageMetrics.inputTokens,
181
179
  usageMetrics.outputTokens,
182
180
  usageMetrics.cacheReadTokens,
181
+ usageMetrics.cacheWriteTokens,
183
182
  ),
184
183
  id: responseId,
185
184
  };
@@ -205,10 +204,7 @@ export async function* emitAiSdkStream(
205
204
  const usageMetrics = resolveUsageMetrics(usage);
206
205
  yield {
207
206
  type: "usage",
208
- inputTokens: Math.max(
209
- 0,
210
- usageMetrics.inputTokens - usageMetrics.cacheReadTokens,
211
- ),
207
+ inputTokens: usageMetrics.inputTokens,
212
208
  outputTokens: usageMetrics.outputTokens,
213
209
  thoughtsTokenCount: usageMetrics.thoughtsTokenCount,
214
210
  cacheReadTokens: usageMetrics.cacheReadTokens,
@@ -217,6 +213,7 @@ export async function* emitAiSdkStream(
217
213
  usageMetrics.inputTokens,
218
214
  usageMetrics.outputTokens,
219
215
  usageMetrics.cacheReadTokens,
216
+ usageMetrics.cacheWriteTokens,
220
217
  ),
221
218
  id: responseId,
222
219
  };
@@ -185,8 +185,18 @@ export abstract class AiSdkProviderHandler extends BaseHandler {
185
185
  yield* emitAiSdkStream(stream, {
186
186
  responseId,
187
187
  errorMessage: this.getStreamErrorMessage(),
188
- calculateCost: (inputTokens, outputTokens, cacheReadTokens) =>
189
- this.calculateCost(inputTokens, outputTokens, cacheReadTokens),
188
+ calculateCost: (
189
+ inputTokens,
190
+ outputTokens,
191
+ cacheReadTokens,
192
+ cacheWriteTokens,
193
+ ) =>
194
+ this.calculateCost(
195
+ inputTokens,
196
+ outputTokens,
197
+ cacheReadTokens,
198
+ cacheWriteTokens,
199
+ ),
190
200
  ...this.getEmitStreamOptions(),
191
201
  });
192
202
  }
@@ -0,0 +1,30 @@
1
+ import { describe, expect, it } from "vitest";
2
+ import { AnthropicHandler } from "./anthropic-base";
3
+
4
+ describe("AnthropicHandler prompt cache detection", () => {
5
+ it("enables prompt caching when model pricing includes cache pricing", () => {
6
+ const handler = new AnthropicHandler({
7
+ providerId: "anthropic",
8
+ modelId: "claude-sonnet-4-6",
9
+ apiKey: "test-key",
10
+ modelInfo: {
11
+ id: "claude-sonnet-4-6",
12
+ pricing: {
13
+ input: 3,
14
+ output: 15,
15
+ cacheRead: 0.3,
16
+ cacheWrite: 3.75,
17
+ },
18
+ },
19
+ });
20
+
21
+ const messages = handler.getMessages("system", [
22
+ { role: "user", content: "Tell me about this repo" },
23
+ ]);
24
+ const userTextBlock = messages[0]?.content?.[0] as
25
+ | { cache_control?: { type: string } }
26
+ | undefined;
27
+
28
+ expect(userTextBlock?.cache_control).toEqual({ type: "ephemeral" });
29
+ });
30
+ });
@@ -17,7 +17,6 @@ import {
17
17
  import {
18
18
  type ApiStream,
19
19
  type HandlerModelInfo,
20
- hasModelCapability,
21
20
  type ProviderConfig,
22
21
  supportsModelThinking,
23
22
  } from "../types";
@@ -76,10 +75,7 @@ export class AnthropicHandler extends BaseHandler {
76
75
  _systemPrompt: string,
77
76
  messages: Message[],
78
77
  ): Anthropic.MessageParam[] {
79
- const supportsPromptCache = hasModelCapability(
80
- this.getModel().info,
81
- "prompt-cache",
82
- );
78
+ const supportsPromptCache = this.supportsPromptCache(this.getModel().info);
83
79
  return convertToAnthropicMessages(
84
80
  messages,
85
81
  supportsPromptCache,
@@ -113,7 +109,7 @@ export class AnthropicHandler extends BaseHandler {
113
109
  const budgetTokens =
114
110
  thinkingSupported && requestedBudget > 0 ? requestedBudget : 0;
115
111
  const nativeToolsOn = tools && tools.length > 0;
116
- const supportsPromptCache = hasModelCapability(model.info, "prompt-cache");
112
+ const supportsPromptCache = this.supportsPromptCache(model.info);
117
113
  const reasoningOn = thinkingSupported && budgetTokens > 0;
118
114
  const debugThinking = isThinkingDebugEnabled();
119
115
  const debugChunkCounts: Record<string, number> = {};
@@ -139,30 +135,34 @@ export class AnthropicHandler extends BaseHandler {
139
135
  const requestOptions = { signal: abortSignal };
140
136
 
141
137
  // Create the request
138
+ // Use top-level automatic caching so the entire prefix (system +
139
+ // messages) is cached and the breakpoint advances each turn.
140
+ const createParams: Record<string, unknown> &
141
+ Anthropic.MessageCreateParamsStreaming = {
142
+ model: model.id,
143
+ thinking: reasoningOn
144
+ ? { type: "enabled", budget_tokens: budgetTokens }
145
+ : undefined,
146
+ max_tokens:
147
+ model.info.maxTokens ?? this.config.maxOutputTokens ?? 128_000,
148
+ temperature: reasoningOn ? undefined : 0,
149
+ system: [
150
+ supportsPromptCache
151
+ ? {
152
+ text: systemPrompt,
153
+ type: "text",
154
+ cache_control: { type: "ephemeral" },
155
+ }
156
+ : { text: systemPrompt, type: "text" },
157
+ ],
158
+ messages: anthropicMessages as Anthropic.MessageParam[],
159
+ stream: true,
160
+ tools: anthropicTools,
161
+ tool_choice: nativeToolsOn && !reasoningOn ? { type: "auto" } : undefined,
162
+ };
163
+
142
164
  const stream = await client.messages.create(
143
- {
144
- model: model.id,
145
- thinking: reasoningOn
146
- ? { type: "enabled", budget_tokens: budgetTokens }
147
- : undefined,
148
- max_tokens:
149
- model.info.maxTokens ?? this.config.maxOutputTokens ?? 128_000,
150
- temperature: reasoningOn ? undefined : 0,
151
- system: supportsPromptCache
152
- ? [
153
- {
154
- text: systemPrompt,
155
- type: "text",
156
- cache_control: { type: "ephemeral" },
157
- },
158
- ]
159
- : [{ text: systemPrompt, type: "text" }],
160
- messages: anthropicMessages as Anthropic.MessageParam[],
161
- stream: true,
162
- tools: anthropicTools,
163
- tool_choice:
164
- nativeToolsOn && !reasoningOn ? { type: "auto" } : undefined,
165
- },
165
+ createParams as Anthropic.MessageCreateParamsStreaming,
166
166
  requestOptions,
167
167
  );
168
168
 
@@ -244,6 +244,7 @@ export class AnthropicHandler extends BaseHandler {
244
244
  usageSnapshot.inputTokens,
245
245
  usageSnapshot.outputTokens,
246
246
  usageSnapshot.cacheReadTokens,
247
+ usageSnapshot.cacheWriteTokens,
247
248
  ),
248
249
  id: responseId,
249
250
  };
@@ -263,6 +264,7 @@ export class AnthropicHandler extends BaseHandler {
263
264
  usageSnapshot.inputTokens,
264
265
  usageSnapshot.outputTokens,
265
266
  usageSnapshot.cacheReadTokens,
267
+ usageSnapshot.cacheWriteTokens,
266
268
  ),
267
269
  id: responseId,
268
270
  };
@@ -15,8 +15,14 @@ class TestHandler extends BaseHandler {
15
15
  inputTokens: number,
16
16
  outputTokens: number,
17
17
  cacheReadTokens = 0,
18
+ cacheWriteTokens = 0,
18
19
  ): number | undefined {
19
- return this.calculateCost(inputTokens, outputTokens, cacheReadTokens);
20
+ return this.calculateCost(
21
+ inputTokens,
22
+ outputTokens,
23
+ cacheReadTokens,
24
+ cacheWriteTokens,
25
+ );
20
26
  }
21
27
 
22
28
  public exposeAbortSignal(): AbortSignal {
@@ -45,7 +51,7 @@ describe("BaseHandler.calculateCost", () => {
45
51
 
46
52
  const cost = handler.computeCost(1_000_000, 1_000_000, 100_000);
47
53
 
48
- expect(cost).toBeCloseTo(17.73, 6);
54
+ expect(cost).toBeCloseTo(18.03, 6);
49
55
  });
50
56
  });
51
57
 
@@ -10,6 +10,7 @@ import type {
10
10
  ApiStream,
11
11
  ApiStreamUsageChunk,
12
12
  HandlerModelInfo,
13
+ ModelInfo,
13
14
  ProviderConfig,
14
15
  } from "../types";
15
16
  import type { Message, ToolDefinition } from "../types/messages";
@@ -146,10 +147,26 @@ export abstract class BaseHandler implements ApiHandler {
146
147
  });
147
148
  }
148
149
 
150
+ protected supportsPromptCache(modelInfo?: ModelInfo): boolean {
151
+ const resolvedModelInfo =
152
+ modelInfo ??
153
+ this.config.modelInfo ??
154
+ this.config.knownModels?.[this.config.modelId];
155
+ const pricing = resolvedModelInfo?.pricing;
156
+
157
+ return (
158
+ resolvedModelInfo?.capabilities?.includes("prompt-cache") === true ||
159
+ this.config.capabilities?.includes("prompt-cache") === true ||
160
+ typeof pricing?.cacheRead === "number" ||
161
+ typeof pricing?.cacheWrite === "number"
162
+ );
163
+ }
164
+
149
165
  protected calculateCost(
150
166
  inputTokens: number,
151
167
  outputTokens: number,
152
168
  cacheReadTokens = 0,
169
+ cacheWriteTokens = 0,
153
170
  ): number | undefined {
154
171
  const pricing = (
155
172
  this.config.modelInfo ?? this.config.knownModels?.[this.config.modelId]
@@ -159,10 +176,14 @@ export abstract class BaseHandler implements ApiHandler {
159
176
  }
160
177
 
161
178
  return (
162
- ((inputTokens - cacheReadTokens) / 1_000_000) * pricing.input +
179
+ (inputTokens / 1_000_000) * pricing.input +
163
180
  (outputTokens / 1_000_000) * pricing.output +
164
181
  (cacheReadTokens > 0
165
182
  ? (cacheReadTokens / 1_000_000) * (pricing.cacheRead ?? 0)
183
+ : 0) +
184
+ (cacheWriteTokens > 0
185
+ ? (cacheWriteTokens / 1_000_000) *
186
+ (pricing.cacheWrite ?? pricing.input * 1.25)
166
187
  : 0)
167
188
  );
168
189
  }
@@ -216,7 +216,7 @@ export class BedrockHandler extends BaseHandler {
216
216
 
217
217
  yield {
218
218
  type: "usage",
219
- inputTokens: inputTokens - cacheReadTokens,
219
+ inputTokens,
220
220
  outputTokens,
221
221
  thoughtsTokenCount,
222
222
  cacheReadTokens,
@@ -245,7 +245,7 @@ export class BedrockHandler extends BaseHandler {
245
245
 
246
246
  yield {
247
247
  type: "usage",
248
- inputTokens: inputTokens - cacheReadTokens,
248
+ inputTokens,
249
249
  outputTokens,
250
250
  thoughtsTokenCount,
251
251
  cacheReadTokens,
@@ -115,6 +115,39 @@ describe("Community SDK handlers", () => {
115
115
  expect(usageChunk?.outputTokens).toBe(3);
116
116
  });
117
117
 
118
+ it("keeps cached input tokens separate from total input tokens", async () => {
119
+ streamTextSpy.mockReturnValue({
120
+ fullStream: makeStreamParts([
121
+ {
122
+ type: "finish",
123
+ usage: { inputTokens: 10, outputTokens: 3, cachedInputTokens: 4 },
124
+ },
125
+ ]),
126
+ });
127
+
128
+ const handler = new ClaudeCodeHandler({
129
+ providerId: "claude-code",
130
+ modelId: "sonnet",
131
+ });
132
+
133
+ const chunks: ApiStreamChunk[] = [];
134
+ for await (const chunk of handler.createMessage("System", [
135
+ { role: "user", content: "Hi" },
136
+ ])) {
137
+ chunks.push(chunk);
138
+ }
139
+
140
+ const usageChunk = chunks.find(
141
+ (chunk): chunk is Extract<ApiStreamChunk, { type: "usage" }> =>
142
+ chunk.type === "usage",
143
+ );
144
+ expect(usageChunk).toMatchObject({
145
+ inputTokens: 10,
146
+ outputTokens: 3,
147
+ cacheReadTokens: 4,
148
+ });
149
+ });
150
+
118
151
  it("uses a fallback model id when model is missing", () => {
119
152
  const handler = new ClaudeCodeHandler({
120
153
  providerId: "claude-code",
@@ -18,7 +18,6 @@ import {
18
18
  import {
19
19
  type ApiStream,
20
20
  type HandlerModelInfo,
21
- type ModelInfo,
22
21
  type ProviderConfig,
23
22
  supportsModelThinking,
24
23
  } from "../types";
@@ -258,7 +257,6 @@ export class GeminiHandler extends BaseHandler {
258
257
 
259
258
  // Yield final usage
260
259
  const totalCost = this.calculateGeminiCost(
261
- info,
262
260
  promptTokens,
263
261
  outputTokens,
264
262
  thoughtsTokenCount,
@@ -267,7 +265,7 @@ export class GeminiHandler extends BaseHandler {
267
265
 
268
266
  yield {
269
267
  type: "usage",
270
- inputTokens: promptTokens - cacheReadTokens,
268
+ inputTokens: promptTokens,
271
269
  outputTokens,
272
270
  thoughtsTokenCount,
273
271
  cacheReadTokens,
@@ -288,27 +286,16 @@ export class GeminiHandler extends BaseHandler {
288
286
  }
289
287
 
290
288
  private calculateGeminiCost(
291
- info: ModelInfo,
292
289
  inputTokens: number,
293
290
  outputTokens: number,
294
291
  thoughtsTokenCount: number,
295
292
  cacheReadTokens: number,
296
293
  ): number | undefined {
297
- const pricing = info.pricing;
298
- if (!pricing?.input || !pricing?.output) {
299
- return undefined;
300
- }
301
-
302
- const uncachedInputTokens = inputTokens - cacheReadTokens;
303
- const inputCost = pricing.input * (uncachedInputTokens / 1_000_000);
304
- const outputCost =
305
- pricing.output * ((outputTokens + thoughtsTokenCount) / 1_000_000);
306
- const cacheReadCost =
307
- cacheReadTokens > 0
308
- ? (pricing.cacheRead ?? 0) * (cacheReadTokens / 1_000_000)
309
- : 0;
310
-
311
- return inputCost + outputCost + cacheReadCost;
294
+ return this.calculateCost(
295
+ inputTokens,
296
+ outputTokens + thoughtsTokenCount,
297
+ cacheReadTokens,
298
+ );
312
299
  }
313
300
  }
314
301
 
@@ -22,7 +22,6 @@ import type {
22
22
  ModelInfo,
23
23
  ProviderConfig,
24
24
  } from "../types";
25
- import { hasModelCapability } from "../types";
26
25
  import type { Message, ToolDefinition } from "../types/messages";
27
26
  import { retryStream } from "../utils/retry";
28
27
  import { ToolCallProcessor } from "../utils/tool-processor";
@@ -108,9 +107,7 @@ export class OpenAIBaseHandler extends BaseHandler {
108
107
  messages: Message[],
109
108
  ): OpenAI.Chat.ChatCompletionMessageParam[] {
110
109
  const model = this.getModel();
111
- const supportsPromptCache =
112
- hasModelCapability(model.info, "prompt-cache") ||
113
- this.config.capabilities?.includes("prompt-cache") === true;
110
+ const supportsPromptCache = this.supportsPromptCache(model.info);
114
111
  const systemMessage = supportsPromptCache
115
112
  ? ({
116
113
  role: "system",
@@ -156,7 +153,8 @@ export class OpenAIBaseHandler extends BaseHandler {
156
153
  const openAiMessages = this.getMessages(systemPrompt, messages);
157
154
 
158
155
  // Build request options
159
- const requestOptions: OpenAI.ChatCompletionCreateParamsStreaming = {
156
+ const requestOptions: Record<string, unknown> &
157
+ OpenAI.ChatCompletionCreateParamsStreaming = {
160
158
  model: modelId,
161
159
  messages: openAiMessages,
162
160
  stream: true,
@@ -167,6 +165,17 @@ export class OpenAIBaseHandler extends BaseHandler {
167
165
  }),
168
166
  };
169
167
 
168
+ // Add top-level cache_control for OpenRouter with Anthropic models.
169
+ // This enables automatic caching where the cache breakpoint advances
170
+ // as the conversation grows, rather than relying on explicit per-block
171
+ // breakpoints which are limited to 4.
172
+ if (
173
+ this.config.providerId === "openrouter" &&
174
+ modelId.startsWith("anthropic/")
175
+ ) {
176
+ requestOptions.cache_control = { type: "ephemeral" };
177
+ }
178
+
170
179
  // Add max tokens if configured
171
180
  const maxTokens = modelInfo.maxTokens ?? this.config.maxOutputTokens;
172
181
  if (maxTokens) {
@@ -286,15 +295,16 @@ export class OpenAIBaseHandler extends BaseHandler {
286
295
  cached_tokens?: number;
287
296
  cache_write_tokens?: number;
288
297
  };
289
- prompt_cache_miss_tokens?: number;
290
298
  cache_creation_input_tokens?: number;
291
299
  cache_read_input_tokens?: number;
292
300
  };
293
301
  const cacheReadTokens =
294
- usageWithCache.prompt_tokens_details?.cached_tokens ?? 0;
302
+ usageWithCache.prompt_tokens_details?.cached_tokens ??
303
+ usageWithCache.cache_read_input_tokens ??
304
+ 0;
295
305
  const cacheWriteTokens =
296
306
  usageWithCache.prompt_tokens_details?.cache_write_tokens ??
297
- usageWithCache.prompt_cache_miss_tokens ??
307
+ usageWithCache.cache_creation_input_tokens ??
298
308
  0;
299
309
 
300
310
  yield {
@@ -307,6 +317,7 @@ export class OpenAIBaseHandler extends BaseHandler {
307
317
  inputTokens,
308
318
  outputTokens,
309
319
  cacheReadTokens,
320
+ cacheWriteTokens,
310
321
  ),
311
322
  id: responseId,
312
323
  };
@@ -210,4 +210,50 @@ describe("OpenAIResponsesHandler", () => {
210
210
  },
211
211
  });
212
212
  });
213
+
214
+ it("keeps cached input tokens separate in usage chunks", () => {
215
+ const handler = new TestOpenAIResponsesHandler({
216
+ providerId: "openai-native",
217
+ modelId: "gpt-5.4",
218
+ apiKey: "test-key",
219
+ baseUrl: "https://example.com",
220
+ modelInfo: {
221
+ id: "gpt-5.4",
222
+ pricing: {
223
+ input: 1,
224
+ output: 2,
225
+ cacheRead: 0.5,
226
+ },
227
+ },
228
+ });
229
+
230
+ const chunks = handler.processChunkForTest({
231
+ type: "response.completed",
232
+ response: {
233
+ id: "resp_usage",
234
+ usage: {
235
+ input_tokens: 100,
236
+ output_tokens: 40,
237
+ input_tokens_details: {
238
+ cached_tokens: 25,
239
+ },
240
+ output_tokens_details: {
241
+ reasoning_tokens: 10,
242
+ },
243
+ },
244
+ },
245
+ });
246
+
247
+ expect(chunks[0]).toMatchObject({
248
+ type: "usage",
249
+ inputTokens: 100,
250
+ outputTokens: 40,
251
+ cacheReadTokens: 25,
252
+ cacheWriteTokens: 0,
253
+ });
254
+ expect(chunks[0]?.type).toBe("usage");
255
+ if (chunks[0]?.type === "usage") {
256
+ expect(chunks[0].totalCost).toBeCloseTo(0.0001925, 10);
257
+ }
258
+ });
213
259
  });
@@ -565,23 +565,19 @@ export class OpenAIResponsesHandler extends BaseHandler {
565
565
  const inputTokens = usage.input_tokens || 0;
566
566
  const outputTokens = usage.output_tokens || 0;
567
567
  const cacheReadTokens =
568
- usage.output_tokens_details?.reasoning_tokens || 0;
569
- const cacheWriteTokens =
570
568
  usage.input_tokens_details?.cached_tokens || 0;
569
+ const cacheWriteTokens = 0;
571
570
 
572
571
  const totalCost = this.calculateCost(
573
572
  inputTokens,
574
573
  outputTokens,
575
574
  cacheReadTokens,
576
- );
577
- const nonCachedInputTokens = Math.max(
578
- 0,
579
- inputTokens - cacheReadTokens - cacheWriteTokens,
575
+ cacheWriteTokens,
580
576
  );
581
577
 
582
578
  yield {
583
579
  type: "usage",
584
- inputTokens: nonCachedInputTokens,
580
+ inputTokens,
585
581
  outputTokens,
586
582
  cacheWriteTokens,
587
583
  cacheReadTokens,
@@ -255,19 +255,18 @@ export class R1BaseHandler extends BaseHandler {
255
255
  const cacheReadTokens = r1Usage.prompt_cache_hit_tokens ?? 0;
256
256
  const cacheWriteTokens = r1Usage.prompt_cache_miss_tokens ?? 0;
257
257
 
258
- // Calculate non-cached input tokens (will always be 0 for DeepSeek since input = read + write)
259
- const nonCachedInputTokens = Math.max(
260
- 0,
261
- inputTokens - cacheReadTokens - cacheWriteTokens,
262
- );
263
-
264
258
  yield {
265
259
  type: "usage",
266
- inputTokens: nonCachedInputTokens,
260
+ inputTokens,
267
261
  outputTokens,
268
262
  cacheReadTokens,
269
263
  cacheWriteTokens,
270
- totalCost: this.calculateCost(inputTokens, outputTokens, cacheReadTokens),
264
+ totalCost: this.calculateCost(
265
+ inputTokens,
266
+ outputTokens,
267
+ cacheReadTokens,
268
+ cacheWriteTokens,
269
+ ),
271
270
  id: responseId,
272
271
  };
273
272
  }
@@ -189,7 +189,7 @@ export class VertexHandler extends BaseHandler {
189
189
  if (!isClaudeModel(model.id)) {
190
190
  return this.ensureGeminiHandler().getMessages(systemPrompt, messages);
191
191
  }
192
- const supportsPromptCache = hasModelCapability(model.info, "prompt-cache");
192
+ const supportsPromptCache = this.supportsPromptCache(model.info);
193
193
  return convertToAnthropicMessages(messages, supportsPromptCache);
194
194
  }
195
195
 
@@ -226,7 +226,7 @@ export class VertexHandler extends BaseHandler {
226
226
  const budgetTokens = this.config.thinkingBudgetTokens ?? 0;
227
227
  const reasoningOn =
228
228
  hasModelCapability(model.info, "reasoning") && budgetTokens > 0;
229
- const promptCacheOn = hasModelCapability(model.info, "prompt-cache");
229
+ const promptCacheOn = this.supportsPromptCache(model.info);
230
230
 
231
231
  const providerOptions: Record<string, unknown> = {};
232
232
  if (reasoningOn) {
@@ -251,8 +251,18 @@ export class VertexHandler extends BaseHandler {
251
251
  yield* emitAiSdkStream(stream, {
252
252
  responseId,
253
253
  errorMessage: "Vertex Anthropic stream failed",
254
- calculateCost: (inputTokens, outputTokens, cacheReadTokens) =>
255
- this.calculateCost(inputTokens, outputTokens, cacheReadTokens),
254
+ calculateCost: (
255
+ inputTokens,
256
+ outputTokens,
257
+ cacheReadTokens,
258
+ cacheWriteTokens,
259
+ ) =>
260
+ this.calculateCost(
261
+ inputTokens,
262
+ outputTokens,
263
+ cacheReadTokens,
264
+ cacheWriteTokens,
265
+ ),
256
266
  reasoningTypes: ["reasoning-delta"],
257
267
  enableToolCalls: true,
258
268
  toolCallArgsOrder: ["input", "args"],