@clinebot/llms 0.0.7 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/dist/index.browser.d.ts +2 -2
  2. package/dist/index.browser.js +40 -1
  3. package/dist/index.d.ts +2 -2
  4. package/dist/index.js +12 -12
  5. package/dist/providers/handlers/ai-sdk-community.d.ts +1 -1
  6. package/dist/providers/handlers/base.d.ts +5 -29
  7. package/dist/providers/transform/openai-format.d.ts +1 -1
  8. package/dist/providers/types/config.d.ts +6 -0
  9. package/dist/providers/types/stream.d.ts +1 -1
  10. package/package.json +2 -1
  11. package/src/index.browser.ts +2 -2
  12. package/src/index.ts +2 -2
  13. package/src/models/providers/vercel-ai-gateway.ts +1 -1
  14. package/src/providers/handlers/ai-sdk-community.ts +5 -8
  15. package/src/providers/handlers/ai-sdk-provider-base.ts +12 -2
  16. package/src/providers/handlers/anthropic-base.test.ts +30 -0
  17. package/src/providers/handlers/anthropic-base.ts +43 -30
  18. package/src/providers/handlers/base.test.ts +68 -3
  19. package/src/providers/handlers/base.ts +104 -54
  20. package/src/providers/handlers/bedrock-base.ts +3 -3
  21. package/src/providers/handlers/community-sdk.test.ts +33 -0
  22. package/src/providers/handlers/gemini-base.test.ts +40 -0
  23. package/src/providers/handlers/gemini-base.ts +22 -20
  24. package/src/providers/handlers/openai-base.ts +67 -12
  25. package/src/providers/handlers/openai-responses.test.ts +46 -0
  26. package/src/providers/handlers/openai-responses.ts +3 -7
  27. package/src/providers/handlers/r1-base.ts +7 -8
  28. package/src/providers/handlers/vertex.ts +15 -5
  29. package/src/providers/transform/anthropic-format.ts +14 -2
  30. package/src/providers/transform/format-conversion.test.ts +49 -0
  31. package/src/providers/transform/openai-format.ts +50 -7
  32. package/src/providers/types/config.ts +8 -0
  33. package/src/providers/types/stream.ts +1 -1
@@ -10,6 +10,7 @@ import type {
10
10
  ApiStream,
11
11
  ApiStreamUsageChunk,
12
12
  HandlerModelInfo,
13
+ ModelInfo,
13
14
  ProviderConfig,
14
15
  } from "../types";
15
16
  import type { Message, ToolDefinition } from "../types/messages";
@@ -22,37 +23,44 @@ export const DEFAULT_REQUEST_HEADERS: Record<string, string> = {
22
23
  "X-CLIENT-TYPE": "cline-sdk",
23
24
  };
24
25
 
26
+ const controllerIds = new WeakMap<AbortController, string>();
27
+ let controllerIdCounter = 0;
28
+
29
+ function getControllerId(controller: AbortController): string {
30
+ let id = controllerIds.get(controller);
31
+ if (!id) {
32
+ id = `abort_${++controllerIdCounter}`;
33
+ controllerIds.set(controller, id);
34
+ }
35
+ return id;
36
+ }
37
+
38
+ function serializeAbortReason(reason: unknown): unknown {
39
+ return reason instanceof Error
40
+ ? { name: reason.name, message: reason.message }
41
+ : reason;
42
+ }
43
+
25
44
  /**
26
45
  * Base handler class with common functionality
27
46
  */
28
47
  export abstract class BaseHandler implements ApiHandler {
29
48
  protected config: ProviderConfig;
30
49
  protected abortController: AbortController | undefined;
50
+ private abortSignalSequence = 0;
31
51
 
32
52
  constructor(config: ProviderConfig) {
33
53
  this.config = config;
34
54
  }
35
55
 
36
- /**
37
- * Convert Cline messages to provider-specific format
38
- * Must be implemented by subclasses
39
- */
40
56
  abstract getMessages(systemPrompt: string, messages: Message[]): unknown;
41
57
 
42
- /**
43
- * Create a streaming message completion
44
- * Must be implemented by subclasses
45
- */
46
58
  abstract createMessage(
47
59
  systemPrompt: string,
48
60
  messages: Message[],
49
61
  tools?: ToolDefinition[],
50
62
  ): ApiStream;
51
63
 
52
- /**
53
- * Get the current model configuration
54
- * Can be overridden by subclasses for provider-specific logic
55
- */
56
64
  getModel(): HandlerModelInfo {
57
65
  const modelId = this.config.modelId;
58
66
  return {
@@ -61,43 +69,55 @@ export abstract class BaseHandler implements ApiHandler {
61
69
  };
62
70
  }
63
71
 
64
- /**
65
- * Get usage information (optional)
66
- * Override in subclasses that support this
67
- */
68
72
  async getApiStreamUsage(): Promise<ApiStreamUsageChunk | undefined> {
69
73
  return undefined;
70
74
  }
71
75
 
72
- /**
73
- * Get the abort signal for the current request
74
- * Creates a new AbortController if one doesn't exist or was already aborted
75
- * Combines with config.abortSignal if provided
76
- */
77
76
  protected getAbortSignal(): AbortSignal {
78
- // Create a new controller if needed
79
- if (!this.abortController || this.abortController.signal.aborted) {
80
- this.abortController = new AbortController();
81
- }
82
-
83
- // If a signal was provided in config, chain it
84
- if (this.config.abortSignal) {
85
- const configSignal = this.config.abortSignal;
77
+ const controller = new AbortController();
78
+ this.abortController = controller;
79
+ controller.signal.addEventListener(
80
+ "abort",
81
+ () => {
82
+ if (this.abortController === controller) {
83
+ this.abortController = undefined;
84
+ }
85
+ },
86
+ { once: true },
87
+ );
88
+
89
+ const configSignal = this.config.abortSignal;
90
+ if (configSignal) {
86
91
  if (configSignal.aborted) {
87
- this.abortController.abort(configSignal.reason);
92
+ this.logAbort("debug", "Provider request inherited aborted signal", {
93
+ controllerId: getControllerId(controller),
94
+ reason: serializeAbortReason(configSignal.reason),
95
+ });
96
+ controller.abort(configSignal.reason);
88
97
  } else {
89
- configSignal.addEventListener("abort", () => {
90
- this.abortController?.abort(configSignal.reason);
98
+ const signalId = ++this.abortSignalSequence;
99
+ configSignal.addEventListener(
100
+ "abort",
101
+ () => {
102
+ this.logAbort("warn", "Provider request abort signal fired", {
103
+ controllerId: getControllerId(controller),
104
+ signalId,
105
+ reason: serializeAbortReason(configSignal.reason),
106
+ });
107
+ controller.abort(configSignal.reason);
108
+ },
109
+ { once: true },
110
+ );
111
+ this.logAbort("debug", "Provider request attached abort signal", {
112
+ controllerId: getControllerId(controller),
113
+ signalId,
91
114
  });
92
115
  }
93
116
  }
94
117
 
95
- return this.abortController.signal;
118
+ return controller.signal;
96
119
  }
97
120
 
98
- /**
99
- * Abort the current request
100
- */
101
121
  abort(): void {
102
122
  this.abortController?.abort();
103
123
  }
@@ -105,37 +125,67 @@ export abstract class BaseHandler implements ApiHandler {
105
125
  setAbortSignal(signal: AbortSignal | undefined): void {
106
126
  this.config.abortSignal = signal;
107
127
  if (signal?.aborted) {
128
+ this.logAbort("debug", "Provider handler received pre-aborted signal", {
129
+ controllerId: this.abortController
130
+ ? getControllerId(this.abortController)
131
+ : undefined,
132
+ reason: serializeAbortReason(signal.reason),
133
+ });
108
134
  this.abortController?.abort(signal.reason);
109
135
  }
110
136
  }
111
137
 
112
- /**
113
- * Helper to calculate cost from usage
114
- */
138
+ private logAbort(
139
+ level: "debug" | "warn",
140
+ message: string,
141
+ metadata?: Record<string, unknown>,
142
+ ): void {
143
+ this.config.logger?.[level]?.(message, {
144
+ providerId: this.config.providerId,
145
+ modelId: this.config.modelId,
146
+ ...metadata,
147
+ });
148
+ }
149
+
150
+ protected supportsPromptCache(modelInfo?: ModelInfo): boolean {
151
+ const resolvedModelInfo =
152
+ modelInfo ??
153
+ this.config.modelInfo ??
154
+ this.config.knownModels?.[this.config.modelId];
155
+ const pricing = resolvedModelInfo?.pricing;
156
+
157
+ return (
158
+ resolvedModelInfo?.capabilities?.includes("prompt-cache") === true ||
159
+ this.config.capabilities?.includes("prompt-cache") === true ||
160
+ typeof pricing?.cacheRead === "number" ||
161
+ typeof pricing?.cacheWrite === "number"
162
+ );
163
+ }
164
+
115
165
  protected calculateCost(
116
166
  inputTokens: number,
117
167
  outputTokens: number,
118
168
  cacheReadTokens = 0,
169
+ cacheWriteTokens = 0,
119
170
  ): number | undefined {
120
- const modelPricingSource =
121
- this.config.modelInfo ??
122
- (this.config.modelId
123
- ? this.config.knownModels?.[this.config.modelId]
124
- : undefined);
125
- const pricing = modelPricingSource?.pricing;
171
+ const pricing = (
172
+ this.config.modelInfo ?? this.config.knownModels?.[this.config.modelId]
173
+ )?.pricing;
126
174
  if (!pricing?.input || !pricing?.output) {
127
175
  return undefined;
128
176
  }
129
177
 
130
- const uncachedInputTokens = inputTokens - cacheReadTokens;
131
- const inputCost = (uncachedInputTokens / 1_000_000) * pricing.input;
132
- const outputCost = (outputTokens / 1_000_000) * pricing.output;
133
- const cacheReadCost =
134
- cacheReadTokens > 0
178
+ return (
179
+ (inputTokens / 1_000_000) * pricing.input +
180
+ (outputTokens / 1_000_000) * pricing.output +
181
+ (cacheReadTokens > 0
135
182
  ? (cacheReadTokens / 1_000_000) * (pricing.cacheRead ?? 0)
136
- : 0;
137
-
138
- return inputCost + outputCost + cacheReadCost;
183
+ : 0) +
184
+ (cacheWriteTokens > 0
185
+ ? (cacheWriteTokens / 1_000_000) *
186
+ (pricing.cacheWrite ?? pricing.input * 1.25)
187
+ : 0)
188
+ );
139
189
  }
140
190
 
141
191
  protected createResponseId(): string {
@@ -154,7 +204,7 @@ export abstract class BaseHandler implements ApiHandler {
154
204
  responseId: string,
155
205
  ): Generator<ApiStreamChunk> {
156
206
  for (const chunk of chunks) {
157
- yield this.withResponseId(chunk, responseId);
207
+ yield { ...chunk, id: responseId };
158
208
  }
159
209
  }
160
210
 
@@ -143,7 +143,7 @@ export class BedrockHandler extends BaseHandler {
143
143
  model: factory(modelId),
144
144
  messages: this.getMessages(systemPrompt, messages),
145
145
  tools: toAiSdkTools(tools),
146
- maxTokens: model.info.maxTokens ?? this.config.maxOutputTokens ?? 8192,
146
+ maxTokens: model.info.maxTokens ?? this.config.maxOutputTokens ?? 128_000,
147
147
  temperature: reasoningEnabled ? undefined : (model.info.temperature ?? 0),
148
148
  providerOptions:
149
149
  Object.keys(providerOptions).length > 0 ? providerOptions : undefined,
@@ -216,7 +216,7 @@ export class BedrockHandler extends BaseHandler {
216
216
 
217
217
  yield {
218
218
  type: "usage",
219
- inputTokens: inputTokens - cacheReadTokens,
219
+ inputTokens,
220
220
  outputTokens,
221
221
  thoughtsTokenCount,
222
222
  cacheReadTokens,
@@ -245,7 +245,7 @@ export class BedrockHandler extends BaseHandler {
245
245
 
246
246
  yield {
247
247
  type: "usage",
248
- inputTokens: inputTokens - cacheReadTokens,
248
+ inputTokens,
249
249
  outputTokens,
250
250
  thoughtsTokenCount,
251
251
  cacheReadTokens,
@@ -115,6 +115,39 @@ describe("Community SDK handlers", () => {
115
115
  expect(usageChunk?.outputTokens).toBe(3);
116
116
  });
117
117
 
118
+ it("keeps cached input tokens separate from total input tokens", async () => {
119
+ streamTextSpy.mockReturnValue({
120
+ fullStream: makeStreamParts([
121
+ {
122
+ type: "finish",
123
+ usage: { inputTokens: 10, outputTokens: 3, cachedInputTokens: 4 },
124
+ },
125
+ ]),
126
+ });
127
+
128
+ const handler = new ClaudeCodeHandler({
129
+ providerId: "claude-code",
130
+ modelId: "sonnet",
131
+ });
132
+
133
+ const chunks: ApiStreamChunk[] = [];
134
+ for await (const chunk of handler.createMessage("System", [
135
+ { role: "user", content: "Hi" },
136
+ ])) {
137
+ chunks.push(chunk);
138
+ }
139
+
140
+ const usageChunk = chunks.find(
141
+ (chunk): chunk is Extract<ApiStreamChunk, { type: "usage" }> =>
142
+ chunk.type === "usage",
143
+ );
144
+ expect(usageChunk).toMatchObject({
145
+ inputTokens: 10,
146
+ outputTokens: 3,
147
+ cacheReadTokens: 4,
148
+ });
149
+ });
150
+
118
151
  it("uses a fallback model id when model is missing", () => {
119
152
  const handler = new ClaudeCodeHandler({
120
153
  providerId: "claude-code",
@@ -218,4 +218,44 @@ describe("GeminiHandler", () => {
218
218
  expect(secondId).toBeTruthy();
219
219
  expect(firstId).not.toBe(secondId);
220
220
  });
221
+
222
+ it("defaults maxOutputTokens to 8192 for gemini-3-flash when no model or config limit is provided", async () => {
223
+ generateContentStreamSpy.mockResolvedValue(createAsyncIterable([]));
224
+
225
+ const handler = new GeminiHandler({
226
+ providerId: "gemini",
227
+ modelId: "gemini-3-flash",
228
+ apiKey: "test-key",
229
+ });
230
+
231
+ await collectChunks(
232
+ handler.createMessage("System", [{ role: "user", content: "go" }]),
233
+ );
234
+
235
+ expect(generateContentStreamSpy).toHaveBeenCalledTimes(1);
236
+ const request = generateContentStreamSpy.mock.calls[0]?.[0] as {
237
+ config?: { maxOutputTokens?: number };
238
+ };
239
+ expect(request.config?.maxOutputTokens).toBe(8192);
240
+ });
241
+
242
+ it("defaults maxOutputTokens to 128000 for non gemini-3-flash models when no model or config limit is provided", async () => {
243
+ generateContentStreamSpy.mockResolvedValue(createAsyncIterable([]));
244
+
245
+ const handler = new GeminiHandler({
246
+ providerId: "gemini",
247
+ modelId: "gemini-2.5-flash",
248
+ apiKey: "test-key",
249
+ });
250
+
251
+ await collectChunks(
252
+ handler.createMessage("System", [{ role: "user", content: "go" }]),
253
+ );
254
+
255
+ expect(generateContentStreamSpy).toHaveBeenCalledTimes(1);
256
+ const request = generateContentStreamSpy.mock.calls[0]?.[0] as {
257
+ config?: { maxOutputTokens?: number };
258
+ };
259
+ expect(request.config?.maxOutputTokens).toBe(128000);
260
+ });
221
261
  });
@@ -18,7 +18,6 @@ import {
18
18
  import {
19
19
  type ApiStream,
20
20
  type HandlerModelInfo,
21
- type ModelInfo,
22
21
  type ProviderConfig,
23
22
  supportsModelThinking,
24
23
  } from "../types";
@@ -27,6 +26,16 @@ import { RetriableError, retryStream } from "../utils/retry";
27
26
  import { BaseHandler } from "./base";
28
27
 
29
28
  const DEFAULT_THINKING_BUDGET_TOKENS = 1024;
29
+ const DEFAULT_MAX_OUTPUT_TOKENS = 128_000;
30
+ const GEMINI_3_FLASH_MAX_OUTPUT_TOKENS = 8192;
31
+
32
+ function isGemini3FlashModel(modelId: string): boolean {
33
+ const normalized = modelId.toLowerCase();
34
+ return (
35
+ normalized.includes("gemini-3-flash") ||
36
+ normalized.includes("gemini-3.0-flash")
37
+ );
38
+ }
30
39
 
31
40
  /**
32
41
  * Handler for Google's Gemini API
@@ -131,6 +140,11 @@ export class GeminiHandler extends BaseHandler {
131
140
  }
132
141
 
133
142
  // Build request config with abort signal
143
+ const fallbackMaxOutputTokens = isGemini3FlashModel(modelId)
144
+ ? GEMINI_3_FLASH_MAX_OUTPUT_TOKENS
145
+ : DEFAULT_MAX_OUTPUT_TOKENS;
146
+ const maxOutputTokens =
147
+ info.maxTokens ?? this.config.maxOutputTokens ?? fallbackMaxOutputTokens;
134
148
  const requestConfig: GenerateContentConfig = {
135
149
  httpOptions: this.config.baseUrl
136
150
  ? { baseUrl: this.config.baseUrl, headers: this.getRequestHeaders() }
@@ -138,7 +152,7 @@ export class GeminiHandler extends BaseHandler {
138
152
  abortSignal,
139
153
  systemInstruction: systemPrompt,
140
154
  temperature: info.temperature ?? 1,
141
- maxOutputTokens: info.maxTokens ?? this.config.maxOutputTokens,
155
+ maxOutputTokens,
142
156
  };
143
157
 
144
158
  // Add thinking config only when explicitly requested and supported.
@@ -243,7 +257,6 @@ export class GeminiHandler extends BaseHandler {
243
257
 
244
258
  // Yield final usage
245
259
  const totalCost = this.calculateGeminiCost(
246
- info,
247
260
  promptTokens,
248
261
  outputTokens,
249
262
  thoughtsTokenCount,
@@ -252,7 +265,7 @@ export class GeminiHandler extends BaseHandler {
252
265
 
253
266
  yield {
254
267
  type: "usage",
255
- inputTokens: promptTokens - cacheReadTokens,
268
+ inputTokens: promptTokens,
256
269
  outputTokens,
257
270
  thoughtsTokenCount,
258
271
  cacheReadTokens,
@@ -273,27 +286,16 @@ export class GeminiHandler extends BaseHandler {
273
286
  }
274
287
 
275
288
  private calculateGeminiCost(
276
- info: ModelInfo,
277
289
  inputTokens: number,
278
290
  outputTokens: number,
279
291
  thoughtsTokenCount: number,
280
292
  cacheReadTokens: number,
281
293
  ): number | undefined {
282
- const pricing = info.pricing;
283
- if (!pricing?.input || !pricing?.output) {
284
- return undefined;
285
- }
286
-
287
- const uncachedInputTokens = inputTokens - cacheReadTokens;
288
- const inputCost = pricing.input * (uncachedInputTokens / 1_000_000);
289
- const outputCost =
290
- pricing.output * ((outputTokens + thoughtsTokenCount) / 1_000_000);
291
- const cacheReadCost =
292
- cacheReadTokens > 0
293
- ? (pricing.cacheRead ?? 0) * (cacheReadTokens / 1_000_000)
294
- : 0;
295
-
296
- return inputCost + outputCost + cacheReadCost;
294
+ return this.calculateCost(
295
+ inputTokens,
296
+ outputTokens + thoughtsTokenCount,
297
+ cacheReadTokens,
298
+ );
297
299
  }
298
300
  }
299
301
 
@@ -106,9 +106,24 @@ export class OpenAIBaseHandler extends BaseHandler {
106
106
  systemPrompt: string,
107
107
  messages: Message[],
108
108
  ): OpenAI.Chat.ChatCompletionMessageParam[] {
109
+ const model = this.getModel();
110
+ const supportsPromptCache = this.supportsPromptCache(model.info);
111
+ const systemMessage = supportsPromptCache
112
+ ? ({
113
+ role: "system",
114
+ content: [
115
+ {
116
+ type: "text",
117
+ text: systemPrompt,
118
+ cache_control: { type: "ephemeral" },
119
+ },
120
+ ],
121
+ } as unknown as OpenAI.Chat.ChatCompletionMessageParam)
122
+ : { role: "system" as const, content: systemPrompt };
123
+
109
124
  return [
110
- { role: "system", content: systemPrompt },
111
- ...convertToOpenAIMessages(messages),
125
+ systemMessage,
126
+ ...convertToOpenAIMessages(messages, supportsPromptCache),
112
127
  ];
113
128
  }
114
129
 
@@ -138,7 +153,8 @@ export class OpenAIBaseHandler extends BaseHandler {
138
153
  const openAiMessages = this.getMessages(systemPrompt, messages);
139
154
 
140
155
  // Build request options
141
- const requestOptions: OpenAI.ChatCompletionCreateParamsStreaming = {
156
+ const requestOptions: Record<string, unknown> &
157
+ OpenAI.ChatCompletionCreateParamsStreaming = {
142
158
  model: modelId,
143
159
  messages: openAiMessages,
144
160
  stream: true,
@@ -149,6 +165,17 @@ export class OpenAIBaseHandler extends BaseHandler {
149
165
  }),
150
166
  };
151
167
 
168
+ // Add top-level cache_control for OpenRouter with Anthropic models.
169
+ // This enables automatic caching where the cache breakpoint advances
170
+ // as the conversation grows, rather than relying on explicit per-block
171
+ // breakpoints which are limited to 4.
172
+ if (
173
+ this.config.providerId === "openrouter" &&
174
+ modelId.startsWith("anthropic/")
175
+ ) {
176
+ requestOptions.cache_control = { type: "ephemeral" };
177
+ }
178
+
152
179
  // Add max tokens if configured
153
180
  const maxTokens = modelInfo.maxTokens ?? this.config.maxOutputTokens;
154
181
  if (maxTokens) {
@@ -171,7 +198,11 @@ export class OpenAIBaseHandler extends BaseHandler {
171
198
  this.config.reasoningEffort ??
172
199
  (this.config.thinking ? DEFAULT_REASONING_EFFORT : undefined);
173
200
  if (supportsReasoningEffort && effectiveReasoningEffort) {
174
- (requestOptions as any).reasoning_effort = effectiveReasoningEffort;
201
+ (
202
+ requestOptions as OpenAI.ChatCompletionCreateParamsStreaming & {
203
+ reasoning_effort?: string;
204
+ }
205
+ ).reasoning_effort = effectiveReasoningEffort;
175
206
  }
176
207
 
177
208
  const requestHeaders = this.getRequestHeaders();
@@ -191,16 +222,25 @@ export class OpenAIBaseHandler extends BaseHandler {
191
222
  headers: requestHeaders,
192
223
  });
193
224
  const toolCallProcessor = new ToolCallProcessor();
225
+ let finishReason: string | null = null;
194
226
 
195
227
  for await (const chunk of stream) {
228
+ const choice = chunk.choices?.[0];
229
+ if (choice?.finish_reason) {
230
+ finishReason = choice.finish_reason;
231
+ }
196
232
  yield* this.withResponseIdForAll(
197
233
  this.processChunk(chunk, toolCallProcessor, modelInfo, responseId),
198
234
  responseId,
199
235
  );
200
236
  }
201
237
 
202
- // Yield done chunk to indicate streaming completed successfully
203
- yield { type: "done", success: true, id: responseId };
238
+ yield {
239
+ type: "done",
240
+ success: true,
241
+ id: responseId,
242
+ incompleteReason: finishReason === "length" ? "max_tokens" : undefined,
243
+ };
204
244
  }
205
245
 
206
246
  /**
@@ -213,9 +253,11 @@ export class OpenAIBaseHandler extends BaseHandler {
213
253
  _modelInfo: ModelInfo,
214
254
  responseId: string,
215
255
  ): Generator<import("../types").ApiStreamChunk> {
216
- const delta = chunk.choices?.[0]?.delta && {
217
- ...chunk.choices[0].delta,
218
- reasoning_content: (chunk.choices[0].delta as any).reasoning_content,
256
+ const rawDelta = chunk.choices?.[0]?.delta;
257
+ const delta = rawDelta && {
258
+ ...rawDelta,
259
+ reasoning_content: (rawDelta as { reasoning_content?: string })
260
+ .reasoning_content,
219
261
  };
220
262
 
221
263
  // Handle text content
@@ -227,7 +269,7 @@ export class OpenAIBaseHandler extends BaseHandler {
227
269
  if (delta?.reasoning_content) {
228
270
  yield {
229
271
  type: "reasoning",
230
- reasoning: (delta as any).reasoning_content,
272
+ reasoning: delta.reasoning_content,
231
273
  id: responseId,
232
274
  };
233
275
  }
@@ -248,10 +290,22 @@ export class OpenAIBaseHandler extends BaseHandler {
248
290
  if (chunk.usage) {
249
291
  const inputTokens = chunk.usage.prompt_tokens ?? 0;
250
292
  const outputTokens = chunk.usage.completion_tokens ?? 0;
293
+ const usageWithCache = chunk.usage as typeof chunk.usage & {
294
+ prompt_tokens_details?: {
295
+ cached_tokens?: number;
296
+ cache_write_tokens?: number;
297
+ };
298
+ cache_creation_input_tokens?: number;
299
+ cache_read_input_tokens?: number;
300
+ };
251
301
  const cacheReadTokens =
252
- (chunk.usage as any).prompt_tokens_details?.cached_tokens ?? 0;
302
+ usageWithCache.prompt_tokens_details?.cached_tokens ??
303
+ usageWithCache.cache_read_input_tokens ??
304
+ 0;
253
305
  const cacheWriteTokens =
254
- (chunk.usage as any).prompt_cache_miss_tokens ?? 0;
306
+ usageWithCache.prompt_tokens_details?.cache_write_tokens ??
307
+ usageWithCache.cache_creation_input_tokens ??
308
+ 0;
255
309
 
256
310
  yield {
257
311
  type: "usage",
@@ -263,6 +317,7 @@ export class OpenAIBaseHandler extends BaseHandler {
263
317
  inputTokens,
264
318
  outputTokens,
265
319
  cacheReadTokens,
320
+ cacheWriteTokens,
266
321
  ),
267
322
  id: responseId,
268
323
  };
@@ -210,4 +210,50 @@ describe("OpenAIResponsesHandler", () => {
210
210
  },
211
211
  });
212
212
  });
213
+
214
+ it("keeps cached input tokens separate in usage chunks", () => {
215
+ const handler = new TestOpenAIResponsesHandler({
216
+ providerId: "openai-native",
217
+ modelId: "gpt-5.4",
218
+ apiKey: "test-key",
219
+ baseUrl: "https://example.com",
220
+ modelInfo: {
221
+ id: "gpt-5.4",
222
+ pricing: {
223
+ input: 1,
224
+ output: 2,
225
+ cacheRead: 0.5,
226
+ },
227
+ },
228
+ });
229
+
230
+ const chunks = handler.processChunkForTest({
231
+ type: "response.completed",
232
+ response: {
233
+ id: "resp_usage",
234
+ usage: {
235
+ input_tokens: 100,
236
+ output_tokens: 40,
237
+ input_tokens_details: {
238
+ cached_tokens: 25,
239
+ },
240
+ output_tokens_details: {
241
+ reasoning_tokens: 10,
242
+ },
243
+ },
244
+ },
245
+ });
246
+
247
+ expect(chunks[0]).toMatchObject({
248
+ type: "usage",
249
+ inputTokens: 100,
250
+ outputTokens: 40,
251
+ cacheReadTokens: 25,
252
+ cacheWriteTokens: 0,
253
+ });
254
+ expect(chunks[0]?.type).toBe("usage");
255
+ if (chunks[0]?.type === "usage") {
256
+ expect(chunks[0].totalCost).toBeCloseTo(0.0001925, 10);
257
+ }
258
+ });
213
259
  });
@@ -565,23 +565,19 @@ export class OpenAIResponsesHandler extends BaseHandler {
565
565
  const inputTokens = usage.input_tokens || 0;
566
566
  const outputTokens = usage.output_tokens || 0;
567
567
  const cacheReadTokens =
568
- usage.output_tokens_details?.reasoning_tokens || 0;
569
- const cacheWriteTokens =
570
568
  usage.input_tokens_details?.cached_tokens || 0;
569
+ const cacheWriteTokens = 0;
571
570
 
572
571
  const totalCost = this.calculateCost(
573
572
  inputTokens,
574
573
  outputTokens,
575
574
  cacheReadTokens,
576
- );
577
- const nonCachedInputTokens = Math.max(
578
- 0,
579
- inputTokens - cacheReadTokens - cacheWriteTokens,
575
+ cacheWriteTokens,
580
576
  );
581
577
 
582
578
  yield {
583
579
  type: "usage",
584
- inputTokens: nonCachedInputTokens,
580
+ inputTokens,
585
581
  outputTokens,
586
582
  cacheWriteTokens,
587
583
  cacheReadTokens,