@clinebot/llms 0.0.10 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,7 +22,6 @@ import type {
22
22
  ModelInfo,
23
23
  ProviderConfig,
24
24
  } from "../types";
25
- import { hasModelCapability } from "../types";
26
25
  import type { Message, ToolDefinition } from "../types/messages";
27
26
  import { retryStream } from "../utils/retry";
28
27
  import { ToolCallProcessor } from "../utils/tool-processor";
@@ -108,9 +107,7 @@ export class OpenAIBaseHandler extends BaseHandler {
108
107
  messages: Message[],
109
108
  ): OpenAI.Chat.ChatCompletionMessageParam[] {
110
109
  const model = this.getModel();
111
- const supportsPromptCache =
112
- hasModelCapability(model.info, "prompt-cache") ||
113
- this.config.capabilities?.includes("prompt-cache") === true;
110
+ const supportsPromptCache = this.supportsPromptCache(model.info);
114
111
  const systemMessage = supportsPromptCache
115
112
  ? ({
116
113
  role: "system",
@@ -156,7 +153,8 @@ export class OpenAIBaseHandler extends BaseHandler {
156
153
  const openAiMessages = this.getMessages(systemPrompt, messages);
157
154
 
158
155
  // Build request options
159
- const requestOptions: OpenAI.ChatCompletionCreateParamsStreaming = {
156
+ const requestOptions: Record<string, unknown> &
157
+ OpenAI.ChatCompletionCreateParamsStreaming = {
160
158
  model: modelId,
161
159
  messages: openAiMessages,
162
160
  stream: true,
@@ -167,6 +165,17 @@ export class OpenAIBaseHandler extends BaseHandler {
167
165
  }),
168
166
  };
169
167
 
168
+ // Add top-level cache_control for OpenRouter with Anthropic models.
169
+ // This enables automatic caching where the cache breakpoint advances
170
+ // as the conversation grows, rather than relying on explicit per-block
171
+ // breakpoints which are limited to 4.
172
+ if (
173
+ this.config.providerId === "openrouter" &&
174
+ modelId.startsWith("anthropic/")
175
+ ) {
176
+ requestOptions.cache_control = { type: "ephemeral" };
177
+ }
178
+
170
179
  // Add max tokens if configured
171
180
  const maxTokens = modelInfo.maxTokens ?? this.config.maxOutputTokens;
172
181
  if (maxTokens) {
@@ -208,10 +217,15 @@ export class OpenAIBaseHandler extends BaseHandler {
208
217
  requestHeaders.Authorization = `Bearer ${apiKey}`;
209
218
  }
210
219
  const abortSignal = this.getAbortSignal();
211
- const stream = await client.chat.completions.create(requestOptions, {
212
- signal: abortSignal,
213
- headers: requestHeaders,
214
- });
220
+ let stream: AsyncIterable<ChatCompletionChunk>;
221
+ try {
222
+ stream = await client.chat.completions.create(requestOptions, {
223
+ signal: abortSignal,
224
+ headers: requestHeaders,
225
+ });
226
+ } catch (error) {
227
+ throw this.normalizeOpenAICompatibleBadRequest(error) ?? error;
228
+ }
215
229
  const toolCallProcessor = new ToolCallProcessor();
216
230
  let finishReason: string | null = null;
217
231
 
@@ -286,27 +300,32 @@ export class OpenAIBaseHandler extends BaseHandler {
286
300
  cached_tokens?: number;
287
301
  cache_write_tokens?: number;
288
302
  };
289
- prompt_cache_miss_tokens?: number;
290
303
  cache_creation_input_tokens?: number;
291
304
  cache_read_input_tokens?: number;
292
305
  };
293
306
  const cacheReadTokens =
294
- usageWithCache.prompt_tokens_details?.cached_tokens ?? 0;
307
+ usageWithCache.prompt_tokens_details?.cached_tokens ??
308
+ usageWithCache.cache_read_input_tokens ??
309
+ 0;
295
310
  const cacheWriteTokens =
296
311
  usageWithCache.prompt_tokens_details?.cache_write_tokens ??
297
- usageWithCache.prompt_cache_miss_tokens ??
312
+ usageWithCache.cache_creation_input_tokens ??
298
313
  0;
299
314
 
300
315
  yield {
301
316
  type: "usage",
302
- inputTokens,
317
+ inputTokens: Math.max(
318
+ 0,
319
+ inputTokens - cacheReadTokens - cacheWriteTokens,
320
+ ),
303
321
  outputTokens,
304
322
  cacheReadTokens,
305
323
  cacheWriteTokens,
306
- totalCost: this.calculateCost(
324
+ totalCost: this.calculateCostFromInclusiveInput(
307
325
  inputTokens,
308
326
  outputTokens,
309
327
  cacheReadTokens,
328
+ cacheWriteTokens,
310
329
  ),
311
330
  id: responseId,
312
331
  };
@@ -210,4 +210,50 @@ describe("OpenAIResponsesHandler", () => {
210
210
  },
211
211
  });
212
212
  });
213
+
214
+ it("keeps cached input tokens separate in usage chunks", () => {
215
+ const handler = new TestOpenAIResponsesHandler({
216
+ providerId: "openai-native",
217
+ modelId: "gpt-5.4",
218
+ apiKey: "test-key",
219
+ baseUrl: "https://example.com",
220
+ modelInfo: {
221
+ id: "gpt-5.4",
222
+ pricing: {
223
+ input: 1,
224
+ output: 2,
225
+ cacheRead: 0.5,
226
+ },
227
+ },
228
+ });
229
+
230
+ const chunks = handler.processChunkForTest({
231
+ type: "response.completed",
232
+ response: {
233
+ id: "resp_usage",
234
+ usage: {
235
+ input_tokens: 100,
236
+ output_tokens: 40,
237
+ input_tokens_details: {
238
+ cached_tokens: 25,
239
+ },
240
+ output_tokens_details: {
241
+ reasoning_tokens: 10,
242
+ },
243
+ },
244
+ },
245
+ });
246
+
247
+ expect(chunks[0]).toMatchObject({
248
+ type: "usage",
249
+ inputTokens: 75,
250
+ outputTokens: 40,
251
+ cacheReadTokens: 25,
252
+ cacheWriteTokens: 0,
253
+ });
254
+ expect(chunks[0]?.type).toBe("usage");
255
+ if (chunks[0]?.type === "usage") {
256
+ expect(chunks[0].totalCost).toBeCloseTo(0.0001675, 10);
257
+ }
258
+ });
213
259
  });
@@ -330,6 +330,11 @@ export class OpenAIResponsesHandler extends BaseHandler {
330
330
  { signal: abortSignal, headers: requestHeaders },
331
331
  );
332
332
  } catch (error) {
333
+ const normalizedBadRequest =
334
+ this.normalizeOpenAICompatibleBadRequest(error);
335
+ if (normalizedBadRequest) {
336
+ throw normalizedBadRequest;
337
+ }
333
338
  if (this.config.providerId === "openai-codex") {
334
339
  const rawError = error as
335
340
  | (Error & {
@@ -565,23 +570,22 @@ export class OpenAIResponsesHandler extends BaseHandler {
565
570
  const inputTokens = usage.input_tokens || 0;
566
571
  const outputTokens = usage.output_tokens || 0;
567
572
  const cacheReadTokens =
568
- usage.output_tokens_details?.reasoning_tokens || 0;
569
- const cacheWriteTokens =
570
573
  usage.input_tokens_details?.cached_tokens || 0;
574
+ const cacheWriteTokens = 0;
571
575
 
572
- const totalCost = this.calculateCost(
576
+ const totalCost = this.calculateCostFromInclusiveInput(
573
577
  inputTokens,
574
578
  outputTokens,
575
579
  cacheReadTokens,
576
- );
577
- const nonCachedInputTokens = Math.max(
578
- 0,
579
- inputTokens - cacheReadTokens - cacheWriteTokens,
580
+ cacheWriteTokens,
580
581
  );
581
582
 
582
583
  yield {
583
584
  type: "usage",
584
- inputTokens: nonCachedInputTokens,
585
+ inputTokens: Math.max(
586
+ 0,
587
+ inputTokens - cacheReadTokens - cacheWriteTokens,
588
+ ),
585
589
  outputTokens,
586
590
  cacheWriteTokens,
587
591
  cacheReadTokens,
@@ -255,19 +255,21 @@ export class R1BaseHandler extends BaseHandler {
255
255
  const cacheReadTokens = r1Usage.prompt_cache_hit_tokens ?? 0;
256
256
  const cacheWriteTokens = r1Usage.prompt_cache_miss_tokens ?? 0;
257
257
 
258
- // Calculate non-cached input tokens (will always be 0 for DeepSeek since input = read + write)
259
- const nonCachedInputTokens = Math.max(
260
- 0,
261
- inputTokens - cacheReadTokens - cacheWriteTokens,
262
- );
263
-
264
258
  yield {
265
259
  type: "usage",
266
- inputTokens: nonCachedInputTokens,
260
+ inputTokens: Math.max(
261
+ 0,
262
+ inputTokens - cacheReadTokens - cacheWriteTokens,
263
+ ),
267
264
  outputTokens,
268
265
  cacheReadTokens,
269
266
  cacheWriteTokens,
270
- totalCost: this.calculateCost(inputTokens, outputTokens, cacheReadTokens),
267
+ totalCost: this.calculateCostFromInclusiveInput(
268
+ inputTokens,
269
+ outputTokens,
270
+ cacheReadTokens,
271
+ cacheWriteTokens,
272
+ ),
271
273
  id: responseId,
272
274
  };
273
275
  }
@@ -189,7 +189,7 @@ export class VertexHandler extends BaseHandler {
189
189
  if (!isClaudeModel(model.id)) {
190
190
  return this.ensureGeminiHandler().getMessages(systemPrompt, messages);
191
191
  }
192
- const supportsPromptCache = hasModelCapability(model.info, "prompt-cache");
192
+ const supportsPromptCache = this.supportsPromptCache(model.info);
193
193
  return convertToAnthropicMessages(messages, supportsPromptCache);
194
194
  }
195
195
 
@@ -226,7 +226,7 @@ export class VertexHandler extends BaseHandler {
226
226
  const budgetTokens = this.config.thinkingBudgetTokens ?? 0;
227
227
  const reasoningOn =
228
228
  hasModelCapability(model.info, "reasoning") && budgetTokens > 0;
229
- const promptCacheOn = hasModelCapability(model.info, "prompt-cache");
229
+ const promptCacheOn = this.supportsPromptCache(model.info);
230
230
 
231
231
  const providerOptions: Record<string, unknown> = {};
232
232
  if (reasoningOn) {
@@ -251,8 +251,18 @@ export class VertexHandler extends BaseHandler {
251
251
  yield* emitAiSdkStream(stream, {
252
252
  responseId,
253
253
  errorMessage: "Vertex Anthropic stream failed",
254
- calculateCost: (inputTokens, outputTokens, cacheReadTokens) =>
255
- this.calculateCost(inputTokens, outputTokens, cacheReadTokens),
254
+ calculateCost: (
255
+ inputTokens,
256
+ outputTokens,
257
+ cacheReadTokens,
258
+ cacheWriteTokens,
259
+ ) =>
260
+ this.calculateCost(
261
+ inputTokens,
262
+ outputTokens,
263
+ cacheReadTokens,
264
+ cacheWriteTokens,
265
+ ),
256
266
  reasoningTypes: ["reasoning-delta"],
257
267
  enableToolCalls: true,
258
268
  toolCallArgsOrder: ["input", "args"],
@@ -32,12 +32,24 @@ export function convertToAnthropicMessages(
32
32
  messages: Message[],
33
33
  enableCaching = false,
34
34
  ): AnthropicMessage[] {
35
+ const userMessageIndices = messages.reduce<number[]>(
36
+ (indices, message, index) => {
37
+ if (message.role === "user") {
38
+ indices.push(index);
39
+ }
40
+ return indices;
41
+ },
42
+ [],
43
+ );
44
+ const cacheableMessageIndices = enableCaching
45
+ ? new Set(userMessageIndices.slice(-2))
46
+ : new Set<number>();
35
47
  const result: AnthropicMessage[] = [];
36
48
 
37
- for (const message of messages) {
49
+ for (const [index, message] of messages.entries()) {
38
50
  const converted = convertMessage(
39
51
  message,
40
- enableCaching && messages.indexOf(message) === messages.length - 1,
52
+ cacheableMessageIndices.has(index),
41
53
  );
42
54
  if (converted) {
43
55
  result.push(converted);
@@ -285,10 +285,33 @@ describe("format conversion", () => {
285
285
  ];
286
286
 
287
287
  const anthropic = convertToAnthropicMessages(messages, true) as any[];
288
+ expect(anthropic[0].content[0].cache_control).toEqual({
289
+ type: "ephemeral",
290
+ });
288
291
  expect(anthropic[1].content[0].type).toBe("thinking");
289
292
  expect(anthropic[1].content[0].signature).toBe("anthropic-sig");
290
293
  });
291
294
 
295
+ it("applies anthropic cache markers to the last two user messages", () => {
296
+ const messages: Message[] = [
297
+ { role: "user", content: "first prompt" },
298
+ { role: "assistant", content: "intermediate response" },
299
+ { role: "user", content: "second prompt" },
300
+ { role: "assistant", content: "another response" },
301
+ { role: "user", content: "third prompt" },
302
+ ];
303
+
304
+ const anthropic = convertToAnthropicMessages(messages, true) as any[];
305
+
306
+ expect(anthropic[0].content[0].cache_control).toBeUndefined();
307
+ expect(anthropic[2].content[0].cache_control).toEqual({
308
+ type: "ephemeral",
309
+ });
310
+ expect(anthropic[4].content[0].cache_control).toEqual({
311
+ type: "ephemeral",
312
+ });
313
+ });
314
+
292
315
  it("normalizes array-shaped tool_use input for anthropic replay", () => {
293
316
  const messages: Message[] = [
294
317
  { role: "user", content: "run these" },
@@ -172,6 +172,80 @@ function convertContentBlock(
172
172
  }
173
173
  }
174
174
 
175
+ /**
176
+ * Allowed JSON Schema properties per Gemini's supported subset.
177
+ * See: https://ai.google.dev/gemini-api/docs/structured-output
178
+ */
179
+ const GEMINI_ALLOWED_PROPERTIES = new Set([
180
+ // Common
181
+ "type",
182
+ "title",
183
+ "description",
184
+ "enum",
185
+ // Object
186
+ "properties",
187
+ "required",
188
+ "additionalProperties",
189
+ // String
190
+ "format",
191
+ // Number / Integer
192
+ "minimum",
193
+ "maximum",
194
+ // Array
195
+ "items",
196
+ "prefixItems",
197
+ "minItems",
198
+ "maxItems",
199
+ ]);
200
+
201
+ /**
202
+ * Recursively sanitize a JSON Schema to only include properties supported by Gemini.
203
+ * Converts exclusiveMinimum/exclusiveMaximum to minimum/maximum as a best-effort fallback.
204
+ */
205
+ function sanitizeSchemaForGemini(schema: unknown): unknown {
206
+ if (!schema || typeof schema !== "object" || Array.isArray(schema)) {
207
+ return schema;
208
+ }
209
+
210
+ const input = schema as Record<string, unknown>;
211
+ const output: Record<string, unknown> = {};
212
+
213
+ for (const [key, value] of Object.entries(input)) {
214
+ if (!GEMINI_ALLOWED_PROPERTIES.has(key)) {
215
+ continue;
216
+ }
217
+
218
+ if (key === "properties" && value && typeof value === "object") {
219
+ const sanitized: Record<string, unknown> = {};
220
+ for (const [propName, propSchema] of Object.entries(
221
+ value as Record<string, unknown>,
222
+ )) {
223
+ sanitized[propName] = sanitizeSchemaForGemini(propSchema);
224
+ }
225
+ output[key] = sanitized;
226
+ } else if (key === "items" || key === "additionalProperties") {
227
+ output[key] =
228
+ typeof value === "object" && value !== null
229
+ ? sanitizeSchemaForGemini(value)
230
+ : value;
231
+ } else if (key === "prefixItems" && Array.isArray(value)) {
232
+ output[key] = value.map((item) => sanitizeSchemaForGemini(item));
233
+ } else {
234
+ output[key] = value;
235
+ }
236
+ }
237
+
238
+ // Convert exclusiveMinimum/exclusiveMaximum to minimum/maximum
239
+ if (input.exclusiveMinimum !== undefined && output.minimum === undefined) {
240
+ output.minimum = input.exclusiveMinimum;
241
+ }
242
+ if (input.exclusiveMaximum !== undefined && output.maximum === undefined) {
243
+ output.maximum = input.exclusiveMaximum;
244
+ }
245
+
246
+ return output;
247
+ }
248
+
175
249
  /**
176
250
  * Convert tool definitions to Gemini format
177
251
  */
@@ -181,6 +255,8 @@ export function convertToolsToGemini(
181
255
  return tools.map((tool) => ({
182
256
  name: tool.name,
183
257
  description: tool.description,
184
- parameters: tool.inputSchema as FunctionDeclaration["parameters"],
258
+ parameters: sanitizeSchemaForGemini(
259
+ tool.inputSchema,
260
+ ) as FunctionDeclaration["parameters"],
185
261
  }));
186
262
  }
@@ -55,7 +55,7 @@ export interface ApiStreamReasoningChunk {
55
55
  */
56
56
  export interface ApiStreamUsageChunk {
57
57
  type: "usage";
58
- /** Number of input tokens (excluding cached) */
58
+ /** Total number of input tokens reported by the provider */
59
59
  inputTokens: number;
60
60
  /** Number of output tokens */
61
61
  outputTokens: number;