llmist 15.12.0 → 15.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -229,7 +229,8 @@ var init_execution_tree = __esm({
229
229
  response: llmNode.response,
230
230
  usage: llmNode.usage,
231
231
  finishReason: llmNode.finishReason,
232
- cost: llmNode.cost
232
+ cost: llmNode.cost,
233
+ thinkingContent: params.thinkingContent
233
234
  });
234
235
  }
235
236
  /**
@@ -4529,7 +4530,10 @@ var init_hook_presets = __esm({
4529
4530
  const costEstimate = modelRegistry.estimateCost(
4530
4531
  modelName,
4531
4532
  ctx.usage.inputTokens,
4532
- ctx.usage.outputTokens
4533
+ ctx.usage.outputTokens,
4534
+ ctx.usage.cachedInputTokens ?? 0,
4535
+ ctx.usage.cacheCreationInputTokens ?? 0,
4536
+ ctx.usage.reasoningTokens ?? 0
4533
4537
  );
4534
4538
  if (costEstimate) {
4535
4539
  totalCost += costEstimate.totalCost;
@@ -5026,10 +5030,10 @@ var init_anthropic_models = __esm({
5026
5030
  contextWindow: 2e5,
5027
5031
  maxOutputTokens: 64e3,
5028
5032
  pricing: {
5029
- input: 0.8,
5030
- output: 4,
5031
- cachedInput: 0.08,
5032
- cacheWriteInput: 1
5033
+ input: 1,
5034
+ output: 5,
5035
+ cachedInput: 0.1,
5036
+ cacheWriteInput: 1.25
5033
5037
  },
5034
5038
  knowledgeCutoff: "2025-02",
5035
5039
  features: {
@@ -5225,10 +5229,10 @@ var init_anthropic_models = __esm({
5225
5229
  contextWindow: 2e5,
5226
5230
  maxOutputTokens: 64e3,
5227
5231
  pricing: {
5228
- input: 0.8,
5229
- output: 4,
5230
- cachedInput: 0.08,
5231
- cacheWriteInput: 1
5232
+ input: 1,
5233
+ output: 5,
5234
+ cachedInput: 0.1,
5235
+ cacheWriteInput: 1.25
5232
5236
  },
5233
5237
  knowledgeCutoff: "2025-02",
5234
5238
  features: {
@@ -5371,10 +5375,15 @@ var init_utils = __esm({
5371
5375
  });
5372
5376
 
5373
5377
  // src/providers/anthropic.ts
5378
+ function resolveAnthropicThinking(reasoning) {
5379
+ if (!reasoning?.enabled) return void 0;
5380
+ const budget = reasoning.budgetTokens ? Math.max(1024, reasoning.budgetTokens) : ANTHROPIC_EFFORT_BUDGET[reasoning.effort ?? "medium"];
5381
+ return { type: "enabled", budget_tokens: budget };
5382
+ }
5374
5383
  function createAnthropicProviderFromEnv() {
5375
5384
  return createProviderFromEnv("ANTHROPIC_API_KEY", import_sdk.default, AnthropicMessagesProvider);
5376
5385
  }
5377
- var import_sdk, AnthropicMessagesProvider;
5386
+ var import_sdk, ANTHROPIC_EFFORT_BUDGET, AnthropicMessagesProvider;
5378
5387
  var init_anthropic = __esm({
5379
5388
  "src/providers/anthropic.ts"() {
5380
5389
  "use strict";
@@ -5384,6 +5393,14 @@ var init_anthropic = __esm({
5384
5393
  init_base_provider();
5385
5394
  init_constants2();
5386
5395
  init_utils();
5396
+ ANTHROPIC_EFFORT_BUDGET = {
5397
+ none: 1024,
5398
+ // Minimum allowed by Anthropic
5399
+ low: 2048,
5400
+ medium: 8192,
5401
+ high: 16384,
5402
+ maximum: 32768
5403
+ };
5387
5404
  AnthropicMessagesProvider = class extends BaseProviderAdapter {
5388
5405
  providerId = "anthropic";
5389
5406
  supports(descriptor) {
@@ -5415,12 +5432,13 @@ var init_anthropic = __esm({
5415
5432
  );
5416
5433
  }
5417
5434
  buildApiRequest(options, descriptor, spec, messages) {
5435
+ const cachingEnabled = options.caching?.enabled !== false;
5418
5436
  const systemMessages = messages.filter((message) => message.role === "system");
5419
5437
  const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
5420
5438
  type: "text",
5421
5439
  text: extractMessageText(m.content),
5422
- // Add cache_control to the LAST system message block
5423
- ...index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
5440
+ // Add cache_control to the LAST system message block (only when caching is enabled)
5441
+ ...cachingEnabled && index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
5424
5442
  })) : void 0;
5425
5443
  const nonSystemMessages = messages.filter(
5426
5444
  (message) => message.role !== "system"
@@ -5433,19 +5451,22 @@ var init_anthropic = __esm({
5433
5451
  role: message.role,
5434
5452
  content: this.convertToAnthropicContent(
5435
5453
  message.content,
5436
- message.role === "user" && index === lastUserIndex
5454
+ cachingEnabled && message.role === "user" && index === lastUserIndex
5437
5455
  )
5438
5456
  }));
5439
5457
  const defaultMaxTokens = spec?.maxOutputTokens ?? ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS;
5458
+ const thinking = resolveAnthropicThinking(options.reasoning);
5459
+ const temperature = thinking ? void 0 : options.temperature;
5440
5460
  const payload = {
5441
5461
  model: descriptor.name,
5442
5462
  system,
5443
5463
  messages: conversation,
5444
5464
  max_tokens: options.maxTokens ?? defaultMaxTokens,
5445
- temperature: options.temperature,
5465
+ temperature,
5446
5466
  top_p: options.topP,
5447
5467
  stop_sequences: options.stopSequences,
5448
5468
  stream: true,
5469
+ ...thinking ? { thinking } : {},
5449
5470
  ...options.extra
5450
5471
  };
5451
5472
  return payload;
@@ -5525,8 +5546,39 @@ var init_anthropic = __esm({
5525
5546
  };
5526
5547
  continue;
5527
5548
  }
5528
- if (event.type === "content_block_delta" && event.delta.type === "text_delta") {
5529
- yield { text: event.delta.text ?? "", rawEvent: event };
5549
+ if (event.type === "content_block_start") {
5550
+ const block = event.content_block;
5551
+ if (block.type === "thinking") {
5552
+ yield { text: "", thinking: { content: "", type: "thinking" }, rawEvent: event };
5553
+ continue;
5554
+ }
5555
+ if (block.type === "redacted_thinking") {
5556
+ yield { text: "", thinking: { content: "", type: "redacted" }, rawEvent: event };
5557
+ continue;
5558
+ }
5559
+ }
5560
+ if (event.type === "content_block_delta") {
5561
+ const delta = event.delta;
5562
+ if (delta.type === "thinking_delta" && delta.thinking) {
5563
+ yield {
5564
+ text: "",
5565
+ thinking: { content: delta.thinking, type: "thinking" },
5566
+ rawEvent: event
5567
+ };
5568
+ continue;
5569
+ }
5570
+ if (delta.type === "signature_delta" && delta.signature) {
5571
+ yield {
5572
+ text: "",
5573
+ thinking: { content: "", type: "thinking", signature: delta.signature },
5574
+ rawEvent: event
5575
+ };
5576
+ continue;
5577
+ }
5578
+ if (delta.type === "text_delta") {
5579
+ yield { text: delta.text ?? "", rawEvent: event };
5580
+ continue;
5581
+ }
5530
5582
  continue;
5531
5583
  }
5532
5584
  if (event.type === "message_delta") {
@@ -5616,6 +5668,177 @@ var init_anthropic = __esm({
5616
5668
  }
5617
5669
  });
5618
5670
 
5671
+ // src/providers/gemini-cache-manager.ts
5672
+ var import_node_crypto3, GeminiCacheManager;
5673
+ var init_gemini_cache_manager = __esm({
5674
+ "src/providers/gemini-cache-manager.ts"() {
5675
+ "use strict";
5676
+ import_node_crypto3 = require("crypto");
5677
+ GeminiCacheManager = class {
5678
+ constructor(client) {
5679
+ this.client = client;
5680
+ }
5681
+ activeCache = null;
5682
+ /**
5683
+ * Get or create a cache for the given content.
5684
+ *
5685
+ * Returns the cache name if a cache was created/reused, or `null` if caching
5686
+ * was skipped (disabled, below threshold, or API error).
5687
+ *
5688
+ * @param model - Gemini model name (e.g., "gemini-2.5-flash")
5689
+ * @param allContents - All Gemini-formatted contents (system + conversation)
5690
+ * @param config - Caching configuration from the user
5691
+ * @param lastUserMessageIndex - Index of the last user message (content after this is not cached)
5692
+ * @returns Cache name string or null
5693
+ */
5694
+ async getOrCreateCache(model, allContents, config, lastUserMessageIndex) {
5695
+ if (!config.enabled) return null;
5696
+ const scope = config.scope ?? "conversation";
5697
+ const ttl = config.ttl ?? "3600s";
5698
+ const minTokenThreshold = config.minTokenThreshold ?? 32768;
5699
+ const cacheableContents = this.selectCacheableContents(
5700
+ allContents,
5701
+ scope,
5702
+ lastUserMessageIndex
5703
+ );
5704
+ if (cacheableContents.length === 0) return null;
5705
+ const estimatedTokens = this.estimateTokenCount(cacheableContents);
5706
+ if (estimatedTokens < minTokenThreshold) return null;
5707
+ const contentHash = this.computeContentHash(cacheableContents, model);
5708
+ if (this.activeCache && this.canReuseCache(this.activeCache, model, contentHash)) {
5709
+ return {
5710
+ cacheName: this.activeCache.name,
5711
+ cachedContentCount: cacheableContents.length
5712
+ };
5713
+ }
5714
+ try {
5715
+ await this.cleanupActiveCache();
5716
+ const response = await this.client.caches.create({
5717
+ model,
5718
+ config: {
5719
+ contents: cacheableContents,
5720
+ ttl,
5721
+ displayName: `llmist-${scope}-${Date.now()}`
5722
+ }
5723
+ });
5724
+ if (!response.name) {
5725
+ return null;
5726
+ }
5727
+ this.activeCache = {
5728
+ name: response.name,
5729
+ model,
5730
+ contentHash,
5731
+ expireTime: response.expireTime ?? ""
5732
+ };
5733
+ return {
5734
+ cacheName: response.name,
5735
+ cachedContentCount: cacheableContents.length
5736
+ };
5737
+ } catch (error) {
5738
+ console.warn("Gemini cache creation failed, continuing without cache:", error);
5739
+ return null;
5740
+ }
5741
+ }
5742
+ /**
5743
+ * Clean up the active cache (best-effort).
5744
+ * Caches auto-expire via TTL, so failure is non-critical.
5745
+ */
5746
+ async dispose() {
5747
+ await this.cleanupActiveCache();
5748
+ }
5749
+ /**
5750
+ * Select which contents to cache based on scope.
5751
+ *
5752
+ * - "system": Only system-derived messages (the initial user+model exchanges
5753
+ * generated from system messages)
5754
+ * - "conversation": Everything except the last user message
5755
+ */
5756
+ selectCacheableContents(allContents, scope, lastUserMessageIndex) {
5757
+ if (scope === "system") {
5758
+ let systemEndIndex = 0;
5759
+ for (let i = 0; i < allContents.length; i++) {
5760
+ const content = allContents[i];
5761
+ if (content.role === "model" && content.parts.length === 1 && "text" in content.parts[0] && content.parts[0].text === "Understood.") {
5762
+ systemEndIndex = i + 1;
5763
+ } else if (content.role === "user") {
5764
+ const next = allContents[i + 1];
5765
+ if (next && next.role === "model" && next.parts.length === 1 && "text" in next.parts[0] && next.parts[0].text === "Understood.") {
5766
+ continue;
5767
+ }
5768
+ break;
5769
+ } else {
5770
+ break;
5771
+ }
5772
+ }
5773
+ return allContents.slice(0, systemEndIndex);
5774
+ }
5775
+ if (lastUserMessageIndex <= 0) return [];
5776
+ return allContents.slice(0, lastUserMessageIndex);
5777
+ }
5778
+ /**
5779
+ * Estimate token count from contents using character-based heuristic.
5780
+ * Uses ~4 characters per token (conservative estimate for English text).
5781
+ */
5782
+ estimateTokenCount(contents) {
5783
+ let totalChars = 0;
5784
+ for (const content of contents) {
5785
+ for (const part of content.parts) {
5786
+ if ("text" in part) {
5787
+ totalChars += part.text.length;
5788
+ } else if ("inlineData" in part) {
5789
+ totalChars += 258 * 4;
5790
+ }
5791
+ }
5792
+ }
5793
+ return Math.ceil(totalChars / 4);
5794
+ }
5795
+ /**
5796
+ * Compute a stable hash of the cacheable contents for change detection.
5797
+ */
5798
+ computeContentHash(contents, model) {
5799
+ const hash = (0, import_node_crypto3.createHash)("sha256");
5800
+ hash.update(model);
5801
+ for (const content of contents) {
5802
+ hash.update(content.role);
5803
+ for (const part of content.parts) {
5804
+ if ("text" in part) {
5805
+ hash.update(part.text);
5806
+ } else if ("inlineData" in part) {
5807
+ hash.update(part.inlineData.mimeType);
5808
+ hash.update(part.inlineData.data);
5809
+ }
5810
+ }
5811
+ }
5812
+ return hash.digest("hex");
5813
+ }
5814
+ /**
5815
+ * Check if an existing cache can be reused.
5816
+ */
5817
+ canReuseCache(cache, model, contentHash) {
5818
+ if (cache.model !== model) return false;
5819
+ if (cache.contentHash !== contentHash) return false;
5820
+ if (cache.expireTime) {
5821
+ const expiresAt = new Date(cache.expireTime).getTime();
5822
+ const now = Date.now();
5823
+ if (expiresAt - now < 6e4) return false;
5824
+ }
5825
+ return true;
5826
+ }
5827
+ /**
5828
+ * Delete the active cache (best-effort).
5829
+ */
5830
+ async cleanupActiveCache() {
5831
+ if (!this.activeCache) return;
5832
+ try {
5833
+ await this.client.caches.delete({ name: this.activeCache.name });
5834
+ } catch {
5835
+ }
5836
+ this.activeCache = null;
5837
+ }
5838
+ };
5839
+ }
5840
+ });
5841
+
5619
5842
  // src/providers/gemini-image-models.ts
5620
5843
  function getGeminiImageModelSpec(modelId) {
5621
5844
  return geminiImageModels.find((m) => m.modelId === modelId);
@@ -5835,10 +6058,10 @@ var init_gemini_models = __esm({
5835
6058
  contextWindow: 1048576,
5836
6059
  maxOutputTokens: 65536,
5837
6060
  pricing: {
5838
- input: 0.4,
5839
- // $0.40 for text/image/video
6061
+ input: 0.5,
6062
+ // $0.50 for text/image/video
5840
6063
  output: 3,
5841
- cachedInput: 0.04
6064
+ cachedInput: 0.05
5842
6065
  },
5843
6066
  knowledgeCutoff: "2025-01",
5844
6067
  features: {
@@ -6132,6 +6355,23 @@ var init_gemini_speech_models = __esm({
6132
6355
  });
6133
6356
 
6134
6357
  // src/providers/gemini.ts
6358
+ function resolveGeminiThinkingConfig(reasoning, modelName) {
6359
+ if (!reasoning?.enabled) return void 0;
6360
+ const isGemini3 = modelName.includes("gemini-3");
6361
+ if (isGemini3) {
6362
+ return {
6363
+ thinkingConfig: {
6364
+ thinkingLevel: GEMINI3_THINKING_LEVEL[reasoning.effort ?? "medium"]
6365
+ }
6366
+ };
6367
+ }
6368
+ const budget = reasoning.budgetTokens ?? GEMINI25_THINKING_BUDGET[reasoning.effort ?? "medium"];
6369
+ return {
6370
+ thinkingConfig: {
6371
+ thinkingBudget: budget
6372
+ }
6373
+ };
6374
+ }
6135
6375
  function wrapPcmInWav(pcmData, sampleRate, bitsPerSample, numChannels) {
6136
6376
  const byteRate = sampleRate * numChannels * bitsPerSample / 8;
6137
6377
  const blockAlign = numChannels * bitsPerSample / 8;
@@ -6160,7 +6400,7 @@ function wrapPcmInWav(pcmData, sampleRate, bitsPerSample, numChannels) {
6160
6400
  function createGeminiProviderFromEnv() {
6161
6401
  return createProviderFromEnv("GEMINI_API_KEY", import_genai.GoogleGenAI, GeminiGenerativeProvider);
6162
6402
  }
6163
- var import_genai, GEMINI_ROLE_MAP, GeminiGenerativeProvider;
6403
+ var import_genai, GEMINI3_THINKING_LEVEL, GEMINI25_THINKING_BUDGET, GEMINI_ROLE_MAP, GeminiGenerativeProvider;
6164
6404
  var init_gemini = __esm({
6165
6405
  "src/providers/gemini.ts"() {
6166
6406
  "use strict";
@@ -6168,10 +6408,25 @@ var init_gemini = __esm({
6168
6408
  init_messages();
6169
6409
  init_base_provider();
6170
6410
  init_constants2();
6411
+ init_gemini_cache_manager();
6171
6412
  init_gemini_image_models();
6172
6413
  init_gemini_models();
6173
6414
  init_gemini_speech_models();
6174
6415
  init_utils();
6416
+ GEMINI3_THINKING_LEVEL = {
6417
+ none: "minimal",
6418
+ low: "low",
6419
+ medium: "medium",
6420
+ high: "high",
6421
+ maximum: "high"
6422
+ };
6423
+ GEMINI25_THINKING_BUDGET = {
6424
+ none: 0,
6425
+ low: 2048,
6426
+ medium: 8192,
6427
+ high: 16384,
6428
+ maximum: 24576
6429
+ };
6175
6430
  GEMINI_ROLE_MAP = {
6176
6431
  system: "user",
6177
6432
  user: "user",
@@ -6179,12 +6434,62 @@ var init_gemini = __esm({
6179
6434
  };
6180
6435
  GeminiGenerativeProvider = class extends BaseProviderAdapter {
6181
6436
  providerId = "gemini";
6437
+ cacheManager;
6438
+ constructor(client) {
6439
+ super(client);
6440
+ this.cacheManager = new GeminiCacheManager(client);
6441
+ }
6182
6442
  supports(descriptor) {
6183
6443
  return descriptor.provider === this.providerId;
6184
6444
  }
6185
6445
  getModelSpecs() {
6186
6446
  return GEMINI_MODELS;
6187
6447
  }
6448
+ /**
6449
+ * Override the base stream method to inject cache logic.
6450
+ *
6451
+ * When caching is enabled, we:
6452
+ * 1. Prepare messages as usual
6453
+ * 2. Attempt to get/create a cache for the cacheable prefix
6454
+ * 3. If a cache is available, strip cached contents from the request and add cachedContent ref
6455
+ * 4. Otherwise, proceed normally (graceful degradation)
6456
+ */
6457
+ async *stream(options, descriptor, spec) {
6458
+ const preparedMessages = this.prepareMessages(options.messages);
6459
+ const contents = this.convertMessagesToContents(preparedMessages);
6460
+ const cachingConfig = options.caching;
6461
+ let cacheName = null;
6462
+ let cachedContentCount = 0;
6463
+ if (cachingConfig?.enabled) {
6464
+ let lastUserIndex = -1;
6465
+ for (let i = contents.length - 1; i >= 0; i--) {
6466
+ if (contents[i].role === "user") {
6467
+ lastUserIndex = i;
6468
+ break;
6469
+ }
6470
+ }
6471
+ const cacheResult = await this.cacheManager.getOrCreateCache(
6472
+ descriptor.name,
6473
+ contents,
6474
+ cachingConfig,
6475
+ lastUserIndex
6476
+ );
6477
+ if (cacheResult) {
6478
+ cacheName = cacheResult.cacheName;
6479
+ cachedContentCount = cacheResult.cachedContentCount;
6480
+ }
6481
+ }
6482
+ const payload = this.buildApiRequestFromContents(
6483
+ options,
6484
+ descriptor,
6485
+ spec,
6486
+ contents,
6487
+ cacheName,
6488
+ cachedContentCount
6489
+ );
6490
+ const rawStream = await this.executeStreamRequest(payload, options.signal);
6491
+ yield* this.normalizeProviderStream(rawStream);
6492
+ }
6188
6493
  // =========================================================================
6189
6494
  // Image Generation
6190
6495
  // =========================================================================
@@ -6320,7 +6625,19 @@ var init_gemini = __esm({
6320
6625
  }
6321
6626
  buildApiRequest(options, descriptor, _spec, messages) {
6322
6627
  const contents = this.convertMessagesToContents(messages);
6628
+ return this.buildApiRequestFromContents(options, descriptor, _spec, contents, null, 0);
6629
+ }
6630
+ /**
6631
+ * Build API request from pre-converted Gemini contents.
6632
+ *
6633
+ * When a cache name is provided, the cached prefix is stripped from contents
6634
+ * and the cache reference is added to the config. This tells Gemini to use
6635
+ * the pre-computed KV pairs instead of reprocessing the cached content.
6636
+ */
6637
+ buildApiRequestFromContents(options, descriptor, _spec, contents, cacheName, cachedContentCount) {
6638
+ const effectiveContents = cacheName ? contents.slice(cachedContentCount) : contents;
6323
6639
  const generationConfig = this.buildGenerationConfig(options);
6640
+ const thinkingConfig = resolveGeminiThinkingConfig(options.reasoning, descriptor.name);
6324
6641
  const config = {
6325
6642
  // Note: systemInstruction removed - it doesn't work with countTokens()
6326
6643
  // System messages are now included in contents as user+model exchanges
@@ -6331,11 +6648,14 @@ var init_gemini = __esm({
6331
6648
  mode: import_genai.FunctionCallingConfigMode.NONE
6332
6649
  }
6333
6650
  },
6651
+ ...thinkingConfig ?? {},
6652
+ // Add cache reference if available
6653
+ ...cacheName ? { cachedContent: cacheName } : {},
6334
6654
  ...options.extra
6335
6655
  };
6336
6656
  return {
6337
6657
  model: descriptor.name,
6338
- contents,
6658
+ contents: effectiveContents,
6339
6659
  config
6340
6660
  };
6341
6661
  }
@@ -6468,7 +6788,18 @@ var init_gemini = __esm({
6468
6788
  async *normalizeProviderStream(iterable) {
6469
6789
  const stream2 = iterable;
6470
6790
  for await (const chunk of stream2) {
6471
- const text3 = this.extractMessageText(chunk);
6791
+ const { text: text3, thinkingText, thinkingSignature } = this.extractTextAndThinking(chunk);
6792
+ if (thinkingText) {
6793
+ yield {
6794
+ text: "",
6795
+ thinking: {
6796
+ content: thinkingText,
6797
+ type: "thinking",
6798
+ signature: thinkingSignature
6799
+ },
6800
+ rawEvent: chunk
6801
+ };
6802
+ }
6472
6803
  if (text3) {
6473
6804
  yield { text: text3, rawEvent: chunk };
6474
6805
  }
@@ -6479,11 +6810,30 @@ var init_gemini = __esm({
6479
6810
  }
6480
6811
  }
6481
6812
  }
6482
- extractMessageText(chunk) {
6813
+ /**
6814
+ * Extract both regular text and thinking text from a chunk.
6815
+ * Gemini marks thinking parts with `thought: true`.
6816
+ */
6817
+ extractTextAndThinking(chunk) {
6483
6818
  if (!chunk?.candidates) {
6484
- return "";
6819
+ return { text: "", thinkingText: "" };
6820
+ }
6821
+ let text3 = "";
6822
+ let thinkingText = "";
6823
+ let thinkingSignature;
6824
+ for (const candidate of chunk.candidates) {
6825
+ for (const part of candidate.content?.parts ?? []) {
6826
+ if (part.thought) {
6827
+ thinkingText += part.text ?? "";
6828
+ if (part.thoughtSignature) {
6829
+ thinkingSignature = part.thoughtSignature;
6830
+ }
6831
+ } else {
6832
+ text3 += part.text ?? "";
6833
+ }
6834
+ }
6485
6835
  }
6486
- return chunk.candidates.flatMap((candidate) => candidate.content?.parts ?? []).map((part) => part.text ?? "").join("");
6836
+ return { text: text3, thinkingText, thinkingSignature };
6487
6837
  }
6488
6838
  extractFinishReason(chunk) {
6489
6839
  const candidate = chunk?.candidates?.find((item) => item.finishReason);
@@ -6499,7 +6849,9 @@ var init_gemini = __esm({
6499
6849
  outputTokens: usageMetadata.candidatesTokenCount ?? 0,
6500
6850
  totalTokens: usageMetadata.totalTokenCount ?? 0,
6501
6851
  // Gemini returns cached token count in cachedContentTokenCount
6502
- cachedInputTokens: usageMetadata.cachedContentTokenCount ?? 0
6852
+ cachedInputTokens: usageMetadata.cachedContentTokenCount ?? 0,
6853
+ // Gemini returns thinking tokens in thoughtsTokenCount
6854
+ reasoningTokens: usageMetadata.thoughtsTokenCount
6503
6855
  };
6504
6856
  }
6505
6857
  /**
@@ -7520,11 +7872,13 @@ var init_openai_compatible_provider = __esm({
7520
7872
  yield { text: text3, rawEvent: chunk };
7521
7873
  }
7522
7874
  const finishReason = chunk.choices.find((choice) => choice.finish_reason)?.finish_reason;
7875
+ const usageDetails = chunk.usage;
7523
7876
  const usage = chunk.usage ? {
7524
7877
  inputTokens: chunk.usage.prompt_tokens,
7525
7878
  outputTokens: chunk.usage.completion_tokens,
7526
7879
  totalTokens: chunk.usage.total_tokens,
7527
- cachedInputTokens: 0
7880
+ cachedInputTokens: 0,
7881
+ reasoningTokens: usageDetails?.completion_tokens_details?.reasoning_tokens
7528
7882
  } : void 0;
7529
7883
  if (finishReason || usage) {
7530
7884
  yield { text: "", finishReason, usage, rawEvent: chunk };
@@ -7600,6 +7954,21 @@ var init_huggingface = __esm({
7600
7954
  getModelSpecs() {
7601
7955
  return HUGGINGFACE_MODELS;
7602
7956
  }
7957
+ /**
7958
+ * Override buildApiRequest to inject DeepSeek-specific thinking parameters.
7959
+ * DeepSeek models use `extra_body: { thinking: { type: "enabled" } }` for reasoning.
7960
+ */
7961
+ buildApiRequest(options, descriptor, spec, messages) {
7962
+ const request = super.buildApiRequest(options, descriptor, spec, messages);
7963
+ if (options.reasoning?.enabled && descriptor.name.toLowerCase().includes("deepseek")) {
7964
+ const requestObj = request;
7965
+ requestObj.extra_body = {
7966
+ ...requestObj.extra_body,
7967
+ thinking: { type: "enabled" }
7968
+ };
7969
+ }
7970
+ return request;
7971
+ }
7603
7972
  /**
7604
7973
  * Enhance error messages with HuggingFace-specific guidance.
7605
7974
  */
@@ -8485,7 +8854,7 @@ function sanitizeExtra(extra, allowTemperature) {
8485
8854
  function createOpenAIProviderFromEnv() {
8486
8855
  return createProviderFromEnv("OPENAI_API_KEY", import_openai3.default, OpenAIChatProvider);
8487
8856
  }
8488
- var import_openai3, import_tiktoken, ROLE_MAP2, OpenAIChatProvider;
8857
+ var import_openai3, import_tiktoken, ROLE_MAP2, OPENAI_EFFORT_MAP, OpenAIChatProvider;
8489
8858
  var init_openai = __esm({
8490
8859
  "src/providers/openai.ts"() {
8491
8860
  "use strict";
@@ -8503,6 +8872,13 @@ var init_openai = __esm({
8503
8872
  user: "user",
8504
8873
  assistant: "assistant"
8505
8874
  };
8875
+ OPENAI_EFFORT_MAP = {
8876
+ none: "none",
8877
+ low: "low",
8878
+ medium: "medium",
8879
+ high: "high",
8880
+ maximum: "xhigh"
8881
+ };
8506
8882
  OpenAIChatProvider = class extends BaseProviderAdapter {
8507
8883
  providerId = "openai";
8508
8884
  supports(descriptor) {
@@ -8593,10 +8969,15 @@ var init_openai = __esm({
8593
8969
  };
8594
8970
  }
8595
8971
  buildApiRequest(options, descriptor, spec, messages) {
8596
- const { maxTokens, temperature, topP, stopSequences, extra } = options;
8972
+ const { maxTokens, temperature, topP, stopSequences, extra, reasoning } = options;
8597
8973
  const supportsTemperature = spec?.metadata?.supportsTemperature !== false;
8598
8974
  const shouldIncludeTemperature = typeof temperature === "number" && supportsTemperature;
8599
8975
  const sanitizedExtra = sanitizeExtra(extra, shouldIncludeTemperature);
8976
+ const reasoningParam = reasoning?.enabled !== void 0 ? {
8977
+ reasoning: {
8978
+ effort: OPENAI_EFFORT_MAP[reasoning.effort ?? "medium"]
8979
+ }
8980
+ } : {};
8600
8981
  return {
8601
8982
  model: descriptor.name,
8602
8983
  messages: messages.map((message) => this.convertToOpenAIMessage(message)),
@@ -8607,6 +8988,7 @@ var init_openai = __esm({
8607
8988
  stop: stopSequences,
8608
8989
  stream: true,
8609
8990
  stream_options: { include_usage: true },
8991
+ ...reasoningParam,
8610
8992
  ...sanitizedExtra ?? {},
8611
8993
  ...shouldIncludeTemperature ? { temperature } : {}
8612
8994
  };
@@ -8695,11 +9077,13 @@ var init_openai = __esm({
8695
9077
  yield { text: text3, rawEvent: chunk };
8696
9078
  }
8697
9079
  const finishReason = chunk.choices.find((choice) => choice.finish_reason)?.finish_reason;
9080
+ const usageDetails = chunk.usage;
8698
9081
  const usage = chunk.usage ? {
8699
9082
  inputTokens: chunk.usage.prompt_tokens,
8700
9083
  outputTokens: chunk.usage.completion_tokens,
8701
9084
  totalTokens: chunk.usage.total_tokens,
8702
- cachedInputTokens: chunk.usage.prompt_tokens_details?.cached_tokens ?? 0
9085
+ cachedInputTokens: usageDetails?.prompt_tokens_details?.cached_tokens ?? 0,
9086
+ reasoningTokens: usageDetails?.completion_tokens_details?.reasoning_tokens
8703
9087
  } : void 0;
8704
9088
  if (finishReason || usage) {
8705
9089
  yield { text: "", finishReason, usage, rawEvent: chunk };
@@ -9234,7 +9618,7 @@ function createOpenRouterProviderFromEnv() {
9234
9618
  });
9235
9619
  return new OpenRouterProvider(client, config);
9236
9620
  }
9237
- var import_openai4, OpenRouterProvider;
9621
+ var import_openai4, OPENROUTER_EFFORT_MAP, OpenRouterProvider;
9238
9622
  var init_openrouter = __esm({
9239
9623
  "src/providers/openrouter.ts"() {
9240
9624
  "use strict";
@@ -9242,6 +9626,13 @@ var init_openrouter = __esm({
9242
9626
  init_openai_compatible_provider();
9243
9627
  init_openrouter_models();
9244
9628
  init_utils();
9629
+ OPENROUTER_EFFORT_MAP = {
9630
+ none: "none",
9631
+ low: "low",
9632
+ medium: "medium",
9633
+ high: "high",
9634
+ maximum: "xhigh"
9635
+ };
9245
9636
  OpenRouterProvider = class extends OpenAICompatibleProvider {
9246
9637
  providerId = "openrouter";
9247
9638
  providerAlias = "or";
@@ -9251,6 +9642,20 @@ var init_openrouter = __esm({
9251
9642
  getModelSpecs() {
9252
9643
  return OPENROUTER_MODELS;
9253
9644
  }
9645
+ /**
9646
+ * Override buildApiRequest to inject reasoning parameters.
9647
+ * OpenRouter normalizes reasoning into the standard OpenAI format.
9648
+ */
9649
+ buildApiRequest(options, descriptor, spec, messages) {
9650
+ const request = super.buildApiRequest(options, descriptor, spec, messages);
9651
+ if (options.reasoning?.enabled !== void 0) {
9652
+ const requestObj = request;
9653
+ requestObj.reasoning = {
9654
+ effort: OPENROUTER_EFFORT_MAP[options.reasoning.effort ?? "medium"]
9655
+ };
9656
+ }
9657
+ return request;
9658
+ }
9254
9659
  /**
9255
9660
  * Get custom headers for OpenRouter analytics.
9256
9661
  */
@@ -9488,9 +9893,10 @@ var init_model_registry = __esm({
9488
9893
  * @param outputTokens - Number of output tokens
9489
9894
  * @param cachedInputTokens - Number of cached input tokens (subset of inputTokens)
9490
9895
  * @param cacheCreationInputTokens - Number of cache creation tokens (subset of inputTokens, Anthropic only)
9896
+ * @param reasoningTokens - Number of reasoning/thinking tokens (subset of outputTokens)
9491
9897
  * @returns CostEstimate if model found, undefined otherwise
9492
9898
  */
9493
- estimateCost(modelId, inputTokens, outputTokens, cachedInputTokens = 0, cacheCreationInputTokens = 0) {
9899
+ estimateCost(modelId, inputTokens, outputTokens, cachedInputTokens = 0, cacheCreationInputTokens = 0, reasoningTokens = 0) {
9494
9900
  const spec = this.getModelSpec(modelId);
9495
9901
  if (!spec) return void 0;
9496
9902
  const cachedRate = spec.pricing.cachedInput ?? spec.pricing.input;
@@ -9500,13 +9906,18 @@ var init_model_registry = __esm({
9500
9906
  const cachedInputCost = cachedInputTokens / 1e6 * cachedRate;
9501
9907
  const cacheCreationCost = cacheCreationInputTokens / 1e6 * cacheWriteRate;
9502
9908
  const inputCost = uncachedInputCost + cachedInputCost + cacheCreationCost;
9503
- const outputCost = outputTokens / 1e6 * spec.pricing.output;
9909
+ const reasoningRate = spec.pricing.reasoningOutput ?? spec.pricing.output;
9910
+ const nonReasoningOutputTokens = outputTokens - reasoningTokens;
9911
+ const reasoningCost = reasoningTokens / 1e6 * reasoningRate;
9912
+ const nonReasoningOutputCost = nonReasoningOutputTokens / 1e6 * spec.pricing.output;
9913
+ const outputCost = nonReasoningOutputCost + reasoningCost;
9504
9914
  const totalCost = inputCost + outputCost;
9505
9915
  return {
9506
9916
  inputCost,
9507
9917
  cachedInputCost,
9508
9918
  cacheCreationCost,
9509
9919
  outputCost,
9920
+ reasoningCost,
9510
9921
  totalCost,
9511
9922
  currency: "USD"
9512
9923
  };
@@ -10221,6 +10632,8 @@ var init_builder = __esm({
10221
10632
  // Shared retry config from parent for consistent backoff behavior
10222
10633
  // When a gadget calls withParentContext(ctx), this config is shared
10223
10634
  sharedRetryConfig;
10635
+ reasoningConfig;
10636
+ cachingConfig;
10224
10637
  constructor(client) {
10225
10638
  this.client = client;
10226
10639
  }
@@ -10806,6 +11219,116 @@ var init_builder = __esm({
10806
11219
  this.signal = signal;
10807
11220
  return this;
10808
11221
  }
11222
+ /**
11223
+ * Enable reasoning/thinking mode for reasoning-capable models.
11224
+ *
11225
+ * Can be called with:
11226
+ * - No args: enables reasoning at "medium" effort
11227
+ * - A string effort level: `withReasoning("high")`
11228
+ * - A full config object: `withReasoning({ enabled: true, budgetTokens: 10000 })`
11229
+ *
11230
+ * @param config - Optional effort level or full reasoning config
11231
+ * @returns This builder for chaining
11232
+ *
11233
+ * @example
11234
+ * ```typescript
11235
+ * // Simple — medium effort
11236
+ * LLMist.createAgent()
11237
+ * .withModel("o3")
11238
+ * .withReasoning()
11239
+ * .ask("Solve this logic puzzle...");
11240
+ *
11241
+ * // Explicit effort level
11242
+ * LLMist.createAgent()
11243
+ * .withModel("anthropic:claude-4-opus")
11244
+ * .withReasoning("high")
11245
+ * .ask("Analyze this complex problem");
11246
+ *
11247
+ * // Full config with explicit token budget
11248
+ * LLMist.createAgent()
11249
+ * .withModel("anthropic:claude-4-opus")
11250
+ * .withReasoning({ enabled: true, budgetTokens: 16000 })
11251
+ * .ask("Step through this proof");
11252
+ * ```
11253
+ */
11254
+ withReasoning(config) {
11255
+ if (typeof config === "string") {
11256
+ this.reasoningConfig = { enabled: true, effort: config };
11257
+ } else if (config === void 0) {
11258
+ this.reasoningConfig = { enabled: true, effort: "medium" };
11259
+ } else {
11260
+ this.reasoningConfig = config;
11261
+ }
11262
+ return this;
11263
+ }
11264
+ /**
11265
+ * Explicitly disable reasoning for this agent, even if the model supports it.
11266
+ *
11267
+ * By default, reasoning is auto-enabled at "medium" effort for models with
11268
+ * `features.reasoning: true`. Use this to opt out.
11269
+ *
11270
+ * @returns This builder for chaining
11271
+ */
11272
+ withoutReasoning() {
11273
+ this.reasoningConfig = { enabled: false };
11274
+ return this;
11275
+ }
11276
+ /**
11277
+ * Enable context caching for supported providers.
11278
+ *
11279
+ * Can be called with:
11280
+ * - No args: enables caching with defaults (`{ enabled: true }`)
11281
+ * - A full config object: `withCaching({ enabled: true, scope: "system", ttl: "7200s" })`
11282
+ *
11283
+ * Provider behavior:
11284
+ * - **Anthropic**: Caching is always-on by default via `cache_control` markers.
11285
+ * Calling `withCaching()` explicitly is a no-op (it's already enabled).
11286
+ * - **Gemini**: Creates an explicit cache via `caches.create()` for the configured scope.
11287
+ * - **OpenAI**: Server-side automatic caching (no-op).
11288
+ *
11289
+ * @param config - Optional caching configuration
11290
+ * @returns This builder for chaining
11291
+ *
11292
+ * @example
11293
+ * ```typescript
11294
+ * // Simple — enable with defaults
11295
+ * LLMist.createAgent()
11296
+ * .withModel("gemini:gemini-2.5-flash")
11297
+ * .withCaching()
11298
+ * .ask("Analyze this large codebase...");
11299
+ *
11300
+ * // Cache only system prompt with longer TTL
11301
+ * LLMist.createAgent()
11302
+ * .withModel("gemini:gemini-2.5-pro")
11303
+ * .withCaching({ enabled: true, scope: "system", ttl: "7200s" })
11304
+ * .ask("...");
11305
+ * ```
11306
+ */
11307
+ withCaching(config) {
11308
+ this.cachingConfig = config ?? { enabled: true };
11309
+ return this;
11310
+ }
11311
+ /**
11312
+ * Explicitly disable context caching.
11313
+ *
11314
+ * For Anthropic, this removes `cache_control` markers from requests,
11315
+ * opting out of prompt caching entirely.
11316
+ *
11317
+ * @returns This builder for chaining
11318
+ *
11319
+ * @example
11320
+ * ```typescript
11321
+ * // Disable Anthropic's automatic caching
11322
+ * LLMist.createAgent()
11323
+ * .withModel("sonnet")
11324
+ * .withoutCaching()
11325
+ * .ask("...");
11326
+ * ```
11327
+ */
11328
+ withoutCaching() {
11329
+ this.cachingConfig = { enabled: false };
11330
+ return this;
11331
+ }
10809
11332
  /**
10810
11333
  * Set subagent configuration overrides.
10811
11334
  *
@@ -11091,6 +11614,8 @@ ${endPrefix}`
11091
11614
  retryConfig: this.retryConfig,
11092
11615
  rateLimitConfig: this.rateLimitConfig,
11093
11616
  signal: this.signal,
11617
+ reasoning: this.reasoningConfig,
11618
+ caching: this.cachingConfig,
11094
11619
  subagentConfig: this.subagentConfig,
11095
11620
  // Tree context for shared tree model (subagents share parent's tree)
11096
11621
  parentTree: this.parentContext?.tree,
@@ -11278,6 +11803,8 @@ ${endPrefix}`
11278
11803
  retryConfig: this.retryConfig,
11279
11804
  rateLimitConfig: this.rateLimitConfig,
11280
11805
  signal: this.signal,
11806
+ reasoning: this.reasoningConfig,
11807
+ caching: this.cachingConfig,
11281
11808
  subagentConfig: this.subagentConfig,
11282
11809
  // Tree context for shared tree model (subagents share parent's tree)
11283
11810
  parentTree: this.parentContext?.tree,
@@ -11732,6 +12259,7 @@ var init_cost_reporting_client = __esm({
11732
12259
  let outputTokens = 0;
11733
12260
  let cachedInputTokens = 0;
11734
12261
  let cacheCreationInputTokens = 0;
12262
+ let reasoningTokens = 0;
11735
12263
  const messages = [
11736
12264
  ...options?.systemPrompt ? [{ role: "system", content: options.systemPrompt }] : [],
11737
12265
  { role: "user", content: prompt }
@@ -11748,6 +12276,7 @@ var init_cost_reporting_client = __esm({
11748
12276
  outputTokens = chunk.usage.outputTokens;
11749
12277
  cachedInputTokens = chunk.usage.cachedInputTokens ?? 0;
11750
12278
  cacheCreationInputTokens = chunk.usage.cacheCreationInputTokens ?? 0;
12279
+ reasoningTokens = chunk.usage.reasoningTokens ?? 0;
11751
12280
  }
11752
12281
  }
11753
12282
  this.reportCostFromUsage(
@@ -11755,7 +12284,8 @@ var init_cost_reporting_client = __esm({
11755
12284
  inputTokens,
11756
12285
  outputTokens,
11757
12286
  cachedInputTokens,
11758
- cacheCreationInputTokens
12287
+ cacheCreationInputTokens,
12288
+ reasoningTokens
11759
12289
  );
11760
12290
  return result;
11761
12291
  }
@@ -11774,6 +12304,7 @@ var init_cost_reporting_client = __esm({
11774
12304
  let outputTokens = 0;
11775
12305
  let cachedInputTokens = 0;
11776
12306
  let cacheCreationInputTokens = 0;
12307
+ let reasoningTokens = 0;
11777
12308
  const messages = [
11778
12309
  ...options?.systemPrompt ? [{ role: "system", content: options.systemPrompt }] : [],
11779
12310
  { role: "user", content: prompt }
@@ -11793,6 +12324,7 @@ var init_cost_reporting_client = __esm({
11793
12324
  outputTokens = chunk.usage.outputTokens;
11794
12325
  cachedInputTokens = chunk.usage.cachedInputTokens ?? 0;
11795
12326
  cacheCreationInputTokens = chunk.usage.cacheCreationInputTokens ?? 0;
12327
+ reasoningTokens = chunk.usage.reasoningTokens ?? 0;
11796
12328
  }
11797
12329
  }
11798
12330
  } finally {
@@ -11801,7 +12333,8 @@ var init_cost_reporting_client = __esm({
11801
12333
  inputTokens,
11802
12334
  outputTokens,
11803
12335
  cachedInputTokens,
11804
- cacheCreationInputTokens
12336
+ cacheCreationInputTokens,
12337
+ reasoningTokens
11805
12338
  );
11806
12339
  }
11807
12340
  }
@@ -11828,6 +12361,7 @@ var init_cost_reporting_client = __esm({
11828
12361
  let outputTokens = 0;
11829
12362
  let cachedInputTokens = 0;
11830
12363
  let cacheCreationInputTokens = 0;
12364
+ let reasoningTokens = 0;
11831
12365
  try {
11832
12366
  for await (const chunk of innerStream) {
11833
12367
  if (chunk.usage) {
@@ -11835,6 +12369,7 @@ var init_cost_reporting_client = __esm({
11835
12369
  outputTokens = chunk.usage.outputTokens;
11836
12370
  cachedInputTokens = chunk.usage.cachedInputTokens ?? 0;
11837
12371
  cacheCreationInputTokens = chunk.usage.cacheCreationInputTokens ?? 0;
12372
+ reasoningTokens = chunk.usage.reasoningTokens ?? 0;
11838
12373
  }
11839
12374
  yield chunk;
11840
12375
  }
@@ -11845,7 +12380,8 @@ var init_cost_reporting_client = __esm({
11845
12380
  inputTokens,
11846
12381
  outputTokens,
11847
12382
  cachedInputTokens,
11848
- cacheCreationInputTokens
12383
+ cacheCreationInputTokens,
12384
+ reasoningTokens
11849
12385
  );
11850
12386
  }
11851
12387
  }
@@ -11855,14 +12391,15 @@ var init_cost_reporting_client = __esm({
11855
12391
  /**
11856
12392
  * Calculates and reports cost from token usage.
11857
12393
  */
11858
- reportCostFromUsage(model, inputTokens, outputTokens, cachedInputTokens = 0, cacheCreationInputTokens = 0) {
12394
+ reportCostFromUsage(model, inputTokens, outputTokens, cachedInputTokens = 0, cacheCreationInputTokens = 0, reasoningTokens = 0) {
11859
12395
  if (inputTokens === 0 && outputTokens === 0) return;
11860
12396
  const estimate = this.client.modelRegistry.estimateCost(
11861
12397
  model,
11862
12398
  inputTokens,
11863
12399
  outputTokens,
11864
12400
  cachedInputTokens,
11865
- cacheCreationInputTokens
12401
+ cacheCreationInputTokens,
12402
+ reasoningTokens
11866
12403
  );
11867
12404
  if (estimate && estimate.totalCost > 0) {
11868
12405
  this.reportCost(estimate.totalCost);
@@ -12954,9 +13491,18 @@ var init_stream_processor = __esm({
12954
13491
  let usage;
12955
13492
  let didExecuteGadgets = false;
12956
13493
  let shouldBreakLoop = false;
13494
+ let thinkingContent = "";
12957
13495
  for await (const chunk of stream2) {
12958
13496
  if (chunk.finishReason) finishReason = chunk.finishReason;
12959
13497
  if (chunk.usage) usage = chunk.usage;
13498
+ if (chunk.thinking?.content) {
13499
+ thinkingContent += chunk.thinking.content;
13500
+ yield {
13501
+ type: "thinking",
13502
+ content: chunk.thinking.content,
13503
+ thinkingType: chunk.thinking.type
13504
+ };
13505
+ }
12960
13506
  let processedChunk = "";
12961
13507
  if (chunk.text) {
12962
13508
  processedChunk = chunk.text;
@@ -13070,7 +13616,8 @@ var init_stream_processor = __esm({
13070
13616
  finishReason,
13071
13617
  usage,
13072
13618
  rawResponse: this.responseText,
13073
- finalMessage
13619
+ finalMessage,
13620
+ thinkingContent: thinkingContent || void 0
13074
13621
  };
13075
13622
  yield completionEvent;
13076
13623
  }
@@ -13872,6 +14419,8 @@ var init_agent = __esm({
13872
14419
  mediaStore;
13873
14420
  // Cancellation
13874
14421
  signal;
14422
+ reasoning;
14423
+ caching;
13875
14424
  // Retry configuration
13876
14425
  retryConfig;
13877
14426
  // Rate limit tracker for proactive throttling
@@ -13963,6 +14512,8 @@ var init_agent = __esm({
13963
14512
  );
13964
14513
  }
13965
14514
  this.signal = options.signal;
14515
+ this.reasoning = options.reasoning;
14516
+ this.caching = options.caching;
13966
14517
  this.retryConfig = options.sharedRetryConfig ?? resolveRetryConfig(options.retryConfig);
13967
14518
  if (options.sharedRateLimitTracker) {
13968
14519
  this.rateLimitTracker = options.sharedRateLimitTracker;
@@ -14365,6 +14916,7 @@ var init_agent = __esm({
14365
14916
  usage: result.usage,
14366
14917
  rawResponse: result.rawResponse,
14367
14918
  finalMessage: result.finalMessage,
14919
+ thinkingContent: result.thinkingContent,
14368
14920
  logger: this.logger,
14369
14921
  subagentContext
14370
14922
  };
@@ -14665,17 +15217,49 @@ var init_agent = __esm({
14665
15217
  });
14666
15218
  return { type: "compaction", event: compactionEvent };
14667
15219
  }
15220
+ /**
15221
+ * Resolve reasoning configuration with auto-enable logic.
15222
+ *
15223
+ * Priority: explicit config > auto-enable for reasoning models > undefined
15224
+ * When a model has `features.reasoning: true` and no explicit config is set,
15225
+ * reasoning is automatically enabled at "medium" effort.
15226
+ */
15227
+ resolveReasoningConfig(spec) {
15228
+ if (this.reasoning !== void 0) return this.reasoning;
15229
+ if (spec?.features?.reasoning) {
15230
+ return { enabled: true, effort: "medium" };
15231
+ }
15232
+ return void 0;
15233
+ }
15234
+ /**
15235
+ * Resolve caching configuration.
15236
+ *
15237
+ * Priority: explicit config > default enabled (preserves Anthropic's existing behavior)
15238
+ * Default is `{ enabled: true }` which means:
15239
+ * - Anthropic: `cache_control` markers are added (existing behavior preserved)
15240
+ * - Gemini: Cache manager is consulted but skips if no explicit config was set
15241
+ * - OpenAI: No-op (server-side automatic)
15242
+ */
15243
+ resolveCachingConfig() {
15244
+ if (this.caching !== void 0) return this.caching;
15245
+ return { enabled: true };
15246
+ }
14668
15247
  /**
14669
15248
  * Prepare LLM call options, create tree node, and process beforeLLMCall controller.
14670
15249
  * @returns options, node ID, and optional skipWithSynthetic response if controller wants to skip
14671
15250
  */
14672
15251
  async prepareLLMCall(iteration) {
15252
+ const spec = this.client.modelRegistry?.getModelSpec?.(this.model);
15253
+ const reasoning = this.resolveReasoningConfig(spec);
15254
+ const caching = this.resolveCachingConfig();
14673
15255
  let llmOptions = {
14674
15256
  model: this.model,
14675
15257
  messages: this.conversation.getMessages(),
14676
15258
  temperature: this.temperature,
14677
15259
  maxTokens: this.defaultMaxTokens,
14678
- signal: this.signal
15260
+ signal: this.signal,
15261
+ reasoning,
15262
+ caching
14679
15263
  };
14680
15264
  const llmNode = this.tree.addLLMCall({
14681
15265
  iteration,
@@ -14745,13 +15329,15 @@ var init_agent = __esm({
14745
15329
  inputTokens,
14746
15330
  outputTokens,
14747
15331
  result.usage?.cachedInputTokens ?? 0,
14748
- result.usage?.cacheCreationInputTokens ?? 0
15332
+ result.usage?.cacheCreationInputTokens ?? 0,
15333
+ result.usage?.reasoningTokens ?? 0
14749
15334
  )?.totalCost;
14750
15335
  this.tree.completeLLMCall(nodeId, {
14751
15336
  response: result.rawResponse,
14752
15337
  usage: result.usage,
14753
15338
  finishReason: result.finishReason,
14754
- cost: llmCost
15339
+ cost: llmCost,
15340
+ thinkingContent: result.thinkingContent
14755
15341
  });
14756
15342
  }
14757
15343
  /**