npm - llmist - Versions diffs - 15.12.0 → 15.14.0 - Mend

llmist 15.12.0 → 15.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -229,7 +229,8 @@ var init_execution_tree = __esm({
           response: llmNode.response,
           usage: llmNode.usage,
           finishReason: llmNode.finishReason,
-          cost: llmNode.cost
+          cost: llmNode.cost,
+          thinkingContent: params.thinkingContent
         });
       }
       /**
@@ -4529,7 +4530,10 @@ var init_hook_presets = __esm({
                     const costEstimate = modelRegistry.estimateCost(
                       modelName,
                       ctx.usage.inputTokens,
-                      ctx.usage.outputTokens
+                      ctx.usage.outputTokens,
+                      ctx.usage.cachedInputTokens ?? 0,
+                      ctx.usage.cacheCreationInputTokens ?? 0,
+                      ctx.usage.reasoningTokens ?? 0
                     );
                     if (costEstimate) {
                       totalCost += costEstimate.totalCost;
@@ -5026,10 +5030,10 @@ var init_anthropic_models = __esm({
         contextWindow: 2e5,
         maxOutputTokens: 64e3,
         pricing: {
-          input: 0.8,
-          output: 4,
-          cachedInput: 0.08,
-          cacheWriteInput: 1
+          input: 1,
+          output: 5,
+          cachedInput: 0.1,
+          cacheWriteInput: 1.25
         },
         knowledgeCutoff: "2025-02",
         features: {
@@ -5225,10 +5229,10 @@ var init_anthropic_models = __esm({
         contextWindow: 2e5,
         maxOutputTokens: 64e3,
         pricing: {
-          input: 0.8,
-          output: 4,
-          cachedInput: 0.08,
-          cacheWriteInput: 1
+          input: 1,
+          output: 5,
+          cachedInput: 0.1,
+          cacheWriteInput: 1.25
         },
         knowledgeCutoff: "2025-02",
         features: {
@@ -5371,10 +5375,15 @@ var init_utils = __esm({
 });
 // src/providers/anthropic.ts
+function resolveAnthropicThinking(reasoning) {
+  if (!reasoning?.enabled) return void 0;
+  const budget = reasoning.budgetTokens ? Math.max(1024, reasoning.budgetTokens) : ANTHROPIC_EFFORT_BUDGET[reasoning.effort ?? "medium"];
+  return { type: "enabled", budget_tokens: budget };
+}
 function createAnthropicProviderFromEnv() {
   return createProviderFromEnv("ANTHROPIC_API_KEY", import_sdk.default, AnthropicMessagesProvider);
 }
-var import_sdk, AnthropicMessagesProvider;
+var import_sdk, ANTHROPIC_EFFORT_BUDGET, AnthropicMessagesProvider;
 var init_anthropic = __esm({
   "src/providers/anthropic.ts"() {
     "use strict";
@@ -5384,6 +5393,14 @@ var init_anthropic = __esm({
     init_base_provider();
     init_constants2();
     init_utils();
+    ANTHROPIC_EFFORT_BUDGET = {
+      none: 1024,
+      // Minimum allowed by Anthropic
+      low: 2048,
+      medium: 8192,
+      high: 16384,
+      maximum: 32768
+    };
     AnthropicMessagesProvider = class extends BaseProviderAdapter {
       providerId = "anthropic";
       supports(descriptor) {
@@ -5415,12 +5432,13 @@ var init_anthropic = __esm({
         );
       }
       buildApiRequest(options, descriptor, spec, messages) {
+        const cachingEnabled = options.caching?.enabled !== false;
         const systemMessages = messages.filter((message) => message.role === "system");
         const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
           type: "text",
           text: extractMessageText(m.content),
-          // Add cache_control to the LAST system message block
-          ...index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
+          // Add cache_control to the LAST system message block (only when caching is enabled)
+          ...cachingEnabled && index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
         })) : void 0;
         const nonSystemMessages = messages.filter(
           (message) => message.role !== "system"
@@ -5433,19 +5451,22 @@ var init_anthropic = __esm({
           role: message.role,
           content: this.convertToAnthropicContent(
             message.content,
-            message.role === "user" && index === lastUserIndex
+            cachingEnabled && message.role === "user" && index === lastUserIndex
           )
         }));
         const defaultMaxTokens = spec?.maxOutputTokens ?? ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS;
+        const thinking = resolveAnthropicThinking(options.reasoning);
+        const temperature = thinking ? void 0 : options.temperature;
         const payload = {
           model: descriptor.name,
           system,
           messages: conversation,
           max_tokens: options.maxTokens ?? defaultMaxTokens,
-          temperature: options.temperature,
+          temperature,
           top_p: options.topP,
           stop_sequences: options.stopSequences,
           stream: true,
+          ...thinking ? { thinking } : {},
           ...options.extra
         };
         return payload;
@@ -5525,8 +5546,39 @@ var init_anthropic = __esm({
             };
             continue;
           }
-          if (event.type === "content_block_delta" && event.delta.type === "text_delta") {
-            yield { text: event.delta.text ?? "", rawEvent: event };
+          if (event.type === "content_block_start") {
+            const block = event.content_block;
+            if (block.type === "thinking") {
+              yield { text: "", thinking: { content: "", type: "thinking" }, rawEvent: event };
+              continue;
+            }
+            if (block.type === "redacted_thinking") {
+              yield { text: "", thinking: { content: "", type: "redacted" }, rawEvent: event };
+              continue;
+            }
+          }
+          if (event.type === "content_block_delta") {
+            const delta = event.delta;
+            if (delta.type === "thinking_delta" && delta.thinking) {
+              yield {
+                text: "",
+                thinking: { content: delta.thinking, type: "thinking" },
+                rawEvent: event
+              };
+              continue;
+            }
+            if (delta.type === "signature_delta" && delta.signature) {
+              yield {
+                text: "",
+                thinking: { content: "", type: "thinking", signature: delta.signature },
+                rawEvent: event
+              };
+              continue;
+            }
+            if (delta.type === "text_delta") {
+              yield { text: delta.text ?? "", rawEvent: event };
+              continue;
+            }
             continue;
           }
           if (event.type === "message_delta") {
@@ -5616,6 +5668,177 @@ var init_anthropic = __esm({
   }
 });
+// src/providers/gemini-cache-manager.ts
+var import_node_crypto3, GeminiCacheManager;
+var init_gemini_cache_manager = __esm({
+  "src/providers/gemini-cache-manager.ts"() {
+    "use strict";
+    import_node_crypto3 = require("crypto");
+    GeminiCacheManager = class {
+      constructor(client) {
+        this.client = client;
+      }
+      activeCache = null;
+      /**
+       * Get or create a cache for the given content.
+       *
+       * Returns the cache name if a cache was created/reused, or `null` if caching
+       * was skipped (disabled, below threshold, or API error).
+       *
+       * @param model - Gemini model name (e.g., "gemini-2.5-flash")
+       * @param allContents - All Gemini-formatted contents (system + conversation)
+       * @param config - Caching configuration from the user
+       * @param lastUserMessageIndex - Index of the last user message (content after this is not cached)
+       * @returns Cache name string or null
+       */
+      async getOrCreateCache(model, allContents, config, lastUserMessageIndex) {
+        if (!config.enabled) return null;
+        const scope = config.scope ?? "conversation";
+        const ttl = config.ttl ?? "3600s";
+        const minTokenThreshold = config.minTokenThreshold ?? 32768;
+        const cacheableContents = this.selectCacheableContents(
+          allContents,
+          scope,
+          lastUserMessageIndex
+        );
+        if (cacheableContents.length === 0) return null;
+        const estimatedTokens = this.estimateTokenCount(cacheableContents);
+        if (estimatedTokens < minTokenThreshold) return null;
+        const contentHash = this.computeContentHash(cacheableContents, model);
+        if (this.activeCache && this.canReuseCache(this.activeCache, model, contentHash)) {
+          return {
+            cacheName: this.activeCache.name,
+            cachedContentCount: cacheableContents.length
+          };
+        }
+        try {
+          await this.cleanupActiveCache();
+          const response = await this.client.caches.create({
+            model,
+            config: {
+              contents: cacheableContents,
+              ttl,
+              displayName: `llmist-${scope}-${Date.now()}`
+            }
+          });
+          if (!response.name) {
+            return null;
+          }
+          this.activeCache = {
+            name: response.name,
+            model,
+            contentHash,
+            expireTime: response.expireTime ?? ""
+          };
+          return {
+            cacheName: response.name,
+            cachedContentCount: cacheableContents.length
+          };
+        } catch (error) {
+          console.warn("Gemini cache creation failed, continuing without cache:", error);
+          return null;
+        }
+      }
+      /**
+       * Clean up the active cache (best-effort).
+       * Caches auto-expire via TTL, so failure is non-critical.
+       */
+      async dispose() {
+        await this.cleanupActiveCache();
+      }
+      /**
+       * Select which contents to cache based on scope.
+       *
+       * - "system": Only system-derived messages (the initial user+model exchanges
+       *   generated from system messages)
+       * - "conversation": Everything except the last user message
+       */
+      selectCacheableContents(allContents, scope, lastUserMessageIndex) {
+        if (scope === "system") {
+          let systemEndIndex = 0;
+          for (let i = 0; i < allContents.length; i++) {
+            const content = allContents[i];
+            if (content.role === "model" && content.parts.length === 1 && "text" in content.parts[0] && content.parts[0].text === "Understood.") {
+              systemEndIndex = i + 1;
+            } else if (content.role === "user") {
+              const next = allContents[i + 1];
+              if (next && next.role === "model" && next.parts.length === 1 && "text" in next.parts[0] && next.parts[0].text === "Understood.") {
+                continue;
+              }
+              break;
+            } else {
+              break;
+            }
+          }
+          return allContents.slice(0, systemEndIndex);
+        }
+        if (lastUserMessageIndex <= 0) return [];
+        return allContents.slice(0, lastUserMessageIndex);
+      }
+      /**
+       * Estimate token count from contents using character-based heuristic.
+       * Uses ~4 characters per token (conservative estimate for English text).
+       */
+      estimateTokenCount(contents) {
+        let totalChars = 0;
+        for (const content of contents) {
+          for (const part of content.parts) {
+            if ("text" in part) {
+              totalChars += part.text.length;
+            } else if ("inlineData" in part) {
+              totalChars += 258 * 4;
+            }
+          }
+        }
+        return Math.ceil(totalChars / 4);
+      }
+      /**
+       * Compute a stable hash of the cacheable contents for change detection.
+       */
+      computeContentHash(contents, model) {
+        const hash = (0, import_node_crypto3.createHash)("sha256");
+        hash.update(model);
+        for (const content of contents) {
+          hash.update(content.role);
+          for (const part of content.parts) {
+            if ("text" in part) {
+              hash.update(part.text);
+            } else if ("inlineData" in part) {
+              hash.update(part.inlineData.mimeType);
+              hash.update(part.inlineData.data);
+            }
+          }
+        }
+        return hash.digest("hex");
+      }
+      /**
+       * Check if an existing cache can be reused.
+       */
+      canReuseCache(cache, model, contentHash) {
+        if (cache.model !== model) return false;
+        if (cache.contentHash !== contentHash) return false;
+        if (cache.expireTime) {
+          const expiresAt = new Date(cache.expireTime).getTime();
+          const now = Date.now();
+          if (expiresAt - now < 6e4) return false;
+        }
+        return true;
+      }
+      /**
+       * Delete the active cache (best-effort).
+       */
+      async cleanupActiveCache() {
+        if (!this.activeCache) return;
+        try {
+          await this.client.caches.delete({ name: this.activeCache.name });
+        } catch {
+        }
+        this.activeCache = null;
+      }
+    };
+  }
+});
 // src/providers/gemini-image-models.ts
 function getGeminiImageModelSpec(modelId) {
   return geminiImageModels.find((m) => m.modelId === modelId);
@@ -5835,10 +6058,10 @@ var init_gemini_models = __esm({
         contextWindow: 1048576,
         maxOutputTokens: 65536,
         pricing: {
-          input: 0.4,
-          // $0.40 for text/image/video
+          input: 0.5,
+          // $0.50 for text/image/video
           output: 3,
-          cachedInput: 0.04
+          cachedInput: 0.05
         },
         knowledgeCutoff: "2025-01",
         features: {
@@ -6132,6 +6355,23 @@ var init_gemini_speech_models = __esm({
 });
 // src/providers/gemini.ts
+function resolveGeminiThinkingConfig(reasoning, modelName) {
+  if (!reasoning?.enabled) return void 0;
+  const isGemini3 = modelName.includes("gemini-3");
+  if (isGemini3) {
+    return {
+      thinkingConfig: {
+        thinkingLevel: GEMINI3_THINKING_LEVEL[reasoning.effort ?? "medium"]
+      }
+    };
+  }
+  const budget = reasoning.budgetTokens ?? GEMINI25_THINKING_BUDGET[reasoning.effort ?? "medium"];
+  return {
+    thinkingConfig: {
+      thinkingBudget: budget
+    }
+  };
+}
 function wrapPcmInWav(pcmData, sampleRate, bitsPerSample, numChannels) {
   const byteRate = sampleRate * numChannels * bitsPerSample / 8;
   const blockAlign = numChannels * bitsPerSample / 8;
@@ -6160,7 +6400,7 @@ function wrapPcmInWav(pcmData, sampleRate, bitsPerSample, numChannels) {
 function createGeminiProviderFromEnv() {
   return createProviderFromEnv("GEMINI_API_KEY", import_genai.GoogleGenAI, GeminiGenerativeProvider);
 }
-var import_genai, GEMINI_ROLE_MAP, GeminiGenerativeProvider;
+var import_genai, GEMINI3_THINKING_LEVEL, GEMINI25_THINKING_BUDGET, GEMINI_ROLE_MAP, GeminiGenerativeProvider;
 var init_gemini = __esm({
   "src/providers/gemini.ts"() {
     "use strict";
@@ -6168,10 +6408,25 @@ var init_gemini = __esm({
     init_messages();
     init_base_provider();
     init_constants2();
+    init_gemini_cache_manager();
     init_gemini_image_models();
     init_gemini_models();
     init_gemini_speech_models();
     init_utils();
+    GEMINI3_THINKING_LEVEL = {
+      none: "minimal",
+      low: "low",
+      medium: "medium",
+      high: "high",
+      maximum: "high"
+    };
+    GEMINI25_THINKING_BUDGET = {
+      none: 0,
+      low: 2048,
+      medium: 8192,
+      high: 16384,
+      maximum: 24576
+    };
     GEMINI_ROLE_MAP = {
       system: "user",
       user: "user",
@@ -6179,12 +6434,62 @@ var init_gemini = __esm({
     };
     GeminiGenerativeProvider = class extends BaseProviderAdapter {
       providerId = "gemini";
+      cacheManager;
+      constructor(client) {
+        super(client);
+        this.cacheManager = new GeminiCacheManager(client);
+      }
       supports(descriptor) {
         return descriptor.provider === this.providerId;
       }
       getModelSpecs() {
         return GEMINI_MODELS;
       }
+      /**
+       * Override the base stream method to inject cache logic.
+       *
+       * When caching is enabled, we:
+       * 1. Prepare messages as usual
+       * 2. Attempt to get/create a cache for the cacheable prefix
+       * 3. If a cache is available, strip cached contents from the request and add cachedContent ref
+       * 4. Otherwise, proceed normally (graceful degradation)
+       */
+      async *stream(options, descriptor, spec) {
+        const preparedMessages = this.prepareMessages(options.messages);
+        const contents = this.convertMessagesToContents(preparedMessages);
+        const cachingConfig = options.caching;
+        let cacheName = null;
+        let cachedContentCount = 0;
+        if (cachingConfig?.enabled) {
+          let lastUserIndex = -1;
+          for (let i = contents.length - 1; i >= 0; i--) {
+            if (contents[i].role === "user") {
+              lastUserIndex = i;
+              break;
+            }
+          }
+          const cacheResult = await this.cacheManager.getOrCreateCache(
+            descriptor.name,
+            contents,
+            cachingConfig,
+            lastUserIndex
+          );
+          if (cacheResult) {
+            cacheName = cacheResult.cacheName;
+            cachedContentCount = cacheResult.cachedContentCount;
+          }
+        }
+        const payload = this.buildApiRequestFromContents(
+          options,
+          descriptor,
+          spec,
+          contents,
+          cacheName,
+          cachedContentCount
+        );
+        const rawStream = await this.executeStreamRequest(payload, options.signal);
+        yield* this.normalizeProviderStream(rawStream);
+      }
       // =========================================================================
       // Image Generation
       // =========================================================================
@@ -6320,7 +6625,19 @@ var init_gemini = __esm({
       }
       buildApiRequest(options, descriptor, _spec, messages) {
         const contents = this.convertMessagesToContents(messages);
+        return this.buildApiRequestFromContents(options, descriptor, _spec, contents, null, 0);
+      }
+      /**
+       * Build API request from pre-converted Gemini contents.
+       *
+       * When a cache name is provided, the cached prefix is stripped from contents
+       * and the cache reference is added to the config. This tells Gemini to use
+       * the pre-computed KV pairs instead of reprocessing the cached content.
+       */
+      buildApiRequestFromContents(options, descriptor, _spec, contents, cacheName, cachedContentCount) {
+        const effectiveContents = cacheName ? contents.slice(cachedContentCount) : contents;
         const generationConfig = this.buildGenerationConfig(options);
+        const thinkingConfig = resolveGeminiThinkingConfig(options.reasoning, descriptor.name);
         const config = {
           // Note: systemInstruction removed - it doesn't work with countTokens()
           // System messages are now included in contents as user+model exchanges
@@ -6331,11 +6648,14 @@ var init_gemini = __esm({
               mode: import_genai.FunctionCallingConfigMode.NONE
             }
           },
+          ...thinkingConfig ?? {},
+          // Add cache reference if available
+          ...cacheName ? { cachedContent: cacheName } : {},
           ...options.extra
         };
         return {
           model: descriptor.name,
-          contents,
+          contents: effectiveContents,
           config
         };
       }
@@ -6468,7 +6788,18 @@ var init_gemini = __esm({
       async *normalizeProviderStream(iterable) {
         const stream2 = iterable;
         for await (const chunk of stream2) {
-          const text3 = this.extractMessageText(chunk);
+          const { text: text3, thinkingText, thinkingSignature } = this.extractTextAndThinking(chunk);
+          if (thinkingText) {
+            yield {
+              text: "",
+              thinking: {
+                content: thinkingText,
+                type: "thinking",
+                signature: thinkingSignature
+              },
+              rawEvent: chunk
+            };
+          }
           if (text3) {
             yield { text: text3, rawEvent: chunk };
           }
@@ -6479,11 +6810,30 @@ var init_gemini = __esm({
           }
         }
       }
-      extractMessageText(chunk) {
+      /**
+       * Extract both regular text and thinking text from a chunk.
+       * Gemini marks thinking parts with `thought: true`.
+       */
+      extractTextAndThinking(chunk) {
         if (!chunk?.candidates) {
-          return "";
+          return { text: "", thinkingText: "" };
+        }
+        let text3 = "";
+        let thinkingText = "";
+        let thinkingSignature;
+        for (const candidate of chunk.candidates) {
+          for (const part of candidate.content?.parts ?? []) {
+            if (part.thought) {
+              thinkingText += part.text ?? "";
+              if (part.thoughtSignature) {
+                thinkingSignature = part.thoughtSignature;
+              }
+            } else {
+              text3 += part.text ?? "";
+            }
+          }
         }
-        return chunk.candidates.flatMap((candidate) => candidate.content?.parts ?? []).map((part) => part.text ?? "").join("");
+        return { text: text3, thinkingText, thinkingSignature };
       }
       extractFinishReason(chunk) {
         const candidate = chunk?.candidates?.find((item) => item.finishReason);
@@ -6499,7 +6849,9 @@ var init_gemini = __esm({
           outputTokens: usageMetadata.candidatesTokenCount ?? 0,
           totalTokens: usageMetadata.totalTokenCount ?? 0,
           // Gemini returns cached token count in cachedContentTokenCount
-          cachedInputTokens: usageMetadata.cachedContentTokenCount ?? 0
+          cachedInputTokens: usageMetadata.cachedContentTokenCount ?? 0,
+          // Gemini returns thinking tokens in thoughtsTokenCount
+          reasoningTokens: usageMetadata.thoughtsTokenCount
         };
       }
       /**
@@ -7520,11 +7872,13 @@ var init_openai_compatible_provider = __esm({
             yield { text: text3, rawEvent: chunk };
           }
           const finishReason = chunk.choices.find((choice) => choice.finish_reason)?.finish_reason;
+          const usageDetails = chunk.usage;
           const usage = chunk.usage ? {
             inputTokens: chunk.usage.prompt_tokens,
             outputTokens: chunk.usage.completion_tokens,
             totalTokens: chunk.usage.total_tokens,
-            cachedInputTokens: 0
+            cachedInputTokens: 0,
+            reasoningTokens: usageDetails?.completion_tokens_details?.reasoning_tokens
           } : void 0;
           if (finishReason || usage) {
             yield { text: "", finishReason, usage, rawEvent: chunk };
@@ -7600,6 +7954,21 @@ var init_huggingface = __esm({
       getModelSpecs() {
         return HUGGINGFACE_MODELS;
       }
+      /**
+       * Override buildApiRequest to inject DeepSeek-specific thinking parameters.
+       * DeepSeek models use `extra_body: { thinking: { type: "enabled" } }` for reasoning.
+       */
+      buildApiRequest(options, descriptor, spec, messages) {
+        const request = super.buildApiRequest(options, descriptor, spec, messages);
+        if (options.reasoning?.enabled && descriptor.name.toLowerCase().includes("deepseek")) {
+          const requestObj = request;
+          requestObj.extra_body = {
+            ...requestObj.extra_body,
+            thinking: { type: "enabled" }
+          };
+        }
+        return request;
+      }
       /**
        * Enhance error messages with HuggingFace-specific guidance.
        */
@@ -8485,7 +8854,7 @@ function sanitizeExtra(extra, allowTemperature) {
 function createOpenAIProviderFromEnv() {
   return createProviderFromEnv("OPENAI_API_KEY", import_openai3.default, OpenAIChatProvider);
 }
-var import_openai3, import_tiktoken, ROLE_MAP2, OpenAIChatProvider;
+var import_openai3, import_tiktoken, ROLE_MAP2, OPENAI_EFFORT_MAP, OpenAIChatProvider;
 var init_openai = __esm({
   "src/providers/openai.ts"() {
     "use strict";
@@ -8503,6 +8872,13 @@ var init_openai = __esm({
       user: "user",
       assistant: "assistant"
     };
+    OPENAI_EFFORT_MAP = {
+      none: "none",
+      low: "low",
+      medium: "medium",
+      high: "high",
+      maximum: "xhigh"
+    };
     OpenAIChatProvider = class extends BaseProviderAdapter {
       providerId = "openai";
       supports(descriptor) {
@@ -8593,10 +8969,15 @@ var init_openai = __esm({
         };
       }
       buildApiRequest(options, descriptor, spec, messages) {
-        const { maxTokens, temperature, topP, stopSequences, extra } = options;
+        const { maxTokens, temperature, topP, stopSequences, extra, reasoning } = options;
         const supportsTemperature = spec?.metadata?.supportsTemperature !== false;
         const shouldIncludeTemperature = typeof temperature === "number" && supportsTemperature;
         const sanitizedExtra = sanitizeExtra(extra, shouldIncludeTemperature);
+        const reasoningParam = reasoning?.enabled !== void 0 ? {
+          reasoning: {
+            effort: OPENAI_EFFORT_MAP[reasoning.effort ?? "medium"]
+          }
+        } : {};
         return {
           model: descriptor.name,
           messages: messages.map((message) => this.convertToOpenAIMessage(message)),
@@ -8607,6 +8988,7 @@ var init_openai = __esm({
           stop: stopSequences,
           stream: true,
           stream_options: { include_usage: true },
+          ...reasoningParam,
           ...sanitizedExtra ?? {},
           ...shouldIncludeTemperature ? { temperature } : {}
         };
@@ -8695,11 +9077,13 @@ var init_openai = __esm({
             yield { text: text3, rawEvent: chunk };
           }
           const finishReason = chunk.choices.find((choice) => choice.finish_reason)?.finish_reason;
+          const usageDetails = chunk.usage;
           const usage = chunk.usage ? {
             inputTokens: chunk.usage.prompt_tokens,
             outputTokens: chunk.usage.completion_tokens,
             totalTokens: chunk.usage.total_tokens,
-            cachedInputTokens: chunk.usage.prompt_tokens_details?.cached_tokens ?? 0
+            cachedInputTokens: usageDetails?.prompt_tokens_details?.cached_tokens ?? 0,
+            reasoningTokens: usageDetails?.completion_tokens_details?.reasoning_tokens
           } : void 0;
           if (finishReason || usage) {
             yield { text: "", finishReason, usage, rawEvent: chunk };
@@ -9234,7 +9618,7 @@ function createOpenRouterProviderFromEnv() {
   });
   return new OpenRouterProvider(client, config);
 }
-var import_openai4, OpenRouterProvider;
+var import_openai4, OPENROUTER_EFFORT_MAP, OpenRouterProvider;
 var init_openrouter = __esm({
   "src/providers/openrouter.ts"() {
     "use strict";
@@ -9242,6 +9626,13 @@ var init_openrouter = __esm({
     init_openai_compatible_provider();
     init_openrouter_models();
     init_utils();
+    OPENROUTER_EFFORT_MAP = {
+      none: "none",
+      low: "low",
+      medium: "medium",
+      high: "high",
+      maximum: "xhigh"
+    };
     OpenRouterProvider = class extends OpenAICompatibleProvider {
       providerId = "openrouter";
       providerAlias = "or";
@@ -9251,6 +9642,20 @@ var init_openrouter = __esm({
       getModelSpecs() {
         return OPENROUTER_MODELS;
       }
+      /**
+       * Override buildApiRequest to inject reasoning parameters.
+       * OpenRouter normalizes reasoning into the standard OpenAI format.
+       */
+      buildApiRequest(options, descriptor, spec, messages) {
+        const request = super.buildApiRequest(options, descriptor, spec, messages);
+        if (options.reasoning?.enabled !== void 0) {
+          const requestObj = request;
+          requestObj.reasoning = {
+            effort: OPENROUTER_EFFORT_MAP[options.reasoning.effort ?? "medium"]
+          };
+        }
+        return request;
+      }
       /**
        * Get custom headers for OpenRouter analytics.
        */
@@ -9488,9 +9893,10 @@ var init_model_registry = __esm({
        * @param outputTokens - Number of output tokens
        * @param cachedInputTokens - Number of cached input tokens (subset of inputTokens)
        * @param cacheCreationInputTokens - Number of cache creation tokens (subset of inputTokens, Anthropic only)
+       * @param reasoningTokens - Number of reasoning/thinking tokens (subset of outputTokens)
        * @returns CostEstimate if model found, undefined otherwise
        */
-      estimateCost(modelId, inputTokens, outputTokens, cachedInputTokens = 0, cacheCreationInputTokens = 0) {
+      estimateCost(modelId, inputTokens, outputTokens, cachedInputTokens = 0, cacheCreationInputTokens = 0, reasoningTokens = 0) {
         const spec = this.getModelSpec(modelId);
         if (!spec) return void 0;
         const cachedRate = spec.pricing.cachedInput ?? spec.pricing.input;
@@ -9500,13 +9906,18 @@ var init_model_registry = __esm({
         const cachedInputCost = cachedInputTokens / 1e6 * cachedRate;
         const cacheCreationCost = cacheCreationInputTokens / 1e6 * cacheWriteRate;
         const inputCost = uncachedInputCost + cachedInputCost + cacheCreationCost;
-        const outputCost = outputTokens / 1e6 * spec.pricing.output;
+        const reasoningRate = spec.pricing.reasoningOutput ?? spec.pricing.output;
+        const nonReasoningOutputTokens = outputTokens - reasoningTokens;
+        const reasoningCost = reasoningTokens / 1e6 * reasoningRate;
+        const nonReasoningOutputCost = nonReasoningOutputTokens / 1e6 * spec.pricing.output;
+        const outputCost = nonReasoningOutputCost + reasoningCost;
         const totalCost = inputCost + outputCost;
         return {
           inputCost,
           cachedInputCost,
           cacheCreationCost,
           outputCost,
+          reasoningCost,
           totalCost,
           currency: "USD"
         };
@@ -10221,6 +10632,8 @@ var init_builder = __esm({
       // Shared retry config from parent for consistent backoff behavior
       // When a gadget calls withParentContext(ctx), this config is shared
       sharedRetryConfig;
+      reasoningConfig;
+      cachingConfig;
       constructor(client) {
         this.client = client;
       }
@@ -10806,6 +11219,116 @@ var init_builder = __esm({
         this.signal = signal;
         return this;
       }
+      /**
+       * Enable reasoning/thinking mode for reasoning-capable models.
+       *
+       * Can be called with:
+       * - No args: enables reasoning at "medium" effort
+       * - A string effort level: `withReasoning("high")`
+       * - A full config object: `withReasoning({ enabled: true, budgetTokens: 10000 })`
+       *
+       * @param config - Optional effort level or full reasoning config
+       * @returns This builder for chaining
+       *
+       * @example
+       * ```typescript
+       * // Simple — medium effort
+       * LLMist.createAgent()
+       *   .withModel("o3")
+       *   .withReasoning()
+       *   .ask("Solve this logic puzzle...");
+       *
+       * // Explicit effort level
+       * LLMist.createAgent()
+       *   .withModel("anthropic:claude-4-opus")
+       *   .withReasoning("high")
+       *   .ask("Analyze this complex problem");
+       *
+       * // Full config with explicit token budget
+       * LLMist.createAgent()
+       *   .withModel("anthropic:claude-4-opus")
+       *   .withReasoning({ enabled: true, budgetTokens: 16000 })
+       *   .ask("Step through this proof");
+       * ```
+       */
+      withReasoning(config) {
+        if (typeof config === "string") {
+          this.reasoningConfig = { enabled: true, effort: config };
+        } else if (config === void 0) {
+          this.reasoningConfig = { enabled: true, effort: "medium" };
+        } else {
+          this.reasoningConfig = config;
+        }
+        return this;
+      }
+      /**
+       * Explicitly disable reasoning for this agent, even if the model supports it.
+       *
+       * By default, reasoning is auto-enabled at "medium" effort for models with
+       * `features.reasoning: true`. Use this to opt out.
+       *
+       * @returns This builder for chaining
+       */
+      withoutReasoning() {
+        this.reasoningConfig = { enabled: false };
+        return this;
+      }
+      /**
+       * Enable context caching for supported providers.
+       *
+       * Can be called with:
+       * - No args: enables caching with defaults (`{ enabled: true }`)
+       * - A full config object: `withCaching({ enabled: true, scope: "system", ttl: "7200s" })`
+       *
+       * Provider behavior:
+       * - **Anthropic**: Caching is always-on by default via `cache_control` markers.
+       *   Calling `withCaching()` explicitly is a no-op (it's already enabled).
+       * - **Gemini**: Creates an explicit cache via `caches.create()` for the configured scope.
+       * - **OpenAI**: Server-side automatic caching (no-op).
+       *
+       * @param config - Optional caching configuration
+       * @returns This builder for chaining
+       *
+       * @example
+       * ```typescript
+       * // Simple — enable with defaults
+       * LLMist.createAgent()
+       *   .withModel("gemini:gemini-2.5-flash")
+       *   .withCaching()
+       *   .ask("Analyze this large codebase...");
+       *
+       * // Cache only system prompt with longer TTL
+       * LLMist.createAgent()
+       *   .withModel("gemini:gemini-2.5-pro")
+       *   .withCaching({ enabled: true, scope: "system", ttl: "7200s" })
+       *   .ask("...");
+       * ```
+       */
+      withCaching(config) {
+        this.cachingConfig = config ?? { enabled: true };
+        return this;
+      }
+      /**
+       * Explicitly disable context caching.
+       *
+       * For Anthropic, this removes `cache_control` markers from requests,
+       * opting out of prompt caching entirely.
+       *
+       * @returns This builder for chaining
+       *
+       * @example
+       * ```typescript
+       * // Disable Anthropic's automatic caching
+       * LLMist.createAgent()
+       *   .withModel("sonnet")
+       *   .withoutCaching()
+       *   .ask("...");
+       * ```
+       */
+      withoutCaching() {
+        this.cachingConfig = { enabled: false };
+        return this;
+      }
       /**
        * Set subagent configuration overrides.
        *
@@ -11091,6 +11614,8 @@ ${endPrefix}`
           retryConfig: this.retryConfig,
           rateLimitConfig: this.rateLimitConfig,
           signal: this.signal,
+          reasoning: this.reasoningConfig,
+          caching: this.cachingConfig,
           subagentConfig: this.subagentConfig,
           // Tree context for shared tree model (subagents share parent's tree)
           parentTree: this.parentContext?.tree,
@@ -11278,6 +11803,8 @@ ${endPrefix}`
           retryConfig: this.retryConfig,
           rateLimitConfig: this.rateLimitConfig,
           signal: this.signal,
+          reasoning: this.reasoningConfig,
+          caching: this.cachingConfig,
           subagentConfig: this.subagentConfig,
           // Tree context for shared tree model (subagents share parent's tree)
           parentTree: this.parentContext?.tree,
@@ -11732,6 +12259,7 @@ var init_cost_reporting_client = __esm({
         let outputTokens = 0;
         let cachedInputTokens = 0;
         let cacheCreationInputTokens = 0;
+        let reasoningTokens = 0;
         const messages = [
           ...options?.systemPrompt ? [{ role: "system", content: options.systemPrompt }] : [],
           { role: "user", content: prompt }
@@ -11748,6 +12276,7 @@ var init_cost_reporting_client = __esm({
             outputTokens = chunk.usage.outputTokens;
             cachedInputTokens = chunk.usage.cachedInputTokens ?? 0;
             cacheCreationInputTokens = chunk.usage.cacheCreationInputTokens ?? 0;
+            reasoningTokens = chunk.usage.reasoningTokens ?? 0;
           }
         }
         this.reportCostFromUsage(
@@ -11755,7 +12284,8 @@ var init_cost_reporting_client = __esm({
           inputTokens,
           outputTokens,
           cachedInputTokens,
-          cacheCreationInputTokens
+          cacheCreationInputTokens,
+          reasoningTokens
         );
         return result;
       }
@@ -11774,6 +12304,7 @@ var init_cost_reporting_client = __esm({
         let outputTokens = 0;
         let cachedInputTokens = 0;
         let cacheCreationInputTokens = 0;
+        let reasoningTokens = 0;
         const messages = [
           ...options?.systemPrompt ? [{ role: "system", content: options.systemPrompt }] : [],
           { role: "user", content: prompt }
@@ -11793,6 +12324,7 @@ var init_cost_reporting_client = __esm({
               outputTokens = chunk.usage.outputTokens;
               cachedInputTokens = chunk.usage.cachedInputTokens ?? 0;
               cacheCreationInputTokens = chunk.usage.cacheCreationInputTokens ?? 0;
+              reasoningTokens = chunk.usage.reasoningTokens ?? 0;
             }
           }
         } finally {
@@ -11801,7 +12333,8 @@ var init_cost_reporting_client = __esm({
             inputTokens,
             outputTokens,
             cachedInputTokens,
-            cacheCreationInputTokens
+            cacheCreationInputTokens,
+            reasoningTokens
           );
         }
       }
@@ -11828,6 +12361,7 @@ var init_cost_reporting_client = __esm({
           let outputTokens = 0;
           let cachedInputTokens = 0;
           let cacheCreationInputTokens = 0;
+          let reasoningTokens = 0;
           try {
             for await (const chunk of innerStream) {
               if (chunk.usage) {
@@ -11835,6 +12369,7 @@ var init_cost_reporting_client = __esm({
                 outputTokens = chunk.usage.outputTokens;
                 cachedInputTokens = chunk.usage.cachedInputTokens ?? 0;
                 cacheCreationInputTokens = chunk.usage.cacheCreationInputTokens ?? 0;
+                reasoningTokens = chunk.usage.reasoningTokens ?? 0;
               }
               yield chunk;
             }
@@ -11845,7 +12380,8 @@ var init_cost_reporting_client = __esm({
                 inputTokens,
                 outputTokens,
                 cachedInputTokens,
-                cacheCreationInputTokens
+                cacheCreationInputTokens,
+                reasoningTokens
               );
             }
           }
@@ -11855,14 +12391,15 @@ var init_cost_reporting_client = __esm({
       /**
        * Calculates and reports cost from token usage.
        */
-      reportCostFromUsage(model, inputTokens, outputTokens, cachedInputTokens = 0, cacheCreationInputTokens = 0) {
+      reportCostFromUsage(model, inputTokens, outputTokens, cachedInputTokens = 0, cacheCreationInputTokens = 0, reasoningTokens = 0) {
         if (inputTokens === 0 && outputTokens === 0) return;
         const estimate = this.client.modelRegistry.estimateCost(
           model,
           inputTokens,
           outputTokens,
           cachedInputTokens,
-          cacheCreationInputTokens
+          cacheCreationInputTokens,
+          reasoningTokens
         );
         if (estimate && estimate.totalCost > 0) {
           this.reportCost(estimate.totalCost);
@@ -12954,9 +13491,18 @@ var init_stream_processor = __esm({
         let usage;
         let didExecuteGadgets = false;
         let shouldBreakLoop = false;
+        let thinkingContent = "";
         for await (const chunk of stream2) {
           if (chunk.finishReason) finishReason = chunk.finishReason;
           if (chunk.usage) usage = chunk.usage;
+          if (chunk.thinking?.content) {
+            thinkingContent += chunk.thinking.content;
+            yield {
+              type: "thinking",
+              content: chunk.thinking.content,
+              thinkingType: chunk.thinking.type
+            };
+          }
           let processedChunk = "";
           if (chunk.text) {
             processedChunk = chunk.text;
@@ -13070,7 +13616,8 @@ var init_stream_processor = __esm({
           finishReason,
           usage,
           rawResponse: this.responseText,
-          finalMessage
+          finalMessage,
+          thinkingContent: thinkingContent || void 0
         };
         yield completionEvent;
       }
@@ -13872,6 +14419,8 @@ var init_agent = __esm({
       mediaStore;
       // Cancellation
       signal;
+      reasoning;
+      caching;
       // Retry configuration
       retryConfig;
       // Rate limit tracker for proactive throttling
@@ -13963,6 +14512,8 @@ var init_agent = __esm({
           );
         }
         this.signal = options.signal;
+        this.reasoning = options.reasoning;
+        this.caching = options.caching;
         this.retryConfig = options.sharedRetryConfig ?? resolveRetryConfig(options.retryConfig);
         if (options.sharedRateLimitTracker) {
           this.rateLimitTracker = options.sharedRateLimitTracker;
@@ -14365,6 +14916,7 @@ var init_agent = __esm({
                     usage: result.usage,
                     rawResponse: result.rawResponse,
                     finalMessage: result.finalMessage,
+                    thinkingContent: result.thinkingContent,
                     logger: this.logger,
                     subagentContext
                   };
@@ -14665,17 +15217,49 @@ var init_agent = __esm({
         });
         return { type: "compaction", event: compactionEvent };
       }
+      /**
+       * Resolve reasoning configuration with auto-enable logic.
+       *
+       * Priority: explicit config > auto-enable for reasoning models > undefined
+       * When a model has `features.reasoning: true` and no explicit config is set,
+       * reasoning is automatically enabled at "medium" effort.
+       */
+      resolveReasoningConfig(spec) {
+        if (this.reasoning !== void 0) return this.reasoning;
+        if (spec?.features?.reasoning) {
+          return { enabled: true, effort: "medium" };
+        }
+        return void 0;
+      }
+      /**
+       * Resolve caching configuration.
+       *
+       * Priority: explicit config > default enabled (preserves Anthropic's existing behavior)
+       * Default is `{ enabled: true }` which means:
+       * - Anthropic: `cache_control` markers are added (existing behavior preserved)
+       * - Gemini: Cache manager is consulted but skips if no explicit config was set
+       * - OpenAI: No-op (server-side automatic)
+       */
+      resolveCachingConfig() {
+        if (this.caching !== void 0) return this.caching;
+        return { enabled: true };
+      }
       /**
        * Prepare LLM call options, create tree node, and process beforeLLMCall controller.
        * @returns options, node ID, and optional skipWithSynthetic response if controller wants to skip
        */
       async prepareLLMCall(iteration) {
+        const spec = this.client.modelRegistry?.getModelSpec?.(this.model);
+        const reasoning = this.resolveReasoningConfig(spec);
+        const caching = this.resolveCachingConfig();
         let llmOptions = {
           model: this.model,
           messages: this.conversation.getMessages(),
           temperature: this.temperature,
           maxTokens: this.defaultMaxTokens,
-          signal: this.signal
+          signal: this.signal,
+          reasoning,
+          caching
         };
         const llmNode = this.tree.addLLMCall({
           iteration,
@@ -14745,13 +15329,15 @@ var init_agent = __esm({
           inputTokens,
           outputTokens,
           result.usage?.cachedInputTokens ?? 0,
-          result.usage?.cacheCreationInputTokens ?? 0
+          result.usage?.cacheCreationInputTokens ?? 0,
+          result.usage?.reasoningTokens ?? 0
         )?.totalCost;
         this.tree.completeLLMCall(nodeId, {
           response: result.rawResponse,
           usage: result.usage,
           finishReason: result.finishReason,
-          cost: llmCost
+          cost: llmCost,
+          thinkingContent: result.thinkingContent
         });
       }
       /**