npm - llmist - Versions diffs - 15.13.0 → 15.14.0 - Mend

llmist 15.13.0 → 15.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -5432,12 +5432,13 @@ var init_anthropic = __esm({
         );
       }
       buildApiRequest(options, descriptor, spec, messages) {
+        const cachingEnabled = options.caching?.enabled !== false;
         const systemMessages = messages.filter((message) => message.role === "system");
         const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
           type: "text",
           text: extractMessageText(m.content),
-          // Add cache_control to the LAST system message block
-          ...index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
+          // Add cache_control to the LAST system message block (only when caching is enabled)
+          ...cachingEnabled && index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
         })) : void 0;
         const nonSystemMessages = messages.filter(
           (message) => message.role !== "system"
@@ -5450,7 +5451,7 @@ var init_anthropic = __esm({
           role: message.role,
           content: this.convertToAnthropicContent(
             message.content,
-            message.role === "user" && index === lastUserIndex
+            cachingEnabled && message.role === "user" && index === lastUserIndex
           )
         }));
         const defaultMaxTokens = spec?.maxOutputTokens ?? ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS;
@@ -5667,6 +5668,177 @@ var init_anthropic = __esm({
   }
 });
+// src/providers/gemini-cache-manager.ts
+var import_node_crypto3, GeminiCacheManager;
+var init_gemini_cache_manager = __esm({
+  "src/providers/gemini-cache-manager.ts"() {
+    "use strict";
+    import_node_crypto3 = require("crypto");
+    GeminiCacheManager = class {
+      constructor(client) {
+        this.client = client;
+      }
+      activeCache = null;
+      /**
+       * Get or create a cache for the given content.
+       *
+       * Returns the cache name if a cache was created/reused, or `null` if caching
+       * was skipped (disabled, below threshold, or API error).
+       *
+       * @param model - Gemini model name (e.g., "gemini-2.5-flash")
+       * @param allContents - All Gemini-formatted contents (system + conversation)
+       * @param config - Caching configuration from the user
+       * @param lastUserMessageIndex - Index of the last user message (content after this is not cached)
+       * @returns Cache name string or null
+       */
+      async getOrCreateCache(model, allContents, config, lastUserMessageIndex) {
+        if (!config.enabled) return null;
+        const scope = config.scope ?? "conversation";
+        const ttl = config.ttl ?? "3600s";
+        const minTokenThreshold = config.minTokenThreshold ?? 32768;
+        const cacheableContents = this.selectCacheableContents(
+          allContents,
+          scope,
+          lastUserMessageIndex
+        );
+        if (cacheableContents.length === 0) return null;
+        const estimatedTokens = this.estimateTokenCount(cacheableContents);
+        if (estimatedTokens < minTokenThreshold) return null;
+        const contentHash = this.computeContentHash(cacheableContents, model);
+        if (this.activeCache && this.canReuseCache(this.activeCache, model, contentHash)) {
+          return {
+            cacheName: this.activeCache.name,
+            cachedContentCount: cacheableContents.length
+          };
+        }
+        try {
+          await this.cleanupActiveCache();
+          const response = await this.client.caches.create({
+            model,
+            config: {
+              contents: cacheableContents,
+              ttl,
+              displayName: `llmist-${scope}-${Date.now()}`
+            }
+          });
+          if (!response.name) {
+            return null;
+          }
+          this.activeCache = {
+            name: response.name,
+            model,
+            contentHash,
+            expireTime: response.expireTime ?? ""
+          };
+          return {
+            cacheName: response.name,
+            cachedContentCount: cacheableContents.length
+          };
+        } catch (error) {
+          console.warn("Gemini cache creation failed, continuing without cache:", error);
+          return null;
+        }
+      }
+      /**
+       * Clean up the active cache (best-effort).
+       * Caches auto-expire via TTL, so failure is non-critical.
+       */
+      async dispose() {
+        await this.cleanupActiveCache();
+      }
+      /**
+       * Select which contents to cache based on scope.
+       *
+       * - "system": Only system-derived messages (the initial user+model exchanges
+       *   generated from system messages)
+       * - "conversation": Everything except the last user message
+       */
+      selectCacheableContents(allContents, scope, lastUserMessageIndex) {
+        if (scope === "system") {
+          let systemEndIndex = 0;
+          for (let i = 0; i < allContents.length; i++) {
+            const content = allContents[i];
+            if (content.role === "model" && content.parts.length === 1 && "text" in content.parts[0] && content.parts[0].text === "Understood.") {
+              systemEndIndex = i + 1;
+            } else if (content.role === "user") {
+              const next = allContents[i + 1];
+              if (next && next.role === "model" && next.parts.length === 1 && "text" in next.parts[0] && next.parts[0].text === "Understood.") {
+                continue;
+              }
+              break;
+            } else {
+              break;
+            }
+          }
+          return allContents.slice(0, systemEndIndex);
+        }
+        if (lastUserMessageIndex <= 0) return [];
+        return allContents.slice(0, lastUserMessageIndex);
+      }
+      /**
+       * Estimate token count from contents using character-based heuristic.
+       * Uses ~4 characters per token (conservative estimate for English text).
+       */
+      estimateTokenCount(contents) {
+        let totalChars = 0;
+        for (const content of contents) {
+          for (const part of content.parts) {
+            if ("text" in part) {
+              totalChars += part.text.length;
+            } else if ("inlineData" in part) {
+              totalChars += 258 * 4;
+            }
+          }
+        }
+        return Math.ceil(totalChars / 4);
+      }
+      /**
+       * Compute a stable hash of the cacheable contents for change detection.
+       */
+      computeContentHash(contents, model) {
+        const hash = (0, import_node_crypto3.createHash)("sha256");
+        hash.update(model);
+        for (const content of contents) {
+          hash.update(content.role);
+          for (const part of content.parts) {
+            if ("text" in part) {
+              hash.update(part.text);
+            } else if ("inlineData" in part) {
+              hash.update(part.inlineData.mimeType);
+              hash.update(part.inlineData.data);
+            }
+          }
+        }
+        return hash.digest("hex");
+      }
+      /**
+       * Check if an existing cache can be reused.
+       */
+      canReuseCache(cache, model, contentHash) {
+        if (cache.model !== model) return false;
+        if (cache.contentHash !== contentHash) return false;
+        if (cache.expireTime) {
+          const expiresAt = new Date(cache.expireTime).getTime();
+          const now = Date.now();
+          if (expiresAt - now < 6e4) return false;
+        }
+        return true;
+      }
+      /**
+       * Delete the active cache (best-effort).
+       */
+      async cleanupActiveCache() {
+        if (!this.activeCache) return;
+        try {
+          await this.client.caches.delete({ name: this.activeCache.name });
+        } catch {
+        }
+        this.activeCache = null;
+      }
+    };
+  }
+});
 // src/providers/gemini-image-models.ts
 function getGeminiImageModelSpec(modelId) {
   return geminiImageModels.find((m) => m.modelId === modelId);
@@ -6236,6 +6408,7 @@ var init_gemini = __esm({
     init_messages();
     init_base_provider();
     init_constants2();
+    init_gemini_cache_manager();
     init_gemini_image_models();
     init_gemini_models();
     init_gemini_speech_models();
@@ -6261,12 +6434,62 @@ var init_gemini = __esm({
     };
     GeminiGenerativeProvider = class extends BaseProviderAdapter {
       providerId = "gemini";
+      cacheManager;
+      constructor(client) {
+        super(client);
+        this.cacheManager = new GeminiCacheManager(client);
+      }
       supports(descriptor) {
         return descriptor.provider === this.providerId;
       }
       getModelSpecs() {
         return GEMINI_MODELS;
       }
+      /**
+       * Override the base stream method to inject cache logic.
+       *
+       * When caching is enabled, we:
+       * 1. Prepare messages as usual
+       * 2. Attempt to get/create a cache for the cacheable prefix
+       * 3. If a cache is available, strip cached contents from the request and add cachedContent ref
+       * 4. Otherwise, proceed normally (graceful degradation)
+       */
+      async *stream(options, descriptor, spec) {
+        const preparedMessages = this.prepareMessages(options.messages);
+        const contents = this.convertMessagesToContents(preparedMessages);
+        const cachingConfig = options.caching;
+        let cacheName = null;
+        let cachedContentCount = 0;
+        if (cachingConfig?.enabled) {
+          let lastUserIndex = -1;
+          for (let i = contents.length - 1; i >= 0; i--) {
+            if (contents[i].role === "user") {
+              lastUserIndex = i;
+              break;
+            }
+          }
+          const cacheResult = await this.cacheManager.getOrCreateCache(
+            descriptor.name,
+            contents,
+            cachingConfig,
+            lastUserIndex
+          );
+          if (cacheResult) {
+            cacheName = cacheResult.cacheName;
+            cachedContentCount = cacheResult.cachedContentCount;
+          }
+        }
+        const payload = this.buildApiRequestFromContents(
+          options,
+          descriptor,
+          spec,
+          contents,
+          cacheName,
+          cachedContentCount
+        );
+        const rawStream = await this.executeStreamRequest(payload, options.signal);
+        yield* this.normalizeProviderStream(rawStream);
+      }
       // =========================================================================
       // Image Generation
       // =========================================================================
@@ -6402,6 +6625,17 @@ var init_gemini = __esm({
       }
       buildApiRequest(options, descriptor, _spec, messages) {
         const contents = this.convertMessagesToContents(messages);
+        return this.buildApiRequestFromContents(options, descriptor, _spec, contents, null, 0);
+      }
+      /**
+       * Build API request from pre-converted Gemini contents.
+       *
+       * When a cache name is provided, the cached prefix is stripped from contents
+       * and the cache reference is added to the config. This tells Gemini to use
+       * the pre-computed KV pairs instead of reprocessing the cached content.
+       */
+      buildApiRequestFromContents(options, descriptor, _spec, contents, cacheName, cachedContentCount) {
+        const effectiveContents = cacheName ? contents.slice(cachedContentCount) : contents;
         const generationConfig = this.buildGenerationConfig(options);
         const thinkingConfig = resolveGeminiThinkingConfig(options.reasoning, descriptor.name);
         const config = {
@@ -6415,11 +6649,13 @@ var init_gemini = __esm({
             }
           },
           ...thinkingConfig ?? {},
+          // Add cache reference if available
+          ...cacheName ? { cachedContent: cacheName } : {},
           ...options.extra
         };
         return {
           model: descriptor.name,
-          contents,
+          contents: effectiveContents,
           config
         };
       }
@@ -10397,6 +10633,7 @@ var init_builder = __esm({
       // When a gadget calls withParentContext(ctx), this config is shared
       sharedRetryConfig;
       reasoningConfig;
+      cachingConfig;
       constructor(client) {
         this.client = client;
       }
@@ -11036,6 +11273,62 @@ var init_builder = __esm({
         this.reasoningConfig = { enabled: false };
         return this;
       }
+      /**
+       * Enable context caching for supported providers.
+       *
+       * Can be called with:
+       * - No args: enables caching with defaults (`{ enabled: true }`)
+       * - A full config object: `withCaching({ enabled: true, scope: "system", ttl: "7200s" })`
+       *
+       * Provider behavior:
+       * - **Anthropic**: Caching is always-on by default via `cache_control` markers.
+       *   Calling `withCaching()` explicitly is a no-op (it's already enabled).
+       * - **Gemini**: Creates an explicit cache via `caches.create()` for the configured scope.
+       * - **OpenAI**: Server-side automatic caching (no-op).
+       *
+       * @param config - Optional caching configuration
+       * @returns This builder for chaining
+       *
+       * @example
+       * ```typescript
+       * // Simple — enable with defaults
+       * LLMist.createAgent()
+       *   .withModel("gemini:gemini-2.5-flash")
+       *   .withCaching()
+       *   .ask("Analyze this large codebase...");
+       *
+       * // Cache only system prompt with longer TTL
+       * LLMist.createAgent()
+       *   .withModel("gemini:gemini-2.5-pro")
+       *   .withCaching({ enabled: true, scope: "system", ttl: "7200s" })
+       *   .ask("...");
+       * ```
+       */
+      withCaching(config) {
+        this.cachingConfig = config ?? { enabled: true };
+        return this;
+      }
+      /**
+       * Explicitly disable context caching.
+       *
+       * For Anthropic, this removes `cache_control` markers from requests,
+       * opting out of prompt caching entirely.
+       *
+       * @returns This builder for chaining
+       *
+       * @example
+       * ```typescript
+       * // Disable Anthropic's automatic caching
+       * LLMist.createAgent()
+       *   .withModel("sonnet")
+       *   .withoutCaching()
+       *   .ask("...");
+       * ```
+       */
+      withoutCaching() {
+        this.cachingConfig = { enabled: false };
+        return this;
+      }
       /**
        * Set subagent configuration overrides.
        *
@@ -11322,6 +11615,7 @@ ${endPrefix}`
           rateLimitConfig: this.rateLimitConfig,
           signal: this.signal,
           reasoning: this.reasoningConfig,
+          caching: this.cachingConfig,
           subagentConfig: this.subagentConfig,
           // Tree context for shared tree model (subagents share parent's tree)
           parentTree: this.parentContext?.tree,
@@ -11510,6 +11804,7 @@ ${endPrefix}`
           rateLimitConfig: this.rateLimitConfig,
           signal: this.signal,
           reasoning: this.reasoningConfig,
+          caching: this.cachingConfig,
           subagentConfig: this.subagentConfig,
           // Tree context for shared tree model (subagents share parent's tree)
           parentTree: this.parentContext?.tree,
@@ -14125,6 +14420,7 @@ var init_agent = __esm({
       // Cancellation
       signal;
       reasoning;
+      caching;
       // Retry configuration
       retryConfig;
       // Rate limit tracker for proactive throttling
@@ -14217,6 +14513,7 @@ var init_agent = __esm({
         }
         this.signal = options.signal;
         this.reasoning = options.reasoning;
+        this.caching = options.caching;
         this.retryConfig = options.sharedRetryConfig ?? resolveRetryConfig(options.retryConfig);
         if (options.sharedRateLimitTracker) {
           this.rateLimitTracker = options.sharedRateLimitTracker;
@@ -14934,6 +15231,19 @@ var init_agent = __esm({
         }
         return void 0;
       }
+      /**
+       * Resolve caching configuration.
+       *
+       * Priority: explicit config > default enabled (preserves Anthropic's existing behavior)
+       * Default is `{ enabled: true }` which means:
+       * - Anthropic: `cache_control` markers are added (existing behavior preserved)
+       * - Gemini: Cache manager is consulted but skips if no explicit config was set
+       * - OpenAI: No-op (server-side automatic)
+       */
+      resolveCachingConfig() {
+        if (this.caching !== void 0) return this.caching;
+        return { enabled: true };
+      }
       /**
        * Prepare LLM call options, create tree node, and process beforeLLMCall controller.
        * @returns options, node ID, and optional skipWithSynthetic response if controller wants to skip
@@ -14941,13 +15251,15 @@ var init_agent = __esm({
       async prepareLLMCall(iteration) {
         const spec = this.client.modelRegistry?.getModelSpec?.(this.model);
         const reasoning = this.resolveReasoningConfig(spec);
+        const caching = this.resolveCachingConfig();
         let llmOptions = {
           model: this.model,
           messages: this.conversation.getMessages(),
           temperature: this.temperature,
           maxTokens: this.defaultMaxTokens,
           signal: this.signal,
-          reasoning
+          reasoning,
+          caching
         };
         const llmNode = this.tree.addLLMCall({
           iteration,