npm - llmist - Versions diffs - 16.2.4 → 17.1.0 - Mend

llmist 16.2.4 → 17.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -818,7 +818,7 @@ var init_constants = __esm({
     GADGET_ARG_PREFIX = "!!!ARG:";
     DEFAULT_GADGET_OUTPUT_LIMIT = true;
     DEFAULT_GADGET_OUTPUT_LIMIT_PERCENT = 15;
-    CHARS_PER_TOKEN = 4;
+    CHARS_PER_TOKEN = 2;
     FALLBACK_CONTEXT_WINDOW = 128e3;
   }
 });
@@ -2839,6 +2839,7 @@ var CompactionManager;
 var init_manager = __esm({
   "src/agent/compaction/manager.ts"() {
     "use strict";
+    init_logger();
     init_config();
     init_strategies();
     CompactionManager = class {
@@ -2846,15 +2847,19 @@ var init_manager = __esm({
       model;
       config;
       strategy;
+      logger;
       modelLimits;
+      hasWarnedModelNotFound = false;
+      hasWarnedNoTokenCounting = false;
       // Statistics
       totalCompactions = 0;
       totalTokensSaved = 0;
       lastTokenCount = 0;
-      constructor(client, model, config = {}) {
+      constructor(client, model, config = {}, logger2) {
         this.client = client;
         this.model = model;
         this.config = resolveCompactionConfig(config);
+        this.logger = logger2 ?? createLogger({ name: "llmist:compaction" });
         if (typeof config.strategy === "object" && "compact" in config.strategy) {
           this.strategy = config.strategy;
         } else {
@@ -2872,13 +2877,16 @@ var init_manager = __esm({
         if (!this.config.enabled) {
           return null;
         }
-        if (!this.modelLimits) {
-          this.modelLimits = this.client.modelRegistry.getModelLimits(this.model);
-          if (!this.modelLimits) {
-            return null;
-          }
+        if (!this.resolveModelLimits()) {
+          return null;
         }
         if (!this.client.countTokens) {
+          if (!this.hasWarnedNoTokenCounting) {
+            this.hasWarnedNoTokenCounting = true;
+            this.logger.warn("Compaction skipped: client does not support token counting", {
+              model: this.model
+            });
+          }
           return null;
         }
         const messages = conversation.getMessages();
@@ -2909,11 +2917,8 @@ var init_manager = __esm({
        * @returns CompactionEvent with compaction details
        */
       async compact(conversation, iteration, precomputed) {
-        if (!this.modelLimits) {
-          this.modelLimits = this.client.modelRegistry.getModelLimits(this.model);
-          if (!this.modelLimits) {
-            return null;
-          }
+        if (!this.resolveModelLimits()) {
+          return null;
         }
         const historyMessages = precomputed?.historyMessages ?? conversation.getHistoryMessages();
         const baseMessages = precomputed?.baseMessages ?? conversation.getBaseMessages();
@@ -2955,6 +2960,42 @@ var init_manager = __esm({
         }
         return event;
       }
+      /**
+       * Feed API-reported input token count for reactive threshold checking.
+       * Call this after each LLM response with the actual inputTokens from usage.
+       */
+      updateUsage(inputTokens) {
+        this.lastTokenCount = inputTokens;
+      }
+      /**
+       * Check if compaction should trigger based on API-reported usage.
+       * Unlike checkAndCompact() which uses estimated token counts,
+       * this uses the ground-truth token count from the last LLM response.
+       */
+      shouldCompactFromUsage() {
+        if (!this.config.enabled) return false;
+        if (!this.resolveModelLimits()) return false;
+        const usagePercent = this.lastTokenCount / this.modelLimits.contextWindow * 100;
+        return usagePercent >= this.config.triggerThresholdPercent;
+      }
+      /**
+       * Resolve and cache model limits from registry. Warns once if not found.
+       * @returns true if limits are available, false otherwise
+       */
+      resolveModelLimits() {
+        if (this.modelLimits) return true;
+        this.modelLimits = this.client.modelRegistry.getModelLimits(this.model);
+        if (!this.modelLimits) {
+          if (!this.hasWarnedModelNotFound) {
+            this.hasWarnedModelNotFound = true;
+            this.logger.warn("Compaction skipped: model not found in registry", {
+              model: this.model
+            });
+          }
+          return false;
+        }
+        return true;
+      }
       /**
        * Get compaction statistics.
        */
@@ -7358,7 +7399,7 @@ var init_constants2 = __esm({
   "src/providers/constants.ts"() {
     "use strict";
     ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS = 4096;
-    FALLBACK_CHARS_PER_TOKEN = 4;
+    FALLBACK_CHARS_PER_TOKEN = 2;
     OPENAI_MESSAGE_OVERHEAD_TOKENS = 4;
     OPENAI_REPLY_PRIMING_TOKENS = 2;
     OPENAI_NAME_FIELD_OVERHEAD_TOKENS = 1;
@@ -9712,11 +9753,12 @@ var init_huggingface_models = __esm({
 });
 // src/providers/openai-compatible-provider.ts
-var import_openai, ROLE_MAP, OpenAICompatibleProvider;
+var import_openai, import_tiktoken, ROLE_MAP, OpenAICompatibleProvider;
 var init_openai_compatible_provider = __esm({
   "src/providers/openai-compatible-provider.ts"() {
     "use strict";
     import_openai = __toESM(require("openai"), 1);
+    import_tiktoken = require("tiktoken");
     init_messages();
     init_base_provider();
     init_constants2();
@@ -9917,11 +9959,38 @@ var init_openai_compatible_provider = __esm({
         }
       }
       /**
-       * Count tokens using character-based fallback estimation.
-       * Most meta-providers don't have a native token counting API.
+       * Count tokens using tiktoken o200k_base encoding.
+       *
+       * While o200k_base isn't model-exact for non-OpenAI models routed through
+       * meta-providers like OpenRouter, BPE tokenizers with 200K vocab produce
+       * counts within 10-20% of true values — far better than the character-based
+       * fallback which can be off by 250% for JSON/code-heavy content.
+       *
+       * Falls back to character-based estimation if tiktoken fails.
        */
       async countTokens(messages, descriptor, _spec) {
+        if (!messages || messages.length === 0) return 0;
         try {
+          const encoding = (0, import_tiktoken.get_encoding)("o200k_base");
+          try {
+            let tokenCount = 0;
+            for (const msg of messages) {
+              const parts = normalizeMessageContent(msg.content);
+              for (const part of parts) {
+                if (part.type === "text") {
+                  tokenCount += encoding.encode(part.text).length;
+                }
+              }
+            }
+            return tokenCount;
+          } finally {
+            encoding.free();
+          }
+        } catch (error) {
+          console.warn(
+            `Token counting with tiktoken failed for ${descriptor.name}, using fallback estimation:`,
+            error
+          );
           let totalChars = 0;
           for (const msg of messages) {
             const parts = normalizeMessageContent(msg.content);
@@ -9932,9 +10001,6 @@ var init_openai_compatible_provider = __esm({
             }
           }
           return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
-        } catch (error) {
-          console.warn(`Token counting failed for ${descriptor.name}, using zero estimate:`, error);
-          return 0;
         }
       }
     };
@@ -10885,12 +10951,12 @@ function sanitizeExtra(extra, allowTemperature) {
 function createOpenAIProviderFromEnv() {
   return createProviderFromEnv("OPENAI_API_KEY", import_openai3.default, OpenAIChatProvider);
 }
-var import_openai3, import_tiktoken, ROLE_MAP2, OPENAI_EFFORT_MAP, OpenAIChatProvider;
+var import_openai3, import_tiktoken2, ROLE_MAP2, OPENAI_EFFORT_MAP, OpenAIChatProvider;
 var init_openai = __esm({
   "src/providers/openai.ts"() {
     "use strict";
     import_openai3 = __toESM(require("openai"), 1);
-    import_tiktoken = require("tiktoken");
+    import_tiktoken2 = require("tiktoken");
     init_messages();
     init_base_provider();
     init_constants2();
@@ -11149,9 +11215,9 @@ var init_openai = __esm({
           const modelName = descriptor.name;
           let encoding;
           try {
-            encoding = (0, import_tiktoken.encoding_for_model)(modelName);
+            encoding = (0, import_tiktoken2.encoding_for_model)(modelName);
           } catch {
-            encoding = (0, import_tiktoken.encoding_for_model)("gpt-4o");
+            encoding = (0, import_tiktoken2.encoding_for_model)("gpt-4o");
           }
           try {
             let tokenCount = 0;
@@ -11443,6 +11509,103 @@ var init_openrouter_models = __esm({
         }
       },
       // ============================================================
+      // Google Gemini 3.1 Models (via OpenRouter)
+      // ============================================================
+      {
+        provider: "openrouter",
+        modelId: "google/gemini-3.1-pro-preview",
+        displayName: "Gemini 3.1 Pro Preview (OpenRouter)",
+        contextWindow: 1048576,
+        maxOutputTokens: 65536,
+        pricing: {
+          input: 2,
+          output: 12
+        },
+        knowledgeCutoff: "2025-01",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          reasoning: true,
+          structuredOutputs: true
+        },
+        metadata: {
+          family: "Gemini 3.1",
+          releaseDate: "2026-03",
+          notes: "Gemini 3.1 Pro Preview via OpenRouter. Frontier reasoning with enhanced software engineering performance."
+        }
+      },
+      {
+        provider: "openrouter",
+        modelId: "google/gemini-3.1-pro-preview-customtools",
+        displayName: "Gemini 3.1 Pro Preview Custom Tools (OpenRouter)",
+        contextWindow: 1048576,
+        maxOutputTokens: 65536,
+        pricing: {
+          input: 2,
+          output: 12
+        },
+        knowledgeCutoff: "2025-01",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          reasoning: true,
+          structuredOutputs: true
+        },
+        metadata: {
+          family: "Gemini 3.1",
+          releaseDate: "2026-03",
+          notes: "Gemini 3.1 Pro Preview Custom Tools via OpenRouter. Improved tool selection to prevent overuse of general tools in agent workflows."
+        }
+      },
+      {
+        provider: "openrouter",
+        modelId: "google/gemini-3.1-flash-lite-preview",
+        displayName: "Gemini 3.1 Flash Lite Preview (OpenRouter)",
+        contextWindow: 1048576,
+        maxOutputTokens: 65536,
+        pricing: {
+          input: 0.25,
+          output: 1.5
+        },
+        knowledgeCutoff: "2025-01",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          reasoning: true,
+          structuredOutputs: true
+        },
+        metadata: {
+          family: "Gemini 3.1",
+          releaseDate: "2026-03",
+          notes: "Gemini 3.1 Flash Lite Preview via OpenRouter. High-efficiency model with full thinking levels for cost/performance trade-offs."
+        }
+      },
+      {
+        provider: "openrouter",
+        modelId: "google/gemini-3.1-flash-image-preview",
+        displayName: "Gemini 3.1 Flash Image Preview (OpenRouter)",
+        contextWindow: 65536,
+        maxOutputTokens: 65536,
+        pricing: {
+          input: 0.5,
+          output: 3
+        },
+        knowledgeCutoff: "2025-01",
+        features: {
+          streaming: true,
+          functionCalling: false,
+          vision: true
+        },
+        metadata: {
+          family: "Gemini 3.1",
+          releaseDate: "2026-03",
+          notes: "Gemini 3.1 Flash Image Preview via OpenRouter. Pro-level image generation and editing at Flash speed."
+        }
+      },
+      // ============================================================
       // Meta Llama Models (via OpenRouter)
       // ============================================================
       {
@@ -12596,6 +12759,7 @@ var init_client = __esm({
     "use strict";
     init_builder();
     init_discovery();
+    init_constants();
     init_model_registry();
     init_image();
     init_speech();
@@ -12714,8 +12878,43 @@ var init_client = __esm({
         if (adapter.countTokens) {
           return adapter.countTokens(messages, descriptor, spec);
         }
-        const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
-        return Math.ceil(totalChars / 4);
+        try {
+          const { get_encoding: get_encoding2 } = await import("tiktoken");
+          const encoding = get_encoding2("o200k_base");
+          try {
+            let tokenCount = 0;
+            for (const msg of messages) {
+              const content = msg.content;
+              if (typeof content === "string") {
+                tokenCount += encoding.encode(content).length;
+              } else if (Array.isArray(content)) {
+                for (const part of content) {
+                  if (part.type === "text") {
+                    tokenCount += encoding.encode(part.text).length;
+                  }
+                }
+              }
+            }
+            return tokenCount;
+          } finally {
+            encoding.free();
+          }
+        } catch {
+          let totalChars = 0;
+          for (const msg of messages) {
+            const content = msg.content;
+            if (typeof content === "string") {
+              totalChars += content.length;
+            } else if (Array.isArray(content)) {
+              for (const part of content) {
+                if (part.type === "text") {
+                  totalChars += part.text.length;
+                }
+              }
+            }
+          }
+          return Math.ceil(totalChars / CHARS_PER_TOKEN);
+        }
       }
       resolveAdapter(descriptor) {
         const adapter = this.adapters.find((item) => item.supports(descriptor));
@@ -16380,7 +16579,8 @@ var init_agent = __esm({
           this.compactionManager = new CompactionManager(
             this.client,
             this.model,
-            options.compactionConfig
+            options.compactionConfig,
+            this.logger
           );
         }
         this.signal = options.signal;
@@ -16726,6 +16926,22 @@ var init_agent = __esm({
                 this.logger.info("Loop terminated by gadget or processor");
                 break;
               }
+              if (this.compactionManager && result.usage?.inputTokens) {
+                this.compactionManager.updateUsage(result.usage.inputTokens);
+                if (this.compactionManager.shouldCompactFromUsage()) {
+                  this.logger.info("Reactive compaction triggered from API-reported usage", {
+                    inputTokens: result.usage.inputTokens,
+                    iteration: currentIteration
+                  });
+                  const reactiveCompaction = await this.compactionManager.compact(
+                    this.conversation,
+                    currentIteration
+                  );
+                  if (reactiveCompaction) {
+                    yield await this.emitCompactionEvent(reactiveCompaction, currentIteration);
+                  }
+                }
+              }
               if (this.budget !== void 0) {
                 const totalCost = this.tree.getTotalCost();
                 if (totalCost >= this.budget) {