npm - llmist - Versions diffs - 16.2.4 → 17.1.0 - Mend

llmist 16.2.4 → 17.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.cts CHANGED Viewed

@@ -7328,11 +7328,14 @@ declare class CompactionManager {
     private readonly model;
     private readonly config;
     private readonly strategy;
+    private readonly logger;
     private modelLimits?;
+    private hasWarnedModelNotFound;
+    private hasWarnedNoTokenCounting;
     private totalCompactions;
     private totalTokensSaved;
     private lastTokenCount;
-    constructor(client: LLMist, model: string, config?: CompactionConfig);
+    constructor(client: LLMist, model: string, config?: CompactionConfig, logger?: Logger<ILogObj>);
     /**
      * Check if compaction is needed and perform it if so.
      *
@@ -7350,6 +7353,22 @@ declare class CompactionManager {
      * @returns CompactionEvent with compaction details
      */
     compact(conversation: IConversationManager, iteration: number, precomputed?: PrecomputedTokens): Promise<CompactionEvent | null>;
+    /**
+     * Feed API-reported input token count for reactive threshold checking.
+     * Call this after each LLM response with the actual inputTokens from usage.
+     */
+    updateUsage(inputTokens: number): void;
+    /**
+     * Check if compaction should trigger based on API-reported usage.
+     * Unlike checkAndCompact() which uses estimated token counts,
+     * this uses the ground-truth token count from the last LLM response.
+     */
+    shouldCompactFromUsage(): boolean;
+    /**
+     * Resolve and cache model limits from registry. Warns once if not found.
+     * @returns true if limits are available, false otherwise
+     */
+    private resolveModelLimits;
     /**
      * Get compaction statistics.
      */
@@ -8613,20 +8632,16 @@ declare class GadgetCallParser {
 /**
  * Character-to-token ratio for fallback token estimation.
  *
- * Rationale: When native token counting APIs fail, we estimate tokens using
- * a rough heuristic of 4 characters per token. This is based on empirical
- * observations across multiple LLM providers:
- * - OpenAI's GPT models average ~4 chars/token for English text
- * - Anthropic's Claude models have similar characteristics
- * - Gemini models also approximate this ratio
- *
- * This is intentionally conservative to avoid underestimating token usage.
- * While not perfectly accurate, it provides a reasonable fallback when
- * precise tokenization is unavailable.
+ * Used only when tiktoken (the primary fallback) is unavailable. A value of 2
+ * errs on the side of overestimating token count, which is safer for
+ * compaction triggers and output limiting.
  *
- * Reference: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
+ * Rationale: The previous value of 4 was based on English prose averages, but
+ * agentic sessions are dominated by JSON, code, and structured data where the
+ * real ratio is ~1.5-2.5 chars/token. A 4-char estimate underestimated tokens
+ * by up to 250%, causing compaction and output limiting to never trigger.
  */
-declare const FALLBACK_CHARS_PER_TOKEN = 4;
+declare const FALLBACK_CHARS_PER_TOKEN = 2;
 /**
  * Subagent creation helper for gadget authors.
@@ -9554,8 +9569,14 @@ declare abstract class OpenAICompatibleProvider<TConfig extends OpenAICompatible
     protected executeStreamRequest(payload: Parameters<OpenAI["chat"]["completions"]["create"]>[0], signal?: AbortSignal): Promise<AsyncIterable<ChatCompletionChunk>>;
     protected normalizeProviderStream(iterable: AsyncIterable<unknown>): LLMStream;
     /**
-     * Count tokens using character-based fallback estimation.
-     * Most meta-providers don't have a native token counting API.
+     * Count tokens using tiktoken o200k_base encoding.
+     *
+     * While o200k_base isn't model-exact for non-OpenAI models routed through
+     * meta-providers like OpenRouter, BPE tokenizers with 200K vocab produce
+     * counts within 10-20% of true values — far better than the character-based
+     * fallback which can be off by 250% for JSON/code-heavy content.
+     *
+     * Falls back to character-based estimation if tiktoken fails.
      */
     countTokens(messages: LLMMessage[], descriptor: ModelDescriptor, _spec?: ModelSpec): Promise<number>;
 }

package/dist/index.d.ts CHANGED Viewed

@@ -7328,11 +7328,14 @@ declare class CompactionManager {
     private readonly model;
     private readonly config;
     private readonly strategy;
+    private readonly logger;
     private modelLimits?;
+    private hasWarnedModelNotFound;
+    private hasWarnedNoTokenCounting;
     private totalCompactions;
     private totalTokensSaved;
     private lastTokenCount;
-    constructor(client: LLMist, model: string, config?: CompactionConfig);
+    constructor(client: LLMist, model: string, config?: CompactionConfig, logger?: Logger<ILogObj>);
     /**
      * Check if compaction is needed and perform it if so.
      *
@@ -7350,6 +7353,22 @@ declare class CompactionManager {
      * @returns CompactionEvent with compaction details
      */
     compact(conversation: IConversationManager, iteration: number, precomputed?: PrecomputedTokens): Promise<CompactionEvent | null>;
+    /**
+     * Feed API-reported input token count for reactive threshold checking.
+     * Call this after each LLM response with the actual inputTokens from usage.
+     */
+    updateUsage(inputTokens: number): void;
+    /**
+     * Check if compaction should trigger based on API-reported usage.
+     * Unlike checkAndCompact() which uses estimated token counts,
+     * this uses the ground-truth token count from the last LLM response.
+     */
+    shouldCompactFromUsage(): boolean;
+    /**
+     * Resolve and cache model limits from registry. Warns once if not found.
+     * @returns true if limits are available, false otherwise
+     */
+    private resolveModelLimits;
     /**
      * Get compaction statistics.
      */
@@ -8613,20 +8632,16 @@ declare class GadgetCallParser {
 /**
  * Character-to-token ratio for fallback token estimation.
  *
- * Rationale: When native token counting APIs fail, we estimate tokens using
- * a rough heuristic of 4 characters per token. This is based on empirical
- * observations across multiple LLM providers:
- * - OpenAI's GPT models average ~4 chars/token for English text
- * - Anthropic's Claude models have similar characteristics
- * - Gemini models also approximate this ratio
- *
- * This is intentionally conservative to avoid underestimating token usage.
- * While not perfectly accurate, it provides a reasonable fallback when
- * precise tokenization is unavailable.
+ * Used only when tiktoken (the primary fallback) is unavailable. A value of 2
+ * errs on the side of overestimating token count, which is safer for
+ * compaction triggers and output limiting.
  *
- * Reference: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
+ * Rationale: The previous value of 4 was based on English prose averages, but
+ * agentic sessions are dominated by JSON, code, and structured data where the
+ * real ratio is ~1.5-2.5 chars/token. A 4-char estimate underestimated tokens
+ * by up to 250%, causing compaction and output limiting to never trigger.
  */
-declare const FALLBACK_CHARS_PER_TOKEN = 4;
+declare const FALLBACK_CHARS_PER_TOKEN = 2;
 /**
  * Subagent creation helper for gadget authors.
@@ -9554,8 +9569,14 @@ declare abstract class OpenAICompatibleProvider<TConfig extends OpenAICompatible
     protected executeStreamRequest(payload: Parameters<OpenAI["chat"]["completions"]["create"]>[0], signal?: AbortSignal): Promise<AsyncIterable<ChatCompletionChunk>>;
     protected normalizeProviderStream(iterable: AsyncIterable<unknown>): LLMStream;
     /**
-     * Count tokens using character-based fallback estimation.
-     * Most meta-providers don't have a native token counting API.
+     * Count tokens using tiktoken o200k_base encoding.
+     *
+     * While o200k_base isn't model-exact for non-OpenAI models routed through
+     * meta-providers like OpenRouter, BPE tokenizers with 200K vocab produce
+     * counts within 10-20% of true values — far better than the character-based
+     * fallback which can be off by 250% for JSON/code-heavy content.
+     *
+     * Falls back to character-based estimation if tiktoken fails.
      */
     countTokens(messages: LLMMessage[], descriptor: ModelDescriptor, _spec?: ModelSpec): Promise<number>;
 }

package/dist/index.js CHANGED Viewed

@@ -813,7 +813,7 @@ var init_constants = __esm({
     GADGET_ARG_PREFIX = "!!!ARG:";
     DEFAULT_GADGET_OUTPUT_LIMIT = true;
     DEFAULT_GADGET_OUTPUT_LIMIT_PERCENT = 15;
-    CHARS_PER_TOKEN = 4;
+    CHARS_PER_TOKEN = 2;
     FALLBACK_CONTEXT_WINDOW = 128e3;
   }
 });
@@ -2834,6 +2834,7 @@ var CompactionManager;
 var init_manager = __esm({
   "src/agent/compaction/manager.ts"() {
     "use strict";
+    init_logger();
     init_config();
     init_strategies();
     CompactionManager = class {
@@ -2841,15 +2842,19 @@ var init_manager = __esm({
       model;
       config;
       strategy;
+      logger;
       modelLimits;
+      hasWarnedModelNotFound = false;
+      hasWarnedNoTokenCounting = false;
       // Statistics
       totalCompactions = 0;
       totalTokensSaved = 0;
       lastTokenCount = 0;
-      constructor(client, model, config = {}) {
+      constructor(client, model, config = {}, logger2) {
         this.client = client;
         this.model = model;
         this.config = resolveCompactionConfig(config);
+        this.logger = logger2 ?? createLogger({ name: "llmist:compaction" });
         if (typeof config.strategy === "object" && "compact" in config.strategy) {
           this.strategy = config.strategy;
         } else {
@@ -2867,13 +2872,16 @@ var init_manager = __esm({
         if (!this.config.enabled) {
           return null;
         }
-        if (!this.modelLimits) {
-          this.modelLimits = this.client.modelRegistry.getModelLimits(this.model);
-          if (!this.modelLimits) {
-            return null;
-          }
+        if (!this.resolveModelLimits()) {
+          return null;
         }
         if (!this.client.countTokens) {
+          if (!this.hasWarnedNoTokenCounting) {
+            this.hasWarnedNoTokenCounting = true;
+            this.logger.warn("Compaction skipped: client does not support token counting", {
+              model: this.model
+            });
+          }
           return null;
         }
         const messages = conversation.getMessages();
@@ -2904,11 +2912,8 @@ var init_manager = __esm({
        * @returns CompactionEvent with compaction details
        */
       async compact(conversation, iteration, precomputed) {
-        if (!this.modelLimits) {
-          this.modelLimits = this.client.modelRegistry.getModelLimits(this.model);
-          if (!this.modelLimits) {
-            return null;
-          }
+        if (!this.resolveModelLimits()) {
+          return null;
         }
         const historyMessages = precomputed?.historyMessages ?? conversation.getHistoryMessages();
         const baseMessages = precomputed?.baseMessages ?? conversation.getBaseMessages();
@@ -2950,6 +2955,42 @@ var init_manager = __esm({
         }
         return event;
       }
+      /**
+       * Feed API-reported input token count for reactive threshold checking.
+       * Call this after each LLM response with the actual inputTokens from usage.
+       */
+      updateUsage(inputTokens) {
+        this.lastTokenCount = inputTokens;
+      }
+      /**
+       * Check if compaction should trigger based on API-reported usage.
+       * Unlike checkAndCompact() which uses estimated token counts,
+       * this uses the ground-truth token count from the last LLM response.
+       */
+      shouldCompactFromUsage() {
+        if (!this.config.enabled) return false;
+        if (!this.resolveModelLimits()) return false;
+        const usagePercent = this.lastTokenCount / this.modelLimits.contextWindow * 100;
+        return usagePercent >= this.config.triggerThresholdPercent;
+      }
+      /**
+       * Resolve and cache model limits from registry. Warns once if not found.
+       * @returns true if limits are available, false otherwise
+       */
+      resolveModelLimits() {
+        if (this.modelLimits) return true;
+        this.modelLimits = this.client.modelRegistry.getModelLimits(this.model);
+        if (!this.modelLimits) {
+          if (!this.hasWarnedModelNotFound) {
+            this.hasWarnedModelNotFound = true;
+            this.logger.warn("Compaction skipped: model not found in registry", {
+              model: this.model
+            });
+          }
+          return false;
+        }
+        return true;
+      }
       /**
        * Get compaction statistics.
        */
@@ -7350,7 +7391,7 @@ var init_constants2 = __esm({
   "src/providers/constants.ts"() {
     "use strict";
     ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS = 4096;
-    FALLBACK_CHARS_PER_TOKEN = 4;
+    FALLBACK_CHARS_PER_TOKEN = 2;
     OPENAI_MESSAGE_OVERHEAD_TOKENS = 4;
     OPENAI_REPLY_PRIMING_TOKENS = 2;
     OPENAI_NAME_FIELD_OVERHEAD_TOKENS = 1;
@@ -9705,6 +9746,7 @@ var init_huggingface_models = __esm({
 // src/providers/openai-compatible-provider.ts
 import OpenAI from "openai";
+import { get_encoding } from "tiktoken";
 var ROLE_MAP, OpenAICompatibleProvider;
 var init_openai_compatible_provider = __esm({
   "src/providers/openai-compatible-provider.ts"() {
@@ -9909,11 +9951,38 @@ var init_openai_compatible_provider = __esm({
         }
       }
       /**
-       * Count tokens using character-based fallback estimation.
-       * Most meta-providers don't have a native token counting API.
+       * Count tokens using tiktoken o200k_base encoding.
+       *
+       * While o200k_base isn't model-exact for non-OpenAI models routed through
+       * meta-providers like OpenRouter, BPE tokenizers with 200K vocab produce
+       * counts within 10-20% of true values — far better than the character-based
+       * fallback which can be off by 250% for JSON/code-heavy content.
+       *
+       * Falls back to character-based estimation if tiktoken fails.
        */
       async countTokens(messages, descriptor, _spec) {
+        if (!messages || messages.length === 0) return 0;
         try {
+          const encoding = get_encoding("o200k_base");
+          try {
+            let tokenCount = 0;
+            for (const msg of messages) {
+              const parts = normalizeMessageContent(msg.content);
+              for (const part of parts) {
+                if (part.type === "text") {
+                  tokenCount += encoding.encode(part.text).length;
+                }
+              }
+            }
+            return tokenCount;
+          } finally {
+            encoding.free();
+          }
+        } catch (error) {
+          console.warn(
+            `Token counting with tiktoken failed for ${descriptor.name}, using fallback estimation:`,
+            error
+          );
           let totalChars = 0;
           for (const msg of messages) {
             const parts = normalizeMessageContent(msg.content);
@@ -9924,9 +9993,6 @@ var init_openai_compatible_provider = __esm({
             }
           }
           return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
-        } catch (error) {
-          console.warn(`Token counting failed for ${descriptor.name}, using zero estimate:`, error);
-          return 0;
         }
       }
     };
@@ -11435,6 +11501,103 @@ var init_openrouter_models = __esm({
         }
       },
       // ============================================================
+      // Google Gemini 3.1 Models (via OpenRouter)
+      // ============================================================
+      {
+        provider: "openrouter",
+        modelId: "google/gemini-3.1-pro-preview",
+        displayName: "Gemini 3.1 Pro Preview (OpenRouter)",
+        contextWindow: 1048576,
+        maxOutputTokens: 65536,
+        pricing: {
+          input: 2,
+          output: 12
+        },
+        knowledgeCutoff: "2025-01",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          reasoning: true,
+          structuredOutputs: true
+        },
+        metadata: {
+          family: "Gemini 3.1",
+          releaseDate: "2026-03",
+          notes: "Gemini 3.1 Pro Preview via OpenRouter. Frontier reasoning with enhanced software engineering performance."
+        }
+      },
+      {
+        provider: "openrouter",
+        modelId: "google/gemini-3.1-pro-preview-customtools",
+        displayName: "Gemini 3.1 Pro Preview Custom Tools (OpenRouter)",
+        contextWindow: 1048576,
+        maxOutputTokens: 65536,
+        pricing: {
+          input: 2,
+          output: 12
+        },
+        knowledgeCutoff: "2025-01",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          reasoning: true,
+          structuredOutputs: true
+        },
+        metadata: {
+          family: "Gemini 3.1",
+          releaseDate: "2026-03",
+          notes: "Gemini 3.1 Pro Preview Custom Tools via OpenRouter. Improved tool selection to prevent overuse of general tools in agent workflows."
+        }
+      },
+      {
+        provider: "openrouter",
+        modelId: "google/gemini-3.1-flash-lite-preview",
+        displayName: "Gemini 3.1 Flash Lite Preview (OpenRouter)",
+        contextWindow: 1048576,
+        maxOutputTokens: 65536,
+        pricing: {
+          input: 0.25,
+          output: 1.5
+        },
+        knowledgeCutoff: "2025-01",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          reasoning: true,
+          structuredOutputs: true
+        },
+        metadata: {
+          family: "Gemini 3.1",
+          releaseDate: "2026-03",
+          notes: "Gemini 3.1 Flash Lite Preview via OpenRouter. High-efficiency model with full thinking levels for cost/performance trade-offs."
+        }
+      },
+      {
+        provider: "openrouter",
+        modelId: "google/gemini-3.1-flash-image-preview",
+        displayName: "Gemini 3.1 Flash Image Preview (OpenRouter)",
+        contextWindow: 65536,
+        maxOutputTokens: 65536,
+        pricing: {
+          input: 0.5,
+          output: 3
+        },
+        knowledgeCutoff: "2025-01",
+        features: {
+          streaming: true,
+          functionCalling: false,
+          vision: true
+        },
+        metadata: {
+          family: "Gemini 3.1",
+          releaseDate: "2026-03",
+          notes: "Gemini 3.1 Flash Image Preview via OpenRouter. Pro-level image generation and editing at Flash speed."
+        }
+      },
+      // ============================================================
       // Meta Llama Models (via OpenRouter)
       // ============================================================
       {
@@ -12588,6 +12751,7 @@ var init_client = __esm({
     "use strict";
     init_builder();
     init_discovery();
+    init_constants();
     init_model_registry();
     init_image();
     init_speech();
@@ -12706,8 +12870,43 @@ var init_client = __esm({
         if (adapter.countTokens) {
           return adapter.countTokens(messages, descriptor, spec);
         }
-        const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
-        return Math.ceil(totalChars / 4);
+        try {
+          const { get_encoding: get_encoding2 } = await import("tiktoken");
+          const encoding = get_encoding2("o200k_base");
+          try {
+            let tokenCount = 0;
+            for (const msg of messages) {
+              const content = msg.content;
+              if (typeof content === "string") {
+                tokenCount += encoding.encode(content).length;
+              } else if (Array.isArray(content)) {
+                for (const part of content) {
+                  if (part.type === "text") {
+                    tokenCount += encoding.encode(part.text).length;
+                  }
+                }
+              }
+            }
+            return tokenCount;
+          } finally {
+            encoding.free();
+          }
+        } catch {
+          let totalChars = 0;
+          for (const msg of messages) {
+            const content = msg.content;
+            if (typeof content === "string") {
+              totalChars += content.length;
+            } else if (Array.isArray(content)) {
+              for (const part of content) {
+                if (part.type === "text") {
+                  totalChars += part.text.length;
+                }
+              }
+            }
+          }
+          return Math.ceil(totalChars / CHARS_PER_TOKEN);
+        }
       }
       resolveAdapter(descriptor) {
         const adapter = this.adapters.find((item) => item.supports(descriptor));
@@ -16372,7 +16571,8 @@ var init_agent = __esm({
           this.compactionManager = new CompactionManager(
             this.client,
             this.model,
-            options.compactionConfig
+            options.compactionConfig,
+            this.logger
           );
         }
         this.signal = options.signal;
@@ -16718,6 +16918,22 @@ var init_agent = __esm({
                 this.logger.info("Loop terminated by gadget or processor");
                 break;
               }
+              if (this.compactionManager && result.usage?.inputTokens) {
+                this.compactionManager.updateUsage(result.usage.inputTokens);
+                if (this.compactionManager.shouldCompactFromUsage()) {
+                  this.logger.info("Reactive compaction triggered from API-reported usage", {
+                    inputTokens: result.usage.inputTokens,
+                    iteration: currentIteration
+                  });
+                  const reactiveCompaction = await this.compactionManager.compact(
+                    this.conversation,
+                    currentIteration
+                  );
+                  if (reactiveCompaction) {
+                    yield await this.emitCompactionEvent(reactiveCompaction, currentIteration);
+                  }
+                }
+              }
               if (this.budget !== void 0) {
                 const totalCost = this.tree.getTotalCost();
                 if (totalCost >= this.budget) {