llmist 16.2.4 → 17.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -818,7 +818,7 @@ var init_constants = __esm({
818
818
  GADGET_ARG_PREFIX = "!!!ARG:";
819
819
  DEFAULT_GADGET_OUTPUT_LIMIT = true;
820
820
  DEFAULT_GADGET_OUTPUT_LIMIT_PERCENT = 15;
821
- CHARS_PER_TOKEN = 4;
821
+ CHARS_PER_TOKEN = 2;
822
822
  FALLBACK_CONTEXT_WINDOW = 128e3;
823
823
  }
824
824
  });
@@ -2839,6 +2839,7 @@ var CompactionManager;
2839
2839
  var init_manager = __esm({
2840
2840
  "src/agent/compaction/manager.ts"() {
2841
2841
  "use strict";
2842
+ init_logger();
2842
2843
  init_config();
2843
2844
  init_strategies();
2844
2845
  CompactionManager = class {
@@ -2846,15 +2847,19 @@ var init_manager = __esm({
2846
2847
  model;
2847
2848
  config;
2848
2849
  strategy;
2850
+ logger;
2849
2851
  modelLimits;
2852
+ hasWarnedModelNotFound = false;
2853
+ hasWarnedNoTokenCounting = false;
2850
2854
  // Statistics
2851
2855
  totalCompactions = 0;
2852
2856
  totalTokensSaved = 0;
2853
2857
  lastTokenCount = 0;
2854
- constructor(client, model, config = {}) {
2858
+ constructor(client, model, config = {}, logger2) {
2855
2859
  this.client = client;
2856
2860
  this.model = model;
2857
2861
  this.config = resolveCompactionConfig(config);
2862
+ this.logger = logger2 ?? createLogger({ name: "llmist:compaction" });
2858
2863
  if (typeof config.strategy === "object" && "compact" in config.strategy) {
2859
2864
  this.strategy = config.strategy;
2860
2865
  } else {
@@ -2872,13 +2877,16 @@ var init_manager = __esm({
2872
2877
  if (!this.config.enabled) {
2873
2878
  return null;
2874
2879
  }
2875
- if (!this.modelLimits) {
2876
- this.modelLimits = this.client.modelRegistry.getModelLimits(this.model);
2877
- if (!this.modelLimits) {
2878
- return null;
2879
- }
2880
+ if (!this.resolveModelLimits()) {
2881
+ return null;
2880
2882
  }
2881
2883
  if (!this.client.countTokens) {
2884
+ if (!this.hasWarnedNoTokenCounting) {
2885
+ this.hasWarnedNoTokenCounting = true;
2886
+ this.logger.warn("Compaction skipped: client does not support token counting", {
2887
+ model: this.model
2888
+ });
2889
+ }
2882
2890
  return null;
2883
2891
  }
2884
2892
  const messages = conversation.getMessages();
@@ -2909,11 +2917,8 @@ var init_manager = __esm({
2909
2917
  * @returns CompactionEvent with compaction details
2910
2918
  */
2911
2919
  async compact(conversation, iteration, precomputed) {
2912
- if (!this.modelLimits) {
2913
- this.modelLimits = this.client.modelRegistry.getModelLimits(this.model);
2914
- if (!this.modelLimits) {
2915
- return null;
2916
- }
2920
+ if (!this.resolveModelLimits()) {
2921
+ return null;
2917
2922
  }
2918
2923
  const historyMessages = precomputed?.historyMessages ?? conversation.getHistoryMessages();
2919
2924
  const baseMessages = precomputed?.baseMessages ?? conversation.getBaseMessages();
@@ -2955,6 +2960,42 @@ var init_manager = __esm({
2955
2960
  }
2956
2961
  return event;
2957
2962
  }
2963
+ /**
2964
+ * Feed API-reported input token count for reactive threshold checking.
2965
+ * Call this after each LLM response with the actual inputTokens from usage.
2966
+ */
2967
+ updateUsage(inputTokens) {
2968
+ this.lastTokenCount = inputTokens;
2969
+ }
2970
+ /**
2971
+ * Check if compaction should trigger based on API-reported usage.
2972
+ * Unlike checkAndCompact() which uses estimated token counts,
2973
+ * this uses the ground-truth token count from the last LLM response.
2974
+ */
2975
+ shouldCompactFromUsage() {
2976
+ if (!this.config.enabled) return false;
2977
+ if (!this.resolveModelLimits()) return false;
2978
+ const usagePercent = this.lastTokenCount / this.modelLimits.contextWindow * 100;
2979
+ return usagePercent >= this.config.triggerThresholdPercent;
2980
+ }
2981
+ /**
2982
+ * Resolve and cache model limits from registry. Warns once if not found.
2983
+ * @returns true if limits are available, false otherwise
2984
+ */
2985
+ resolveModelLimits() {
2986
+ if (this.modelLimits) return true;
2987
+ this.modelLimits = this.client.modelRegistry.getModelLimits(this.model);
2988
+ if (!this.modelLimits) {
2989
+ if (!this.hasWarnedModelNotFound) {
2990
+ this.hasWarnedModelNotFound = true;
2991
+ this.logger.warn("Compaction skipped: model not found in registry", {
2992
+ model: this.model
2993
+ });
2994
+ }
2995
+ return false;
2996
+ }
2997
+ return true;
2998
+ }
2958
2999
  /**
2959
3000
  * Get compaction statistics.
2960
3001
  */
@@ -7358,7 +7399,7 @@ var init_constants2 = __esm({
7358
7399
  "src/providers/constants.ts"() {
7359
7400
  "use strict";
7360
7401
  ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS = 4096;
7361
- FALLBACK_CHARS_PER_TOKEN = 4;
7402
+ FALLBACK_CHARS_PER_TOKEN = 2;
7362
7403
  OPENAI_MESSAGE_OVERHEAD_TOKENS = 4;
7363
7404
  OPENAI_REPLY_PRIMING_TOKENS = 2;
7364
7405
  OPENAI_NAME_FIELD_OVERHEAD_TOKENS = 1;
@@ -9712,11 +9753,12 @@ var init_huggingface_models = __esm({
9712
9753
  });
9713
9754
 
9714
9755
  // src/providers/openai-compatible-provider.ts
9715
- var import_openai, ROLE_MAP, OpenAICompatibleProvider;
9756
+ var import_openai, import_tiktoken, ROLE_MAP, OpenAICompatibleProvider;
9716
9757
  var init_openai_compatible_provider = __esm({
9717
9758
  "src/providers/openai-compatible-provider.ts"() {
9718
9759
  "use strict";
9719
9760
  import_openai = __toESM(require("openai"), 1);
9761
+ import_tiktoken = require("tiktoken");
9720
9762
  init_messages();
9721
9763
  init_base_provider();
9722
9764
  init_constants2();
@@ -9917,11 +9959,38 @@ var init_openai_compatible_provider = __esm({
9917
9959
  }
9918
9960
  }
9919
9961
  /**
9920
- * Count tokens using character-based fallback estimation.
9921
- * Most meta-providers don't have a native token counting API.
9962
+ * Count tokens using tiktoken o200k_base encoding.
9963
+ *
9964
+ * While o200k_base isn't model-exact for non-OpenAI models routed through
9965
+ * meta-providers like OpenRouter, BPE tokenizers with 200K vocab produce
9966
+ * counts within 10-20% of true values — far better than the character-based
9967
+ * fallback which can be off by 250% for JSON/code-heavy content.
9968
+ *
9969
+ * Falls back to character-based estimation if tiktoken fails.
9922
9970
  */
9923
9971
  async countTokens(messages, descriptor, _spec) {
9972
+ if (!messages || messages.length === 0) return 0;
9924
9973
  try {
9974
+ const encoding = (0, import_tiktoken.get_encoding)("o200k_base");
9975
+ try {
9976
+ let tokenCount = 0;
9977
+ for (const msg of messages) {
9978
+ const parts = normalizeMessageContent(msg.content);
9979
+ for (const part of parts) {
9980
+ if (part.type === "text") {
9981
+ tokenCount += encoding.encode(part.text).length;
9982
+ }
9983
+ }
9984
+ }
9985
+ return tokenCount;
9986
+ } finally {
9987
+ encoding.free();
9988
+ }
9989
+ } catch (error) {
9990
+ console.warn(
9991
+ `Token counting with tiktoken failed for ${descriptor.name}, using fallback estimation:`,
9992
+ error
9993
+ );
9925
9994
  let totalChars = 0;
9926
9995
  for (const msg of messages) {
9927
9996
  const parts = normalizeMessageContent(msg.content);
@@ -9932,9 +10001,6 @@ var init_openai_compatible_provider = __esm({
9932
10001
  }
9933
10002
  }
9934
10003
  return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
9935
- } catch (error) {
9936
- console.warn(`Token counting failed for ${descriptor.name}, using zero estimate:`, error);
9937
- return 0;
9938
10004
  }
9939
10005
  }
9940
10006
  };
@@ -10885,12 +10951,12 @@ function sanitizeExtra(extra, allowTemperature) {
10885
10951
  function createOpenAIProviderFromEnv() {
10886
10952
  return createProviderFromEnv("OPENAI_API_KEY", import_openai3.default, OpenAIChatProvider);
10887
10953
  }
10888
- var import_openai3, import_tiktoken, ROLE_MAP2, OPENAI_EFFORT_MAP, OpenAIChatProvider;
10954
+ var import_openai3, import_tiktoken2, ROLE_MAP2, OPENAI_EFFORT_MAP, OpenAIChatProvider;
10889
10955
  var init_openai = __esm({
10890
10956
  "src/providers/openai.ts"() {
10891
10957
  "use strict";
10892
10958
  import_openai3 = __toESM(require("openai"), 1);
10893
- import_tiktoken = require("tiktoken");
10959
+ import_tiktoken2 = require("tiktoken");
10894
10960
  init_messages();
10895
10961
  init_base_provider();
10896
10962
  init_constants2();
@@ -11149,9 +11215,9 @@ var init_openai = __esm({
11149
11215
  const modelName = descriptor.name;
11150
11216
  let encoding;
11151
11217
  try {
11152
- encoding = (0, import_tiktoken.encoding_for_model)(modelName);
11218
+ encoding = (0, import_tiktoken2.encoding_for_model)(modelName);
11153
11219
  } catch {
11154
- encoding = (0, import_tiktoken.encoding_for_model)("gpt-4o");
11220
+ encoding = (0, import_tiktoken2.encoding_for_model)("gpt-4o");
11155
11221
  }
11156
11222
  try {
11157
11223
  let tokenCount = 0;
@@ -11443,6 +11509,103 @@ var init_openrouter_models = __esm({
11443
11509
  }
11444
11510
  },
11445
11511
  // ============================================================
11512
+ // Google Gemini 3.1 Models (via OpenRouter)
11513
+ // ============================================================
11514
+ {
11515
+ provider: "openrouter",
11516
+ modelId: "google/gemini-3.1-pro-preview",
11517
+ displayName: "Gemini 3.1 Pro Preview (OpenRouter)",
11518
+ contextWindow: 1048576,
11519
+ maxOutputTokens: 65536,
11520
+ pricing: {
11521
+ input: 2,
11522
+ output: 12
11523
+ },
11524
+ knowledgeCutoff: "2025-01",
11525
+ features: {
11526
+ streaming: true,
11527
+ functionCalling: true,
11528
+ vision: true,
11529
+ reasoning: true,
11530
+ structuredOutputs: true
11531
+ },
11532
+ metadata: {
11533
+ family: "Gemini 3.1",
11534
+ releaseDate: "2026-03",
11535
+ notes: "Gemini 3.1 Pro Preview via OpenRouter. Frontier reasoning with enhanced software engineering performance."
11536
+ }
11537
+ },
11538
+ {
11539
+ provider: "openrouter",
11540
+ modelId: "google/gemini-3.1-pro-preview-customtools",
11541
+ displayName: "Gemini 3.1 Pro Preview Custom Tools (OpenRouter)",
11542
+ contextWindow: 1048576,
11543
+ maxOutputTokens: 65536,
11544
+ pricing: {
11545
+ input: 2,
11546
+ output: 12
11547
+ },
11548
+ knowledgeCutoff: "2025-01",
11549
+ features: {
11550
+ streaming: true,
11551
+ functionCalling: true,
11552
+ vision: true,
11553
+ reasoning: true,
11554
+ structuredOutputs: true
11555
+ },
11556
+ metadata: {
11557
+ family: "Gemini 3.1",
11558
+ releaseDate: "2026-03",
11559
+ notes: "Gemini 3.1 Pro Preview Custom Tools via OpenRouter. Improved tool selection to prevent overuse of general tools in agent workflows."
11560
+ }
11561
+ },
11562
+ {
11563
+ provider: "openrouter",
11564
+ modelId: "google/gemini-3.1-flash-lite-preview",
11565
+ displayName: "Gemini 3.1 Flash Lite Preview (OpenRouter)",
11566
+ contextWindow: 1048576,
11567
+ maxOutputTokens: 65536,
11568
+ pricing: {
11569
+ input: 0.25,
11570
+ output: 1.5
11571
+ },
11572
+ knowledgeCutoff: "2025-01",
11573
+ features: {
11574
+ streaming: true,
11575
+ functionCalling: true,
11576
+ vision: true,
11577
+ reasoning: true,
11578
+ structuredOutputs: true
11579
+ },
11580
+ metadata: {
11581
+ family: "Gemini 3.1",
11582
+ releaseDate: "2026-03",
11583
+ notes: "Gemini 3.1 Flash Lite Preview via OpenRouter. High-efficiency model with full thinking levels for cost/performance trade-offs."
11584
+ }
11585
+ },
11586
+ {
11587
+ provider: "openrouter",
11588
+ modelId: "google/gemini-3.1-flash-image-preview",
11589
+ displayName: "Gemini 3.1 Flash Image Preview (OpenRouter)",
11590
+ contextWindow: 65536,
11591
+ maxOutputTokens: 65536,
11592
+ pricing: {
11593
+ input: 0.5,
11594
+ output: 3
11595
+ },
11596
+ knowledgeCutoff: "2025-01",
11597
+ features: {
11598
+ streaming: true,
11599
+ functionCalling: false,
11600
+ vision: true
11601
+ },
11602
+ metadata: {
11603
+ family: "Gemini 3.1",
11604
+ releaseDate: "2026-03",
11605
+ notes: "Gemini 3.1 Flash Image Preview via OpenRouter. Pro-level image generation and editing at Flash speed."
11606
+ }
11607
+ },
11608
+ // ============================================================
11446
11609
  // Meta Llama Models (via OpenRouter)
11447
11610
  // ============================================================
11448
11611
  {
@@ -12596,6 +12759,7 @@ var init_client = __esm({
12596
12759
  "use strict";
12597
12760
  init_builder();
12598
12761
  init_discovery();
12762
+ init_constants();
12599
12763
  init_model_registry();
12600
12764
  init_image();
12601
12765
  init_speech();
@@ -12714,8 +12878,43 @@ var init_client = __esm({
12714
12878
  if (adapter.countTokens) {
12715
12879
  return adapter.countTokens(messages, descriptor, spec);
12716
12880
  }
12717
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
12718
- return Math.ceil(totalChars / 4);
12881
+ try {
12882
+ const { get_encoding: get_encoding2 } = await import("tiktoken");
12883
+ const encoding = get_encoding2("o200k_base");
12884
+ try {
12885
+ let tokenCount = 0;
12886
+ for (const msg of messages) {
12887
+ const content = msg.content;
12888
+ if (typeof content === "string") {
12889
+ tokenCount += encoding.encode(content).length;
12890
+ } else if (Array.isArray(content)) {
12891
+ for (const part of content) {
12892
+ if (part.type === "text") {
12893
+ tokenCount += encoding.encode(part.text).length;
12894
+ }
12895
+ }
12896
+ }
12897
+ }
12898
+ return tokenCount;
12899
+ } finally {
12900
+ encoding.free();
12901
+ }
12902
+ } catch {
12903
+ let totalChars = 0;
12904
+ for (const msg of messages) {
12905
+ const content = msg.content;
12906
+ if (typeof content === "string") {
12907
+ totalChars += content.length;
12908
+ } else if (Array.isArray(content)) {
12909
+ for (const part of content) {
12910
+ if (part.type === "text") {
12911
+ totalChars += part.text.length;
12912
+ }
12913
+ }
12914
+ }
12915
+ }
12916
+ return Math.ceil(totalChars / CHARS_PER_TOKEN);
12917
+ }
12719
12918
  }
12720
12919
  resolveAdapter(descriptor) {
12721
12920
  const adapter = this.adapters.find((item) => item.supports(descriptor));
@@ -16380,7 +16579,8 @@ var init_agent = __esm({
16380
16579
  this.compactionManager = new CompactionManager(
16381
16580
  this.client,
16382
16581
  this.model,
16383
- options.compactionConfig
16582
+ options.compactionConfig,
16583
+ this.logger
16384
16584
  );
16385
16585
  }
16386
16586
  this.signal = options.signal;
@@ -16726,6 +16926,22 @@ var init_agent = __esm({
16726
16926
  this.logger.info("Loop terminated by gadget or processor");
16727
16927
  break;
16728
16928
  }
16929
+ if (this.compactionManager && result.usage?.inputTokens) {
16930
+ this.compactionManager.updateUsage(result.usage.inputTokens);
16931
+ if (this.compactionManager.shouldCompactFromUsage()) {
16932
+ this.logger.info("Reactive compaction triggered from API-reported usage", {
16933
+ inputTokens: result.usage.inputTokens,
16934
+ iteration: currentIteration
16935
+ });
16936
+ const reactiveCompaction = await this.compactionManager.compact(
16937
+ this.conversation,
16938
+ currentIteration
16939
+ );
16940
+ if (reactiveCompaction) {
16941
+ yield await this.emitCompactionEvent(reactiveCompaction, currentIteration);
16942
+ }
16943
+ }
16944
+ }
16729
16945
  if (this.budget !== void 0) {
16730
16946
  const totalCost = this.tree.getTotalCost();
16731
16947
  if (totalCost >= this.budget) {