legion-llm 0.14.2 → 0.14.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8ab1822ba6aa5df945cd99b3bb2ee5e735080f97a517d4de694725e38f44bf71
4
- data.tar.gz: 276fd55c3fabce052c0c27a3cd1e84e050b502446a4b047ff8ec88fe829c7f47
3
+ metadata.gz: dac2aadf427ccf4c6e902242ed392c3f9f0f5a3e0ddaeeb6d6c262e3926c87b1
4
+ data.tar.gz: 48ccd139e47330107147d7745ec7e34c7a1de879cc8939d2d1fd33322d04b336
5
5
  SHA512:
6
- metadata.gz: f1a2ca486fe605683c14a8847ed209a6041c8a557d90b4e6e01676218d3bced83e776ae4352a6f19c09ab19e2f98e129cfb947dcd435a490a6df5c7b95240b31
7
- data.tar.gz: aa8cbe3c10d72b28d9a31a3e8f3114faaa2e016860e4520532d5872ab01a38e53084184cb22dbb0b7943691819291c17770fb7292da392460200057ba980e638
6
+ metadata.gz: 27c3269042cf0b060f0e402fad967f048319a2a98a425d4b68f39569d07e41842505bf947f32ae62a52125a0c81143e4d1fcf076bee53e247fd0cfe76a9e7f6a
7
+ data.tar.gz: 1fcd99c1a3007b858bc033f57a1cd3ada1fd96e2729d597fd65f8ac8eeb406295f1b73a2d4bcea31793ca6669b0bba545c32ac86474e2c9f1a5adcc60f9550f2
data/CHANGELOG.md CHANGED
@@ -1,5 +1,23 @@
1
1
  # Legion LLM Changelog
2
2
 
3
+ ## [0.14.3] - 2026-06-22
4
+
5
+ ### Fixed
6
+
7
+ - Context window enforcement now accounts for tool definition tokens when deciding whether to compact messages. Previously, 235 injected tools (~50k tokens) were invisible to the threshold check, causing context overflow on dispatch.
8
+ - Context overflow errors are no longer unconditionally terminal during escalation. When a lane with a larger context window exists, the executor retries on that lane instead of raising immediately.
9
+ - `X-Legion-Format: canonical` debug surface enabled by default (was gated to dev/lite mode only). The canonical format is a first-class output format, not a debug feature.
10
+ - Completion log now includes `context_tokens_saved=N` showing total tokens saved by curation, archival, thinking-strip, and context window compaction per request.
11
+ - Embedding dispatch now sends chunks individually instead of as an array, fixing intermittent `ContextOverflow` on RAG embedding queries with ollama.
12
+ - Truncated tool results now instruct the model to make smaller targeted requests instead of silently chopping content.
13
+ - `context_window_threshold` moved from hardcoded 0.90 to `context_curation_defaults` setting.
14
+
15
+ ### Changed
16
+
17
+ - `tool_result_max_dispatch_chars` raised from 2k to 5k (dispatch-time truncation of oversized tool results before provider call).
18
+ - `context_curation.tool_result_max_chars` lowered from 10k to 2k (curator distills tool results earlier, reducing context growth).
19
+ - Default `tier_weights` rebalanced: direct=105, local=110, fleet=110, cloud=120, frontier=150 (prefer higher-capability tiers when lane_weight tie-breaks).
20
+
3
21
  ## [0.14.2] - 2026-06-20
4
22
 
5
23
  ### Fixed
@@ -179,6 +179,8 @@ module Legion
179
179
  provider_latency_ms = api_provider_latency_ms(pipeline_response, routing)
180
180
  conversation_id = pipeline_response.respond_to?(:conversation_id) ? pipeline_response.conversation_id : nil
181
181
 
182
+ context_stats = api_context_stats(pipeline_response)
183
+
182
184
  parts = {
183
185
  request_id: request_id || 'unknown',
184
186
  conversation_id: conversation_id || 'none',
@@ -199,6 +201,7 @@ module Legion
199
201
  tool_executions: api_tool_execution_count(pipeline_response),
200
202
  stop_reason: resolved_stop_reason
201
203
  }
204
+ parts.merge!(context_stats) if context_stats
202
205
 
203
206
  log.info("[llm][api][#{namespace}] action=completed " \
204
207
  "#{parts.map { |key, value| "#{key}=#{value}" }.join(' ')}")
@@ -228,6 +231,26 @@ module Legion
228
231
  nil
229
232
  end
230
233
 
234
+ def api_context_stats(pipeline_response)
235
+ return nil unless pipeline_response.respond_to?(:audit)
236
+
237
+ accounting = pipeline_response.audit&.dig(:context_accounting)
238
+ return nil unless accounting.is_a?(Hash)
239
+
240
+ tokens = accounting[:tokens] || {}
241
+ curated = tokens[:curation_saved_estimated_tokens].to_i
242
+ archived = tokens[:archive_saved_estimated_tokens].to_i
243
+ stripped = tokens[:stripped_thinking_estimated_tokens].to_i
244
+ compacted = tokens[:context_window_saved_estimated_tokens].to_i
245
+ total_saved = curated + archived + stripped + compacted
246
+ return nil unless total_saved.positive?
247
+
248
+ { context_tokens_saved: total_saved }
249
+ rescue StandardError => e
250
+ handle_exception(e, level: :warn, handled: true, operation: 'llm.api.context_stats')
251
+ nil
252
+ end
253
+
231
254
  def api_hash_value(hash, key)
232
255
  return nil unless hash.respond_to?(:[])
233
256
 
@@ -26,18 +26,21 @@ module Legion
26
26
  @context_accounting[:component_status][:context_window] = :observed
27
27
  return messages unless context_window&.positive?
28
28
 
29
- threshold = (context_window * 0.90).to_i
29
+ threshold = (context_window * Legion::Settings[:llm][:context_curation][:context_window_threshold]).to_i
30
+ tool_budget = estimate_tool_token_budget
31
+ available_for_messages = threshold - tool_budget
30
32
  estimated = estimate_message_tokens(messages)
31
- return messages if estimated <= threshold
33
+ return messages if estimated <= available_for_messages
32
34
 
33
35
  log.warn "[llm][executor] action=context_compaction request_id=#{@request.id} " \
34
- "estimated_tokens=#{estimated} context_window=#{context_window} threshold=#{threshold}"
36
+ "estimated_tokens=#{estimated} context_window=#{context_window} " \
37
+ "threshold=#{threshold} tool_budget=#{tool_budget} available=#{available_for_messages}"
35
38
 
36
39
  preserve_after = last_user_message_index(messages)
37
40
  recent = messages[preserve_after..]
38
41
  older = messages[0...preserve_after]
39
42
 
40
- target_tokens = threshold - estimate_message_tokens(recent)
43
+ target_tokens = available_for_messages - estimate_message_tokens(recent)
41
44
  compacted = compact_to_fit(older, target_tokens)
42
45
 
43
46
  result = compacted + recent
@@ -94,6 +97,17 @@ module Legion
94
97
  messages.sum { |m| ((m[:content] || m['content']).to_s.length / 4.0).ceil }
95
98
  end
96
99
 
100
+ def estimate_tool_token_budget
101
+ tools = @request.tools
102
+ return 0 if tools.nil? || tools.empty?
103
+
104
+ tool_list = tools.is_a?(Hash) ? tools.values : Array(tools)
105
+ tool_list.sum do |tool|
106
+ json_repr = tool.respond_to?(:to_h) ? Legion::JSON.dump(tool.to_h) : tool.to_s
107
+ (json_repr.length / 3.5).ceil
108
+ end
109
+ end
110
+
97
111
  def strip_thinking_from_history(messages)
98
112
  before_tokens = ContextAccounting.estimate_message_tokens(messages)
99
113
  preserve_after = last_user_message_index(messages)
@@ -158,7 +172,9 @@ module Legion
158
172
  next msg unless content.is_a?(String) && content.length > max_chars
159
173
 
160
174
  trimmed_count += 1
161
- msg.merge(content: "#{content[0, max_chars]}\n[truncated #{content.length} chars total]")
175
+ msg.merge(content: "#{content[0, max_chars]}\n\n[TRUNCATED: showing first #{max_chars} of #{content.length} chars. " \
176
+ 'If you need more content, make multiple smaller targeted requests ' \
177
+ '(e.g. read specific line ranges, grep for specific patterns, or request smaller sections).]')
162
178
  end
163
179
 
164
180
  if trimmed_count.positive?
@@ -419,6 +419,22 @@ module Legion
419
419
  err.is_a?(::NoMethodError) || err.is_a?(::ArgumentError)
420
420
  end
421
421
 
422
+ def larger_context_lane_available?(lane:, payload:, **)
423
+ current_context = lane.dig(:limits, :context_window).to_i
424
+ return false unless current_context.positive?
425
+
426
+ Legion::LLM::Inventory.lanes.any? do |candidate|
427
+ next false if payload[:tried_lanes].include?(candidate[:id])
428
+ next false if candidate[:lane_weight].to_i <= 0
429
+
430
+ candidate_context = candidate.dig(:limits, :context_window).to_i
431
+ candidate_context > current_context
432
+ end
433
+ rescue StandardError => e
434
+ handle_exception(e, level: :warn, operation: 'llm.pipeline.larger_context_lane_available')
435
+ false
436
+ end
437
+
422
438
  # Classify error for the while remaining.positive? loop (G26 / D-C).
423
439
  # internal_error MUST be checked BEFORE account_specific (G25 / B-H):
424
440
  # a daemon NoMethodError must never be treated as a retriable account-scoped failure.
@@ -439,7 +455,12 @@ module Legion
439
455
  # Terminal errors re-raise immediately so the caller's while loop stops.
440
456
  def classify_and_accumulate_exclusions(error:, lane:, payload:, **)
441
457
  case classify_error(error: error)
442
- when :internal_error, :context_overflow, :payload_error, :policy_denied
458
+ when :context_overflow
459
+ raise error unless larger_context_lane_available?(lane: lane, payload: payload)
460
+
461
+ payload[:tried_lanes] << lane[:id]
462
+
463
+ when :internal_error, :payload_error, :policy_denied
443
464
  raise error
444
465
  when :account_specific
445
466
  # Account/instance-scoped failure: trip the per-instance circuit.
@@ -30,7 +30,7 @@ module Legion
30
30
  max_tool_rounds: 200,
31
31
  max_tool_calls_per_turn: 100,
32
32
  tool_error_log_chars: 500,
33
- tool_result_max_dispatch_chars: 10_000,
33
+ tool_result_max_dispatch_chars: 5_000,
34
34
  default_model: model_override,
35
35
  default_temperature: 0.9,
36
36
  default_provider: nil,
@@ -218,7 +218,7 @@ module Legion
218
218
  # Multiplicative tier weights for lane_weight computation (P1 SSOT RANKING v2).
219
219
  # Default 100 for all tiers. Operators can override per-tier to bias routing.
220
220
  # Used by Inventory.write_lane to compute lane_weight = tier_w * provider_w * instance_w * model_w * health_mult.
221
- tier_weights: { direct: 100, local: 100, fleet: 100, cloud: 100, frontier: 100 },
221
+ tier_weights: { direct: 105, local: 110, fleet: 110, cloud: 120, frontier: 150 },
222
222
  max_attempts: 3,
223
223
  # Body-level routing hints are gated by this flag. Auto-routing aliases
224
224
  # like legionio/auto are still accepted as "you pick" intent.
@@ -399,19 +399,20 @@ module Legion
399
399
 
400
400
  def self.context_curation_defaults
401
401
  {
402
- enabled: true,
403
- mode: 'heuristic',
404
- llm_assisted: false,
405
- llm_model: nil,
406
- tool_result_max_chars: 10_000,
407
- thinking_eviction: true,
408
- exchange_folding: true,
409
- superseded_eviction: true,
410
- dedup_enabled: true,
411
- dedup_threshold: 0.85,
412
- target_context_tokens: 60_000,
413
- archive_dropped_turns: true,
414
- archive_preserve_recent: 10
402
+ enabled: true,
403
+ mode: 'heuristic',
404
+ llm_assisted: false,
405
+ llm_model: nil,
406
+ tool_result_max_chars: 2_000,
407
+ thinking_eviction: true,
408
+ exchange_folding: true,
409
+ superseded_eviction: true,
410
+ dedup_enabled: true,
411
+ dedup_threshold: 0.85,
412
+ target_context_tokens: 60_000,
413
+ context_window_threshold: 0.90,
414
+ archive_dropped_turns: true,
415
+ archive_preserve_recent: 10
415
416
  }
416
417
  end
417
418
 
@@ -482,10 +483,7 @@ module Legion
482
483
  end
483
484
 
484
485
  def self.debug_formats_default_enabled
485
- return true if defined?(Legion::Mode) && Legion::Mode.respond_to?(:lite?) && Legion::Mode.lite?
486
-
487
- env = (ENV.fetch('LEGION_ENV', nil) || ENV.fetch('RACK_ENV', nil)).to_s.downcase
488
- %w[development dev test].include?(env)
486
+ true
489
487
  end
490
488
 
491
489
  def self.streaming_defaults
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Legion
4
4
  module LLM
5
- VERSION = '0.14.2'
5
+ VERSION = '0.14.3'
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: legion-llm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.2
4
+ version: 0.14.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esity