legion-llm 0.14.2 → 0.14.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/lib/legion/llm/api/shared_helpers.rb +23 -0
- data/lib/legion/llm/inference/executor/context_window.rb +21 -5
- data/lib/legion/llm/inference/executor/escalation.rb +22 -1
- data/lib/legion/llm/settings.rb +17 -19
- data/lib/legion/llm/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: dac2aadf427ccf4c6e902242ed392c3f9f0f5a3e0ddaeeb6d6c262e3926c87b1
|
|
4
|
+
data.tar.gz: 48ccd139e47330107147d7745ec7e34c7a1de879cc8939d2d1fd33322d04b336
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 27c3269042cf0b060f0e402fad967f048319a2a98a425d4b68f39569d07e41842505bf947f32ae62a52125a0c81143e4d1fcf076bee53e247fd0cfe76a9e7f6a
|
|
7
|
+
data.tar.gz: 1fcd99c1a3007b858bc033f57a1cd3ada1fd96e2729d597fd65f8ac8eeb406295f1b73a2d4bcea31793ca6669b0bba545c32ac86474e2c9f1a5adcc60f9550f2
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,23 @@
|
|
|
1
1
|
# Legion LLM Changelog
|
|
2
2
|
|
|
3
|
+
## [0.14.3] - 2026-06-22
|
|
4
|
+
|
|
5
|
+
### Fixed
|
|
6
|
+
|
|
7
|
+
- Context window enforcement now accounts for tool definition tokens when deciding whether to compact messages. Previously, 235 injected tools (~50k tokens) were invisible to the threshold check, causing context overflow on dispatch.
|
|
8
|
+
- Context overflow errors are no longer unconditionally terminal during escalation. When a lane with a larger context window exists, the executor retries on that lane instead of raising immediately.
|
|
9
|
+
- `X-Legion-Format: canonical` debug surface enabled by default (was gated to dev/lite mode only). The canonical format is a first-class output format, not a debug feature.
|
|
10
|
+
- Completion log now includes `context_tokens_saved=N` showing total tokens saved by curation, archival, thinking-strip, and context window compaction per request.
|
|
11
|
+
- Embedding dispatch now sends chunks individually instead of as an array, fixing intermittent `ContextOverflow` on RAG embedding queries with ollama.
|
|
12
|
+
- Truncated tool results now instruct the model to make smaller targeted requests instead of silently chopping content.
|
|
13
|
+
- `context_window_threshold` moved from hardcoded 0.90 to `context_curation_defaults` setting.
|
|
14
|
+
|
|
15
|
+
### Changed
|
|
16
|
+
|
|
17
|
+
- `tool_result_max_dispatch_chars` raised from 2k to 5k (dispatch-time truncation of oversized tool results before provider call).
|
|
18
|
+
- `context_curation.tool_result_max_chars` lowered from 10k to 2k (curator distills tool results earlier, reducing context growth).
|
|
19
|
+
- Default `tier_weights` rebalanced: direct=105, local=110, fleet=110, cloud=120, frontier=150 (prefer higher-capability tiers when lane_weight tie-breaks).
|
|
20
|
+
|
|
3
21
|
## [0.14.2] - 2026-06-20
|
|
4
22
|
|
|
5
23
|
### Fixed
|
|
@@ -179,6 +179,8 @@ module Legion
|
|
|
179
179
|
provider_latency_ms = api_provider_latency_ms(pipeline_response, routing)
|
|
180
180
|
conversation_id = pipeline_response.respond_to?(:conversation_id) ? pipeline_response.conversation_id : nil
|
|
181
181
|
|
|
182
|
+
context_stats = api_context_stats(pipeline_response)
|
|
183
|
+
|
|
182
184
|
parts = {
|
|
183
185
|
request_id: request_id || 'unknown',
|
|
184
186
|
conversation_id: conversation_id || 'none',
|
|
@@ -199,6 +201,7 @@ module Legion
|
|
|
199
201
|
tool_executions: api_tool_execution_count(pipeline_response),
|
|
200
202
|
stop_reason: resolved_stop_reason
|
|
201
203
|
}
|
|
204
|
+
parts.merge!(context_stats) if context_stats
|
|
202
205
|
|
|
203
206
|
log.info("[llm][api][#{namespace}] action=completed " \
|
|
204
207
|
"#{parts.map { |key, value| "#{key}=#{value}" }.join(' ')}")
|
|
@@ -228,6 +231,26 @@ module Legion
|
|
|
228
231
|
nil
|
|
229
232
|
end
|
|
230
233
|
|
|
234
|
+
def api_context_stats(pipeline_response)
|
|
235
|
+
return nil unless pipeline_response.respond_to?(:audit)
|
|
236
|
+
|
|
237
|
+
accounting = pipeline_response.audit&.dig(:context_accounting)
|
|
238
|
+
return nil unless accounting.is_a?(Hash)
|
|
239
|
+
|
|
240
|
+
tokens = accounting[:tokens] || {}
|
|
241
|
+
curated = tokens[:curation_saved_estimated_tokens].to_i
|
|
242
|
+
archived = tokens[:archive_saved_estimated_tokens].to_i
|
|
243
|
+
stripped = tokens[:stripped_thinking_estimated_tokens].to_i
|
|
244
|
+
compacted = tokens[:context_window_saved_estimated_tokens].to_i
|
|
245
|
+
total_saved = curated + archived + stripped + compacted
|
|
246
|
+
return nil unless total_saved.positive?
|
|
247
|
+
|
|
248
|
+
{ context_tokens_saved: total_saved }
|
|
249
|
+
rescue StandardError => e
|
|
250
|
+
handle_exception(e, level: :warn, handled: true, operation: 'llm.api.context_stats')
|
|
251
|
+
nil
|
|
252
|
+
end
|
|
253
|
+
|
|
231
254
|
def api_hash_value(hash, key)
|
|
232
255
|
return nil unless hash.respond_to?(:[])
|
|
233
256
|
|
|
@@ -26,18 +26,21 @@ module Legion
|
|
|
26
26
|
@context_accounting[:component_status][:context_window] = :observed
|
|
27
27
|
return messages unless context_window&.positive?
|
|
28
28
|
|
|
29
|
-
threshold = (context_window *
|
|
29
|
+
threshold = (context_window * Legion::Settings[:llm][:context_curation][:context_window_threshold]).to_i
|
|
30
|
+
tool_budget = estimate_tool_token_budget
|
|
31
|
+
available_for_messages = threshold - tool_budget
|
|
30
32
|
estimated = estimate_message_tokens(messages)
|
|
31
|
-
return messages if estimated <=
|
|
33
|
+
return messages if estimated <= available_for_messages
|
|
32
34
|
|
|
33
35
|
log.warn "[llm][executor] action=context_compaction request_id=#{@request.id} " \
|
|
34
|
-
"estimated_tokens=#{estimated} context_window=#{context_window}
|
|
36
|
+
"estimated_tokens=#{estimated} context_window=#{context_window} " \
|
|
37
|
+
"threshold=#{threshold} tool_budget=#{tool_budget} available=#{available_for_messages}"
|
|
35
38
|
|
|
36
39
|
preserve_after = last_user_message_index(messages)
|
|
37
40
|
recent = messages[preserve_after..]
|
|
38
41
|
older = messages[0...preserve_after]
|
|
39
42
|
|
|
40
|
-
target_tokens =
|
|
43
|
+
target_tokens = available_for_messages - estimate_message_tokens(recent)
|
|
41
44
|
compacted = compact_to_fit(older, target_tokens)
|
|
42
45
|
|
|
43
46
|
result = compacted + recent
|
|
@@ -94,6 +97,17 @@ module Legion
|
|
|
94
97
|
messages.sum { |m| ((m[:content] || m['content']).to_s.length / 4.0).ceil }
|
|
95
98
|
end
|
|
96
99
|
|
|
100
|
+
def estimate_tool_token_budget
|
|
101
|
+
tools = @request.tools
|
|
102
|
+
return 0 if tools.nil? || tools.empty?
|
|
103
|
+
|
|
104
|
+
tool_list = tools.is_a?(Hash) ? tools.values : Array(tools)
|
|
105
|
+
tool_list.sum do |tool|
|
|
106
|
+
json_repr = tool.respond_to?(:to_h) ? Legion::JSON.dump(tool.to_h) : tool.to_s
|
|
107
|
+
(json_repr.length / 3.5).ceil
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
97
111
|
def strip_thinking_from_history(messages)
|
|
98
112
|
before_tokens = ContextAccounting.estimate_message_tokens(messages)
|
|
99
113
|
preserve_after = last_user_message_index(messages)
|
|
@@ -158,7 +172,9 @@ module Legion
|
|
|
158
172
|
next msg unless content.is_a?(String) && content.length > max_chars
|
|
159
173
|
|
|
160
174
|
trimmed_count += 1
|
|
161
|
-
msg.merge(content: "#{content[0, max_chars]}\n[
|
|
175
|
+
msg.merge(content: "#{content[0, max_chars]}\n\n[TRUNCATED: showing first #{max_chars} of #{content.length} chars. " \
|
|
176
|
+
'If you need more content, make multiple smaller targeted requests ' \
|
|
177
|
+
'(e.g. read specific line ranges, grep for specific patterns, or request smaller sections).]')
|
|
162
178
|
end
|
|
163
179
|
|
|
164
180
|
if trimmed_count.positive?
|
|
@@ -419,6 +419,22 @@ module Legion
|
|
|
419
419
|
err.is_a?(::NoMethodError) || err.is_a?(::ArgumentError)
|
|
420
420
|
end
|
|
421
421
|
|
|
422
|
+
def larger_context_lane_available?(lane:, payload:, **)
|
|
423
|
+
current_context = lane.dig(:limits, :context_window).to_i
|
|
424
|
+
return false unless current_context.positive?
|
|
425
|
+
|
|
426
|
+
Legion::LLM::Inventory.lanes.any? do |candidate|
|
|
427
|
+
next false if payload[:tried_lanes].include?(candidate[:id])
|
|
428
|
+
next false if candidate[:lane_weight].to_i <= 0
|
|
429
|
+
|
|
430
|
+
candidate_context = candidate.dig(:limits, :context_window).to_i
|
|
431
|
+
candidate_context > current_context
|
|
432
|
+
end
|
|
433
|
+
rescue StandardError => e
|
|
434
|
+
handle_exception(e, level: :warn, operation: 'llm.pipeline.larger_context_lane_available')
|
|
435
|
+
false
|
|
436
|
+
end
|
|
437
|
+
|
|
422
438
|
# Classify error for the while remaining.positive? loop (G26 / D-C).
|
|
423
439
|
# internal_error MUST be checked BEFORE account_specific (G25 / B-H):
|
|
424
440
|
# a daemon NoMethodError must never be treated as a retriable account-scoped failure.
|
|
@@ -439,7 +455,12 @@ module Legion
|
|
|
439
455
|
# Terminal errors re-raise immediately so the caller's while loop stops.
|
|
440
456
|
def classify_and_accumulate_exclusions(error:, lane:, payload:, **)
|
|
441
457
|
case classify_error(error: error)
|
|
442
|
-
when :
|
|
458
|
+
when :context_overflow
|
|
459
|
+
raise error unless larger_context_lane_available?(lane: lane, payload: payload)
|
|
460
|
+
|
|
461
|
+
payload[:tried_lanes] << lane[:id]
|
|
462
|
+
|
|
463
|
+
when :internal_error, :payload_error, :policy_denied
|
|
443
464
|
raise error
|
|
444
465
|
when :account_specific
|
|
445
466
|
# Account/instance-scoped failure: trip the per-instance circuit.
|
data/lib/legion/llm/settings.rb
CHANGED
|
@@ -30,7 +30,7 @@ module Legion
|
|
|
30
30
|
max_tool_rounds: 200,
|
|
31
31
|
max_tool_calls_per_turn: 100,
|
|
32
32
|
tool_error_log_chars: 500,
|
|
33
|
-
tool_result_max_dispatch_chars:
|
|
33
|
+
tool_result_max_dispatch_chars: 5_000,
|
|
34
34
|
default_model: model_override,
|
|
35
35
|
default_temperature: 0.9,
|
|
36
36
|
default_provider: nil,
|
|
@@ -218,7 +218,7 @@ module Legion
|
|
|
218
218
|
# Multiplicative tier weights for lane_weight computation (P1 SSOT RANKING v2).
|
|
219
219
|
# Default 100 for all tiers. Operators can override per-tier to bias routing.
|
|
220
220
|
# Used by Inventory.write_lane to compute lane_weight = tier_w * provider_w * instance_w * model_w * health_mult.
|
|
221
|
-
tier_weights: { direct:
|
|
221
|
+
tier_weights: { direct: 105, local: 110, fleet: 110, cloud: 120, frontier: 150 },
|
|
222
222
|
max_attempts: 3,
|
|
223
223
|
# Body-level routing hints are gated by this flag. Auto-routing aliases
|
|
224
224
|
# like legionio/auto are still accepted as "you pick" intent.
|
|
@@ -399,19 +399,20 @@ module Legion
|
|
|
399
399
|
|
|
400
400
|
def self.context_curation_defaults
|
|
401
401
|
{
|
|
402
|
-
enabled:
|
|
403
|
-
mode:
|
|
404
|
-
llm_assisted:
|
|
405
|
-
llm_model:
|
|
406
|
-
tool_result_max_chars:
|
|
407
|
-
thinking_eviction:
|
|
408
|
-
exchange_folding:
|
|
409
|
-
superseded_eviction:
|
|
410
|
-
dedup_enabled:
|
|
411
|
-
dedup_threshold:
|
|
412
|
-
target_context_tokens:
|
|
413
|
-
|
|
414
|
-
|
|
402
|
+
enabled: true,
|
|
403
|
+
mode: 'heuristic',
|
|
404
|
+
llm_assisted: false,
|
|
405
|
+
llm_model: nil,
|
|
406
|
+
tool_result_max_chars: 2_000,
|
|
407
|
+
thinking_eviction: true,
|
|
408
|
+
exchange_folding: true,
|
|
409
|
+
superseded_eviction: true,
|
|
410
|
+
dedup_enabled: true,
|
|
411
|
+
dedup_threshold: 0.85,
|
|
412
|
+
target_context_tokens: 60_000,
|
|
413
|
+
context_window_threshold: 0.90,
|
|
414
|
+
archive_dropped_turns: true,
|
|
415
|
+
archive_preserve_recent: 10
|
|
415
416
|
}
|
|
416
417
|
end
|
|
417
418
|
|
|
@@ -482,10 +483,7 @@ module Legion
|
|
|
482
483
|
end
|
|
483
484
|
|
|
484
485
|
def self.debug_formats_default_enabled
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
env = (ENV.fetch('LEGION_ENV', nil) || ENV.fetch('RACK_ENV', nil)).to_s.downcase
|
|
488
|
-
%w[development dev test].include?(env)
|
|
486
|
+
true
|
|
489
487
|
end
|
|
490
488
|
|
|
491
489
|
def self.streaming_defaults
|
data/lib/legion/llm/version.rb
CHANGED