legion-llm 0.10.2 → 0.11.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +120 -0
  3. data/Gemfile +3 -1
  4. data/lib/legion/llm/api/auth.rb +2 -2
  5. data/lib/legion/llm/api/namespaces/anthropic/files.rb +2 -1
  6. data/lib/legion/llm/api/namespaces/anthropic/messages.rb +17 -4
  7. data/lib/legion/llm/api/namespaces/helpers.rb +2 -1
  8. data/lib/legion/llm/api/namespaces/native/chat.rb +9 -1
  9. data/lib/legion/llm/api/namespaces/openai/audio/speech.rb +4 -2
  10. data/lib/legion/llm/api/namespaces/openai/audio/transcriptions.rb +2 -1
  11. data/lib/legion/llm/api/namespaces/openai/audio/translations.rb +2 -1
  12. data/lib/legion/llm/api/namespaces/openai/batches.rb +4 -3
  13. data/lib/legion/llm/api/namespaces/openai/chat/completions.rb +2 -3
  14. data/lib/legion/llm/api/namespaces/openai/completions.rb +1 -1
  15. data/lib/legion/llm/api/namespaces/openai/embeddings.rb +12 -2
  16. data/lib/legion/llm/api/namespaces/openai/files.rb +4 -2
  17. data/lib/legion/llm/api/namespaces/openai/images.rb +5 -3
  18. data/lib/legion/llm/api/namespaces/openai/models.rb +13 -5
  19. data/lib/legion/llm/api/namespaces/openai/responses.rb +1 -1
  20. data/lib/legion/llm/api/native/helpers.rb +17 -4
  21. data/lib/legion/llm/api/native/models.rb +3 -1
  22. data/lib/legion/llm/api/native/tiers.rb +2 -2
  23. data/lib/legion/llm/api/openai/chat_completions.rb +1 -1
  24. data/lib/legion/llm/api/openai/embeddings.rb +1 -1
  25. data/lib/legion/llm/api/openai/responses.rb +1 -1
  26. data/lib/legion/llm/api/shared_helpers.rb +2 -1
  27. data/lib/legion/llm/api/translators/anthropic_request.rb +17 -6
  28. data/lib/legion/llm/api/translators/anthropic_response.rb +7 -5
  29. data/lib/legion/llm/api/translators/openai_request.rb +20 -9
  30. data/lib/legion/llm/api/translators/openai_response.rb +10 -3
  31. data/lib/legion/llm/api.rb +1 -1
  32. data/lib/legion/llm/audit.rb +2 -2
  33. data/lib/legion/llm/cache/response.rb +3 -3
  34. data/lib/legion/llm/cache.rb +2 -2
  35. data/lib/legion/llm/call/daemon_client.rb +5 -7
  36. data/lib/legion/llm/call/embeddings.rb +25 -13
  37. data/lib/legion/llm/call/lex_llm_adapter.rb +24 -4
  38. data/lib/legion/llm/call/providers.rb +4 -4
  39. data/lib/legion/llm/call/structured_output.rb +3 -3
  40. data/lib/legion/llm/config.rb +7 -7
  41. data/lib/legion/llm/context/compressor.rb +17 -5
  42. data/lib/legion/llm/context/curator.rb +56 -41
  43. data/lib/legion/llm/discovery/memory_gate.rb +2 -2
  44. data/lib/legion/llm/discovery/rule_generator.rb +3 -3
  45. data/lib/legion/llm/discovery/system.rb +1 -1
  46. data/lib/legion/llm/discovery.rb +151 -83
  47. data/lib/legion/llm/fleet/dispatcher.rb +14 -20
  48. data/lib/legion/llm/fleet/handler.rb +7 -6
  49. data/lib/legion/llm/fleet/reply_dispatcher.rb +4 -3
  50. data/lib/legion/llm/fleet/token_issuer.rb +2 -6
  51. data/lib/legion/llm/helper.rb +3 -3
  52. data/lib/legion/llm/hooks/budget_guard.rb +1 -5
  53. data/lib/legion/llm/hooks/rag_guard.rb +2 -2
  54. data/lib/legion/llm/hooks/reflection.rb +2 -5
  55. data/lib/legion/llm/inference/audit_publisher.rb +40 -14
  56. data/lib/legion/llm/inference/conversation.rb +3 -3
  57. data/lib/legion/llm/inference/enrichment_injector.rb +2 -4
  58. data/lib/legion/llm/inference/executor.rb +354 -106
  59. data/lib/legion/llm/inference/native_tool_loop.rb +61 -12
  60. data/lib/legion/llm/inference/prompt.rb +2 -9
  61. data/lib/legion/llm/inference/request.rb +1 -4
  62. data/lib/legion/llm/inference/route_attempts.rb +5 -5
  63. data/lib/legion/llm/inference/steps/billing.rb +1 -1
  64. data/lib/legion/llm/inference/steps/classification.rb +9 -5
  65. data/lib/legion/llm/inference/steps/confidence_scoring.rb +10 -0
  66. data/lib/legion/llm/inference/steps/debate.rb +23 -16
  67. data/lib/legion/llm/inference/steps/gaia_advisory.rb +3 -1
  68. data/lib/legion/llm/inference/steps/knowledge_capture.rb +9 -3
  69. data/lib/legion/llm/inference/steps/logging.rb +2 -1
  70. data/lib/legion/llm/inference/steps/mcp_discovery.rb +1 -0
  71. data/lib/legion/llm/inference/steps/metering.rb +6 -1
  72. data/lib/legion/llm/inference/steps/post_response.rb +6 -1
  73. data/lib/legion/llm/inference/steps/prompt_cache.rb +4 -5
  74. data/lib/legion/llm/inference/steps/rag_context.rb +27 -22
  75. data/lib/legion/llm/inference/steps/rag_guard.rb +2 -2
  76. data/lib/legion/llm/inference/steps/rbac.rb +1 -1
  77. data/lib/legion/llm/inference/steps/skill_injector.rb +5 -6
  78. data/lib/legion/llm/inference/steps/sticky_helpers.rb +4 -5
  79. data/lib/legion/llm/inference/steps/tier_assigner.rb +7 -1
  80. data/lib/legion/llm/inference/steps/token_budget.rb +4 -1
  81. data/lib/legion/llm/inference/steps/tool_calls.rb +60 -21
  82. data/lib/legion/llm/inference/steps/tool_discovery.rb +4 -1
  83. data/lib/legion/llm/inference/steps/trigger_match.rb +7 -6
  84. data/lib/legion/llm/inference.rb +97 -43
  85. data/lib/legion/llm/inventory.rb +1 -1
  86. data/lib/legion/llm/metering/tokens.rb +11 -3
  87. data/lib/legion/llm/metering/tracker.rb +3 -3
  88. data/lib/legion/llm/metering.rb +117 -12
  89. data/lib/legion/llm/publisher_identity.rb +2 -1
  90. data/lib/legion/llm/quality/checker.rb +35 -8
  91. data/lib/legion/llm/quality/confidence/scorer.rb +31 -17
  92. data/lib/legion/llm/quality/shadow_eval.rb +2 -1
  93. data/lib/legion/llm/router/arbitrage.rb +3 -2
  94. data/lib/legion/llm/router/escalation/chain.rb +5 -2
  95. data/lib/legion/llm/router/health_tracker.rb +12 -27
  96. data/lib/legion/llm/router.rb +36 -63
  97. data/lib/legion/llm/scheduling/batch.rb +1 -1
  98. data/lib/legion/llm/scheduling.rb +5 -13
  99. data/lib/legion/llm/settings.rb +80 -179
  100. data/lib/legion/llm/skills/external_discovery.rb +2 -2
  101. data/lib/legion/llm/skills.rb +1 -4
  102. data/lib/legion/llm/tools/dispatcher.rb +16 -4
  103. data/lib/legion/llm/tools/interceptor.rb +10 -0
  104. data/lib/legion/llm/transport/messages/metering_event.rb +6 -2
  105. data/lib/legion/llm/transport/messages/prompt_event.rb +1 -1
  106. data/lib/legion/llm/transport/messages/skill_event.rb +1 -1
  107. data/lib/legion/llm/transport/messages/tool_event.rb +1 -1
  108. data/lib/legion/llm/types/tool_call.rb +43 -25
  109. data/lib/legion/llm/vector_store/storage.rb +2 -2
  110. data/lib/legion/llm/version.rb +1 -1
  111. data/lib/legion/llm.rb +6 -6
  112. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f5c1d01be312fb6e070b3e75236a8ecd6c6841653e42eae391cb5b2519a668b6
4
- data.tar.gz: f6703f5c706dd134e85993c11da4222fc68edced3ca88e09a841f04ca2fb2474
3
+ metadata.gz: 555bff51c05efea04f283dc4a0c005703fef2344d57b9692fdd70d6ab95d9646
4
+ data.tar.gz: 454c9cd0be750aec0d597c9e343a35fc8939c22a67af8d048d3887214c225cba
5
5
  SHA512:
6
- metadata.gz: 06aef6de56965e58876fa98bfcdf8b411996d00ea3faff44f9aa0f9035d385d9da3591300ac2663a31a2820cf56610572fff3b801ec417af20f20fd377ad0e8a
7
- data.tar.gz: 77b720ac764fbb899dbbea7d372edfb7a6a8c2671d2b8eef4f92680439d4572e9e8986519c68d4452232ede7c7c87ae74b3b38fbb6383bd9e3dcc1aa0f30670a
6
+ metadata.gz: c153ef24c678502b0bd9249c6ff8d39070e07b8e06eb950686b552bfdf8bcc5d03b060c333dff2c06418cfc4a1046c3a779274756fb784e821166b3605ee430d
7
+ data.tar.gz: b1ab367c71a7098292fecbcf2c67758de0734ecd7e70cb108d8ee08abc9fd3917e01adefe2a7bc2d0472a0d731e86b0bf73a9a2a7f566619502322ff0eb9d4ec
data/CHANGELOG.md CHANGED
@@ -1,5 +1,125 @@
1
1
  # Legion LLM Changelog
2
2
 
3
+ ## [0.11.2] - 2026-06-02
4
+
5
+ ### Removed
6
+ - **Legion::LLM::Settings abstraction layer** — Removed `value`, `config_value`, `global_value`, `set_value`, `transport_connected?`, `enterprise_privacy?`, `current_settings`, and `namespace` methods. All settings reads now go directly through `Legion::Settings[:llm]` or `Legion::Settings.dig(:llm, ...)`. The module retains only `default`, `register_defaults!`, `validate!`, and the `*_defaults` class methods.
7
+ - **Passthrough wrapper methods** — Removed `routing_settings`, `discovery_settings`, `default_settings_model`, `default_settings_provider`, `llm_settings` and similar indirection methods from Router, Discovery, and Inference modules.
8
+ - **String-key dual-lookup** — `config_value` previously tried both symbol and string keys on any hash. All settings are now symbol-keyed; string-key fallback is gone.
9
+ - **Deprecated "embeddings" (plural) settings key** — Use `embedding` (singular) only.
10
+
11
+ ### Added
12
+ - Default values for `compliance.encrypt_metering`, `compliance.audit_max_messages`, `budget.session_usd`, `rag_guard.evaluators`, `discovery.memory_overhead_factor`, and `structured_output` settings so all accessed keys have registered defaults.
13
+
14
+ ## [0.12.1] - 2026-06-02
15
+
16
+ ### Fixed
17
+ - **Provider-scoped discovery refresh** — `Discovery.refresh_discovered_models!` now accepts an optional `provider:` keyword argument. When filtered, only refreshes that provider's models and merges with the existing cache instead of re-querying all providers (discovery.rb)
18
+
19
+ ## [0.12.0] - 2026-06-01
20
+
21
+ ### Fixed
22
+ - **cacheable? treated nil temperature as zero** — Added `default_temperature` setting (default 1.0). Requests without an explicit temperature now resolve against the default instead of being treated as `0.0`, preventing non-deterministic responses from being served from cache (inference.rb, settings.rb)
23
+ - **ReDoS in infer_tool_name** — Possessive quantifier `\d++` replaced with `\d+` in search detection regex (context/curator.rb)
24
+ - **ReDoS in strip_thinking regex** — `[^#\n][^\n]*` double-character-class pattern created exponential backtracking on long non-heading lines (10k-char lines). Replaced with anchored negative-lookahead variant that matches only lines starting with `#+ Thinking` headings. Benchmarked: pathological input drops from potential timeout to <5ms (context/curator.rb)
25
+ - **Shell injection in dispatch_client_tool** — Added audit logging for all shell commands executed via native client tools (api/native/helpers.rb)
26
+ - **Path traversal in file operations** — Added `validate_client_tool_path` that constrains file_read/file_write/file_edit to working directory, rejecting paths that escape via `..` (api/native/helpers.rb)
27
+ - **Text block concatenation loss** — Anthropic translator now joins assistant text parts with `\n\n` separator instead of empty string (api/translators/anthropic_request.rb)
28
+ - **Raw part leak in responses_content_part** — LexLLMAdapter no longer returns unnormalized/unsanitized parts; unknown types are converted to `input_text` with serialized content (call/lex_llm_adapter.rb)
29
+ - **Metering failures silently dropped** — step_metering now attempts `Metering.spool_event` on publish failure so billing events are spooled to disk instead of lost (inference/executor.rb)
30
+ - **Error category extraction always nil** — `extract_error_category_from_attempt` now handles string failures and hash `:error` keys in addition to `:category` (inference.rb)
31
+ - **provider_scoped_instance false negatives** — Now checks `Registry.instances_for` before returning nil, only drops instance when provider has other registered instances (inference/executor.rb)
32
+ - **build_fallback_resolutions double-exclusion** — Merged two separate exclusion checks into single predicate; `exclude_instance: nil` now only excludes the specified provider+instance combo (inference/executor.rb)
33
+ - **find_fallback_provider hardcoded local exclusion** — ollama/vllm fallback exclusion is now configurable via `fallback.allow_local` setting instead of being permanently blocked (inference/executor.rb)
34
+ - **extract_content double transform_keys** — Normalizes block keys once up front instead of calling transform_keys 2-3 times per block (api/translators/openai_request.rb)
35
+ - **@pending_tool_history data race** — Tool history mutations in step_tool_calls now wrapped in `@pending_tool_history_mutex.synchronize` to match executor's async event emission (inference/steps/tool_calls.rb)
36
+ - **Client passthrough tool events never emitted** — `client_passthrough_tool_loop_result` now emits both `emit_tool_call_event` and `emit_tool_result_event` for passthrough tools so they appear in `@pending_tool_history`, fire `@tool_event_handler` callbacks, and generate tool audit events (inference/steps/tool_calls.rb)
37
+ - **Thinking tag pattern divergence** — Executor's `strip_thinking_from_history` only handled `<thinking>`/`<think(?:ing)?>` (Anthropic/long form) but not short `<think>` (DeepSeek, Qwen, Ollama, vLLM) or `<thought>` (various models). Added `THINKING_TAG_PATTERN_SHORT` and `THINKING_TAG_PATTERN_THOUGHT` constants, applied all three gsubs so all thinking block variants are stripped before dispatch on every turn (inference/executor.rb)
38
+ - **Thinking tag stripping corrupted passthrough content** — Unanchored regex in `strip_thinking_from_history` and `strip_thinking_tags` treated backtick-quoted or mid-content `<think>`/`</think>` references as real thinking blocks, deleting content between them. Replaced regex with string-based `start_with?`/`index` approach that only strips tags at the beginning of a message where providers actually emit them (inference/executor.rb, context/curator.rb)
39
+ - **ToolResultEvent unresolved constant** — `client_passthrough_tool_loop_result` referenced `ToolResultEvent` without namespace; Ruby's lexical constant lookup failed in the included module. Qualified as `Executor::ToolResultEvent` (inference/steps/tool_calls.rb)
40
+ - **client_tool_methods_spec sandbox failure** — Tests created temp files in `/tmp` which `validate_client_tool_path` correctly rejects. Moved to project-relative `tmp/` directory (spec/api/native/client_tool_methods_spec.rb)
41
+
42
+ ### Changed
43
+ - **Removed `llm_setting` abstraction** — All `llm_setting(:key)` calls replaced with direct `Legion::Settings[:llm][:key]` access. The indirection obscured that settings must flow through `Legion::Settings` to pick up dynamic user overrides. Affected: inference/executor.rb, inference.rb, inference/native_tool_loop.rb, inference/prompt.rb
44
+ - **`fallback.allow_local` defaults to true** — Local providers (ollama/vllm) are now allowed as fallback targets by default instead of being permanently excluded (settings.rb)
45
+ - **Text join separator** — OpenAI translator `extract_content` text blocks now join with `\n\n` instead of empty string for consistency
46
+
47
+ ## [0.11.1] - 2026-06-01
48
+
49
+ ### Fixed
50
+ - **Embedding instance selection honored** — Discovery honors a configured `embedding.instance` pin over a higher-tier-ranked empty instance (and skips empty candidates whose resolved model is absent), and `Embeddings.generate`/`generate_batch` resolve that configured instance on the dispatch path instead of falling back to the provider default
51
+
52
+ ## [0.11.0] - 2026-05-31
53
+
54
+ ### Added
55
+ - **Comprehensive diagnostic logging** — 28 files across executor, pipeline steps, tool loop, context, router, quality checker with structured `[llm][component] action=verb key=value` format at appropriate severity levels (debug/info/warn)
56
+ - **Context window enforcement** — Pre-dispatch compaction triggers at 90% of model's context window, preserving recent turns and aggressively compacting older history
57
+ - **Tool result trimming** — Oversized tool results from prior turns trimmed to 4000 chars before dispatch (current turn preserved in full)
58
+ - **Thinking block stripping** — Historical `<think>` blocks removed from prior assistant turns before dispatch
59
+ - **Empty response guard** — Streaming responses with no text and no tool calls emit `overloaded_error` instead of valid empty message, triggering client retry
60
+ - **System prompt: no tool call limit** — Added instruction telling models there is no tool call limit per turn
61
+ - **Conversation ID always generated** — API handler generates conv_id when client doesn't provide one; returned via `X-Legion-Conversation-Id` header
62
+ - **Metering spool encryption** — Spool file encrypts via `Legion::Crypt` when `:compliance, :encrypt_spool` enabled
63
+ - **Audit publisher improvements** — Preserves caller identity, includes agent_id/node_id, extracts provider metrics, hashes truncated conversations
64
+
65
+ ### Fixed
66
+ - **Quality checker ignores tool-use responses** — No longer flags empty_response when model returns tool calls with no text content
67
+ - **Confidence scoring skips tool-use** — Score=0.0 no longer reported for valid tool call responses
68
+ - **Context overflow doesn't trip circuit breaker** — ContextLengthExceededError no longer reports `:error` signal to health tracker
69
+ - **HealthTracker deadlock prevention** — `Mutex` replaced with `Monitor` (reentrant) to prevent deadlock when custom handlers call back into report/adjustment
70
+ - **Thread pool fallback policy** — Chat/batch pools use `:caller_runs` instead of `:abort` (no silent request drops under load)
71
+ - **Bare Thread.new eliminated** — All async work uses managed `ASYNC_THREAD_POOL` with `at_exit` shutdown hooks
72
+ - **Conversation#replace preserves internal roles** — `__metadata__` and `__curated__` entries no longer wiped on replace
73
+ - **EscalationChain method naming** — `padded_resolutions` renamed to `capped_resolutions` (it truncates, not pads)
74
+ - **trigger_tool_limit default mismatch** — Fallback default fixed from 50 to 25 to match settings.rb
75
+ - **Debate extract_question string keys** — `m[:role] == :user` changed to `.to_s == 'user'` for mixed-key messages
76
+ - **EnrichmentInjector nil safety** — `enrichments` param defaults to `{}` when nil
77
+ - **Stop reason preserved from provider** — `message_response` and `chunk_response` extract actual `finish_reason` from raw provider response instead of discarding
78
+ - **OpenAI streaming usage stats** — Always included in final chunk (was gated behind `include_reasoning`)
79
+ - **Metering identity** — Uses caller identity from request, not process publisher identity
80
+ - **Metering request_type** — Derived from request metadata (image/audio/chat), not hardcoded 'chat'
81
+ - **Metering actual cost** — Prefers provider-reported cost over local estimate
82
+ - **Metering encryption** — `encrypt?` respects `:compliance, :encrypt_metering` setting
83
+ - **Audit identity clobbering** — `attributed_event` uses `||=` to preserve caller identity
84
+ - **Audit step order** — `post_response` (audit) now runs before `metering` (financial records need supporting evidence)
85
+ - **Audit tool spooling** — Failed tool audit events spool to disk instead of silent drop
86
+ - **Audit timeline** — Preserves RBAC, classification, billing, confidence decisions
87
+ - **Budget cap** — Pre-flight check estimates output tokens (assumes output ≈ input) instead of `output_tokens: 0`
88
+ - **Embeddings audit** — POST /v1/embeddings now emits audit event
89
+ - **Native chat audit** — Async chat path emits `Audit.emit_prompt` after completion
90
+ - **Knowledge capture embedding** — Truncates content to 2000 chars before embedding to prevent ContextLengthExceededError
91
+ - **Dedup performance** — O(n²) → O(n×20) via sliding window comparison
92
+ - **22 silent rescue swallows** — All `rescue StandardError` without variable capture now log at debug level
93
+
94
+ ### Changed
95
+ - **max_tool_calls_per_turn: 50** — New setting (was dead `MAX_TOOL_LOOPS = 10` constant); deferred tool calls get error result telling model to retry
96
+ - **max_tool_rounds** — Removed `MAX_NATIVE_TOOL_ROUNDS` constant; reads directly from settings
97
+ - **Settings-driven limits** — Redundant fallback defaults removed from `llm_setting` call sites
98
+
99
+ ## [0.10.4] - 2026-05-31
100
+
101
+ ### Fixed
102
+ - **TRANSLATION-BUG-01**: Anthropic `tool_result` content blocks preserved as arrays — multimodal tool results (images) no longer flattened to string.
103
+ - **TRANSLATION-BUG-03**: Anthropic `stop_reason` properly maps `content_filter`; distinguishes `stop` with/without `stop_sequence`.
104
+ - **TRANSLATION-BUG-04**: OpenAI `map_finish_reason` returns `error` for unknown stop reasons instead of `stop` (errors no longer disguised as success).
105
+ - **TRANSLATION-BUG-05**: OpenAI `extract_content` preserves `image_url` and non-text content parts — vision input no longer silently dropped.
106
+ - **TRANSLATION-BUG-06**: Anthropic streaming `content_block_start` includes tool arguments in `input` field (was empty `{}`).
107
+ - **TRANSLATION-BUG-09**: Anthropic system prompt `cache_control` metadata preserved when present — prompt caching no longer silently disabled.
108
+ - **TRANSLATION-BUG-10**: Stable `tool_call_id` generated when OpenAI client sends nil — multi-turn tool chains no longer break.
109
+ - **TRANSLATION-BUG-11**: OpenAI translator uses symbol roles (`:user`, `:assistant`) matching Anthropic — executor symbol comparisons now work.
110
+ - **TRANSLATION-BUG-12**: Unsupported OpenAI tool types (`code_interpreter`, `file_search`) logged at debug instead of silent drop.
111
+
112
+ ## [0.10.3] - 2026-05-31
113
+
114
+ ### Fixed
115
+ - **DaemonClient HTTPS support** — `http_get` and `http_post` now set `http.use_ssl = true` when the daemon URL scheme is `https://`. Previously, all daemon communication was plain HTTP, silently failing for HTTPS URLs or sending credentials in cleartext.
116
+ - **Context compression guard against preserve_recent: 0** — `auto_compact` now enforces a minimum `preserve_recent` of 1. A value of 0 would compact the entire conversation including the latest messages, producing empty context.
117
+ - **Context curator thread safety** — `curate_turn` and `curated_messages` now synchronize on a per-instance `@curation_mutex`. Concurrent turns could race on `@curated_messages`, causing stale or nil curation state.
118
+ - **Recursive compaction guard** — `maybe_compact_history` now uses `Thread.current[:legion_compacting]` to prevent infinite recursion when `Context::Compressor.auto_compact` triggers its own LLM summarization call, which would recursively trigger compaction again.
119
+ - **Metering::Tokens unbounded memory growth** — `TokenTracker#record` now evicts oldest entries when the store exceeds `MAX_ENTRIES` (10,000). Long-running high-throughput processes would leak memory.
120
+ - **Tool timeline index per-call resolution** — `build_tool_timeline_index` now tracks per-tool-name call counts and produces keys like `"read_file:2"` for repeated calls. `build_response_tool_calls` matches each tool call to its corresponding timeline entry, fixing wrong duration/status when the same tool is called multiple times in a round.
121
+ - **Streaming escalation quality bypass documented** — Added explicit comment noting that streaming escalation attempts always pass quality check because in-flight stream quality-checking is not supported.
122
+
3
123
  ## [0.10.2] - 2026-05-30
4
124
 
5
125
  ### Fixed
data/Gemfile CHANGED
@@ -12,7 +12,9 @@ group :test do
12
12
  if Dir.exist?(lex_llm_path)
13
13
  gem 'lex-llm', path: lex_llm_path
14
14
  else
15
- gem 'lex-llm'
15
+ # TEMP (revert to `gem 'lex-llm'` once 0.4.16 is published): track lex-llm PR #16, which
16
+ # adds the fleet TokenValidator verify_issuer + WorkerExecution policy-warn behavior these specs require.
17
+ gem 'lex-llm', git: 'https://github.com/LegionIO/lex-llm.git', branch: 'fix/audit-fleet-security'
16
18
  end
17
19
 
18
20
  %w[
@@ -37,7 +37,7 @@ module Legion
37
37
 
38
38
  app.helpers do
39
39
  define_method(:auth_enabled?) do
40
- Legion::LLM::Settings.value(:api, :auth, :enabled) == true
40
+ Legion::Settings.dig(:llm, :api, :auth, :enabled) == true
41
41
  end
42
42
 
43
43
  define_method(:extract_token) do |req|
@@ -56,7 +56,7 @@ module Legion
56
56
  return true unless auth_enabled?
57
57
  return false if token.nil? || token.empty?
58
58
 
59
- keys = Legion::LLM::Settings.value(:api, :auth, :api_keys, default: [])
59
+ keys = Legion::Settings.dig(:llm, :api, :auth, :api_keys) || []
60
60
  keys.include?(token)
61
61
  end
62
62
  end
@@ -201,7 +201,8 @@ module Legion
201
201
  def files_storage_path
202
202
  configured = begin
203
203
  Legion::Settings.dig(:llm, :files, :storage_path)
204
- rescue StandardError
204
+ rescue StandardError => e
205
+ log.debug "[llm][api][anthropic][files] action=files_storage_path_fallback error=#{e.class} message=#{e.message}"
205
206
  nil
206
207
  end
207
208
  base = configured.to_s.empty? ? ::File.join(Dir.home, '.legionio', 'data', 'files') : configured
@@ -34,7 +34,7 @@ module Legion
34
34
  tool_defs = build_tool_definitions(normalized[:tools] || [], executable: false)
35
35
  modality = detect_modality(normalized[:messages])
36
36
 
37
- conv_id = env['HTTP_X_LEGION_CONVERSATION_ID'] || body[:conversation_id]
37
+ conv_id = env['HTTP_X_LEGION_CONVERSATION_ID'] || body[:conversation_id] || "conv_#{SecureRandom.hex(8)}"
38
38
  ext_provider = env['HTTP_X_LEGION_PROVIDER'] || body[:provider]
39
39
  ext_tier = env['HTTP_X_LEGION_TIER'] || body[:tier]
40
40
  ext_instance = env['HTTP_X_LEGION_INSTANCE'] || body[:instance]
@@ -78,7 +78,8 @@ module Legion
78
78
 
79
79
  if streaming
80
80
  content_type 'text/event-stream'
81
- headers 'Cache-Control' => 'no-cache', 'Connection' => 'keep-alive', 'X-Accel-Buffering' => 'no'
81
+ headers 'Cache-Control' => 'no-cache', 'Connection' => 'keep-alive',
82
+ 'X-Accel-Buffering' => 'no', 'X-Legion-Conversation-Id' => conv_id
82
83
 
83
84
  stream do |out|
84
85
  full_text = +''
@@ -90,7 +91,7 @@ module Legion
90
91
  id: request_id, type: 'message', role: 'assistant',
91
92
  content: [], model: model.to_s,
92
93
  stop_reason: nil, stop_sequence: nil,
93
- usage: { input_tokens: 0, output_tokens: 0 }
94
+ usage: { input_tokens: est_tokens, output_tokens: 0 }
94
95
  }
95
96
  })}\n\n"
96
97
 
@@ -137,6 +138,16 @@ module Legion
137
138
  "tool_calls=#{tool_calls.size} stop_reason=#{stop_reason} " \
138
139
  "text_block_opened=#{text_block_opened} full_text_length=#{full_text.length}"
139
140
 
141
+ if tool_calls.empty? && full_text.empty?
142
+ log.warn "[llm][api][anthropic] action=empty_response request_id=#{request_id} " \
143
+ "model=#{model} text_block_opened=#{text_block_opened} — provider returned no content, signaling overloaded"
144
+ out << "event: error\ndata: #{Legion::JSON.dump({
145
+ type: 'error', error: { type: 'overloaded_error',
146
+ message: 'Model returned empty response. Please retry.' }
147
+ })}\n\n"
148
+ next
149
+ end
150
+
140
151
  if text_block_opened
141
152
  out << "event: content_block_stop\ndata: #{Legion::JSON.dump({ type: 'content_block_stop', index: 0 })}\n\n"
142
153
  content_index = 1
@@ -158,7 +169,8 @@ module Legion
158
169
  out << "event: message_delta\ndata: #{Legion::JSON.dump({
159
170
  type: 'message_delta',
160
171
  delta: { stop_reason: stop_reason, stop_sequence: nil },
161
- usage: { output_tokens: translator.token_count(tokens, :output) }
172
+ usage: { input_tokens: translator.token_count(tokens, :input),
173
+ output_tokens: translator.token_count(tokens, :output) }
162
174
  })}\n\n"
163
175
  out << "event: message_stop\ndata: #{Legion::JSON.dump({ type: 'message_stop' })}\n\n"
164
176
  log.info "[llm][api][anthropic] action=stream_complete request_id=#{request_id} stop_reason=#{stop_reason}"
@@ -172,6 +184,7 @@ module Legion
172
184
  pipeline_response, model: model, request_id: request_id
173
185
  )
174
186
 
187
+ headers 'X-Legion-Conversation-Id' => conv_id
175
188
  content_type :json
176
189
  status 200
177
190
  Legion::JSON.dump(formatted)
@@ -42,7 +42,8 @@ module Legion
42
42
 
43
43
  def data_subsystem_available?
44
44
  defined?(Legion::Data) && Legion::Data.respond_to?(:connected?) && Legion::Data.connected?
45
- rescue StandardError
45
+ rescue StandardError => e
46
+ log.debug "[llm][api][namespaces][helpers] action=data_subsystem_check_fallback error=#{e.class} message=#{e.message}"
46
47
  false
47
48
  end
48
49
  end
@@ -16,7 +16,7 @@ module Legion
16
16
 
17
17
  ASYNC_POOL = Concurrent::FixedThreadPool.new(
18
18
  [4, (Concurrent.processor_count / 2)].max,
19
- fallback_policy: :abort
19
+ fallback_policy: :caller_runs
20
20
  )
21
21
 
22
22
  # Ensure the thread pool is shut down cleanly when the process exits.
@@ -66,6 +66,14 @@ module Legion
66
66
  tokens_out: response.respond_to?(:output_tokens) ? response.output_tokens : nil
67
67
  }
68
68
  )
69
+ Legion::LLM::Audit.emit_prompt(
70
+ request_id: request_id,
71
+ caller: { requested_by: { identity: 'api:chat:async', type: :external } },
72
+ routing: { model: session.model.to_s, provider: provider },
73
+ tokens: { input_tokens: response.respond_to?(:input_tokens) ? response.input_tokens : 0,
74
+ output_tokens: response.respond_to?(:output_tokens) ? response.output_tokens : 0 },
75
+ timestamp: Time.now
76
+ )
69
77
  log.debug("[llm][api][namespaces][chat] action=async_complete request_id=#{request_id}")
70
78
  rescue StandardError => e
71
79
  handle_exception(e, level: :error, handled: true, operation: 'llm.api.chat.async', request_id: request_id)
@@ -36,7 +36,8 @@ module Legion
36
36
  def self.capable_provider_available?
37
37
  instances = begin
38
38
  Legion::LLM::Call::Registry.all_instances
39
- rescue StandardError
39
+ rescue StandardError => e
40
+ log.debug "[llm][api][openai][audio][speech] action=registry_fallback error=#{e.class} message=#{e.message}"
40
41
  []
41
42
  end
42
43
  instances.any? do |entry|
@@ -94,7 +95,8 @@ module Legion
94
95
  else
95
96
  begin
96
97
  Legion::JSON.load(raw)
97
- rescue StandardError
98
+ rescue StandardError => e
99
+ log.debug "[llm][api][openai][audio][speech] action=parse_body_fallback error=#{e.class} message=#{e.message}"
98
100
  {}
99
101
  end
100
102
  end
@@ -25,7 +25,8 @@ module Legion
25
25
  def self.capable_provider_available?
26
26
  instances = begin
27
27
  Legion::LLM::Call::Registry.all_instances
28
- rescue StandardError
28
+ rescue StandardError => e
29
+ log.debug "[llm][api][openai][audio][transcriptions] action=registry_fallback error=#{e.class} message=#{e.message}"
29
30
  []
30
31
  end
31
32
  instances.any? do |entry|
@@ -25,7 +25,8 @@ module Legion
25
25
  def self.capable_provider_available?
26
26
  instances = begin
27
27
  Legion::LLM::Call::Registry.all_instances
28
- rescue StandardError
28
+ rescue StandardError => e
29
+ log.debug "[llm][api][openai][audio][translations] action=registry_fallback error=#{e.class} message=#{e.message}"
29
30
  []
30
31
  end
31
32
  instances.any? do |entry|
@@ -23,8 +23,8 @@ module Legion
23
23
 
24
24
  BATCH_POOL_MUTEX.synchronize do
25
25
  @batch_pool ||= begin
26
- pool_size = Legion::LLM::Settings.value(:api, :batch_pool_size, default: 4)
27
- Concurrent::FixedThreadPool.new(pool_size, fallback_policy: :abort)
26
+ pool_size = Legion::Settings[:llm][:api][:batch_pool_size] || 4
27
+ Concurrent::FixedThreadPool.new(pool_size, fallback_policy: :caller_runs)
28
28
  end
29
29
  end
30
30
  end
@@ -237,7 +237,8 @@ module Legion
237
237
 
238
238
  ::File.readlines(file_path).filter_map do |line|
239
239
  Legion::JSON.load(line.strip)
240
- rescue StandardError
240
+ rescue StandardError => e
241
+ log.debug "[llm][api][openai][batches] action=load_batch_line_fallback file=#{file_id} error=#{e.class} message=#{e.message}"
241
242
  nil
242
243
  end
243
244
  end
@@ -31,7 +31,7 @@ module Legion
31
31
 
32
32
  request_id = body[:request_id] || SecureRandom.uuid
33
33
  normalized = Legion::LLM::API::Translators::OpenAIRequest.normalize(body)
34
- model = normalized[:model] || Legion::LLM::Settings.value(:default_model) || 'default'
34
+ model = normalized[:model] || Legion::Settings[:llm][:default_model] || 'default'
35
35
  streaming = normalized[:stream] == true
36
36
  include_reasoning = body[:include_reasoning] == true || body[:include_thinking] == true
37
37
  tool_decls = Completions.build_tool_declarations(normalized[:tools])
@@ -231,8 +231,7 @@ module Legion
231
231
  end
232
232
 
233
233
  def self.append_usage_stats(done_chunk, pipeline_response, include_reasoning)
234
- return unless include_reasoning
235
-
234
+ _ = include_reasoning
236
235
  tokens = pipeline_response.tokens || {}
237
236
  oai = Legion::LLM::API::Translators::OpenAIResponse
238
237
  input_count = oai.extract_token_count(tokens, :input).to_i
@@ -27,7 +27,7 @@ module Legion
27
27
  end
28
28
 
29
29
  request_id = SecureRandom.uuid
30
- model = body[:model] || Legion::LLM::Settings.value(:default_model) || 'default'
30
+ model = body[:model] || Legion::Settings[:llm][:default_model] || 'default'
31
31
  messages = [{ role: 'user', content: prompt.to_s }]
32
32
 
33
33
  log.info("[llm][api][namespaces][openai][completions] action=accepted request_id=#{request_id} model=#{model}")
@@ -15,11 +15,11 @@ module Legion
15
15
  def self.registered(app)
16
16
  log.debug('[llm][api][namespaces][openai][embeddings] registering routes')
17
17
 
18
- app.post '/v1/embeddings' do
18
+ app.post '/v1/embeddings' do # rubocop:disable Metrics/BlockLength
19
19
  require_llm!
20
20
  body = parse_request_body
21
21
  input = body[:input]
22
- model = body[:model] || Legion::LLM::Settings.value(:default_model)
22
+ model = body[:model] || Legion::Settings[:llm][:default_model]
23
23
 
24
24
  if input.nil? || (input.respond_to?(:empty?) && input.empty?)
25
25
  return openai_error('input is required', type: 'invalid_request_error',
@@ -43,6 +43,16 @@ module Legion
43
43
  )
44
44
 
45
45
  log.info("[llm][api][namespaces][openai][embeddings] action=complete model=#{model} dims=#{vector_array.size}")
46
+
47
+ Legion::LLM::Audit.emit_prompt(
48
+ request_id: SecureRandom.uuid,
49
+ caller: build_server_caller(source: 'openai_embeddings', path: request.path, env: env),
50
+ routing: { model: model, provider: 'embed' },
51
+ tokens: { input_tokens: (text.length / 4.0).ceil, output_tokens: 0 },
52
+ request_type: 'embedding',
53
+ timestamp: Time.now
54
+ )
55
+
46
56
  content_type :json
47
57
  Legion::JSON.dump(response_body)
48
58
  rescue Legion::LLM::AuthError => e
@@ -53,12 +53,14 @@ module Legion
53
53
  file_id = "file-#{SecureRandom.hex(16)}"
54
54
  filename = begin
55
55
  uploaded[:filename] || uploaded.original_filename
56
- rescue StandardError
56
+ rescue StandardError => e
57
+ log.debug "[llm][api][openai][files] action=filename_fallback error=#{e.class} message=#{e.message}"
57
58
  'upload.bin'
58
59
  end
59
60
  data = begin
60
61
  uploaded[:tempfile]&.read || uploaded.read
61
- rescue StandardError
62
+ rescue StandardError => e
63
+ log.debug "[llm][api][openai][files] action=file_read_fallback error=#{e.class} message=#{e.message}"
62
64
  ''
63
65
  end
64
66
 
@@ -23,7 +23,8 @@ module Legion
23
23
  def self.capable_provider_available?(capability)
24
24
  instances = begin
25
25
  Legion::LLM::Call::Registry.all_instances
26
- rescue StandardError
26
+ rescue StandardError => e
27
+ log.debug "[llm][api][openai][images] action=registry_fallback capability=#{capability} error=#{e.class} message=#{e.message}"
27
28
  []
28
29
  end
29
30
  instances.any? do |entry|
@@ -76,7 +77,8 @@ module Legion
76
77
 
77
78
  Legion::JSON.load(raw)
78
79
  end
79
- rescue StandardError
80
+ rescue StandardError => e
81
+ log.debug "[llm][api][openai][images] action=parse_media_body_fallback error=#{e.class} message=#{e.message}"
80
82
  {}
81
83
  end
82
84
 
@@ -116,7 +118,7 @@ module Legion
116
118
  Legion::JSON.dump({ error: { message: 'prompt is required', type: 'invalid_request_error', code: nil } })
117
119
  end
118
120
 
119
- model = (body[:model] || body['model'] || Legion::LLM::Settings.value(:default_model) || 'dall-e-3').to_s
121
+ model = (body[:model] || body['model'] || Legion::Settings[:llm][:default_model] || 'dall-e-3').to_s
120
122
  n = [(body[:n] || body['n'] || 1).to_i, 1].max
121
123
  size = (body[:size] || body['size'] || '1024x1024').to_s
122
124
  quality = (body[:quality] || body['quality'] || 'standard').to_s
@@ -3,6 +3,7 @@
3
3
  require 'time'
4
4
  require 'legion/logging/helper'
5
5
  require 'legion/llm/api/namespaces/helpers'
6
+ require 'legion/llm/api/native/models'
6
7
  require 'legion/llm/api/translators/openai_response'
7
8
 
8
9
  module Legion
@@ -81,10 +82,14 @@ module Legion
81
82
  end
82
83
 
83
84
  def self.build_openai_model_list
84
- models = Legion::LLM::Inventory.offerings(type: :inference).map do |offering|
85
+ offerings = Legion::LLM::Inventory.offerings(type: :inference)
86
+ offerings = Legion::LLM::API::Native::Models.with_auto_routing_offering(offerings, {})
87
+
88
+ models = offerings.map do |offering|
85
89
  Legion::LLM::API::Translators::OpenAIResponse.format_model_object(
86
90
  offering[:model],
87
- owned_by: offering[:provider_family]
91
+ owned_by: offering[:provider_family],
92
+ limits: offering[:limits]
88
93
  )
89
94
  end
90
95
  seen = {}
@@ -99,12 +104,15 @@ module Legion
99
104
  end
100
105
 
101
106
  def self.openai_to_anthropic_model(openai_model)
102
- {
107
+ model = {
108
+ type: 'model',
103
109
  id: openai_model[:id],
104
110
  display_name: openai_model[:id],
105
- created_at: Time.at(openai_model[:created] || Time.now.to_i).utc.strftime('%Y-%m-%dT%H:%M:%SZ'),
106
- type: 'model'
111
+ created_at: Time.at(openai_model[:created] || Time.now.to_i).utc.strftime('%Y-%m-%dT%H:%M:%SZ')
107
112
  }
113
+ model[:max_input_tokens] = openai_model[:context_window] if openai_model[:context_window]
114
+ model[:max_tokens] = openai_model[:max_output_tokens] if openai_model[:max_output_tokens]
115
+ model
108
116
  end
109
117
  end
110
118
  end
@@ -36,7 +36,7 @@ module Legion
36
36
 
37
37
  messages = [{ role: 'system', content: body[:instructions].to_s }] + messages if body[:instructions]
38
38
 
39
- model = body[:model] || Legion::LLM::Settings.value(:default_model) || 'default'
39
+ model = body[:model] || Legion::Settings[:llm][:default_model] || 'default'
40
40
  streaming = body[:stream] == true
41
41
  tool_decls = Responses.build_tool_declarations(body[:tools])
42
42
 
@@ -52,22 +52,34 @@ module Legion
52
52
  end
53
53
  end
54
54
 
55
+ def validate_client_tool_path(path)
56
+ return 'file operation error: path is required' if path.nil? || path.to_s.empty?
57
+
58
+ expanded = ::File.expand_path(path)
59
+ sandbox_root = ::File.expand_path(Dir.pwd)
60
+
61
+ return "file operation error: path '#{path}' escapes working directory #{sandbox_root}" unless expanded.start_with?(sandbox_root)
62
+
63
+ expanded
64
+ end
65
+
55
66
  def dispatch_client_tool(ref, **kwargs) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/MethodLength,Metrics/PerceivedComplexity
56
67
  case ref
57
68
  when 'sh'
58
69
  cmd = kwargs[:command] || kwargs[:cmd] || kwargs.values.first.to_s
70
+ log.warn("[llm][native] client_tool=sh command=#{cmd[0, 120]}")
59
71
  output, status = ::Open3.capture2e(cmd, chdir: Dir.pwd)
60
72
  "exit=#{status.exitstatus}\n#{output}"
61
73
  when 'file_read'
62
- path = kwargs[:path] || kwargs[:file_path] || kwargs.values.first.to_s
74
+ path = validate_client_tool_path(kwargs[:path] || kwargs[:file_path] || kwargs.values.first.to_s)
63
75
  read_client_file(path)
64
76
  when 'file_write'
65
- path = kwargs[:path] || kwargs[:file_path]
77
+ path = validate_client_tool_path(kwargs[:path] || kwargs[:file_path])
66
78
  content = kwargs[:content] || kwargs[:contents]
67
79
  ::File.write(path, content)
68
80
  "Written #{content.to_s.bytesize} bytes to #{path}"
69
81
  when 'file_edit'
70
- path = kwargs[:path] || kwargs[:file_path]
82
+ path = validate_client_tool_path(kwargs[:path] || kwargs[:file_path])
71
83
  old_text = kwargs[:old_text] || kwargs[:search]
72
84
  new_text = kwargs[:new_text] || kwargs[:replace]
73
85
  return 'file_edit error: old_text is required' if old_text.nil? || old_text.empty?
@@ -374,7 +386,8 @@ module Legion
374
386
  return raw_args unless raw_args.is_a?(String)
375
387
 
376
388
  Legion::JSON.parse(raw_args, symbolize_names: true)
377
- rescue StandardError
389
+ rescue StandardError => e
390
+ log.debug "[llm][api][native][helpers] action=openai_tool_call_arguments_fallback error=#{e.class} message=#{e.message}"
378
391
  raw_args
379
392
  end
380
393
 
@@ -133,6 +133,8 @@ module Legion
133
133
  end
134
134
 
135
135
  def self.auto_routing_offering
136
+ ctx = Legion::Settings[:llm][:context_window] || 262_144
137
+ max_out = Legion::Settings[:llm][:max_output_tokens] || 16_384
136
138
  {
137
139
  id: AUTO_ROUTING_OFFERING_ID,
138
140
  offering_id: AUTO_ROUTING_OFFERING_ID,
@@ -148,7 +150,7 @@ module Legion
148
150
  transport: :internal,
149
151
  enabled: true,
150
152
  capabilities: AUTO_ROUTING_CAPABILITIES,
151
- limits: {},
153
+ limits: { context_window: ctx, max_output_tokens: max_out },
152
154
  health: { circuit_state: 'available' },
153
155
  metadata: { auto_route: true, placeholder: true, display_name: AUTO_ROUTING_MODEL_DISPLAY },
154
156
  routing_metadata: { strategy: 'auto' },
@@ -162,8 +162,8 @@ module Legion
162
162
  def self.tier_priority
163
163
  return Legion::LLM::Router.tier_priority if defined?(Legion::LLM::Router)
164
164
 
165
- routing_config = Legion::LLM::Settings.value(:routing) || {}
166
- top_level = Legion::LLM::Settings.value(:tier_order, default: nil)
165
+ routing_config = Legion::Settings[:llm][:routing] || {}
166
+ top_level = Legion::Settings[:llm][:tier_order] || nil
167
167
  Array(top_level || routing_config[:tier_order] || routing_config[:tier_priority] ||
168
168
  %w[local direct fleet openai_compat cloud frontier])
169
169
  end
@@ -35,7 +35,7 @@ module Legion
35
35
 
36
36
  request_id = body[:request_id] || SecureRandom.uuid
37
37
  normalized = Legion::LLM::API::Translators::OpenAIRequest.normalize(body)
38
- model = normalized[:model] || Legion::LLM::Settings.value(:default_model) || 'default'
38
+ model = normalized[:model] || Legion::Settings[:llm][:default_model] || 'default'
39
39
  streaming = normalized[:stream] == true
40
40
  include_reasoning = body[:include_reasoning] == true || body[:include_thinking] == true
41
41
 
@@ -26,7 +26,7 @@ module Legion
26
26
  body = parse_request_body
27
27
 
28
28
  input = body[:input] || body['input']
29
- model = body[:model] || body['model'] || Legion::LLM::Settings.value(:default_model)
29
+ model = body[:model] || body['model'] || Legion::Settings[:llm][:default_model]
30
30
 
31
31
  if input.nil? || (input.respond_to?(:empty?) && input.empty?)
32
32
  halt 400, { 'Content-Type' => 'application/json' },
@@ -42,7 +42,7 @@ module Legion
42
42
 
43
43
  messages = [{ role: 'system', content: body[:instructions].to_s }] + messages if body[:instructions]
44
44
 
45
- model = body[:model] || Legion::LLM::Settings.value(:default_model) || 'default'
45
+ model = body[:model] || Legion::Settings[:llm][:default_model] || 'default'
46
46
  streaming = body[:stream] == true
47
47
 
48
48
  tool_declarations = Responses.build_tool_declarations(body[:tools])