legion-llm 0.12.3 → 0.12.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +8 -20
  3. data/AGENTS.md +62 -0
  4. data/CHANGELOG.md +141 -0
  5. data/lib/legion/llm/api/anthropic/messages.rb +15 -3
  6. data/lib/legion/llm/api/namespaces/anthropic/files.rb +0 -3
  7. data/lib/legion/llm/api/namespaces/anthropic/messages.rb +324 -70
  8. data/lib/legion/llm/api/namespaces/native/chat.rb +12 -3
  9. data/lib/legion/llm/api/namespaces/native/inference.rb +19 -12
  10. data/lib/legion/llm/api/namespaces/native/tiers.rb +1 -1
  11. data/lib/legion/llm/api/namespaces/openai/audio/speech.rb +0 -2
  12. data/lib/legion/llm/api/namespaces/openai/audio/transcriptions.rb +0 -2
  13. data/lib/legion/llm/api/namespaces/openai/audio/translations.rb +0 -2
  14. data/lib/legion/llm/api/namespaces/openai/batches.rb +2 -2
  15. data/lib/legion/llm/api/namespaces/openai/chat/completions.rb +21 -10
  16. data/lib/legion/llm/api/namespaces/openai/completions.rb +11 -5
  17. data/lib/legion/llm/api/namespaces/openai/conversations/items.rb +51 -2
  18. data/lib/legion/llm/api/namespaces/openai/conversations.rb +1 -1
  19. data/lib/legion/llm/api/namespaces/openai/embeddings.rb +1 -1
  20. data/lib/legion/llm/api/namespaces/openai/files.rb +2 -2
  21. data/lib/legion/llm/api/namespaces/openai/images.rb +0 -8
  22. data/lib/legion/llm/api/namespaces/openai/moderations.rb +0 -3
  23. data/lib/legion/llm/api/namespaces/openai/responses.rb +284 -48
  24. data/lib/legion/llm/api/namespaces/openai/uploads/parts.rb +1 -1
  25. data/lib/legion/llm/api/namespaces/openai/uploads.rb +2 -2
  26. data/lib/legion/llm/api/namespaces/openai/vector_stores/file_batches.rb +0 -3
  27. data/lib/legion/llm/api/namespaces/openai/vector_stores/files.rb +0 -3
  28. data/lib/legion/llm/api/namespaces/openai/vector_stores.rb +0 -3
  29. data/lib/legion/llm/api/native/chat.rb +2 -2
  30. data/lib/legion/llm/api/native/helpers.rb +1 -1
  31. data/lib/legion/llm/api/native/inference.rb +0 -2
  32. data/lib/legion/llm/api/native/tiers.rb +1 -1
  33. data/lib/legion/llm/api/openai/chat_completions.rb +20 -5
  34. data/lib/legion/llm/api/openai/responses.rb +14 -5
  35. data/lib/legion/llm/api/shared_helpers.rb +141 -4
  36. data/lib/legion/llm/api/translators/anthropic_response.rb +208 -33
  37. data/lib/legion/llm/api/translators/openai_response.rb +20 -1
  38. data/lib/legion/llm/call/dispatch.rb +38 -21
  39. data/lib/legion/llm/call/lex_llm_adapter.rb +173 -16
  40. data/lib/legion/llm/call/structured_output.rb +1 -1
  41. data/lib/legion/llm/compat.rb +1 -1
  42. data/lib/legion/llm/context/curator.rb +22 -5
  43. data/lib/legion/llm/inference/executor.rb +415 -66
  44. data/lib/legion/llm/inference/native_tool_loop.rb +217 -42
  45. data/lib/legion/llm/inference/prompt.rb +1 -1
  46. data/lib/legion/llm/inference/route_attempts.rb +2 -2
  47. data/lib/legion/llm/inference/steps/knowledge_capture.rb +40 -1
  48. data/lib/legion/llm/inference/steps/metering.rb +9 -1
  49. data/lib/legion/llm/inference/steps/post_response.rb +11 -4
  50. data/lib/legion/llm/inference/steps/rag_context.rb +2 -0
  51. data/lib/legion/llm/inference/steps/rbac.rb +2 -2
  52. data/lib/legion/llm/inference/steps/sticky_persist.rb +1 -1
  53. data/lib/legion/llm/inference/steps/tool_calls.rb +30 -8
  54. data/lib/legion/llm/inference/steps/tool_history.rb +62 -9
  55. data/lib/legion/llm/inference.rb +1 -3
  56. data/lib/legion/llm/metering/usage.rb +6 -3
  57. data/lib/legion/llm/quality/checker.rb +30 -2
  58. data/lib/legion/llm/router/health_tracker.rb +1 -1
  59. data/lib/legion/llm/router.rb +125 -33
  60. data/lib/legion/llm/settings.rb +34 -19
  61. data/lib/legion/llm/tools/dispatcher.rb +4 -9
  62. data/lib/legion/llm/transport/message.rb +43 -1
  63. data/lib/legion/llm/version.rb +1 -1
  64. data/lib/legion/llm.rb +5 -0
  65. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 733b0a06bf37557dfe7c661d6d63596e30ef5cde4b01c9ae12e801c5257b684e
4
- data.tar.gz: f24cfe5b32ed137a47efaac7adf57f348d911cb1f511fbdfc5a769ce54d7c32a
3
+ metadata.gz: 7d61b50d6573478325baba59ea7b05a8e7a6bce2c66c453d15eec40b1380b891
4
+ data.tar.gz: e14038bcac7c816169e31bc2f8a08fb76331e0bc7b18766f029fe217c3b57d2d
5
5
  SHA512:
6
- metadata.gz: c006671e6a1dc02bff77b18e3c31dc4cb771ffd1ba5a6b773e2d7a62a9e886b76e95fb1e9780b0c1285e0165a218112f239fe088ceae9296da440918bd682648
7
- data.tar.gz: 14eaff3d563eb80e058f699bca53c79308642f745710c1a546b7e00ecfde049129d90e91a52cea5e13a0e19c39afb1282daf1228640680b8f7b94db777676f81
6
+ metadata.gz: e1abe73f183b7b6e135db20bb2bc2b875b6bb40234630c5327a906d38892b6ef83f35c2c9f8a8b0ee193451d1609f1e30c1ebc4b0ebf28e844b3a9e8044ffefa
7
+ data.tar.gz: bf44ad26a524c018b042dda702b076c98ef7068aaf48e638305776532d809036778c86a0e4e385974c2d59bffbdf61da4a20067b89d09cbc85a2e5a6b34f5203
data/.rubocop.yml CHANGED
@@ -4,57 +4,45 @@ AllCops:
4
4
  SuggestExtensions: false
5
5
 
6
6
  Layout/LineLength:
7
- Max: 160
8
-
7
+ Max: 195
9
8
  Layout/SpaceAroundEqualsInParameterDefault:
10
9
  EnforcedStyle: space
11
-
12
10
  Layout/HashAlignment:
13
11
  EnforcedHashRocketStyle: table
14
12
  EnforcedColonStyle: table
15
-
16
13
  Metrics/MethodLength:
17
- Max: 60
18
-
14
+ Max: 150
19
15
  Metrics/ClassLength:
20
16
  Max: 1500
21
-
22
17
  Metrics/ModuleLength:
23
18
  Max: 1500
24
-
25
19
  Metrics/BlockLength:
26
- Max: 40
20
+ Max: 150
27
21
  Exclude:
28
22
  - 'spec/**/*'
29
23
 
30
24
  Metrics/AbcSize:
31
- Max: 85
32
-
25
+ Max: 110
26
+ Metrics/BlockNesting:
27
+ Max: 4
33
28
  Metrics/CyclomaticComplexity:
34
- Max: 35
29
+ Max: 50
35
30
 
36
31
  Metrics/PerceivedComplexity:
37
- Max: 35
38
-
32
+ Max: 50
39
33
  Style/Documentation:
40
34
  Enabled: false
41
-
42
35
  Style/SymbolArray:
43
36
  Enabled: true
44
-
45
37
  Style/FrozenStringLiteralComment:
46
38
  Enabled: true
47
39
  EnforcedStyle: always
48
-
49
40
  Naming/FileName:
50
41
  Enabled: false
51
-
52
42
  Naming/PredicateMethod:
53
43
  Enabled: false
54
-
55
44
  Metrics/ParameterLists:
56
45
  Max: 9
57
-
58
46
  Style/RedundantConstantBase:
59
47
  Exclude:
60
48
  - 'spec/**/*'
data/AGENTS.md CHANGED
@@ -35,3 +35,65 @@ bundle exec rubocop
35
35
 
36
36
  - Run targeted specs for modified router/pipeline/provider code.
37
37
  - Before handoff, run full `bundle exec rspec` and `bundle exec rubocop`.
38
+
39
+ ---
40
+
41
+ ## Client Request Headers Reference
42
+
43
+ Verified from source code (Claude Code binary + Codex `codex-rs` Rust source).
44
+
45
+ ### Claude Code → `POST /v1/messages`
46
+
47
+ | Header | Value | Always? |
48
+ |---|---|---|
49
+ | `X-Claude-Code-Session-Id` | Stable UUID for the CLI session | Yes |
50
+ | `x-app` | `"cli"` (foreground) or `"cli-bg"` (background) | Yes |
51
+ | `x-claude-remote-session-id` | Remote container session ID | Conditional |
52
+ | `x-claude-remote-container-id` | Remote container ID | Conditional |
53
+ | `x-claude-code-agent-id` | Agent UUID for multi-agent sessions | Conditional |
54
+ | `x-claude-code-parent-agent-id` | Parent agent UUID (spawned subagent) | Conditional |
55
+ | `x-client-app` | Additional client app identifier | Conditional |
56
+
57
+ Conversation threading is **stateless** — full `messages[]` history sent in the body on every request. No conversation ID, turn ID, or `x-client-request-id` header is sent.
58
+
59
+ In Rack/Sinatra env keys, headers arrive as `HTTP_X_CLAUDE_CODE_SESSION_ID`, `HTTP_X_APP`, etc.
60
+
61
+ ### Codex → `POST /v1/responses`
62
+
63
+ | Header | Value | Always? |
64
+ |---|---|---|
65
+ | `session-id` | Stable UUID for the Codex session | Yes |
66
+ | `thread-id` | Stable UUID for the thread/conversation | Yes |
67
+ | `x-client-request-id` | Same value as `thread-id` | Yes |
68
+ | `x-codex-installation-id` | Installation-scoped UUID | Yes |
69
+ | `x-codex-window-id` | `"{thread_id}:{window_generation}"` | Yes |
70
+ | `x-codex-turn-state` | Sticky-routing token returned by server, replayed by client | After first response |
71
+ | `x-codex-turn-metadata` | Per-turn observability metadata | Conditional |
72
+ | `x-codex-parent-thread-id` | Parent thread UUID (sub-agents) | Conditional |
73
+ | `x-openai-subagent` | Sub-agent type (`"review"`, `"compact"`, `"memory_consolidation"`, etc.) | Conditional |
74
+ | `x-openai-memgen-request` | `"true"` for memory generation requests | Conditional |
75
+
76
+ In Rack/Sinatra env keys: `HTTP_SESSION_ID`, `HTTP_THREAD_ID`, `HTTP_X_CLIENT_REQUEST_ID`, `HTTP_X_CODEX_INSTALLATION_ID`, etc.
77
+
78
+ **`HTTP_THREAD_ID` is the stable Codex thread/conversation ID** — it is stable for the lifetime of a thread, not per-request. `HTTP_X_CLIENT_REQUEST_ID` equals `HTTP_THREAD_ID` (Codex sets them to the same value).
79
+
80
+ Conversation threading over HTTP uses full input in body (stateless like Anthropic). Over WebSocket, `previous_response_id` is sent in the request body to enable delta-only input.
81
+
82
+ ### Practical Usage in `/v1/messages` and `/v1/responses` Handlers
83
+
84
+ ```ruby
85
+ # Stable request ID (Claude Code sends X-Claude-Code-Session-Id; Codex sends x-client-request-id = thread-id)
86
+ request_id = env['HTTP_X_CLIENT_REQUEST_ID'] || "req_#{SecureRandom.hex(12)}"
87
+
88
+ # Stable conversation/thread ID
89
+ # Claude Code: no header — generate per-request or use Legion conversation tracking
90
+ # Codex: HTTP_THREAD_ID is stable for the thread lifetime
91
+ conversation_id = env['HTTP_THREAD_ID'] ||
92
+ env['HTTP_X_LEGION_CONVERSATION_ID'] ||
93
+ body[:conversation_id] ||
94
+ "conv_#{SecureRandom.hex(8)}"
95
+
96
+ # Identify the calling client
97
+ claude_code_session = env['HTTP_X_CLAUDE_CODE_SESSION_ID'] # present only for Claude Code
98
+ codex_installation = env['HTTP_X_CODEX_INSTALLATION_ID'] # present only for Codex
99
+ ```
data/CHANGELOG.md CHANGED
@@ -1,5 +1,146 @@
1
1
  # Legion LLM Changelog
2
2
 
3
+ ## [0.12.14] - 2026-06-10
4
+
5
+ ### Added
6
+ - **Hint-based router scoring** — `tier`, `provider`, and `model` are now preference hints that bias rule scoring (+50 per matching hint) instead of hard overrides that bypass rule evaluation. This allows the router to apply policy (cost, privacy, health) and fall back to a better local match when the hinted provider is unavailable (router.rb)
7
+ - **Context window filtering in router** — `estimated_tokens` is now computed from request messages + conversation history and passed to the router. Rules whose model's `context_length` cannot fit the estimated token count (at 90% threshold) are excluded from candidate selection (router.rb, executor.rb)
8
+ - **Model-provider mismatch detection** — When an explicitly specified provider differs from the model's natural provider (e.g. "claude-sonnet-4-6" routed to vllm), the model is swapped to the provider's default to prevent dispatch failures. Auto-resolved providers (from tier/defaults) trust the caller's model choice (router.rb, executor.rb)
9
+ - **Provider registry validation in explicit resolution** — Unregistered providers are cleared before tier-based fallback instead of committing to a dead-end resolution (router.rb)
10
+ - **Repeat tool call detection in native tool loop** — The tool loop now tracks `(name, args_hash)` pairs and returns early with client passthrough results when repeated calls are detected, preventing infinite loops from stuck tool cycles (native_tool_loop.rb)
11
+ - **Preserve recent turns in context curator** — `preserve_recent_turns` setting (default: 2) prevents tool result distillation from the most recent N turns so the model retains full context of recent work (context/curator.rb)
12
+ - **Large JSON result summarization** — `summarize_result` now extracts top-level JSON keys from large results (>2000 chars) without full parsing, avoiding ParseError noise from truncated JSON (steps/tool_history.rb)
13
+
14
+ ### Changed
15
+ - **Default settings adjustments** — `tool_result_max_dispatch_chars`: 4000→10000, `default_temperature`: 1.0→0.9, `context_curation.tool_result_max_chars`: 2000→10000, `thinking_eviction`: true→false, `exchange_folding`: true→false, `target_context_tokens`: 40000→60000, `conversation.summarize_threshold`: 50000→90000, `conversation.target_tokens`: 20000→60000, `structured_output.retry_on_parse_failure`: true→false (settings.rb)
16
+ - **Router no longer short-circuits on tier/provider** — Tier and provider hints flow through rule matching with scoring bonuses instead of bypassing rules entirely. Fallback chain: rule match → explicit resolution → arbitrage (router.rb)
17
+ - **Always inject LegionIO tools** — Removed `client_tools_only?` optimization; LegionIO tools (special + extension) are always injected regardless of client passthrough settings. Client passthrough is handled by the tool loop which executes LegionIO tools server-side (executor.rb)
18
+ - **Server-side LegionIO tool execution in tool loop** — Tool calls are partitioned into server (LegionIO) and client (passthrough). Server tools execute in-place; client tools are returned without results. LegionIO tool results are populated from `@pending_tool_history` so translators see completed results and avoid `pause_turn` stop reasons (native_tool_loop.rb, steps/tool_calls.rb)
19
+ - **Stop reason logic for completed LegionIO tools** — LegionIO tools with results no longer trigger `:pause_turn`; only tools without results (pending execution) pause the turn (steps/tool_calls.rb)
20
+
21
+ ### Fixed
22
+ - **merge_defaults crash on nil intent** — Added safe navigation (`&.`) for `transform_keys`/`transform_values` on nil intent in `merge_defaults` (router.rb)
23
+ - **structured_output retry_enabled? nil dereference** — Changed `[]` chain to `.dig()` so the setting check survives when the structured_output subtree is absent (call/structured_output.rb)
24
+ - **Spec helper provider registration isolation** — Standard providers (anthropic, test, bedrock, openai, ollama, vllm, azure_foundry, gemini, xai) are now re-registered in `before(:each)` after `Registry.reset!` so router resolution works in every test (spec_helper.rb)
25
+
26
+ ## [0.12.13] - 2026-06-05
27
+
28
+ ### Added
29
+ - **Runtime caller class detection via caller_locations** — `Transport::Message#encode_message` now walks the call stack to find the class/module that initiated an AMQP publish and injects `runtime_caller_class` into the caller hash. Works for any caller (Legion::LLM::API::Namespaces::Anthropic::Messages, Legion::Gaia::*, etc.) — the class name is derived from the file path so it always matches the actual calling module (lib/legion/llm/transport/message.rb)
30
+
31
+ ## [0.12.12] - 2026-06-05
32
+
33
+ ### Added
34
+ - **Caller class and client detection** — `build_server_caller` now emits `runtime_caller_class` (codex, claude-code), `runtime_caller_client` (user-agent), `parent_request_ref` (Codex turn_id for ledger correlation), and `codex_turn_metadata` (parsed X_CODEX_TURN_METADATA JSON) in the caller hash so the ledger can populate caller identity, client attribution, and turn-level request correlation without DB queries at emit time (api/shared_helpers.rb)
35
+
36
+ ## [0.12.11] - 2026-06-05
37
+
38
+ ### Fixed
39
+ - **Client request ID and conversation threading** — `/v1/messages` and `/v1/responses` now read `X-Client-Request-Id` from HTTP headers as the request ID instead of always generating new ones. `conversation_id` now uses `Thread-Id` (Codex) or `X-Claude-Code-Session-Id` (Claude Code) so the ledger groups messages from the same client session into one conversation row instead of random UUIDs (api/namespaces/anthropic/messages.rb, api/namespaces/openai/responses.rb)
40
+
41
+ ## [0.12.10] - 2026-06-05
42
+
43
+ ### Fixed
44
+ - **OpenAI role normalization** — `:developer`, `:critic`, `:discriminator` roles now map to `:system` for non-OpenAI providers (Anthropic, vLLM, Ollama, Bedrock) so they don't raise `InvalidRoleError`. OpenAI messages preserve the original role because OpenAI natively supports all four. `normalize_role` runs at the adapter boundary per-provider (issue #147) (lib/legion/llm/call/lex_llm_adapter.rb)
45
+
46
+ ## [0.12.9] - 2026-06-05
47
+
48
+ ### Fixed
49
+ - **Escalation attempts now individually recorded** — Every escalation attempt emits its own metering and audit events so each provider attempt gets a separate `inference_response` row and `metric` row. Per-attempt events include messages, response content, thinking, tokens, and cost. `emit_error_audit` now includes request messages so the ledger captures `request_json` even on total escalation exhaustion (issue #147) (lib/legion/llm/inference/executor.rb)
50
+
51
+ ## [0.12.8] - 2026-06-05
52
+
53
+ ### Fixed
54
+ - **Async thread pool graceful shutdown** — `Legion::LLM.shutdown` now calls `shutdown` + `wait_for_termination(5)` on the executor's `ASYNC_THREAD_POOL` so background curation, reflection, and knowledge capture threads drain cleanly instead of being killed mid-operation (issue #143) (lib/legion/llm.rb)
55
+
56
+ ## [0.12.7] - 2026-06-05
57
+
58
+ ### Fixed
59
+ - **Metering events now include request messages and response content** — Fixes 86% empty `request_json` rows in `llm_message_inference_requests` by including `messages`, `response_content`, and `response_thinking` in the metering event payload so the ledger captures complete data on the first synchronous write (issue #146) (inference/steps/metering.rb, inference/executor.rb)
60
+
61
+ ## [0.12.6] - 2026-06-05
62
+
63
+ ### Added
64
+ - **Full thinking/reasoning support end-to-end** — Thinking blocks now flow through the entire pipeline: provider adapter accumulates thinking deltas and completed thinking, LexLLMAdapter extracts reasoning from Responses API output items, Anthropic translator emits thinking_content_block and redacted_thinking blocks, and the Response struct carries `thinking` through to all API formats. Anthropic streaming now emits `thinking_delta` and `signature_delta` events with proper content_block_start/stop lifecycle (api/namespaces/anthropic/messages.rb, api/translators/anthropic_response.rb, call/lex_llm_adapter.rb, inference/executor.rb)
65
+ - **Responses API thinking + tool calls** — LexLLMAdapter now accumulates thinking (`response.reasoning_text.delta`, `response.reasoning_summary_text.delta`) and tool calls (`response.function_call_arguments.delta`) during Responses API streaming. Hash (non-streaming) responses also extract thinking from reasoning output items and top-level `reasoning.text`. `responses_usage` captures `output_tokens_details.reasoning_tokens` (call/lex_llm_adapter.rb)
66
+ - **OpenAI Responses API namespace overhaul** — Rebuilt `/v1/responses` handler with full reasoning, tool call, thinking config, and provider routing support. Extracts `provider`/`tier`/`instance` from headers and body, builds proper routing hash, forwards `thinking` config to Request.build, and emits thinking blocks in streaming responses. Non-streaming path now returns thinking in the response payload (api/namespaces/openai/responses.rb, api/openai/responses.rb)
67
+ - **Shared API completion summary logging** — New `log_api_completion_summary` helper in `api/shared_helpers.rb` emits structured completion logs across all API handlers (Anthropic, OpenAI chat, OpenAI Responses, native chat, native inference) with provider, model, tier, tokens (input/output/cache/thinking), latency, tool calls, and stop_reason. Replaces ad-hoc logging in each handler (api/shared_helpers.rb, api/namespaces/anthropic/messages.rb, api/namespaces/openai/responses.rb, api/namespaces/native/chat.rb, api/namespaces/native/inference.rb, api/native/chat.rb)
68
+ - **`output_tokens_details` token breakdown** — `Usage` struct now carries `output_tokens_details` (reasoning_tokens). `NativeResponseAdapter` extracts it from provider usage hashes, and `extract_tokens` preserves it in the response tokens hash so downstream metering and API responses can report reasoning token counts (metering/usage.rb, call/dispatch.rb, inference/executor.rb)
69
+ - **Tool loop message persistence** — Intermediate assistant/tool messages generated during the native tool loop are now persisted to the conversation store. `persist_tool_loop_messages` stores the intermediate exchanges (tool_use + tool_result pairs) between the original inputs and the final assistant response. `tool_loop_final_tool_calls` extracts tool_calls from the loop's final message for the assistant response record (inference/executor.rb)
70
+ - **Client stream error detection** — New `client_stream_error?` predicate detects client-side disconnects (Puma::ConnectionError, EPIPE, closed IOError, ECONNRESET, ECONNABORTED) so escalation avoids retrying when the HTTP client has already disconnected (inference/executor.rb)
71
+ - **Tool source resolution from registry** — Client-shaped tool declarations are now reclassified as registry/extension tools when a matching entry exists in `Legion::Settings::Extensions.find_tool`. `request_tool_source` and `resolve_registry_tool_source` look up tools by name (including `raw_name` from sanitized LegionIO dot-notation like `legion.microsoft_teams_create_chain`) and reclassify the source to `:registry` or `:extension` type with proper tool_class, runner, and function metadata. `client_tools_only?` was rewritten to only return true when ALL tools are truly client-side (none resolved to registry) (inference/executor.rb)
72
+
73
+ ### Changed
74
+ - **Anthropic Messages API model no longer required** — `validate_anthropic_required!` no longer rejects requests without an explicit `model` field; the executor will auto-select a default. This matches the behavior expected by Claude Code and other clients that rely on server-side model resolution (api/namespaces/anthropic/messages.rb)
75
+ - **Settings defaults** — `gaia.advisory_enabled` defaults to `false`, `fleet.enabled` to `false`, `routing.escalation.enabled` to `false`, `routing.arbitrage.enabled` to `false`, `rag.enabled` to `false`, `knowledge_capture.enabled` to `false`. Context window reduced to `128000`, local tool limit reduced to `50` (settings.rb)
76
+ - **Empty response detection accounts for thinking** — Anthropic streaming no longer emits an overloaded_error when the provider returns thinking-only content (internal reasoning with no client-visible text). The error check now requires both `tool_calls.empty? && full_text.empty? && full_thinking.empty?` (api/namespaces/anthropic/messages.rb)
77
+ - **Streaming stream error resilience** — Anthropic and OpenAI Responses streaming handlers now track `stream_closed` state and handle EPIPE/Puma::ConnectionError gracefully, exiting the stream loop without blowing up Puma (api/namespaces/anthropic/messages.rb, api/namespaces/openai/responses.rb)
78
+ - **`provider_supports_responses?` fallback** — When the provider is not yet resolved, falls back to the request's routing hint so the decision can be made before pre-provider steps run (inference/executor.rb)
79
+ - **Rubocop directive cleanup** — Removed unnecessary `Metrics/BlockLength` and `Metrics/MethodLength` disable comments throughout API files that no longer trigger violations (api/namespaces/*/). Line-length directives removed where guard-clause style fixes the violation (inference/executor.rb)
80
+
81
+ ### Fixed
82
+ - **`Faraday::SSLError` not caught as provider_down** — SSL/TLS errors (certificate failures, handshake errors) were not handled in the provider_down rescue chain. Now `Faraday::SSLError` is caught alongside `ConnectionFailed` and `TimeoutError` in both `execute_provider_request_native` and `execute_provider_request_stream_native`, with audit status `provider_down` (inference/executor.rb)
83
+ - **`client_tools_only?` returned wrong result** — The original implementation checked if any tool had `type: :client`, which returned true for mixed client+registry tool sets. Rewritten to check that ALL tools are client-side passthrough (none resolved to registry/extension tool classes). This fixes registry tool injection being suppressed when client and server tools are mixed (inference/executor.rb)
84
+ - **Model resolution pins to unregistered remote providers** — `resolve_model_to_local_provider` now requires `Call::Registry.registered?(provider, instance: instance)` to pass before selecting a discovered model as healthy. Prevents pinning to providers like Anthropic when they only exist in the discovery cache (e.g. on a vLLM-only node) (inference/executor.rb)
85
+ - **Quality checker thinking handling** — `Quality::Checker` now properly handles responses where content is nil but thinking is present, avoiding false quality failures (quality/checker.rb)
86
+ - **RBAC step log level** — Downgraded RBAC step log from info to debug to reduce noise in production logs (inference/steps/rbac.rb)
87
+ - **Compatibility alias** — `NativeResponseAdapter` alias now points to the correct nested class under `Call::Dispatch` (compat.rb)
88
+ - **Knowledge capture step** — Handles nil thinking gracefully and respects the disabled default setting (inference/steps/knowledge_capture.rb)
89
+ - **Post response step** — Properly passes thinking through post-response processing (inference/steps/post_response.rb)
90
+ - **Trigger match step** — Now checks `ThinkingExtractor` availability before attempting extraction (inference/steps/trigger_match.rb)
91
+ - **Sticky persist step** — Fixed tool call extraction for responses with thinking (inference/steps/sticky_persist.rb)
92
+ - **Health tracker** — Removed unnecessary `Lint/DuplicateBranch` rubocop directive (router/health_tracker.rb)
93
+ - **Route attempts** — Fixed `route_attempts` tracking for SSLError failures (inference/route_attempts.rb)
94
+
95
+ ## [0.12.6] - 2026-06-05
96
+
97
+ ### Added
98
+ - **Full thinking/reasoning support end-to-end** — Thinking blocks now flow through the entire pipeline: provider adapters accumulate thinking deltas and completed thinking, LexLLMAdapter extracts reasoning from Responses API output items, Anthropic translator emits thinking_content_block and redacted_thinking blocks, and the Response struct carries `thinking` through to all API formats. Anthropic streaming now emits `thinking_delta` and `signature_delta` events with proper content_block_start/stop lifecycle (api/namespaces/anthropic/messages.rb, api/translators/anthropic_response.rb, call/lex_llm_adapter.rb, inference/executor.rb)
99
+ - **Responses API thinking + tool calls in streaming** — LexLLMAdapter now accumulates thinking (`response.reasoning_text.delta`, `response.reasoning_summary_text.delta`) and tool calls (`response.function_call_arguments.delta`) during Responses API streaming. Hash (non-streaming) responses also extract thinking from reasoning output items and top-level `reasoning.text`. `responses_usage` captures `output_tokens_details.reasoning_tokens` (call/lex_llm_adapter.rb)
100
+ - **OpenAI Responses API namespace overhaul** — Rebuilt `/v1/responses` handler with full reasoning, tool call, thinking config, and provider routing support. Extracts `provider`/`tier`/`instance` from headers and body, builds proper routing hash, forwards `thinking` config to Request.build, and emits thinking blocks in streaming responses. Non-streaming path now returns thinking in the response payload (api/namespaces/openai/responses.rb, api/openai/responses.rb)
101
+ - **Shared API completion summary logging** — New `log_api_completion_summary` helper in `api/shared_helpers.rb` emits structured completion logs across all API handlers (Anthropic, OpenAI chat, OpenAI Responses, native chat, native inference) with provider, model, tier, tokens (input/output/cache/thinking), latency, tool calls, and stop_reason. Replaces ad-hoc logging in each handler (api/shared_helpers.rb, api/namespaces/anthropic/messages.rb, api/namespaces/openai/responses.rb, api/namespaces/native/chat.rb, api/namespaces/native/inference.rb, api/native/chat.rb)
102
+ - **`output_tokens_details` token breakdown** — `Usage` struct now carries `output_tokens_details` (reasoning_tokens). `NativeResponseAdapter` extracts it from provider usage hashes, and `extract_tokens` preserves it in the response tokens hash so downstream metering and API responses can report reasoning token counts (metering/usage.rb, call/dispatch.rb, inference/executor.rb)
103
+ - **Tool loop message persistence** — Intermediate assistant/tool messages generated during the native tool loop are now persisted to the conversation store. `persist_tool_loop_messages` stores the intermediate exchanges (tool_use + tool_result pairs) between the original inputs and the final assistant response. `tool_loop_final_tool_calls` extracts tool_calls from the loop's final message for the assistant response record (inference/executor.rb)
104
+ - **Client stream error detection** — New `client_stream_error?` predicate detects client-side disconnects (Puma::ConnectionError, EPIPE, closed IOError, ECONNRESET, ECONNABORTED) so escalation avoids retrying when the HTTP client has already disconnected (inference/executor.rb)
105
+ - **Tool source resolution from registry** — Client-shaped tool declarations are now reclassified as registry/extension tools when a matching entry exists in `Legion::Settings::Extensions.find_tool`. `request_tool_source` and `resolve_registry_tool_source` look up tools by name (including `raw_name` from sanitized LegionIO dot-notation like `legion.microsoft_teams_create_chain`) and reclassify the source to `:registry` or `:extension` type with proper tool_class, runner, and function metadata. `client_tools_only?` was rewritten to only return true when ALL tools are truly client-side (none resolved to registry) (inference/executor.rb)
106
+
107
+ ### Changed
108
+ - **Anthropic Messages API model no longer required** — `validate_anthropic_required!` no longer rejects requests without an explicit `model` field; the executor will auto-select a default. This matches the behavior expected by Claude Code and other clients that rely on server-side model resolution (api/namespaces/anthropic/messages.rb)
109
+ - **Settings defaults** — `gaia.advisory_enabled` defaults to `false`, `fleet.enabled` to `false`, `routing.escalation.enabled` to `false`, `routing.arbitrage.enabled` to `false`, `rag.enabled` to `false`, `knowledge_capture.enabled` to `false`. Context window reduced to `128000`, local tool limit reduced to `50` (settings.rb)
110
+ - **Empty response detection accounts for thinking** — Anthropic streaming no longer emits an overloaded_error when the provider returns thinking-only content (internal reasoning with no client-visible text). The error check now requires `tool_calls.empty? && full_text.empty? && full_thinking.empty?` (api/namespaces/anthropic/messages.rb)
111
+ - **Streaming stream error resilience** — Anthropic and OpenAI Responses streaming handlers now track `stream_closed` state and handle EPIPE/Puma::ConnectionError gracefully, exiting the stream loop without blowing up Puma (api/namespaces/anthropic/messages.rb, api/namespaces/openai/responses.rb)
112
+ - **`provider_supports_responses?` fallback** — When the provider is not yet resolved, falls back to the request's routing hint so the decision can be made before pre-provider steps run (inference/executor.rb)
113
+ - **Rubocop directive cleanup** — Removed unnecessary `Metrics/BlockLength` and `Metrics/MethodLength` disable comments throughout API files that no longer trigger violations (api/namespaces/*/). Line-length directives removed where guard-clause style fixes the violation (inference/executor.rb)
114
+
115
+ ### Fixed
116
+ - **`Faraday::SSLError` not caught as provider_down** — SSL/TLS errors (certificate failures, handshake errors) were not handled in the provider_down rescue chain. Now `Faraday::SSLError` is caught alongside `ConnectionFailed` and `TimeoutError` in both `execute_provider_request_native` and `execute_provider_request_stream_native`, with audit status `provider_down` (inference/executor.rb)
117
+ - **`client_tools_only?` returned wrong result** — The original implementation checked if any tool had `type: :client`, which returned true for mixed client+registry tool sets. Rewritten to check that ALL tools are client-side passthrough (none resolved to registry/extension tool classes). This fixes registry tool injection being suppressed when client and server tools are mixed (inference/executor.rb)
118
+ - **Model resolution pins to unregistered remote providers** — `resolve_model_to_local_provider` now requires `Call::Registry.registered?(provider, instance: instance)` to pass before selecting a discovered model as healthy. Prevents pinning to providers like Anthropic when they only exist in the discovery cache (e.g. on a vLLM-only node) (inference/executor.rb)
119
+ - **Quality checker thinking handling** — `Quality::Checker` now properly handles responses where content is nil but thinking is present, avoiding false quality failures (quality/checker.rb)
120
+ - **RBAC step log level** — Downgraded RBAC step log from info to debug to reduce noise in production logs (inference/steps/rbac.rb)
121
+ - **Compatibility alias** — `NativeResponseAdapter` alias now points to the correct nested class under `Call::Dispatch` (compat.rb)
122
+ - **Knowledge capture step** — Handles nil thinking gracefully and respects the disabled default setting (inference/steps/knowledge_capture.rb)
123
+ - **Post response step** — Properly passes thinking through post-response processing (inference/steps/post_response.rb)
124
+ - **Trigger match step** — Now checks `ThinkingExtractor` availability before attempting extraction (inference/steps/trigger_match.rb)
125
+ - **Sticky persist step** — Fixed tool call extraction for responses with thinking (inference/steps/sticky_persist.rb)
126
+ - **Route attempts** — Fixed `route_attempts` tracking for SSLError failures (inference/route_attempts.rb)
127
+ - **Health tracker** — Removed unnecessary `Lint/DuplicateBranch` rubocop directive (router/health_tracker.rb)
128
+
129
+ ## [0.12.5] - 2026-06-04
130
+
131
+ ### Fixed
132
+ - **dispatch_extension fails when source uses :extension key** — `dispatch_extension` only checked `source[:lex]`, but `check_registry_override` and other callers sometimes set `:extension` instead. Now falls back to `source[:extension]` when `:lex` is absent (tools/dispatcher.rb)
133
+ - **dispatch_client attempted server-side execution of client tools** — Client tools (Bash, Read, etc.) now always return `:passthrough` status instead of attempting server-side execution via `ClientToolMethods`. LegionIO should never execute client tools; that's the client's responsibility (tools/dispatcher.rb)
134
+ - **OpenAI Responses API ignores explicit provider routing** — Both namespace (`api/namespaces/openai/responses.rb`) and legacy (`api/openai/responses.rb`) handlers only passed `{ model: model }` in the routing hash, dropping `HTTP_X_LEGION_PROVIDER`, `HTTP_X_LEGION_TIER`, and `HTTP_X_LEGION_INSTANCE` headers. Now extracts these headers and body fields, builds a proper routing hash with provider/instance/model, and passes tier via `Request.extra[:tier]` to match the Anthropic Messages handler behavior.
135
+ - **Anthropic Messages API drops thinking config** — Both namespace (`api/namespaces/anthropic/messages.rb`) and legacy (`api/anthropic/messages.rb`) handlers never forwarded `body[:thinking]` to `Request.build`. Now passes through unchanged so all providers (Anthropic, Bedrock, etc.) receive the original thinking config. Anthropic provider now handles both `:budget_tokens` and `:budget` keys for compatibility.
136
+
137
+ ## [0.12.4] - 2026-06-04
138
+
139
+ ### Fixed
140
+ - **Forced tool choice arguments emitted as text instead of structured tool_use** — When a tool choice is forced (e.g. vLLM/qwen with explicit tool name in user text), the provider may output tool arguments as plain text JSON (`{"file_path": "..."}`) in `delta['content']` instead of structured `delta['tool_calls']`. Added `maybe_synthesize_tool_call_from_content` to both `execute_native_tool_loop` and `execute_native_streaming_tool_loop`: when tool calls are empty but content is valid JSON and a tool choice is forced, parse the text and create a proper tool call. Also added `text_looks_like_tool_json?` guard to Anthropic and OpenAI response translators to skip emitting JSON blobs as text deltas. Client sees native tool_use/function_call blocks instead of raw JSON text (inference/native_tool_loop.rb, api/translators/anthropic_response.rb, api/translators/openai_response.rb, api/anthropic/messages.rb, api/openai/chat_completions.rb)
141
+ - **Model discovery pins to unregistered remote providers** — `resolve_model_to_local_provider` found a model in the discovery cache and pinned it to a provider that isn't registered locally (e.g. `claude-haiku-4-5-20251001` → Anthropic on a vLLM-only node). Added `Call::Registry.registered?(provider, instance: instance)` check to the healthy candidate filter so unregistered providers are skipped and the request falls through to `auto_route` (inference/executor.rb)
142
+ - **`client_tools_only?` method missing** — Restored `client_tools_only?` in executor to guard explicit tool choice forcing for client passthrough requests (inference/executor.rb, inference/native_tool_loop.rb)
143
+
3
144
  ## [0.12.3] - 2026-06-02
4
145
 
5
146
  ### Fixed
@@ -13,10 +13,10 @@ module Legion
13
13
  module Messages
14
14
  extend Legion::Logging::Helper
15
15
 
16
- def self.registered(app) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
16
+ def self.registered(app) # rubocop:disable Metrics/AbcSize
17
17
  log.debug('[llm][api][anthropic][messages] registering POST /v1/messages')
18
18
 
19
- app.post '/v1/messages' do # rubocop:disable Metrics/BlockLength
19
+ app.post '/v1/messages' do
20
20
  require_llm!
21
21
 
22
22
  body = parse_request_body
@@ -40,6 +40,7 @@ module Legion
40
40
  tools: build_tool_classes(normalized[:tools] || []),
41
41
  caller: build_server_caller(source: 'anthropic_compat', path: request.path, env: env),
42
42
  stream: streaming,
43
+ thinking: body[:thinking],
43
44
  cache: { strategy: :default, cacheable: true }
44
45
  )
45
46
 
@@ -54,6 +55,7 @@ module Legion
54
55
 
55
56
  stream do |out|
56
57
  full_text = +''
58
+ text_delta_lines = [] # buffer in-stream deltas until we know if tool calls exist
57
59
 
58
60
  pipeline_response = executor.call_stream do |chunk|
59
61
  text = chunk.respond_to?(:content) ? chunk.content.to_s : chunk.to_s
@@ -61,7 +63,7 @@ module Legion
61
63
 
62
64
  full_text << text
63
65
  delta_event = Legion::LLM::API::Translators::AnthropicResponse.format_chunk(text)
64
- out << "event: content_block_delta\ndata: #{Legion::JSON.dump(delta_event)}\n\n"
66
+ text_delta_lines << "event: content_block_delta\ndata: #{Legion::JSON.dump(delta_event)}\n\n"
65
67
  end
66
68
 
67
69
  events = Legion::LLM::API::Translators::AnthropicResponse.streaming_events(
@@ -71,6 +73,16 @@ module Legion
71
73
  full_text: full_text
72
74
  )
73
75
 
76
+ # If tool calls are present and the text is just JSON arguments,
77
+ # suppress the in-stream text deltas so the client only sees
78
+ # tool_use content blocks — not text deltas of raw JSON.
79
+ if Legion::LLM::API::Translators::AnthropicResponse.text_looks_like_tool_json?(full_text)
80
+ pipeline_tools = pipeline_response.respond_to?(:tools) ? Array(pipeline_response.tools) : []
81
+ text_delta_lines.clear if pipeline_tools.any?
82
+ end
83
+
84
+ text_delta_lines.each { |line| out << line }
85
+
74
86
  events.each do |event_name, payload|
75
87
  next if event_name == 'content_block_delta'
76
88
 
@@ -167,7 +167,6 @@ module Legion
167
167
  anthropic_error('api_error', e.message, status_code: 500)
168
168
  end
169
169
 
170
- # rubocop:disable Metrics/BlockLength
171
170
  helpers do
172
171
  # ── Filename / MIME extraction ──────────────────────────────────
173
172
 
@@ -285,8 +284,6 @@ module Legion
285
284
  sorted_list
286
285
  end
287
286
  end
288
- # rubocop:enable Metrics/BlockLength
289
-
290
287
  # Thread-safe metadata store, re-initialized cleanly on each Sinatra::Extension register.
291
288
  def self.metadata_store
292
289
  @metadata_store ||= Concurrent::Map.new