legion-llm 0.8.49 → 0.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +117 -0
  3. data/README.md +17 -14
  4. data/legion-llm.gemspec +2 -1
  5. data/lib/legion/llm/api/native/helpers.rb +1 -1
  6. data/lib/legion/llm/api/native/inference.rb +25 -18
  7. data/lib/legion/llm/api/native/providers.rb +2 -1
  8. data/lib/legion/llm/api/openai/embeddings.rb +2 -1
  9. data/lib/legion/llm/api/translators/anthropic_request.rb +1 -1
  10. data/lib/legion/llm/api/translators/openai_response.rb +12 -2
  11. data/lib/legion/llm/audit.rb +6 -6
  12. data/lib/legion/llm/call/daemon_client.rb +29 -16
  13. data/lib/legion/llm/call/dispatch.rb +85 -6
  14. data/lib/legion/llm/call/lex_llm_adapter.rb +103 -6
  15. data/lib/legion/llm/call/providers.rb +69 -9
  16. data/lib/legion/llm/call/structured_output.rb +32 -2
  17. data/lib/legion/llm/caller_identity.rb +92 -0
  18. data/lib/legion/llm/context/compressor.rb +15 -8
  19. data/lib/legion/llm/context/curator.rb +219 -11
  20. data/lib/legion/llm/discovery/rule_generator.rb +2 -1
  21. data/lib/legion/llm/discovery.rb +35 -9
  22. data/lib/legion/llm/fleet/dispatcher.rb +164 -93
  23. data/lib/legion/llm/fleet/handler.rb +187 -220
  24. data/lib/legion/llm/fleet/provider_responder.rb +12 -0
  25. data/lib/legion/llm/fleet/reply_dispatcher.rb +79 -31
  26. data/lib/legion/llm/fleet/token_issuer.rb +122 -0
  27. data/lib/legion/llm/fleet/token_validator.rb +13 -0
  28. data/lib/legion/llm/fleet/worker_execution.rb +11 -0
  29. data/lib/legion/llm/fleet.rb +13 -6
  30. data/lib/legion/llm/hooks/metering.rb +2 -10
  31. data/lib/legion/llm/inference/audit_publisher.rb +24 -31
  32. data/lib/legion/llm/inference/conversation.rb +63 -9
  33. data/lib/legion/llm/inference/enrichment_injector.rb +15 -0
  34. data/lib/legion/llm/inference/executor.rb +312 -82
  35. data/lib/legion/llm/inference/profile.rb +5 -4
  36. data/lib/legion/llm/inference/route_attempts.rb +185 -0
  37. data/lib/legion/llm/inference/steps/billing.rb +23 -2
  38. data/lib/legion/llm/inference/steps/classification.rb +64 -10
  39. data/lib/legion/llm/inference/steps/confidence_scoring.rb +61 -1
  40. data/lib/legion/llm/inference/steps/debate.rb +138 -49
  41. data/lib/legion/llm/inference/steps/gaia_advisory.rb +114 -9
  42. data/lib/legion/llm/inference/steps/knowledge_capture.rb +18 -2
  43. data/lib/legion/llm/inference/steps/logging.rb +73 -0
  44. data/lib/legion/llm/inference/steps/mcp_discovery.rb +19 -5
  45. data/lib/legion/llm/inference/steps/metering.rb +38 -12
  46. data/lib/legion/llm/inference/steps/post_response.rb +23 -1
  47. data/lib/legion/llm/inference/steps/prompt_cache.rb +32 -8
  48. data/lib/legion/llm/inference/steps/rag_context.rb +137 -7
  49. data/lib/legion/llm/inference/steps/rag_guard.rb +61 -6
  50. data/lib/legion/llm/inference/steps/skill_injector.rb +56 -11
  51. data/lib/legion/llm/inference/steps/span_annotator.rb +5 -1
  52. data/lib/legion/llm/inference/steps/sticky_persist.rb +36 -2
  53. data/lib/legion/llm/inference/steps/sticky_runners.rb +14 -1
  54. data/lib/legion/llm/inference/steps/tier_assigner.rb +16 -8
  55. data/lib/legion/llm/inference/steps/token_budget.rb +29 -4
  56. data/lib/legion/llm/inference/steps/tool_calls.rb +46 -18
  57. data/lib/legion/llm/inference/steps/tool_discovery.rb +21 -5
  58. data/lib/legion/llm/inference/steps/tool_history.rb +15 -2
  59. data/lib/legion/llm/inference/steps/trigger_match.rb +25 -5
  60. data/lib/legion/llm/inference/steps.rb +1 -0
  61. data/lib/legion/llm/inference.rb +14 -7
  62. data/lib/legion/llm/inventory.rb +24 -12
  63. data/lib/legion/llm/metering.rb +29 -4
  64. data/lib/legion/llm/router/arbitrage.rb +32 -9
  65. data/lib/legion/llm/router/health_tracker.rb +5 -3
  66. data/lib/legion/llm/router.rb +30 -41
  67. data/lib/legion/llm/settings.rb +125 -34
  68. data/lib/legion/llm/tools/confidence.rb +23 -8
  69. data/lib/legion/llm/tools/dispatcher.rb +3 -3
  70. data/lib/legion/llm/transport/message.rb +104 -7
  71. data/lib/legion/llm/transport/messages/escalation_event.rb +4 -0
  72. data/lib/legion/llm/transport/messages/fleet_error.rb +5 -59
  73. data/lib/legion/llm/transport/messages/fleet_request.rb +5 -63
  74. data/lib/legion/llm/transport/messages/fleet_response.rb +5 -45
  75. data/lib/legion/llm/transport/messages/prompt_event.rb +11 -3
  76. data/lib/legion/llm/types/message.rb +34 -3
  77. data/lib/legion/llm/version.rb +1 -1
  78. data/lib/legion/llm.rb +1 -0
  79. metadata +24 -4
  80. data/lib/legion/llm/router/gateway_interceptor.rb +0 -68
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 88524bf330ed22e6ac366d35ddccdbad4557b115c447d87d238ab2670ca8da7e
4
- data.tar.gz: 68c0470f4c10ee885bd28d0486f487777ae3c9dc35ab0fea3cc4ab860c68a0c6
3
+ metadata.gz: 6a7ea4bbe972c340ca3ccaa0cf2478724c90bb8385f1d8129a21fc74981a4380
4
+ data.tar.gz: a7a8250e8c84835b3bf2ffefe9cdc25494b9be7deae4cb16375bd8bfa2d8dcf2
5
5
  SHA512:
6
- metadata.gz: ba9ad2293c9e65db838aca3921ba291ef3bfd71798fc044bfdcdb90f3a04e0536fa5c562f00ea3e06eda88c41ee1659d23ef46382c69424940b4ff058f5cd978
7
- data.tar.gz: c7feaf54a2a618eb5d45741269ba65d855bb478142a3b90702d29de9d23f3bbceadca26b5eb765bedf6c2476b97901dc59cc597242051d9500df9733174532f4
6
+ metadata.gz: be0a896d160b6824760d0361bb82e8e320f312fc23e8e9156cf9dfd924f3d264a48ff71bce0531b29f13913bf367c7eeca732cff477adc1c54310a4f4cbcce38
7
+ data.tar.gz: 210706b370e355606f342e506108b4ac13cb0e663c8203fe7b09afdeb643dba4e1ade3e5cb8e13b0efe4238b33656a73f7f57e74687413a6b0914821a31b0039
data/CHANGELOG.md CHANGED
@@ -1,5 +1,122 @@
1
1
  # Legion LLM Changelog
2
2
 
3
+ ## [0.9.9] - 2026-05-07
4
+
5
+ ### Fixed
6
+ - Initialized sticky-state persistence reads explicitly on cache misses to satisfy static analysis without changing runtime behavior.
7
+ - Consume GAIA advisory `tool_hint`, `suppress`, and `context_window` data when building native tool definitions and sizing RAG retrieval, so GAIA advisory outputs affect provider calls instead of only appearing in enrichment summaries.
8
+
9
+ ## [0.9.8] - 2026-05-06
10
+
11
+ ### Fixed
12
+ - Fixed Anthropic Messages request normalization for multi-block content.
13
+ - Marked API-submitted client tools as non-executable server-side while preserving trusted registry/deferred tool injection.
14
+ - Preferred explicit settings overrides over registry MCP overrides.
15
+ - Added structured-output parse retries through alternate routes when an escalation route is available.
16
+ - Added bounded metering spool writes, disabled-vs-dropped audit results, and warning logs for invalid settings paths and invalid inventory offerings.
17
+ - Preserved non-text message content for audit/persistence and extracted string-keyed text blocks correctly.
18
+ - Added trackable unknown caller identity defaults for audit/transport envelopes.
19
+ - Used provider embedding usage tokens when formatting OpenAI-compatible embedding responses.
20
+ - Serialized daemon-client cached state behind a shared mutex.
21
+ - Added escalation transport exchange binding for escalation events.
22
+
23
+ ## [0.9.7] - 2026-05-06
24
+
25
+ ### Fixed
26
+ - Enabled arbitrage by default, added zero-cost local/fleet offering coverage, and removed the dead `Quality::Checker.model_score` dependency from arbitrage eligibility.
27
+ - Skipped debate when fewer than two distinct models are available and captured judge evaluation/confidence separately from the final answer.
28
+ - Reused the shared context token estimator for billing preflight cost checks.
29
+ - Logged dropped metering emissions at the inference metering call site.
30
+ - Added resolved model metadata to routing span attributes.
31
+ - Stopped treating standalone email addresses as PII by default while preserving contextual and opt-in email detection.
32
+ - Allowed context curation to choose vLLM/MLX local or fleet models for LLM-assisted summarization.
33
+ - Routed compressor summarization through standard low-cost/basic intent when no explicit compressor model is configured.
34
+
35
+ ## [0.9.6] - 2026-05-06
36
+
37
+ ### Fixed
38
+ - Enforced privacy classification as a forced local routing constraint, even when callers request a cloud tier.
39
+ - Made RAG faithfulness failures block caller-visible responses by default and record structured audit data.
40
+ - Archived dropped conversation turns into Apollo with conversation-scoped tags and retrieved archived history during RAG context loading.
41
+ - Reported escalation quality failures and low-confidence responses to `Router::HealthTracker` as quality signals.
42
+ - Persisted conversation sticky state through the database path and restored it after in-memory LRU eviction.
43
+ - Counted pending conversation-history and RAG enrichments in token-budget checks before provider dispatch.
44
+ - Averaged provider health priority adjustments across instances while preserving worst-state circuit reporting.
45
+
46
+ ## [0.9.5] - 2026-05-06
47
+
48
+ ### Fixed
49
+ - Fixed context curator cache invalidation and stored curated-summary replay so compacted messages are used on later turns.
50
+ - Persisted curation marker records even when a pass does not rewrite individual messages, allowing structural curation to run for short-message turns.
51
+ - Fixed compressor LLM summarization to call `Legion::LLM.chat_direct` with a prompt message instead of the obsolete session-style API.
52
+ - Warn and omit misleading zero-dollar cost estimates when provider usage metadata collapses to zero tokens for a known model.
53
+
54
+ ## [0.9.4] - 2026-05-06
55
+
56
+ ### Changed
57
+ - Added shared inference step logging helpers and debug-level step enter/complete/failure logs.
58
+ - Added safe debug/info instrumentation across inference steps for routing actions, enrichment decisions, tool handling, RAG, skill injection, sticky runner state, billing, classification, debate, post-response audit, and metering emission.
59
+
60
+ ## [0.9.3] - 2026-05-06
61
+
62
+ ### Changed
63
+ - Delegated responder-side fleet provider execution, token validation, and provider response publishing to the shared `lex-llm` fleet helpers.
64
+ - Kept `legion-llm` as the request-side fleet dispatcher and token issuer while retaining compatibility aliases for old responder constants.
65
+ - Bumped the `lex-llm` dependency floor to `>= 0.4.3` for shared responder execution helpers.
66
+
67
+ ## [0.9.2] - 2026-05-06
68
+
69
+ ### Fixed
70
+ - Prefer namespaced caller ids over ambiguous display identities when publishing audit, metering, and transport identity metadata.
71
+
72
+ ## [0.9.1] - 2026-05-06
73
+
74
+ ### Changed
75
+ - `legion-llm` now owns lex-llm provider registration by scanning loaded provider modules, constructing `LexLLMAdapter` instances, and writing `Call::Registry`.
76
+ - Provider rediscovery now rebuilds registry entries after `Call::Registry.reset!`, supporting LegionIO reload/hot-update flows without relying on provider require-time side effects.
77
+ - Bumped the `lex-llm` dependency floor to `>= 0.4.1` for pure provider discovery and alias metadata.
78
+
79
+ ### Fixed
80
+ - Preserve streaming thinking chunks that providers emit as plain strings.
81
+ - Normalize discovered provider instance ids from provider offerings before memory-gate and availability checks.
82
+ - Use canonical offering aliases for metering cost estimates and fall back to caller agent metadata for fleet context.
83
+ - Preserve extension, string, and top-level caller identity metadata in audit events and transport headers.
84
+
85
+ ## [0.9.0] - 2026-05-06
86
+
87
+ ### Changed
88
+ - Added shared provider-owned fleet responder execution support for lex-llm provider gems.
89
+ - Moved fleet dispatch defaults to top-level `fleet.dispatch`, removed legacy gateway defaults, and rejected `routing.use_fleet` / `openai_compat.gateways` settings during validation.
90
+ - OpenAI-compatible routing now resolves through registered `lex-llm-openai` provider instances instead of gateway interceptor configuration.
91
+ - Native dispatch now routes chat, stream, embed, image, and health calls through the canonical lex-llm provider-instance adapter contract.
92
+ - Native inference now records direct/fleet route attempts with dispatch path, idempotency key, selected lane, failure reason, and escalation context.
93
+ - Native inference and `/api/llm/inference` now strip provider thinking from caller-visible content and expose thinking only through explicit diagnostic fields/events.
94
+ - Inventory and provider/model API reads now use cached discovery and non-live provider offerings, so explicit discovery refresh remains the only path that probes provider endpoints.
95
+ - Fleet dispatch now publishes shared lex-llm protocol-v2 envelopes with canonical `operation`, `request_id`, `correlation_id`, `idempotency_key`, signed tokens, and strict reply matching.
96
+ - Fleet worker handling now validates protocol-v2 envelopes, enforces token/idempotency policy, dispatches local providers through canonical lex-llm methods, and publishes shared lex-llm response/error envelopes.
97
+ - Bumped dependency floors to `lex-llm >= 0.4.0` and `legion-transport >= 1.4.14` for shared provider contracts and fleet envelopes.
98
+
99
+ ### Removed
100
+ - Removed the gateway interceptor runtime path and gateway metering fallback.
101
+ - Retired `Legion::LLM::Transport::Messages::FleetRequest`, `FleetResponse`, and `FleetError` as fleet message authorities in favor of `Legion::Extensions::Llm::Transport::Messages::*`.
102
+
103
+ ## [0.8.51] - 2026-05-03
104
+
105
+ ### Changed
106
+ - Native `/api/llm/inference` streaming hides provider thinking deltas by default, with `include_thinking: true` as the explicit diagnostic opt-in.
107
+ - Pipeline metering events now carry wall-clock latency, estimated cost, conversation/correlation ids, billing, task, agent, identity, and routing context.
108
+ - LLM transport messages now preserve caller identity, credential, and caller type headers from nested caller metadata, top-level identity metadata, and extension callers.
109
+
110
+ ### Fixed
111
+ - Prompt audit events now include provider response thinking separately from assistant response content.
112
+ - Native lex-llm dispatch now carries provider thinking separately from response content before API responses are emitted.
113
+ - Native discovery now normalizes provider offering objects before generating routing candidates, preserving provider instance, tier, capabilities, context length, and parameter metadata.
114
+
115
+ ## [0.8.50] - 2026-05-03
116
+
117
+ ### Fixed
118
+ - Native discovery now normalizes lex-llm `ModelOffering` objects before generating routing candidates, allowing auto-rules to populate from provider adapters again.
119
+
3
120
  ## [0.8.49] - 2026-04-29
4
121
 
5
122
  ### Changed
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  LLM routing and provider orchestration for the [LegionIO](https://github.com/LegionIO/LegionIO) framework. Routes chat, embeddings, tool use, fleet dispatch, auditing, and provider metadata through Legion-native `lex-llm-*` provider extensions.
4
4
 
5
- **Version**: 0.8.49
5
+ **Version**: 0.9.0
6
6
 
7
7
  ## Installation
8
8
 
@@ -60,7 +60,7 @@ Requests flow through the full Inference pipeline — routing, metering, audit,
60
60
  Both formats supported with correct SSE shapes:
61
61
  - **OpenAI**: `data: {"choices":[{"delta":{"content":"..."}}]}` chunks, terminated by `data: [DONE]`
62
62
  - **Anthropic**: Typed events — `message_start`, `content_block_start`, `content_block_delta`, `content_block_stop`, `message_delta`, `message_stop`
63
- - **Native**: `/api/llm/inference` streams `text-delta`, `thinking-delta`, tool lifecycle events, and a final `done` event. Structured provider content blocks are flattened to plain text in both streaming and non-streaming native responses so `content` remains a string for daemon clients.
63
+ - **Native**: `/api/llm/inference` streams `text-delta`, optional `thinking-delta` events when `include_thinking: true`, tool lifecycle events, and a final `done` event. Structured provider content blocks are flattened to plain text in both streaming and non-streaming native responses so `content` remains a string for daemon clients.
64
64
 
65
65
  ### API Authentication
66
66
 
@@ -122,7 +122,7 @@ Credentials are resolved automatically by the universal secret resolver in `legi
122
122
 
123
123
  ### Provider Extensions (lex-llm-*)
124
124
 
125
- Each provider is a standalone `lex-llm-*` gem that ships its own `default_settings`, model catalog, and capability declarations. The provider registers itself with `legion-llm` at load time. Provider gems implement:
125
+ Each provider is a standalone `lex-llm-*` gem that ships its own `default_settings`, model catalog, capability declarations, and optional provider-owned fleet worker actor. When a provider gem is loaded, `legion-llm` discovers it through the shared `lex-llm` provider contract and registers provider instances for routing. Provider gems implement:
126
126
 
127
127
  - **`default_settings`** -- Connection defaults (base_url, region, API key env vars)
128
128
  - **`model_allowed?(model_name)`** -- Provider-level model filtering
@@ -370,9 +370,9 @@ Legion::LLM (lib/legion/llm.rb) # Thin facade — delegates to Inferenc
370
370
  │ ├── Arbitrage # Cost-aware model selection when no rules match
371
371
  │ └── Escalation/
372
372
  │ └── History # EscalationHistory mixin
373
- ├── Fleet # Fleet RPC dispatch over AMQP (built-in)
373
+ ├── Fleet # Fleet dispatch over AMQP; provider responders live in lex-llm-* gems
374
374
  │ ├── Dispatcher # Fleet RPC dispatch with routing key building, per-type timeouts
375
- │ ├── Handler # Fleet request handler for GPU worker nodes
375
+ │ ├── TokenIssuer # Request-side JWT minting for provider-owned responders
376
376
  │ └── ReplyDispatcher # Correlation-based reply routing
377
377
  ├── API # All external HTTP interfaces
378
378
  │ ├── Auth # Config-driven Bearer/x-api-key auth for /v1/ routes
@@ -392,10 +392,10 @@ Legion::LLM (lib/legion/llm.rb) # Thin facade — delegates to Inferenc
392
392
  │ ├── OpenAIRequest / OpenAIResponse
393
393
  │ └── AnthropicRequest / AnthropicResponse
394
394
  ├── Audit # Prompt, tool, and skill audit event emission
395
- ├── Transport # Centralized AMQP exchange and message definitions
395
+ ├── Transport # Centralized AMQP exchange and non-fleet message definitions
396
396
  │ ├── Message # LLM base message: context propagation, LLM headers
397
397
  │ ├── Exchanges/ # Fleet, Metering, Audit, Escalation
398
- │ └── Messages/ # FleetRequest, FleetResponse, FleetError, MeteringEvent, etc.
398
+ │ └── Messages/ # MeteringEvent, prompt/tool audit, escalation, and compatibility wrappers
399
399
  ├── Scheduling # Deferred execution
400
400
  │ ├── Batch # Non-urgent request batching with priority queue and auto-flush
401
401
  │ └── OffPeak # Peak-hour deferral
@@ -497,8 +497,8 @@ legion-llm includes a dynamic weighted routing engine that dispatches requests a
497
497
  │ Tier 1: LOCAL → Ollama on this machine (direct HTTP) │
498
498
  │ Zero network overhead, no Transport │
499
499
  │ │
500
- │ Tier 2: FLEET → vLLM/Ollama on GPU workers
501
- Built-in Fleet RPC over AMQP
500
+ │ Tier 2: FLEET → provider-owned lex-llm-* responders
501
+ Shared lex-llm fleet envelopes over AMQP
502
502
  │ │
503
503
  │ Tier 3: CLOUD → Bedrock / Azure / Gemini │
504
504
  │ Tier 4: FRONTIER → Anthropic / OpenAI │
@@ -509,12 +509,12 @@ legion-llm includes a dynamic weighted routing engine that dispatches requests a
509
509
  | Tier | Target | Use Case |
510
510
  |------|--------|----------|
511
511
  | `local` | Ollama on localhost | Privacy-sensitive, offline, or low-latency workloads |
512
- | `fleet` | Shared hardware via built-in Fleet dispatcher (AMQP) | Larger vLLM/Ollama models on dedicated GPU servers |
513
- | `openai_compat` | OpenAI-compatible gateways | Self-hosted or proxy endpoints with OpenAI-compatible APIs |
512
+ | `fleet` | Shared hardware via provider-owned lex-llm responders over AMQP | Larger vLLM/Ollama models on dedicated GPU servers |
513
+ | `openai_compat` | OpenAI-compatible provider instances | Self-hosted or proxy endpoints with OpenAI-compatible APIs |
514
514
  | `cloud` | API providers (Bedrock, Azure, Gemini) | Managed cloud inference |
515
515
  | `frontier` | API providers (Anthropic, OpenAI) | Frontier models, full-capability inference |
516
516
 
517
- Fleet dispatch is built into legion-llm. The `Fleet::Dispatcher` publishes shared-lane requests to keys such as `llm.fleet.inference.qwen3-6-27b.ctx32000` or `llm.fleet.embed.nomic-embed-text`; `Fleet::Handler` processes them on GPU worker nodes and replies through correlated live responses. Keep `routing.tiers.fleet.routing_style` set to `shared_lane` for the default pooled model lanes, set it to `offering_lane` for exact provider-instance lanes such as `llm.fleet.offering.vllm-gpu-01.qwen3-6.inference`, or use any other value only for the legacy `llm.request.{provider}.{type}.{model}` keys.
517
+ Fleet dispatch is built into `legion-llm`, but fleet consumption is provider-owned. `Fleet::Dispatcher` publishes shared `lex-llm` protocol-v2 `FleetRequest` envelopes to keys such as `llm.fleet.inference.qwen3-6-27b.ctx32000` or `llm.fleet.embed.nomic-embed-text`; the enabled provider gem actor consumes the request, validates the signed token and idempotency key through `Legion::Extensions::Llm::Fleet::ProviderResponder`, calls its local provider instance through the canonical `lex-llm` provider methods, and replies with shared `FleetResponse` or `FleetError` envelopes. Keep `routing.tiers.fleet.routing_style` set to `shared_lane` for the default pooled model lanes, or set it to `offering_lane` for exact provider-instance lanes such as `llm.fleet.offering.vllm-gpu-01.qwen3-6.inference`.
518
518
 
519
519
  #### Intent-Based Dispatch
520
520
 
@@ -585,7 +585,7 @@ Add routing configuration under the `llm` key:
585
585
  "timeout_seconds": 30,
586
586
  "timeouts": { "embed": 10, "chat": 30, "generate": 30, "default": 30 }
587
587
  },
588
- "openai_compat": { "gateways": [] },
588
+ "openai_compat": { "providers": ["openai"] },
589
589
  "cloud": { "providers": ["bedrock", "azure", "gemini"] },
590
590
  "frontier": { "providers": ["anthropic", "openai"] }
591
591
  },
@@ -853,6 +853,8 @@ Legion::Service#initialize
853
853
  load_extensions # LEX extensions (can use LLM if available)
854
854
  ```
855
855
 
856
+ LegionIO hosts these routes through `mount_library_routes('llm', Routes::Llm, 'Legion::LLM::Routes')`. The route modules remain owned by `legion-llm`; LegionIO no longer registers provider gateway fallback routes when the library is available.
857
+
856
858
  - **Service**: `setup_llm` called between data and supervision in startup sequence
857
859
  - **Extensions**: `llm_required?` method on extension module, checked at load time
858
860
  - **Helpers**: `Legion::Extensions::Helpers::LLM` auto-loaded when gem is present
@@ -887,8 +889,9 @@ bundle exec rubocop -A
887
889
  | `legion-json` | Legion JSON serialization |
888
890
  | `legion-logging` | Logging |
889
891
  | `legion-settings` | Configuration defaults and file overrides |
892
+ | `legion-transport` (>= 1.4.14) | AMQP transport for fleet dispatch, metering, and audit |
890
893
  | `lex-knowledge` | Optional knowledge chunking integration when loaded |
891
- | `lex-llm` (>= 0.1.6) | Provider-neutral model offering and adapter base |
894
+ | `lex-llm` (>= 0.4.3) | Provider-neutral contract, model offerings, response normalization, fleet envelopes, and responder-side fleet execution helpers |
892
895
  | `pdf-reader` | PDF extraction support |
893
896
  | `tzinfo` (>= 2.0) | IANA timezone conversion for schedule windows |
894
897
 
data/legion-llm.gemspec CHANGED
@@ -31,8 +31,9 @@ Gem::Specification.new do |spec|
31
31
  spec.add_dependency 'legion-json', '>= 1.2.0'
32
32
  spec.add_dependency 'legion-logging', '>= 1.2.8'
33
33
  spec.add_dependency 'legion-settings', '>= 1.4.0'
34
+ spec.add_dependency 'legion-transport', '>= 1.4.14'
34
35
  spec.add_dependency 'lex-knowledge'
35
- spec.add_dependency 'lex-llm', '>= 0.1.6'
36
+ spec.add_dependency 'lex-llm', '>= 0.4.3'
36
37
  spec.add_dependency 'pdf-reader'
37
38
  spec.add_dependency 'tzinfo', '>= 2.0'
38
39
  end
@@ -301,7 +301,7 @@ module Legion
301
301
  name: tname,
302
302
  description: tdesc,
303
303
  parameters: tschema || {},
304
- source: { type: :client, executable: true }
304
+ source: { type: :client, executable: false }
305
305
  )
306
306
  rescue StandardError => e
307
307
  handle_exception(e, level: :warn, handled: true, operation: "llm.api.build_client_tool_class.#{tname}")
@@ -26,6 +26,7 @@ module Legion
26
26
  caller_context = body[:caller]
27
27
  conversation_id = body[:conversation_id]
28
28
  request_id = body[:request_id] || SecureRandom.uuid
29
+ include_thinking = body[:include_thinking] == true
29
30
 
30
31
  unless messages.is_a?(Array)
31
32
  halt 400, { 'Content-Type' => 'application/json' },
@@ -145,7 +146,7 @@ module Legion
145
146
 
146
147
  pipeline_response = executor.call_stream do |chunk|
147
148
  thinking = extract_text_content(chunk.thinking) if chunk.respond_to?(:thinking)
148
- emit_sse_event(out, 'thinking-delta', { delta: thinking }) unless thinking.to_s.empty?
149
+ emit_sse_event(out, 'thinking-delta', { delta: thinking }) if include_thinking && !thinking.to_s.empty?
149
150
 
150
151
  text = extract_text_content(chunk.respond_to?(:content) ? chunk.content : chunk)
151
152
  next if text.empty?
@@ -161,14 +162,18 @@ module Legion
161
162
 
162
163
  routing = pipeline_response.routing || {}
163
164
  tokens = pipeline_response.tokens || {}
165
+ done_payload = {
166
+ request_id: request_id,
167
+ content: full_text,
168
+ model: (routing[:model] || routing['model']).to_s,
169
+ input_tokens: token_value(tokens, :input),
170
+ output_tokens: token_value(tokens, :output),
171
+ tool_calls: extract_tool_calls(pipeline_response),
172
+ conversation_id: pipeline_response.conversation_id
173
+ }
174
+ done_payload[:thinking] = pipeline_response.thinking if include_thinking && pipeline_response.thinking
164
175
  emit_sse_event(out, 'done', {
165
- request_id: request_id,
166
- content: full_text,
167
- model: (routing[:model] || routing['model']).to_s,
168
- input_tokens: token_value(tokens, :input),
169
- output_tokens: token_value(tokens, :output),
170
- tool_calls: extract_tool_calls(pipeline_response),
171
- conversation_id: pipeline_response.conversation_id
176
+ **done_payload
172
177
  })
173
178
 
174
179
  log.info(
@@ -208,16 +213,18 @@ module Legion
208
213
  "stop_reason=#{pipeline_response.stop&.dig(:reason) || 'unknown'} stream=false"
209
214
  )
210
215
 
211
- json_response({
212
- request_id: request_id,
213
- content: content,
214
- tool_calls: tool_calls,
215
- stop_reason: pipeline_response.stop&.dig(:reason)&.to_s,
216
- model: (routing[:model] || routing['model']).to_s,
217
- input_tokens: token_value(tokens, :input),
218
- output_tokens: token_value(tokens, :output),
219
- conversation_id: pipeline_response.conversation_id
220
- }, status_code: 200)
216
+ payload = {
217
+ request_id: request_id,
218
+ content: content,
219
+ tool_calls: tool_calls,
220
+ stop_reason: pipeline_response.stop&.dig(:reason)&.to_s,
221
+ model: (routing[:model] || routing['model']).to_s,
222
+ input_tokens: token_value(tokens, :input),
223
+ output_tokens: token_value(tokens, :output),
224
+ conversation_id: pipeline_response.conversation_id
225
+ }
226
+ payload[:thinking] = pipeline_response.thinking if include_thinking && pipeline_response.thinking
227
+ json_response(payload, status_code: 200)
221
228
  end
222
229
  rescue Legion::LLM::AuthError => e
223
230
  handle_exception(e, level: :error, handled: true, operation: 'llm.api.inference.auth', request_id: request_id)
@@ -80,7 +80,8 @@ module Legion
80
80
  def self.instance_to_hash(entry)
81
81
  health = begin
82
82
  Legion::LLM::Router.health_tracker
83
- rescue StandardError
83
+ rescue StandardError => e
84
+ handle_exception(e, level: :debug, handled: true, operation: 'api.providers.health_tracker')
84
85
  nil
85
86
  end
86
87
  provider_key = entry[:provider].to_sym
@@ -35,9 +35,10 @@ module Legion
35
35
  when Hash then vector[:vector] || vector['vector'] || vector[:embedding] || vector['embedding'] || []
36
36
  else []
37
37
  end
38
+ usage = vector.is_a?(Hash) ? (vector[:usage] || vector['usage'] || vector) : nil
38
39
 
39
40
  response_body = Legion::LLM::API::Translators::OpenAIResponse.format_embeddings(
40
- vector_array, model: model, input_text: text
41
+ vector_array, model: model, input_text: text, usage: usage
41
42
  )
42
43
 
43
44
  log.info("[llm][api][openai][embeddings] action=complete model=#{model} dims=#{vector_array.size}")
@@ -84,7 +84,7 @@ module Legion
84
84
  return content if content.is_a?(String)
85
85
  return content unless content.is_a?(Array)
86
86
 
87
- content.map do |block|
87
+ parts = content.map do |block|
88
88
  bs = block.respond_to?(:transform_keys) ? block.transform_keys(&:to_sym) : block
89
89
  type = bs[:type].to_s
90
90
  case type
@@ -70,8 +70,8 @@ module Legion
70
70
  }
71
71
  end
72
72
 
73
- def format_embeddings(vector, model:, input_text:)
74
- tokens = input_text.to_s.split.size
73
+ def format_embeddings(vector, model:, input_text:, usage: nil)
74
+ tokens = embedding_token_count(usage, input_text)
75
75
 
76
76
  {
77
77
  object: 'list',
@@ -134,6 +134,16 @@ module Legion
134
134
 
135
135
  nil
136
136
  end
137
+
138
+ def embedding_token_count(usage, input_text)
139
+ usage_hash = usage.respond_to?(:key?) ? usage : {}
140
+ token_count = usage_hash[:prompt_tokens] || usage_hash['prompt_tokens'] ||
141
+ usage_hash[:input_tokens] || usage_hash['input_tokens'] ||
142
+ usage_hash[:total_tokens] || usage_hash['total_tokens']
143
+ return token_count.to_i if token_count
144
+
145
+ input_text.to_s.split.size
146
+ end
137
147
  end
138
148
  end
139
149
  end
@@ -24,8 +24,8 @@ module Legion
24
24
  log.info('[llm][audit] published prompt audit')
25
25
  :published
26
26
  else
27
- log.warn('[llm][audit] dropped prompt audit: transport unavailable')
28
- :dropped
27
+ log.warn('[llm][audit] disabled prompt audit: transport unavailable')
28
+ :disabled
29
29
  end
30
30
  rescue StandardError => e
31
31
  handle_exception(e, level: :warn, operation: 'llm.audit.emit_prompt')
@@ -38,8 +38,8 @@ module Legion
38
38
  log.info('[llm][audit] published tool audit')
39
39
  :published
40
40
  else
41
- log.warn('[llm][audit] dropped tool audit: transport unavailable')
42
- :dropped
41
+ log.warn('[llm][audit] disabled tool audit: transport unavailable')
42
+ :disabled
43
43
  end
44
44
  rescue StandardError => e
45
45
  handle_exception(e, level: :warn, operation: 'llm.audit.emit_tools')
@@ -52,8 +52,8 @@ module Legion
52
52
  log.info('[llm][audit] published skill audit')
53
53
  :published
54
54
  else
55
- log.warn('[llm][audit] dropped skill audit: transport unavailable')
56
- :dropped
55
+ log.warn('[llm][audit] disabled skill audit: transport unavailable')
56
+ :disabled
57
57
  end
58
58
  rescue StandardError => e
59
59
  handle_exception(e, level: :warn, operation: 'llm.audit.emit_skill')
@@ -17,6 +17,10 @@ module Legion
17
17
 
18
18
  module_function
19
19
 
20
+ def state_mutex
21
+ @state_mutex ||= Mutex.new
22
+ end
23
+
20
24
  # Returns true if the daemon is reachable and healthy.
21
25
  # Returns false immediately if daemon_url is nil.
22
26
  # Caches a positive health check for HEALTH_CACHE_TTL seconds.
@@ -25,14 +29,13 @@ module Legion
25
29
  return false if daemon_url.nil?
26
30
 
27
31
  now = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
28
-
29
- return true if @healthy == true && @health_checked_at && (now - @health_checked_at) < HEALTH_CACHE_TTL
32
+ cached_healthy = state_mutex.synchronize do
33
+ @healthy == true && @health_checked_at && (now - @health_checked_at) < HEALTH_CACHE_TTL
34
+ end
35
+ return true if cached_healthy
30
36
 
31
37
  result = check_health
32
- if result
33
- @healthy = true
34
- @health_checked_at = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
35
- end
38
+ record_health(result) if result
36
39
  result
37
40
  end
38
41
 
@@ -61,16 +64,20 @@ module Legion
61
64
  # Returns the daemon URL from settings, cached after first read.
62
65
  # Returns nil if settings are unavailable or the key is missing.
63
66
  def daemon_url
64
- return @daemon_url if defined?(@daemon_url)
67
+ state_mutex.synchronize do
68
+ return @daemon_url if defined?(@daemon_url)
65
69
 
66
- @daemon_url = fetch_daemon_url
70
+ @daemon_url = fetch_daemon_url
71
+ end
67
72
  end
68
73
 
69
74
  # Clears all cached state. Returns self for chaining.
70
75
  def reset!
71
- remove_instance_variable(:@daemon_url) if defined?(@daemon_url)
72
- @healthy = nil
73
- @health_checked_at = nil
76
+ state_mutex.synchronize do
77
+ remove_instance_variable(:@daemon_url) if defined?(@daemon_url)
78
+ @healthy = nil
79
+ @health_checked_at = nil
80
+ end
74
81
  self
75
82
  end
76
83
 
@@ -79,8 +86,7 @@ module Legion
79
86
  def check_health
80
87
  response = http_get('/api/health')
81
88
  healthy = response.code == '200'
82
- @healthy = healthy
83
- @health_checked_at = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
89
+ record_health(healthy)
84
90
  log.info("Daemon health check result=#{healthy ? 'healthy' : 'unhealthy'} url=#{daemon_url}")
85
91
  healthy
86
92
  rescue StandardError => e
@@ -92,8 +98,7 @@ module Legion
92
98
  # Marks the daemon as unhealthy and records the timestamp.
93
99
  def mark_unhealthy
94
100
  log.warn("Daemon marked unhealthy url=#{daemon_url}")
95
- @healthy = false
96
- @health_checked_at = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
101
+ record_health(false)
97
102
  end
98
103
 
99
104
  # Builds and sends a GET request. Returns Net::HTTPResponse.
@@ -185,6 +190,13 @@ module Legion
185
190
  nil
186
191
  end
187
192
 
193
+ def record_health(healthy)
194
+ state_mutex.synchronize do
195
+ @healthy = healthy == true
196
+ @health_checked_at = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
197
+ end
198
+ end
199
+
188
200
  def safe_parse(body)
189
201
  return {} if body.nil? || body.strip.empty?
190
202
 
@@ -242,7 +254,8 @@ module Legion
242
254
  end
243
255
  end
244
256
 
245
- private_class_method :fetch_daemon_url, :safe_parse, :extract_retry_after, :interpret_inference_response
257
+ private_class_method :state_mutex, :fetch_daemon_url, :record_health, :safe_parse, :extract_retry_after,
258
+ :interpret_inference_response
246
259
  end
247
260
  end
248
261
  end