legion-llm 0.8.47 → 0.9.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/CHANGELOG.md +135 -0
- data/CLAUDE.md +20 -17
- data/Gemfile +3 -0
- data/README.md +61 -60
- data/legion-llm.gemspec +3 -2
- data/lib/legion/llm/api/native/helpers.rb +6 -4
- data/lib/legion/llm/api/native/inference.rb +25 -18
- data/lib/legion/llm/api/native/instances.rb +32 -35
- data/lib/legion/llm/api/native/providers.rb +52 -75
- data/lib/legion/llm/api/native/routing.rb +55 -0
- data/lib/legion/llm/api/openai/embeddings.rb +2 -1
- data/lib/legion/llm/api/translators/anthropic_request.rb +1 -1
- data/lib/legion/llm/api/translators/openai_response.rb +12 -2
- data/lib/legion/llm/api.rb +2 -0
- data/lib/legion/llm/audit.rb +6 -6
- data/lib/legion/llm/cache/response.rb +2 -1
- data/lib/legion/llm/cache.rb +4 -2
- data/lib/legion/llm/call/daemon_client.rb +31 -18
- data/lib/legion/llm/call/dispatch.rb +172 -54
- data/lib/legion/llm/call/embeddings.rb +113 -503
- data/lib/legion/llm/call/lex_llm_adapter.rb +146 -21
- data/lib/legion/llm/call/providers.rb +62 -674
- data/lib/legion/llm/call/registry.rb +94 -10
- data/lib/legion/llm/call/structured_output.rb +33 -3
- data/lib/legion/llm/call.rb +0 -2
- data/lib/legion/llm/caller_identity.rb +92 -0
- data/lib/legion/llm/compat.rb +2 -7
- data/lib/legion/llm/config.rb +11 -1
- data/lib/legion/llm/context/compressor.rb +15 -8
- data/lib/legion/llm/context/curator.rb +221 -12
- data/lib/legion/llm/discovery/memory_gate.rb +53 -0
- data/lib/legion/llm/discovery/rule_generator.rb +148 -0
- data/lib/legion/llm/discovery.rb +228 -51
- data/lib/legion/llm/fleet/dispatcher.rb +167 -93
- data/lib/legion/llm/fleet/handler.rb +188 -217
- data/lib/legion/llm/fleet/lane.rb +1 -1
- data/lib/legion/llm/fleet/provider_responder.rb +12 -0
- data/lib/legion/llm/fleet/reply_dispatcher.rb +80 -37
- data/lib/legion/llm/fleet/token_issuer.rb +122 -0
- data/lib/legion/llm/fleet/token_validator.rb +13 -0
- data/lib/legion/llm/fleet/worker_execution.rb +11 -0
- data/lib/legion/llm/fleet.rb +13 -6
- data/lib/legion/llm/hooks/metering.rb +2 -10
- data/lib/legion/llm/hooks.rb +2 -2
- data/lib/legion/llm/inference/audit_publisher.rb +24 -31
- data/lib/legion/llm/inference/conversation.rb +66 -11
- data/lib/legion/llm/inference/enrichment_injector.rb +15 -0
- data/lib/legion/llm/inference/executor.rb +471 -182
- data/lib/legion/llm/inference/profile.rb +5 -4
- data/lib/legion/llm/inference/prompt.rb +0 -12
- data/lib/legion/llm/inference/route_attempts.rb +185 -0
- data/lib/legion/llm/inference/steps/billing.rb +23 -2
- data/lib/legion/llm/inference/steps/classification.rb +67 -22
- data/lib/legion/llm/inference/steps/confidence_scoring.rb +61 -1
- data/lib/legion/llm/inference/steps/debate.rb +156 -70
- data/lib/legion/llm/inference/steps/gaia_advisory.rb +116 -11
- data/lib/legion/llm/inference/steps/knowledge_capture.rb +18 -2
- data/lib/legion/llm/inference/steps/logging.rb +73 -0
- data/lib/legion/llm/inference/steps/mcp_discovery.rb +19 -5
- data/lib/legion/llm/inference/steps/metering.rb +38 -12
- data/lib/legion/llm/inference/steps/post_response.rb +23 -1
- data/lib/legion/llm/inference/steps/prompt_cache.rb +33 -21
- data/lib/legion/llm/inference/steps/rag_context.rb +144 -26
- data/lib/legion/llm/inference/steps/rag_guard.rb +61 -6
- data/lib/legion/llm/inference/steps/skill_injector.rb +57 -24
- data/lib/legion/llm/inference/steps/span_annotator.rb +5 -1
- data/lib/legion/llm/inference/steps/sticky_helpers.rb +1 -13
- data/lib/legion/llm/inference/steps/sticky_persist.rb +65 -10
- data/lib/legion/llm/inference/steps/sticky_runners.rb +20 -7
- data/lib/legion/llm/inference/steps/tier_assigner.rb +16 -8
- data/lib/legion/llm/inference/steps/token_budget.rb +29 -4
- data/lib/legion/llm/inference/steps/tool_calls.rb +46 -18
- data/lib/legion/llm/inference/steps/tool_discovery.rb +45 -15
- data/lib/legion/llm/inference/steps/tool_history.rb +15 -2
- data/lib/legion/llm/inference/steps/trigger_match.rb +27 -20
- data/lib/legion/llm/inference/steps.rb +1 -0
- data/lib/legion/llm/inference.rb +20 -29
- data/lib/legion/llm/inventory.rb +55 -49
- data/lib/legion/llm/metering.rb +29 -4
- data/lib/legion/llm/quality/checker.rb +2 -2
- data/lib/legion/llm/quality/confidence/scorer.rb +3 -2
- data/lib/legion/llm/router/arbitrage.rb +32 -9
- data/lib/legion/llm/router/health_tracker.rb +146 -57
- data/lib/legion/llm/router/resolution.rb +9 -3
- data/lib/legion/llm/router/rule.rb +6 -3
- data/lib/legion/llm/router.rb +190 -70
- data/lib/legion/llm/scheduling.rb +2 -2
- data/lib/legion/llm/settings.rb +135 -99
- data/lib/legion/llm/skills/base.rb +4 -1
- data/lib/legion/llm/tools/confidence.rb +30 -13
- data/lib/legion/llm/tools/dispatcher.rb +25 -41
- data/lib/legion/llm/transport/message.rb +134 -8
- data/lib/legion/llm/transport/messages/escalation_event.rb +4 -0
- data/lib/legion/llm/transport/messages/fleet_error.rb +5 -59
- data/lib/legion/llm/transport/messages/fleet_request.rb +5 -63
- data/lib/legion/llm/transport/messages/fleet_response.rb +5 -45
- data/lib/legion/llm/transport/messages/prompt_event.rb +11 -3
- data/lib/legion/llm/types/message.rb +34 -3
- data/lib/legion/llm/types/tool_definition.rb +17 -0
- data/lib/legion/llm/version.rb +1 -1
- data/lib/legion/llm.rb +7 -4
- metadata +29 -10
- data/lib/legion/llm/call/claude_config_loader.rb +0 -182
- data/lib/legion/llm/call/codex_config_loader.rb +0 -137
- data/lib/legion/llm/discovery/ollama.rb +0 -116
- data/lib/legion/llm/discovery/vllm.rb +0 -134
- data/lib/legion/llm/router/gateway_interceptor.rb +0 -68
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 6a7ea4bbe972c340ca3ccaa0cf2478724c90bb8385f1d8129a21fc74981a4380
|
|
4
|
+
data.tar.gz: a7a8250e8c84835b3bf2ffefe9cdc25494b9be7deae4cb16375bd8bfa2d8dcf2
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: be0a896d160b6824760d0361bb82e8e320f312fc23e8e9156cf9dfd924f3d264a48ff71bce0531b29f13913bf367c7eeca732cff477adc1c54310a4f4cbcce38
|
|
7
|
+
data.tar.gz: 210706b370e355606f342e506108b4ac13cb0e663c8203fe7b09afdeb643dba4e1ade3e5cb8e13b0efe4238b33656a73f7f57e74687413a6b0914821a31b0039
|
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,140 @@
|
|
|
1
1
|
# Legion LLM Changelog
|
|
2
2
|
|
|
3
|
+
## [0.9.9] - 2026-05-07
|
|
4
|
+
|
|
5
|
+
### Fixed
|
|
6
|
+
- Initialized sticky-state persistence reads explicitly on cache misses to satisfy static analysis without changing runtime behavior.
|
|
7
|
+
- Consume GAIA advisory `tool_hint`, `suppress`, and `context_window` data when building native tool definitions and sizing RAG retrieval, so GAIA advisory outputs affect provider calls instead of only appearing in enrichment summaries.
|
|
8
|
+
|
|
9
|
+
## [0.9.8] - 2026-05-06
|
|
10
|
+
|
|
11
|
+
### Fixed
|
|
12
|
+
- Fixed Anthropic Messages request normalization for multi-block content.
|
|
13
|
+
- Marked API-submitted client tools as non-executable server-side while preserving trusted registry/deferred tool injection.
|
|
14
|
+
- Preferred explicit settings overrides over registry MCP overrides.
|
|
15
|
+
- Added structured-output parse retries through alternate routes when an escalation route is available.
|
|
16
|
+
- Added bounded metering spool writes, disabled-vs-dropped audit results, and warning logs for invalid settings paths and invalid inventory offerings.
|
|
17
|
+
- Preserved non-text message content for audit/persistence and extracted string-keyed text blocks correctly.
|
|
18
|
+
- Added trackable unknown caller identity defaults for audit/transport envelopes.
|
|
19
|
+
- Used provider embedding usage tokens when formatting OpenAI-compatible embedding responses.
|
|
20
|
+
- Serialized daemon-client cached state behind a shared mutex.
|
|
21
|
+
- Added escalation transport exchange binding for escalation events.
|
|
22
|
+
|
|
23
|
+
## [0.9.7] - 2026-05-06
|
|
24
|
+
|
|
25
|
+
### Fixed
|
|
26
|
+
- Enabled arbitrage by default, added zero-cost local/fleet offering coverage, and removed the dead `Quality::Checker.model_score` dependency from arbitrage eligibility.
|
|
27
|
+
- Skipped debate when fewer than two distinct models are available and captured judge evaluation/confidence separately from the final answer.
|
|
28
|
+
- Reused the shared context token estimator for billing preflight cost checks.
|
|
29
|
+
- Logged dropped metering emissions at the inference metering call site.
|
|
30
|
+
- Added resolved model metadata to routing span attributes.
|
|
31
|
+
- Stopped treating standalone email addresses as PII by default while preserving contextual and opt-in email detection.
|
|
32
|
+
- Allowed context curation to choose vLLM/MLX local or fleet models for LLM-assisted summarization.
|
|
33
|
+
- Routed compressor summarization through standard low-cost/basic intent when no explicit compressor model is configured.
|
|
34
|
+
|
|
35
|
+
## [0.9.6] - 2026-05-06
|
|
36
|
+
|
|
37
|
+
### Fixed
|
|
38
|
+
- Enforced privacy classification as a forced local routing constraint, even when callers request a cloud tier.
|
|
39
|
+
- Made RAG faithfulness failures block caller-visible responses by default and record structured audit data.
|
|
40
|
+
- Archived dropped conversation turns into Apollo with conversation-scoped tags and retrieved archived history during RAG context loading.
|
|
41
|
+
- Reported escalation quality failures and low-confidence responses to `Router::HealthTracker` as quality signals.
|
|
42
|
+
- Persisted conversation sticky state through the database path and restored it after in-memory LRU eviction.
|
|
43
|
+
- Counted pending conversation-history and RAG enrichments in token-budget checks before provider dispatch.
|
|
44
|
+
- Averaged provider health priority adjustments across instances while preserving worst-state circuit reporting.
|
|
45
|
+
|
|
46
|
+
## [0.9.5] - 2026-05-06
|
|
47
|
+
|
|
48
|
+
### Fixed
|
|
49
|
+
- Fixed context curator cache invalidation and stored curated-summary replay so compacted messages are used on later turns.
|
|
50
|
+
- Persisted curation marker records even when a pass does not rewrite individual messages, allowing structural curation to run for short-message turns.
|
|
51
|
+
- Fixed compressor LLM summarization to call `Legion::LLM.chat_direct` with a prompt message instead of the obsolete session-style API.
|
|
52
|
+
- Warn and omit misleading zero-dollar cost estimates when provider usage metadata collapses to zero tokens for a known model.
|
|
53
|
+
|
|
54
|
+
## [0.9.4] - 2026-05-06
|
|
55
|
+
|
|
56
|
+
### Changed
|
|
57
|
+
- Added shared inference step logging helpers and debug-level step enter/complete/failure logs.
|
|
58
|
+
- Added safe debug/info instrumentation across inference steps for routing actions, enrichment decisions, tool handling, RAG, skill injection, sticky runner state, billing, classification, debate, post-response audit, and metering emission.
|
|
59
|
+
|
|
60
|
+
## [0.9.3] - 2026-05-06
|
|
61
|
+
|
|
62
|
+
### Changed
|
|
63
|
+
- Delegated responder-side fleet provider execution, token validation, and provider response publishing to the shared `lex-llm` fleet helpers.
|
|
64
|
+
- Kept `legion-llm` as the request-side fleet dispatcher and token issuer while retaining compatibility aliases for old responder constants.
|
|
65
|
+
- Bumped the `lex-llm` dependency floor to `>= 0.4.3` for shared responder execution helpers.
|
|
66
|
+
|
|
67
|
+
## [0.9.2] - 2026-05-06
|
|
68
|
+
|
|
69
|
+
### Fixed
|
|
70
|
+
- Prefer namespaced caller ids over ambiguous display identities when publishing audit, metering, and transport identity metadata.
|
|
71
|
+
|
|
72
|
+
## [0.9.1] - 2026-05-06
|
|
73
|
+
|
|
74
|
+
### Changed
|
|
75
|
+
- `legion-llm` now owns lex-llm provider registration by scanning loaded provider modules, constructing `LexLLMAdapter` instances, and writing `Call::Registry`.
|
|
76
|
+
- Provider rediscovery now rebuilds registry entries after `Call::Registry.reset!`, supporting LegionIO reload/hot-update flows without relying on provider require-time side effects.
|
|
77
|
+
- Bumped the `lex-llm` dependency floor to `>= 0.4.1` for pure provider discovery and alias metadata.
|
|
78
|
+
|
|
79
|
+
### Fixed
|
|
80
|
+
- Preserve streaming thinking chunks that providers emit as plain strings.
|
|
81
|
+
- Normalize discovered provider instance ids from provider offerings before memory-gate and availability checks.
|
|
82
|
+
- Use canonical offering aliases for metering cost estimates and fall back to caller agent metadata for fleet context.
|
|
83
|
+
- Preserve extension, string, and top-level caller identity metadata in audit events and transport headers.
|
|
84
|
+
|
|
85
|
+
## [0.9.0] - 2026-05-06
|
|
86
|
+
|
|
87
|
+
### Changed
|
|
88
|
+
- Added shared provider-owned fleet responder execution support for lex-llm provider gems.
|
|
89
|
+
- Moved fleet dispatch defaults to top-level `fleet.dispatch`, removed legacy gateway defaults, and rejected `routing.use_fleet` / `openai_compat.gateways` settings during validation.
|
|
90
|
+
- OpenAI-compatible routing now resolves through registered `lex-llm-openai` provider instances instead of gateway interceptor configuration.
|
|
91
|
+
- Native dispatch now routes chat, stream, embed, image, and health calls through the canonical lex-llm provider-instance adapter contract.
|
|
92
|
+
- Native inference now records direct/fleet route attempts with dispatch path, idempotency key, selected lane, failure reason, and escalation context.
|
|
93
|
+
- Native inference and `/api/llm/inference` now strip provider thinking from caller-visible content and expose thinking only through explicit diagnostic fields/events.
|
|
94
|
+
- Inventory and provider/model API reads now use cached discovery and non-live provider offerings, so explicit discovery refresh remains the only path that probes provider endpoints.
|
|
95
|
+
- Fleet dispatch now publishes shared lex-llm protocol-v2 envelopes with canonical `operation`, `request_id`, `correlation_id`, `idempotency_key`, signed tokens, and strict reply matching.
|
|
96
|
+
- Fleet worker handling now validates protocol-v2 envelopes, enforces token/idempotency policy, dispatches local providers through canonical lex-llm methods, and publishes shared lex-llm response/error envelopes.
|
|
97
|
+
- Bumped dependency floors to `lex-llm >= 0.4.0` and `legion-transport >= 1.4.14` for shared provider contracts and fleet envelopes.
|
|
98
|
+
|
|
99
|
+
### Removed
|
|
100
|
+
- Removed the gateway interceptor runtime path and gateway metering fallback.
|
|
101
|
+
- Retired `Legion::LLM::Transport::Messages::FleetRequest`, `FleetResponse`, and `FleetError` as fleet message authorities in favor of `Legion::Extensions::Llm::Transport::Messages::*`.
|
|
102
|
+
|
|
103
|
+
## [0.8.51] - 2026-05-03
|
|
104
|
+
|
|
105
|
+
### Changed
|
|
106
|
+
- Native `/api/llm/inference` streaming hides provider thinking deltas by default, with `include_thinking: true` as the explicit diagnostic opt-in.
|
|
107
|
+
- Pipeline metering events now carry wall-clock latency, estimated cost, conversation/correlation ids, billing, task, agent, identity, and routing context.
|
|
108
|
+
- LLM transport messages now preserve caller identity, credential, and caller type headers from nested caller metadata, top-level identity metadata, and extension callers.
|
|
109
|
+
|
|
110
|
+
### Fixed
|
|
111
|
+
- Prompt audit events now include provider response thinking separately from assistant response content.
|
|
112
|
+
- Native lex-llm dispatch now carries provider thinking separately from response content before API responses are emitted.
|
|
113
|
+
- Native discovery now normalizes provider offering objects before generating routing candidates, preserving provider instance, tier, capabilities, context length, and parameter metadata.
|
|
114
|
+
|
|
115
|
+
## [0.8.50] - 2026-05-03
|
|
116
|
+
|
|
117
|
+
### Fixed
|
|
118
|
+
- Native discovery now normalizes lex-llm `ModelOffering` objects before generating routing candidates, allowing auto-rules to populate from provider adapters again.
|
|
119
|
+
|
|
120
|
+
## [0.8.49] - 2026-04-29
|
|
121
|
+
|
|
122
|
+
### Changed
|
|
123
|
+
- `Settings.register_defaults!` now calls `Legion::Settings.register_library` instead of `merge_settings`, using the idempotent legion-settings 1.4.0 API that prevents double-registration.
|
|
124
|
+
- Bumped `legion-settings` dependency floor to `>= 1.4.0`.
|
|
125
|
+
- Test stub `Legion::Settings` now exposes `register_library` matching the real 1.4.0 API.
|
|
126
|
+
|
|
127
|
+
## [0.8.48] - 2026-04-29
|
|
128
|
+
|
|
129
|
+
### Added
|
|
130
|
+
- `ToolDefinition.from_registry_entry` builds tool definitions from `Legion::Settings::Extensions` registry entries.
|
|
131
|
+
- `Dispatcher` checks `Settings::Extensions` for tool override resolution; when no matching entry is found it falls back to settings-based MCP overrides (no `Tools::Registry` or `Catalog::Registry` fallback).
|
|
132
|
+
- `Executor#add_registry_tool_definitions` reads from `Settings::Extensions` when available, falling back to `Legion::Tools::Registry` for backward compatibility.
|
|
133
|
+
- `Steps::ToolDiscovery` discovers tools from `Settings::Extensions` when available, falling back to `Legion::Tools::Registry`.
|
|
134
|
+
|
|
135
|
+
### Changed
|
|
136
|
+
- Bumped `legion-settings` dependency floor to `>= 1.4.0` (requires `Settings::Extensions` module).
|
|
137
|
+
|
|
3
138
|
## [0.8.47] - 2026-04-29
|
|
4
139
|
|
|
5
140
|
### Fixed
|
data/CLAUDE.md
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
Core LegionIO gem providing LLM capabilities to all extensions through Legion-native provider dispatch. Includes a dynamic weighted routing engine that dispatches requests across local, fleet, and cloud tiers based on caller intent, priority rules, time schedules, cost multipliers, and real-time provider health.
|
|
9
9
|
|
|
10
10
|
**GitHub**: https://github.com/LegionIO/legion-llm
|
|
11
|
-
**Version**: 0.8.
|
|
11
|
+
**Version**: 0.8.49
|
|
12
12
|
**License**: Apache-2.0
|
|
13
13
|
|
|
14
14
|
## Architecture
|
|
@@ -61,7 +61,10 @@ Legion::LLM (lib/legion/llm.rb) # Thin facade — delegates to Inferenc
|
|
|
61
61
|
│ ├── Compressor # Deterministic prompt compression (3 levels, code-block-aware)
|
|
62
62
|
│ └── Curator # Async conversation curation: strip thinking, distill tools, fold resolved exchanges
|
|
63
63
|
├── Discovery # Runtime introspection
|
|
64
|
-
│ ├── Ollama #
|
|
64
|
+
│ ├── Ollama # Multi-instance Ollama /api/tags + /api/show discovery (TTL-cached)
|
|
65
|
+
│ ├── Vllm # Multi-instance vLLM /v1/models + /health discovery (TTL-cached)
|
|
66
|
+
│ ├── RuleGenerator # Auto-generates routing rules from discovered instances/models
|
|
67
|
+
│ ├── MemoryGate # Checks available RAM before routing to local models
|
|
65
68
|
│ └── System # Queries OS memory: macOS (vm_stat/sysctl), Linux (/proc/meminfo)
|
|
66
69
|
├── Quality # Response quality evaluation
|
|
67
70
|
│ ├── Checker # Quality heuristics (empty, too_short, repetition, json_parse) + pluggable (was QualityChecker)
|
|
@@ -364,24 +367,21 @@ Settings read from `Legion::Settings[:llm]`:
|
|
|
364
367
|
|
|
365
368
|
### Provider Settings
|
|
366
369
|
|
|
367
|
-
Each provider has: `enabled`, `api_key`,
|
|
370
|
+
Provider defaults now live in each `lex-llm-*` provider extension's `default_settings`. The `providers:` key in `Settings.default` ships as an empty hash; settings files and extension registrations populate it at runtime. Each provider has: `enabled`, `api_key`, plus provider-specific keys.
|
|
368
371
|
|
|
369
|
-
|
|
372
|
+
Local/fleet providers (Ollama, vLLM, MLX) support multi-instance configs via an `instances:` hash. Discovery scans all instances in parallel, enriches models with real capability metadata, and generates per-instance routing rules.
|
|
370
373
|
|
|
371
|
-
|
|
372
|
-
- **SigV4** (default): `api_key` + `secret_key` (+ optional `session_token`)
|
|
373
|
-
- **Bearer token**: `bearer_token` for AWS Identity Center/SSO. Native Bedrock providers consume it through lex-llm configuration.
|
|
374
|
+
### Capability-Aware Routing
|
|
374
375
|
|
|
375
|
-
|
|
376
|
+
Routing rules carry `model_capabilities`, `context_length`, and `parameter_count` from provider-supplied `Model::Info`. The `RuleGenerator` creates rules from discovered instances without a static capability map -- each provider supplies real metadata.
|
|
377
|
+
|
|
378
|
+
### Memory Gate
|
|
376
379
|
|
|
377
|
-
|
|
380
|
+
`Discovery::MemoryGate` checks available system memory before routing to local models. Models that exceed available RAM minus `discovery.memory_floor_mb` are silently skipped.
|
|
381
|
+
|
|
382
|
+
### Auto-Detection Priority
|
|
378
383
|
|
|
379
|
-
|
|
380
|
-
2. Anthropic -> `claude-sonnet-4-6`
|
|
381
|
-
3. OpenAI -> `gpt-4o`
|
|
382
|
-
4. Gemini -> `gemini-2.0-flash`
|
|
383
|
-
5. Azure -> (endpoint-specific, from `api_base`)
|
|
384
|
-
6. Ollama -> `llama3`
|
|
384
|
+
When no defaults are configured, the first enabled provider is used. Detection order and default models are defined by each `lex-llm-*` provider extension.
|
|
385
385
|
|
|
386
386
|
### Routing Settings
|
|
387
387
|
|
|
@@ -501,7 +501,10 @@ In-memory signal consumer with pluggable handlers. Adjusts effective priorities
|
|
|
501
501
|
| `lib/legion/llm/context/compressor.rb` | Deterministic prompt compression: 3 levels, code-block-aware, stopword removal |
|
|
502
502
|
| `lib/legion/llm/context/curator.rb` | Async heuristic conversation curation (was ContextCurator) |
|
|
503
503
|
| `lib/legion/llm/discovery.rb` | Discovery entry point: run, detect_embedding_capability, can_embed? |
|
|
504
|
-
| `lib/legion/llm/discovery/ollama.rb` | Ollama /api/tags discovery with TTL cache |
|
|
504
|
+
| `lib/legion/llm/discovery/ollama.rb` | Multi-instance Ollama /api/tags + /api/show discovery with TTL cache |
|
|
505
|
+
| `lib/legion/llm/discovery/vllm.rb` | Multi-instance vLLM /v1/models + /health discovery with TTL cache |
|
|
506
|
+
| `lib/legion/llm/discovery/rule_generator.rb` | Auto-generates routing rules from discovered instances/models |
|
|
507
|
+
| `lib/legion/llm/discovery/memory_gate.rb` | Checks available RAM vs model size before routing to local models |
|
|
505
508
|
| `lib/legion/llm/discovery/system.rb` | OS memory introspection (macOS + Linux) with TTL cache |
|
|
506
509
|
| `lib/legion/llm/quality.rb` | Quality entry point |
|
|
507
510
|
| `lib/legion/llm/quality/checker.rb` | Quality heuristics + pluggable callable (was QualityChecker) |
|
|
@@ -715,7 +718,7 @@ The legacy `vault_path` per-provider setting was removed in v0.3.1.
|
|
|
715
718
|
Tests run without the full LegionIO stack. `spec/spec_helper.rb` uses real `Legion::Logging` and `Legion::Settings` (no stubs — hard dependencies are always present). Each test resets settings to defaults via `before(:each)`.
|
|
716
719
|
|
|
717
720
|
```bash
|
|
718
|
-
bundle exec rspec #
|
|
721
|
+
bundle exec rspec # 2379 examples, 0 failures
|
|
719
722
|
bundle exec rubocop # 0 offenses
|
|
720
723
|
```
|
|
721
724
|
|
data/Gemfile
CHANGED
|
@@ -4,6 +4,9 @@ source 'https://rubygems.org'
|
|
|
4
4
|
|
|
5
5
|
gemspec
|
|
6
6
|
|
|
7
|
+
legion_settings_path = File.expand_path('../legion-settings', __dir__)
|
|
8
|
+
gem 'legion-settings', path: legion_settings_path if Dir.exist?(legion_settings_path)
|
|
9
|
+
|
|
7
10
|
group :test do
|
|
8
11
|
lex_llm_path = File.expand_path('../extensions-ai/lex-llm', __dir__)
|
|
9
12
|
if Dir.exist?(lex_llm_path)
|
data/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
LLM routing and provider orchestration for the [LegionIO](https://github.com/LegionIO/LegionIO) framework. Routes chat, embeddings, tool use, fleet dispatch, auditing, and provider metadata through Legion-native `lex-llm-*` provider extensions.
|
|
4
4
|
|
|
5
|
-
**Version**: 0.
|
|
5
|
+
**Version**: 0.9.0
|
|
6
6
|
|
|
7
7
|
## Installation
|
|
8
8
|
|
|
@@ -60,7 +60,7 @@ Requests flow through the full Inference pipeline — routing, metering, audit,
|
|
|
60
60
|
Both formats supported with correct SSE shapes:
|
|
61
61
|
- **OpenAI**: `data: {"choices":[{"delta":{"content":"..."}}]}` chunks, terminated by `data: [DONE]`
|
|
62
62
|
- **Anthropic**: Typed events — `message_start`, `content_block_start`, `content_block_delta`, `content_block_stop`, `message_delta`, `message_stop`
|
|
63
|
-
- **Native**: `/api/llm/inference` streams `text-delta`, `thinking-delta`, tool lifecycle events, and a final `done` event. Structured provider content blocks are flattened to plain text in both streaming and non-streaming native responses so `content` remains a string for daemon clients.
|
|
63
|
+
- **Native**: `/api/llm/inference` streams `text-delta`, optional `thinking-delta` events when `include_thinking: true`, tool lifecycle events, and a final `done` event. Structured provider content blocks are flattened to plain text in both streaming and non-streaming native responses so `content` remains a string for daemon clients.
|
|
64
64
|
|
|
65
65
|
### API Authentication
|
|
66
66
|
|
|
@@ -90,6 +90,8 @@ When enabled, validates `Authorization: Bearer <token>` or `x-api-key` headers a
|
|
|
90
90
|
|
|
91
91
|
## Configuration
|
|
92
92
|
|
|
93
|
+
Provider defaults now live in each `lex-llm-*` provider extension. `legion-llm` ships an empty `providers: {}` hash; settings files and extension registrations populate it at runtime.
|
|
94
|
+
|
|
93
95
|
Add to your LegionIO settings directory (e.g. `~/.legionio/settings/llm.json`):
|
|
94
96
|
|
|
95
97
|
```json
|
|
@@ -103,53 +105,60 @@ Add to your LegionIO settings directory (e.g. `~/.legionio/settings/llm.json`):
|
|
|
103
105
|
"region": "us-east-2",
|
|
104
106
|
"bearer_token": ["vault://secret/data/llm/bedrock#bearer_token", "env://AWS_BEARER_TOKEN"]
|
|
105
107
|
},
|
|
106
|
-
"anthropic": {
|
|
107
|
-
"enabled": false,
|
|
108
|
-
"api_key": "env://ANTHROPIC_API_KEY"
|
|
109
|
-
},
|
|
110
|
-
"openai": {
|
|
111
|
-
"enabled": false,
|
|
112
|
-
"api_key": "env://OPENAI_API_KEY"
|
|
113
|
-
},
|
|
114
108
|
"ollama": {
|
|
115
|
-
"enabled":
|
|
116
|
-
"base_url": "http://localhost:11434"
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
"default_model": "qwen3.6-27b",
|
|
122
|
-
"enable_thinking": true
|
|
123
|
-
},
|
|
124
|
-
"mlx": {
|
|
125
|
-
"enabled": false,
|
|
126
|
-
"base_url": "http://localhost:8000"
|
|
109
|
+
"enabled": true,
|
|
110
|
+
"base_url": "http://localhost:11434",
|
|
111
|
+
"instances": {
|
|
112
|
+
"default": { "base_url": "http://localhost:11434" },
|
|
113
|
+
"gpu_server": { "base_url": "http://gpu-server:11434" }
|
|
114
|
+
}
|
|
127
115
|
}
|
|
128
116
|
}
|
|
129
117
|
}
|
|
130
118
|
}
|
|
131
119
|
```
|
|
132
120
|
|
|
133
|
-
Credentials are resolved automatically by the universal secret resolver in `legion-settings` (v1.3.0+). Use `vault://` URIs for Vault secrets, `env://` for environment variables, or plain strings for static values. Array values act as fallback chains
|
|
121
|
+
Credentials are resolved automatically by the universal secret resolver in `legion-settings` (v1.3.0+). Use `vault://` URIs for Vault secrets, `env://` for environment variables, or plain strings for static values. Array values act as fallback chains -- the first non-nil result wins.
|
|
122
|
+
|
|
123
|
+
### Provider Extensions (lex-llm-*)
|
|
124
|
+
|
|
125
|
+
Each provider is a standalone `lex-llm-*` gem that ships its own `default_settings`, model catalog, capability declarations, and optional provider-owned fleet worker actor. When a provider gem is loaded, `legion-llm` discovers it through the shared `lex-llm` provider contract and registers provider instances for routing. Provider gems implement:
|
|
126
|
+
|
|
127
|
+
- **`default_settings`** -- Connection defaults (base_url, region, API key env vars)
|
|
128
|
+
- **`model_allowed?(model_name)`** -- Provider-level model filtering
|
|
129
|
+
- **`Model::Info`** -- Real capabilities, context lengths, and parameter counts for each model
|
|
130
|
+
|
|
131
|
+
The routing layer only sees models the provider has already filtered and annotated.
|
|
132
|
+
|
|
133
|
+
### Multi-Instance Providers
|
|
134
|
+
|
|
135
|
+
Local and fleet providers (Ollama, vLLM, MLX) support multiple named instances:
|
|
136
|
+
|
|
137
|
+
```json
|
|
138
|
+
{
|
|
139
|
+
"ollama": {
|
|
140
|
+
"enabled": true,
|
|
141
|
+
"instances": {
|
|
142
|
+
"macbook": { "base_url": "http://localhost:11434" },
|
|
143
|
+
"gpu_server": { "base_url": "http://gpu-server:11434" }
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Discovery scans all instances in parallel, enriches models with `/api/show` metadata, and generates per-instance routing rules. Each instance appears independently in the routing table so the router can target the exact hardware.
|
|
134
150
|
|
|
135
|
-
###
|
|
151
|
+
### Capability-Aware Routing
|
|
136
152
|
|
|
137
|
-
|
|
153
|
+
Routing rules and auto-generated rules carry `model_capabilities`, `context_length`, and `parameter_count` from provider-supplied `Model::Info`. The router uses these to match capability requirements (e.g., `thinking`, `vision`, `tools`) without a static lookup table.
|
|
138
154
|
|
|
139
|
-
|
|
140
|
-
|-------|------|-------------|
|
|
141
|
-
| `enabled` | Boolean | Enable this provider (default: `false`) |
|
|
142
|
-
| `api_key` | String | API key (supports `vault://`, `env://`, or plain string) |
|
|
155
|
+
### Generic Dispatch
|
|
143
156
|
|
|
144
|
-
|
|
157
|
+
`Call::Dispatch.call` accepts a `capability:` parameter (`:chat`, `:stream`, `:embed`) and routes to the registered `lex-llm-*` adapter. This replaces the old provider-specific dispatch paths.
|
|
145
158
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
| **Azure** | `api_base` (Azure OpenAI endpoint URL, required), `auth_token` (bearer token alternative to `api_key`) |
|
|
150
|
-
| **Ollama** | `base_url` (default: `http://localhost:11434`) |
|
|
151
|
-
| **vLLM** | `base_url` (default: `http://localhost:8000/v1`), `api_key`, `enable_thinking` |
|
|
152
|
-
| **MLX** | `base_url` (default: `http://localhost:8000`), `api_key` |
|
|
159
|
+
### Memory Gate
|
|
160
|
+
|
|
161
|
+
Discovery checks available system memory (macOS `vm_stat`/`sysctl`, Linux `/proc/meminfo`) before routing to local models. Models that exceed available RAM minus `discovery.memory_floor_mb` are silently skipped.
|
|
153
162
|
|
|
154
163
|
### Credential Resolution
|
|
155
164
|
|
|
@@ -171,18 +180,7 @@ By the time `Legion::LLM.start` runs, all `vault://` and `env://` references hav
|
|
|
171
180
|
|
|
172
181
|
### Auto-Detection
|
|
173
182
|
|
|
174
|
-
If no `default_model` or `default_provider` is set, legion-llm auto-detects from the first enabled provider
|
|
175
|
-
|
|
176
|
-
| Priority | Provider | Default Model |
|
|
177
|
-
|----------|----------|---------------|
|
|
178
|
-
| 1 | Bedrock | `us.anthropic.claude-sonnet-4-6-v1` |
|
|
179
|
-
| 2 | Anthropic | `claude-sonnet-4-6` |
|
|
180
|
-
| 3 | OpenAI | `gpt-4o` |
|
|
181
|
-
| 4 | Gemini | `gemini-2.0-flash` |
|
|
182
|
-
| 5 | Azure | (endpoint-specific) |
|
|
183
|
-
| 6 | Ollama | `qwen3.5:latest` |
|
|
184
|
-
| 7 | vLLM | `qwen3.6-27b` |
|
|
185
|
-
| 8 | MLX | (configured model) |
|
|
183
|
+
If no `default_model` or `default_provider` is set, legion-llm auto-detects from the first enabled provider. The detection order and default models are defined by each `lex-llm-*` provider extension's `default_settings`.
|
|
186
184
|
|
|
187
185
|
## Core API
|
|
188
186
|
|
|
@@ -372,9 +370,9 @@ Legion::LLM (lib/legion/llm.rb) # Thin facade — delegates to Inferenc
|
|
|
372
370
|
│ ├── Arbitrage # Cost-aware model selection when no rules match
|
|
373
371
|
│ └── Escalation/
|
|
374
372
|
│ └── History # EscalationHistory mixin
|
|
375
|
-
├── Fleet # Fleet
|
|
373
|
+
├── Fleet # Fleet dispatch over AMQP; provider responders live in lex-llm-* gems
|
|
376
374
|
│ ├── Dispatcher # Fleet RPC dispatch with routing key building, per-type timeouts
|
|
377
|
-
│ ├──
|
|
375
|
+
│ ├── TokenIssuer # Request-side JWT minting for provider-owned responders
|
|
378
376
|
│ └── ReplyDispatcher # Correlation-based reply routing
|
|
379
377
|
├── API # All external HTTP interfaces
|
|
380
378
|
│ ├── Auth # Config-driven Bearer/x-api-key auth for /v1/ routes
|
|
@@ -394,10 +392,10 @@ Legion::LLM (lib/legion/llm.rb) # Thin facade — delegates to Inferenc
|
|
|
394
392
|
│ ├── OpenAIRequest / OpenAIResponse
|
|
395
393
|
│ └── AnthropicRequest / AnthropicResponse
|
|
396
394
|
├── Audit # Prompt, tool, and skill audit event emission
|
|
397
|
-
├── Transport # Centralized AMQP exchange and message definitions
|
|
395
|
+
├── Transport # Centralized AMQP exchange and non-fleet message definitions
|
|
398
396
|
│ ├── Message # LLM base message: context propagation, LLM headers
|
|
399
397
|
│ ├── Exchanges/ # Fleet, Metering, Audit, Escalation
|
|
400
|
-
│ └── Messages/ #
|
|
398
|
+
│ └── Messages/ # MeteringEvent, prompt/tool audit, escalation, and compatibility wrappers
|
|
401
399
|
├── Scheduling # Deferred execution
|
|
402
400
|
│ ├── Batch # Non-urgent request batching with priority queue and auto-flush
|
|
403
401
|
│ └── OffPeak # Peak-hour deferral
|
|
@@ -499,8 +497,8 @@ legion-llm includes a dynamic weighted routing engine that dispatches requests a
|
|
|
499
497
|
│ Tier 1: LOCAL → Ollama on this machine (direct HTTP) │
|
|
500
498
|
│ Zero network overhead, no Transport │
|
|
501
499
|
│ │
|
|
502
|
-
│ Tier 2: FLEET →
|
|
503
|
-
│
|
|
500
|
+
│ Tier 2: FLEET → provider-owned lex-llm-* responders │
|
|
501
|
+
│ Shared lex-llm fleet envelopes over AMQP │
|
|
504
502
|
│ │
|
|
505
503
|
│ Tier 3: CLOUD → Bedrock / Azure / Gemini │
|
|
506
504
|
│ Tier 4: FRONTIER → Anthropic / OpenAI │
|
|
@@ -511,12 +509,12 @@ legion-llm includes a dynamic weighted routing engine that dispatches requests a
|
|
|
511
509
|
| Tier | Target | Use Case |
|
|
512
510
|
|------|--------|----------|
|
|
513
511
|
| `local` | Ollama on localhost | Privacy-sensitive, offline, or low-latency workloads |
|
|
514
|
-
| `fleet` | Shared hardware via
|
|
515
|
-
| `openai_compat` | OpenAI-compatible
|
|
512
|
+
| `fleet` | Shared hardware via provider-owned lex-llm responders over AMQP | Larger vLLM/Ollama models on dedicated GPU servers |
|
|
513
|
+
| `openai_compat` | OpenAI-compatible provider instances | Self-hosted or proxy endpoints with OpenAI-compatible APIs |
|
|
516
514
|
| `cloud` | API providers (Bedrock, Azure, Gemini) | Managed cloud inference |
|
|
517
515
|
| `frontier` | API providers (Anthropic, OpenAI) | Frontier models, full-capability inference |
|
|
518
516
|
|
|
519
|
-
Fleet dispatch is built into legion-llm.
|
|
517
|
+
Fleet dispatch is built into `legion-llm`, but fleet consumption is provider-owned. `Fleet::Dispatcher` publishes shared `lex-llm` protocol-v2 `FleetRequest` envelopes to keys such as `llm.fleet.inference.qwen3-6-27b.ctx32000` or `llm.fleet.embed.nomic-embed-text`; the enabled provider gem actor consumes the request, validates the signed token and idempotency key through `Legion::Extensions::Llm::Fleet::ProviderResponder`, calls its local provider instance through the canonical `lex-llm` provider methods, and replies with shared `FleetResponse` or `FleetError` envelopes. Keep `routing.tiers.fleet.routing_style` set to `shared_lane` for the default pooled model lanes, or set it to `offering_lane` for exact provider-instance lanes such as `llm.fleet.offering.vllm-gpu-01.qwen3-6.inference`.
|
|
520
518
|
|
|
521
519
|
#### Intent-Based Dispatch
|
|
522
520
|
|
|
@@ -587,7 +585,7 @@ Add routing configuration under the `llm` key:
|
|
|
587
585
|
"timeout_seconds": 30,
|
|
588
586
|
"timeouts": { "embed": 10, "chat": 30, "generate": 30, "default": 30 }
|
|
589
587
|
},
|
|
590
|
-
"openai_compat": { "
|
|
588
|
+
"openai_compat": { "providers": ["openai"] },
|
|
591
589
|
"cloud": { "providers": ["bedrock", "azure", "gemini"] },
|
|
592
590
|
"frontier": { "providers": ["anthropic", "openai"] }
|
|
593
591
|
},
|
|
@@ -855,6 +853,8 @@ Legion::Service#initialize
|
|
|
855
853
|
load_extensions # LEX extensions (can use LLM if available)
|
|
856
854
|
```
|
|
857
855
|
|
|
856
|
+
LegionIO hosts these routes through `mount_library_routes('llm', Routes::Llm, 'Legion::LLM::Routes')`. The route modules remain owned by `legion-llm`; LegionIO no longer registers provider gateway fallback routes when the library is available.
|
|
857
|
+
|
|
858
858
|
- **Service**: `setup_llm` called between data and supervision in startup sequence
|
|
859
859
|
- **Extensions**: `llm_required?` method on extension module, checked at load time
|
|
860
860
|
- **Helpers**: `Legion::Extensions::Helpers::LLM` auto-loaded when gem is present
|
|
@@ -889,8 +889,9 @@ bundle exec rubocop -A
|
|
|
889
889
|
| `legion-json` | Legion JSON serialization |
|
|
890
890
|
| `legion-logging` | Logging |
|
|
891
891
|
| `legion-settings` | Configuration defaults and file overrides |
|
|
892
|
+
| `legion-transport` (>= 1.4.14) | AMQP transport for fleet dispatch, metering, and audit |
|
|
892
893
|
| `lex-knowledge` | Optional knowledge chunking integration when loaded |
|
|
893
|
-
| `lex-llm` (>= 0.
|
|
894
|
+
| `lex-llm` (>= 0.4.3) | Provider-neutral contract, model offerings, response normalization, fleet envelopes, and responder-side fleet execution helpers |
|
|
894
895
|
| `pdf-reader` | PDF extraction support |
|
|
895
896
|
| `tzinfo` (>= 2.0) | IANA timezone conversion for schedule windows |
|
|
896
897
|
|
data/legion-llm.gemspec
CHANGED
|
@@ -30,9 +30,10 @@ Gem::Specification.new do |spec|
|
|
|
30
30
|
spec.add_dependency 'legion-cache', '>= 1.4.2'
|
|
31
31
|
spec.add_dependency 'legion-json', '>= 1.2.0'
|
|
32
32
|
spec.add_dependency 'legion-logging', '>= 1.2.8'
|
|
33
|
-
spec.add_dependency 'legion-settings', '>= 1.
|
|
33
|
+
spec.add_dependency 'legion-settings', '>= 1.4.0'
|
|
34
|
+
spec.add_dependency 'legion-transport', '>= 1.4.14'
|
|
34
35
|
spec.add_dependency 'lex-knowledge'
|
|
35
|
-
spec.add_dependency 'lex-llm', '>= 0.
|
|
36
|
+
spec.add_dependency 'lex-llm', '>= 0.4.3'
|
|
36
37
|
spec.add_dependency 'pdf-reader'
|
|
37
38
|
spec.add_dependency 'tzinfo', '>= 2.0'
|
|
38
39
|
end
|
|
@@ -10,8 +10,10 @@ require 'legion/llm/types'
|
|
|
10
10
|
begin
|
|
11
11
|
require 'legion/identity/request'
|
|
12
12
|
require 'legion/identity/process'
|
|
13
|
-
rescue LoadError
|
|
14
|
-
|
|
13
|
+
rescue LoadError => e
|
|
14
|
+
Object.new.extend(Legion::Logging::Helper).handle_exception(
|
|
15
|
+
e, level: :debug, handled: true, operation: 'llm.api.native.helpers.optional_identity_require'
|
|
16
|
+
)
|
|
15
17
|
end
|
|
16
18
|
|
|
17
19
|
module Legion
|
|
@@ -299,7 +301,7 @@ module Legion
|
|
|
299
301
|
name: tname,
|
|
300
302
|
description: tdesc,
|
|
301
303
|
parameters: tschema || {},
|
|
302
|
-
source: { type: :client, executable:
|
|
304
|
+
source: { type: :client, executable: false }
|
|
303
305
|
)
|
|
304
306
|
rescue StandardError => e
|
|
305
307
|
handle_exception(e, level: :warn, handled: true, operation: "llm.api.build_client_tool_class.#{tname}")
|
|
@@ -334,7 +336,7 @@ module Legion
|
|
|
334
336
|
text = content.key?(:text) || content.key?('text') ? (content[:text] || content['text']) : (content[:content] || content['content'])
|
|
335
337
|
extract_text_content(text)
|
|
336
338
|
else
|
|
337
|
-
content.to_s
|
|
339
|
+
content.respond_to?(:text) ? content.text.to_s : content.to_s
|
|
338
340
|
end
|
|
339
341
|
end
|
|
340
342
|
|
|
@@ -26,6 +26,7 @@ module Legion
|
|
|
26
26
|
caller_context = body[:caller]
|
|
27
27
|
conversation_id = body[:conversation_id]
|
|
28
28
|
request_id = body[:request_id] || SecureRandom.uuid
|
|
29
|
+
include_thinking = body[:include_thinking] == true
|
|
29
30
|
|
|
30
31
|
unless messages.is_a?(Array)
|
|
31
32
|
halt 400, { 'Content-Type' => 'application/json' },
|
|
@@ -145,7 +146,7 @@ module Legion
|
|
|
145
146
|
|
|
146
147
|
pipeline_response = executor.call_stream do |chunk|
|
|
147
148
|
thinking = extract_text_content(chunk.thinking) if chunk.respond_to?(:thinking)
|
|
148
|
-
emit_sse_event(out, 'thinking-delta', { delta: thinking })
|
|
149
|
+
emit_sse_event(out, 'thinking-delta', { delta: thinking }) if include_thinking && !thinking.to_s.empty?
|
|
149
150
|
|
|
150
151
|
text = extract_text_content(chunk.respond_to?(:content) ? chunk.content : chunk)
|
|
151
152
|
next if text.empty?
|
|
@@ -161,14 +162,18 @@ module Legion
|
|
|
161
162
|
|
|
162
163
|
routing = pipeline_response.routing || {}
|
|
163
164
|
tokens = pipeline_response.tokens || {}
|
|
165
|
+
done_payload = {
|
|
166
|
+
request_id: request_id,
|
|
167
|
+
content: full_text,
|
|
168
|
+
model: (routing[:model] || routing['model']).to_s,
|
|
169
|
+
input_tokens: token_value(tokens, :input),
|
|
170
|
+
output_tokens: token_value(tokens, :output),
|
|
171
|
+
tool_calls: extract_tool_calls(pipeline_response),
|
|
172
|
+
conversation_id: pipeline_response.conversation_id
|
|
173
|
+
}
|
|
174
|
+
done_payload[:thinking] = pipeline_response.thinking if include_thinking && pipeline_response.thinking
|
|
164
175
|
emit_sse_event(out, 'done', {
|
|
165
|
-
|
|
166
|
-
content: full_text,
|
|
167
|
-
model: (routing[:model] || routing['model']).to_s,
|
|
168
|
-
input_tokens: token_value(tokens, :input),
|
|
169
|
-
output_tokens: token_value(tokens, :output),
|
|
170
|
-
tool_calls: extract_tool_calls(pipeline_response),
|
|
171
|
-
conversation_id: pipeline_response.conversation_id
|
|
176
|
+
**done_payload
|
|
172
177
|
})
|
|
173
178
|
|
|
174
179
|
log.info(
|
|
@@ -208,16 +213,18 @@ module Legion
|
|
|
208
213
|
"stop_reason=#{pipeline_response.stop&.dig(:reason) || 'unknown'} stream=false"
|
|
209
214
|
)
|
|
210
215
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
216
|
+
payload = {
|
|
217
|
+
request_id: request_id,
|
|
218
|
+
content: content,
|
|
219
|
+
tool_calls: tool_calls,
|
|
220
|
+
stop_reason: pipeline_response.stop&.dig(:reason)&.to_s,
|
|
221
|
+
model: (routing[:model] || routing['model']).to_s,
|
|
222
|
+
input_tokens: token_value(tokens, :input),
|
|
223
|
+
output_tokens: token_value(tokens, :output),
|
|
224
|
+
conversation_id: pipeline_response.conversation_id
|
|
225
|
+
}
|
|
226
|
+
payload[:thinking] = pipeline_response.thinking if include_thinking && pipeline_response.thinking
|
|
227
|
+
json_response(payload, status_code: 200)
|
|
221
228
|
end
|
|
222
229
|
rescue Legion::LLM::AuthError => e
|
|
223
230
|
handle_exception(e, level: :error, handled: true, operation: 'llm.api.inference.auth', request_id: request_id)
|