lex-llm-vllm 0.2.12 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -0
- data/CHANGELOG.md +18 -0
- data/Gemfile +1 -2
- data/README.md +213 -31
- data/lex-llm-vllm.gemspec +1 -1
- data/lib/legion/extensions/llm/vllm/actors/discovery_refresh.rb +48 -0
- data/lib/legion/extensions/llm/vllm/provider.rb +173 -6
- data/lib/legion/extensions/llm/vllm/translator.rb +696 -0
- data/lib/legion/extensions/llm/vllm/version.rb +1 -1
- data/lib/legion/extensions/llm/vllm.rb +1 -0
- metadata +5 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 172c35debe332979f48575e43bd59c04828449a41a195f3d899bc15afa18bdb2
|
|
4
|
+
data.tar.gz: c423c24ff7a5e4b33f1b6e562b50c196d2870b347bbcad61b38cd228d54ee318
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: dbf9166b8302c7dc786562b7e2e17381b4cf33b570825570cc153dc438bfbef4991e53b0e86811821f08b2a4a00e3b6522bac903764a6bfe3e90f04be4d556ea
|
|
7
|
+
data.tar.gz: 5ee6cda495f98e9f68c4b3ea79b1a0fa28ab833113a00c3aa9a43e9a67e94c4aa79995faf8d8745e063d7c480d77dde246940d8d6b9c3570d678c39be19b496f
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,23 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.3.0 - 2026-06-10
|
|
4
|
+
|
|
5
|
+
- Add canonical provider translator (`Translator`) implementing `render_request`,
|
|
6
|
+
`parse_response`, `parse_chunk`, and `capabilities` per N×N routing design
|
|
7
|
+
- Wire provider `render_payload`, `parse_completion_response`, `build_chunk` to
|
|
8
|
+
delegate to translator with legacy Message/Chunk bridge for backward compat
|
|
9
|
+
- Declare vLLM quirks: `tool_calls_as_text`, `forced_tool_choice`, `thinking_tags`,
|
|
10
|
+
`streaming_token_usage`
|
|
11
|
+
- G18 parameter mapping: max_tokens, temperature, top_p, top_k, stop_sequences,
|
|
12
|
+
seed, frequency_penalty, presence_penalty, response_format
|
|
13
|
+
- Qwen-style </think> tag extraction and tool-call synthesis from content text
|
|
14
|
+
- Adopt conformance kit (`it_behaves_like 'a canonical provider translator'`)
|
|
15
|
+
- Bump lex-llm dependency floor to >= 0.5.0
|
|
16
|
+
|
|
17
|
+
## 0.2.13 - 2026-06-05
|
|
18
|
+
|
|
19
|
+
- Fix missing documentation comment on `DiscoveryRefresh` actor (RuboCop Style/Documentation)
|
|
20
|
+
|
|
3
21
|
## 0.2.12 - 2026-05-29
|
|
4
22
|
|
|
5
23
|
- Add capabilities `[:completion, :streaming, :vision, :tools]` to `DEFAULT_INSTANCE_TIER` so routing can match vLLM instances by required capability without live discovery
|
data/Gemfile
CHANGED
|
@@ -3,10 +3,9 @@
|
|
|
3
3
|
source 'https://rubygems.org'
|
|
4
4
|
|
|
5
5
|
group :test do
|
|
6
|
-
llm_base_path = ENV.fetch('LEX_LLM_PATH', File.expand_path('../lex-llm', __dir__))
|
|
7
6
|
transport_path = ENV.fetch('LEGION_TRANSPORT_PATH', File.expand_path('../../legion-transport', __dir__))
|
|
8
7
|
gem 'legion-transport', path: transport_path if File.directory?(transport_path)
|
|
9
|
-
|
|
8
|
+
# lex-llm >= 0.5.0 carries canonical types + conformance kit (released on rubygems.org)
|
|
10
9
|
end
|
|
11
10
|
|
|
12
11
|
gemspec
|
data/README.md
CHANGED
|
@@ -2,24 +2,138 @@
|
|
|
2
2
|
|
|
3
3
|
LegionIO LLM provider extension for [vLLM](https://docs.vllm.ai/).
|
|
4
4
|
|
|
5
|
-
This gem
|
|
5
|
+
This gem provides a complete vLLM adapter for the LegionIO LLM routing layer. It speaks the OpenAI-compatible API, discovers models at runtime, publishes availability events, and supports vLLM-specific features like thinking mode and server lifecycle management.
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
**Namespace:** `Legion::Extensions::Llm::Vllm`
|
|
8
|
+
**Provider slug:** `:vllm`
|
|
9
|
+
**Dependency:** `lex-llm >= 0.4.3`
|
|
8
10
|
|
|
9
|
-
|
|
11
|
+
Load with:
|
|
10
12
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
-
|
|
22
|
-
|
|
13
|
+
```ruby
|
|
14
|
+
require 'legion/extensions/llm/vllm'
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Architecture at a Glance
|
|
20
|
+
|
|
21
|
+
```
|
|
22
|
+
Legion::Extensions::Llm::Vllm # Root module (namespace, discovery, defaults)
|
|
23
|
+
|-- Provider # Per-instance provider (chat, models, management)
|
|
24
|
+
| |-- OpenAICompatible (mixin) # Shared request/response handling
|
|
25
|
+
| |-- Capabilities (module) # Capability predicates for offerings
|
|
26
|
+
|
|
|
27
|
+
|-- Actor::DiscoveryRefresh # Periodic actor: refreshes discovered model list
|
|
28
|
+
|-- Actor::FleetWorker # Subscription actor: consumes fleet requests
|
|
29
|
+
|
|
|
30
|
+
|-- Runners::FleetWorker # Runner: delegates to Fleet::ProviderResponder
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### File Map
|
|
34
|
+
|
|
35
|
+
| File | What |
|
|
36
|
+
|------|------|
|
|
37
|
+
| `lib/legion/extensions/llm/vllm.rb` | Root module, `discover_instances`, `default_settings`, alias normalization |
|
|
38
|
+
| `lib/legion/extensions/llm/vllm/version.rb` | `VERSION` constant |
|
|
39
|
+
| `lib/legion/extensions/llm/vllm/provider.rb` | Provider class, chat/embeddings/model discovery, management endpoints |
|
|
40
|
+
| `lib/legion/extensions/llm/vllm/actors/discovery_refresh.rb` | Periodic actor to refresh model discovery cache |
|
|
41
|
+
| `lib/legion/extensions/llm/vllm/actors/fleet_worker.rb` | Subscription actor for fleet request consumption |
|
|
42
|
+
| `lib/legion/extensions/llm/vllm/runners/fleet_worker.rb` | Runner entrypoint that delegates to `Fleet::ProviderResponder` |
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Key Classes
|
|
47
|
+
|
|
48
|
+
### `Legion::Extensions::Llm::Vllm` (Root Module)
|
|
49
|
+
|
|
50
|
+
The top-level module. It handles auto-registration via `Legion::Extensions::Llm::AutoRegistration`, instance discovery, and configuration normalization.
|
|
51
|
+
|
|
52
|
+
**Constants:**
|
|
53
|
+
- `PROVIDER_FAMILY` — `:vllm`
|
|
54
|
+
- `DEFAULT_INSTANCE_TIER` — `{ tier: :direct, capabilities: [:completion, :streaming, :vision, :tools] }`
|
|
55
|
+
|
|
56
|
+
**Class methods:**
|
|
57
|
+
|
|
58
|
+
| Method | Description |
|
|
59
|
+
|--------|-------------|
|
|
60
|
+
| `default_settings` | Returns the full default settings hash (endpoint, fleet, thinking, etc.) |
|
|
61
|
+
| `provider_class` | Returns `Provider` |
|
|
62
|
+
| `registry_publisher` | Memoized `Legion::Extensions::Llm::RegistryPublisher` instance |
|
|
63
|
+
| `discover_instances` | Probes `localhost:8000` health endpoint, merges configured instances from `Legion::Settings` |
|
|
64
|
+
| `normalize_instance_config(config)` | Normalizes config keys (`base_url`/`api_base`/`endpoint` -> `vllm_api_base`), infers tier |
|
|
65
|
+
| `normalize_api_base(url)` | Strips trailing `/v1` from URLs |
|
|
66
|
+
| `infer_tier_from_endpoint(url)` | Returns `:local` for localhost addresses, `:direct` otherwise |
|
|
67
|
+
|
|
68
|
+
**Instance discovery sources:**
|
|
69
|
+
1. HTTP health probe against `http://localhost:8000` (0.1s timeout) -> `:local` tier
|
|
70
|
+
2. Configured instances under `Legion::Settings[:extensions][:llm][:vllm][:instances]`
|
|
71
|
+
|
|
72
|
+
### `Legion::Extensions::Llm::Vllm::Provider`
|
|
73
|
+
|
|
74
|
+
The per-instance provider class. Inherits from `Legion::Extensions::Llm::Provider` and mixes in `OpenAICompatible` for shared HTTP request/response handling.
|
|
75
|
+
|
|
76
|
+
**Class methods:**
|
|
77
|
+
|
|
78
|
+
| Method | Returns |
|
|
79
|
+
|--------|---------|
|
|
80
|
+
| `slug` | `'vllm'` |
|
|
81
|
+
| `local?` | `false` |
|
|
82
|
+
| `default_transport` | `:http` |
|
|
83
|
+
| `default_tier` | `:direct` |
|
|
84
|
+
| `configuration_options` | `[:vllm_api_base, :vllm_api_key]` |
|
|
85
|
+
| `configuration_requirements` | `[]` (no required fields) |
|
|
86
|
+
| `capabilities` | `Capabilities` module |
|
|
87
|
+
| `registry_publisher` | Delegates to `Vllm.registry_publisher` |
|
|
88
|
+
|
|
89
|
+
**Instance methods:**
|
|
90
|
+
|
|
91
|
+
| Method | Description |
|
|
92
|
+
|--------|-------------|
|
|
93
|
+
| `api_base` | Normalized API root from config, settings, or `http://localhost:8000` |
|
|
94
|
+
| `headers` | Identity headers + optional Bearer token |
|
|
95
|
+
| `settings` | Returns `Vllm.default_settings` |
|
|
96
|
+
| `health(live:)` | `GET /health` |
|
|
97
|
+
| `readiness(live:)` | Checks readiness, publishes async readiness event when `live: true` |
|
|
98
|
+
| `list_models` | `GET /v1/models`, publishes async model availability events |
|
|
99
|
+
| `discover_offerings(live:, **)` | Builds `ModelOffering` instances from discovered models (uses cache when not live) |
|
|
100
|
+
| `version` | `GET /version` |
|
|
101
|
+
| `fetch_model_detail(model_name)` | Re-fetches `/v1/models` to resolve `context_window` on cache miss |
|
|
102
|
+
| `stream_usage_supported?` | Always `true` for vLLM |
|
|
103
|
+
| `reset_prefix_cache(reset_running_requests:, reset_external:)` | `POST /reset_prefix_cache` |
|
|
104
|
+
| `reset_mm_cache` | `POST /reset_mm_cache` |
|
|
105
|
+
| `sleep(level:)` | `POST /sleep` |
|
|
106
|
+
| `wake_up(tags:)` | `POST /wake_up` |
|
|
107
|
+
|
|
108
|
+
**Payload rendering:** Overrides `render_payload` to support vLLM thinking mode via `chat_template_kwargs` and strips `reasoning_effort`.
|
|
109
|
+
|
|
110
|
+
### `Provider::Capabilities` (Module)
|
|
111
|
+
|
|
112
|
+
Predicate methods for model capability detection. All return `true` for vLLM by default:
|
|
113
|
+
|
|
114
|
+
- `chat?(model)`, `streaming?(model)`, `vision?(model)`, `functions?(model)`, `embeddings?(model)`
|
|
115
|
+
- `critical_capabilities_for(model)` — returns array of active capability names
|
|
116
|
+
|
|
117
|
+
### `Actor::DiscoveryRefresh`
|
|
118
|
+
|
|
119
|
+
Periodic actor (extends `Legion::Extensions::Actors::Every`) that refreshes the vLLM discovered model list.
|
|
120
|
+
|
|
121
|
+
- **Default interval:** 1800 seconds (30 minutes)
|
|
122
|
+
- **Configurable via:** `Legion::Settings[:extensions][:llm][:vllm][:discovery_interval]`
|
|
123
|
+
- **Action:** Calls `Legion::LLM::Discovery.refresh_discovered_models!(provider: :vllm)`
|
|
124
|
+
|
|
125
|
+
### `Actor::FleetWorker`
|
|
126
|
+
|
|
127
|
+
Subscription actor (extends `Legion::Extensions::Actors::Subscription`) that consumes LLM fleet requests routed to vLLM.
|
|
128
|
+
|
|
129
|
+
- Only activates when `Fleet::ProviderResponder.enabled_for?` returns true for discovered instances
|
|
130
|
+
- Delegates execution to `Runners::FleetWorker.handle_fleet_request`
|
|
131
|
+
|
|
132
|
+
### `Runners::FleetWorker`
|
|
133
|
+
|
|
134
|
+
Runner module that dispatches fleet requests to `Legion::Extensions::Llm::Fleet::ProviderResponder` with vLLM-specific context (provider family, class, instance discovery callback).
|
|
135
|
+
|
|
136
|
+
---
|
|
23
137
|
|
|
24
138
|
## Defaults
|
|
25
139
|
|
|
@@ -49,8 +163,12 @@ Legion::Extensions::Llm::Vllm.default_settings
|
|
|
49
163
|
# }
|
|
50
164
|
```
|
|
51
165
|
|
|
166
|
+
---
|
|
167
|
+
|
|
52
168
|
## Configuration
|
|
53
169
|
|
|
170
|
+
### Per-instance via Legion::Extensions::Llm.configure
|
|
171
|
+
|
|
54
172
|
```ruby
|
|
55
173
|
Legion::Extensions::Llm.configure do |config|
|
|
56
174
|
config.vllm_api_base = "http://localhost:8000"
|
|
@@ -60,9 +178,36 @@ Legion::Extensions::Llm.configure do |config|
|
|
|
60
178
|
end
|
|
61
179
|
```
|
|
62
180
|
|
|
181
|
+
### Multi-instance via Legion::Settings
|
|
182
|
+
|
|
183
|
+
```yaml
|
|
184
|
+
extensions:
|
|
185
|
+
llm:
|
|
186
|
+
vllm:
|
|
187
|
+
discovery_interval: 1800 # seconds between model list refreshes
|
|
188
|
+
instances:
|
|
189
|
+
production:
|
|
190
|
+
vllm_api_base: "https://vllm.example.com"
|
|
191
|
+
tier: :direct
|
|
192
|
+
local:
|
|
193
|
+
vllm_api_base: "http://localhost:8000"
|
|
194
|
+
tier: :local
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Endpoint alias normalization
|
|
198
|
+
|
|
199
|
+
The following keys are all resolved to `vllm_api_base` during instance config normalization:
|
|
200
|
+
- `base_url`
|
|
201
|
+
- `api_base`
|
|
202
|
+
- `endpoint`
|
|
203
|
+
|
|
204
|
+
Trailing `/v1` is stripped automatically.
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
63
208
|
## Fleet Responder
|
|
64
209
|
|
|
65
|
-
Provider instances can opt in to consuming Legion LLM fleet requests. The
|
|
210
|
+
Provider instances can opt in to consuming Legion LLM fleet requests. The fleet actor only starts when at least one configured instance enables `respond_to_requests`.
|
|
66
211
|
|
|
67
212
|
```yaml
|
|
68
213
|
extensions:
|
|
@@ -79,29 +224,51 @@ extensions:
|
|
|
79
224
|
- embed
|
|
80
225
|
```
|
|
81
226
|
|
|
82
|
-
|
|
227
|
+
Execution flows: `Actor::FleetWorker` (receives message) -> `Runners::FleetWorker.handle_fleet_request` -> `Fleet::ProviderResponder.call`.
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
## Thinking Mode
|
|
83
232
|
|
|
84
|
-
|
|
233
|
+
vLLM supports a "thinking" mode that enables extended reasoning. Enable via:
|
|
85
234
|
|
|
235
|
+
**Instance-level:**
|
|
236
|
+
```yaml
|
|
237
|
+
extensions:
|
|
238
|
+
llm:
|
|
239
|
+
vllm:
|
|
240
|
+
instances:
|
|
241
|
+
default:
|
|
242
|
+
enable_thinking: true
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
**Global:**
|
|
86
246
|
```ruby
|
|
87
|
-
#
|
|
247
|
+
# Legion::Settings or settings JSON
|
|
88
248
|
{ llm: { providers: { vllm: { enable_thinking: true } } } }
|
|
89
249
|
```
|
|
90
250
|
|
|
91
|
-
|
|
251
|
+
**Per-request:**
|
|
252
|
+
```ruby
|
|
253
|
+
# Pass thinking: { enabled: true } in the chat kwargs
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
When enabled, the provider adds `chat_template_kwargs: { enable_thinking: true }` to the chat payload and strips the OpenAI-specific `reasoning_effort` key.
|
|
257
|
+
|
|
258
|
+
---
|
|
92
259
|
|
|
93
260
|
## Management Endpoints
|
|
94
261
|
|
|
95
|
-
|
|
262
|
+
| Method | Endpoint | Kwargs | Description |
|
|
263
|
+
|--------|----------|--------|-------------|
|
|
264
|
+
| `health(live:)` | `GET /health` | `live:` | Server health check |
|
|
265
|
+
| `version` | `GET /version` | none | Server version info |
|
|
266
|
+
| `reset_prefix_cache` | `POST /reset_prefix_cache` | `reset_running_requests:`, `reset_external:` | Clear prefix cache |
|
|
267
|
+
| `reset_mm_cache` | `POST /reset_mm_cache` | none | Clear multimodal cache |
|
|
268
|
+
| `sleep(level:)` | `POST /sleep` | `level:` (default: 1) | Put worker to sleep |
|
|
269
|
+
| `wake_up(tags:)` | `POST /wake_up` | `tags:` | Wake worker up |
|
|
96
270
|
|
|
97
|
-
|
|
98
|
-
|--------|----------|-------------|
|
|
99
|
-
| `health` | `GET /health` | Server health check |
|
|
100
|
-
| `version` | `GET /version` | Server version info |
|
|
101
|
-
| `reset_prefix_cache` | `POST /reset_prefix_cache` | Clear prefix cache |
|
|
102
|
-
| `reset_mm_cache` | `POST /reset_mm_cache` | Clear multimodal cache |
|
|
103
|
-
| `sleep(level:)` | `POST /sleep` | Put server to sleep |
|
|
104
|
-
| `wake_up(tags:)` | `POST /wake_up` | Wake server up |
|
|
271
|
+
---
|
|
105
272
|
|
|
106
273
|
## Registry Publishing
|
|
107
274
|
|
|
@@ -110,16 +277,31 @@ When `lex-llm` routing and Legion transport are available, the provider publishe
|
|
|
110
277
|
- **Readiness events** on `readiness(live: true)` calls
|
|
111
278
|
- **Model availability events** on `list_models` discovery
|
|
112
279
|
|
|
113
|
-
|
|
280
|
+
All publishing is async (background threads) and never blocks the caller. Failures are logged via `handle_exception`.
|
|
281
|
+
|
|
282
|
+
---
|
|
283
|
+
|
|
284
|
+
## Model Discovery & Offerings
|
|
285
|
+
|
|
286
|
+
On `list_models`, vLLM returns `max_model_len` which is mapped to `context_length`. This value is:
|
|
287
|
+
1. Attached to `Model::Info` objects
|
|
288
|
+
2. Cached via `cache_set` with 86400s TTL keyed by `model_detail_cache_key`
|
|
289
|
+
3. Available in routing offerings via `limits: { context_window: ctx }`
|
|
290
|
+
|
|
291
|
+
`discover_offerings(live: false)` serves from the cached model list without hitting the network.
|
|
292
|
+
|
|
293
|
+
---
|
|
114
294
|
|
|
115
295
|
## Development
|
|
116
296
|
|
|
117
297
|
```bash
|
|
118
298
|
bundle install
|
|
119
|
-
bundle exec rspec
|
|
299
|
+
bundle exec rspec
|
|
120
300
|
bundle exec rubocop -A
|
|
121
301
|
```
|
|
122
302
|
|
|
303
|
+
---
|
|
304
|
+
|
|
123
305
|
## License
|
|
124
306
|
|
|
125
307
|
MIT
|
data/lex-llm-vllm.gemspec
CHANGED
|
@@ -27,5 +27,5 @@ Gem::Specification.new do |spec|
|
|
|
27
27
|
spec.add_dependency 'legion-logging', '>= 1.3.2'
|
|
28
28
|
spec.add_dependency 'legion-settings', '>= 1.3.14'
|
|
29
29
|
spec.add_dependency 'legion-transport', '>= 1.4.14'
|
|
30
|
-
spec.add_dependency 'lex-llm', '>= 0.
|
|
30
|
+
spec.add_dependency 'lex-llm', '>= 0.5.0'
|
|
31
31
|
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
begin
|
|
4
|
+
require 'legion/extensions/actors/every'
|
|
5
|
+
rescue LoadError => e
|
|
6
|
+
warn(e.message) if $VERBOSE
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
return unless defined?(Legion::Extensions::Actors::Every)
|
|
10
|
+
|
|
11
|
+
module Legion
|
|
12
|
+
module Extensions
|
|
13
|
+
module Llm
|
|
14
|
+
module Vllm
|
|
15
|
+
module Actor
|
|
16
|
+
# Periodic actor that refreshes the vLLM discovered model list.
|
|
17
|
+
class DiscoveryRefresh < Legion::Extensions::Actors::Every
|
|
18
|
+
include Legion::Logging::Helper
|
|
19
|
+
|
|
20
|
+
REFRESH_INTERVAL = 1800
|
|
21
|
+
|
|
22
|
+
def runner_class = self.class
|
|
23
|
+
def runner_function = 'manual'
|
|
24
|
+
def run_now? = true
|
|
25
|
+
def use_runner? = false
|
|
26
|
+
def check_subtask? = false
|
|
27
|
+
def generate_task? = false
|
|
28
|
+
|
|
29
|
+
def time
|
|
30
|
+
return REFRESH_INTERVAL unless defined?(Legion::Settings)
|
|
31
|
+
|
|
32
|
+
Legion::Settings.dig(:extensions, :llm, :vllm, :discovery_interval) || REFRESH_INTERVAL
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def manual
|
|
36
|
+
log.debug('[vllm][discovery_refresh] refreshing model list')
|
|
37
|
+
return unless defined?(Legion::LLM::Discovery)
|
|
38
|
+
|
|
39
|
+
Legion::LLM::Discovery.refresh_discovered_models!(provider: :vllm)
|
|
40
|
+
rescue StandardError => e
|
|
41
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.actor.discovery_refresh')
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -53,6 +53,11 @@ module Legion
|
|
|
53
53
|
Vllm.default_settings
|
|
54
54
|
end
|
|
55
55
|
|
|
56
|
+
# Canonical translator instance — renders requests, parses responses/chunks.
|
|
57
|
+
def translator
|
|
58
|
+
@translator ||= Translator.new(config: config)
|
|
59
|
+
end
|
|
60
|
+
|
|
56
61
|
def api_base
|
|
57
62
|
normalize_url(config.vllm_api_base || settings[:endpoint] || 'http://localhost:8000')
|
|
58
63
|
end
|
|
@@ -173,15 +178,159 @@ module Legion
|
|
|
173
178
|
)
|
|
174
179
|
end
|
|
175
180
|
|
|
181
|
+
# ── Canonical bridge: legacy provider API → Canonical::Request ──
|
|
182
|
+
|
|
183
|
+
# rubocop:disable Metrics/ParameterLists, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- bridge method can be complex
|
|
184
|
+
def build_canonical_request(
|
|
185
|
+
messages:, tools:, temperature:, model:, stream:, schema:, thinking:, tool_prefs:
|
|
186
|
+
)
|
|
187
|
+
model_id = model.respond_to?(:id) ? model.id : model.to_s
|
|
188
|
+
|
|
189
|
+
canonical_messages = messages.filter_map do |msg|
|
|
190
|
+
Canonical::Message.from_hash(msg.to_h) if msg.respond_to?(:to_h)
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
canonical_tools = tools.to_h.transform_values do |tool|
|
|
194
|
+
if tool.is_a?(Canonical::ToolDefinition)
|
|
195
|
+
tool
|
|
196
|
+
else
|
|
197
|
+
Canonical::ToolDefinition.from_hash(tool.respond_to?(:to_h) ? tool.to_h : tool)
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
params_hash = { temperature: temperature }
|
|
202
|
+
params_hash[:response_format] = schema if schema
|
|
203
|
+
canonical_params = Canonical::Params.from_hash(params_hash)
|
|
204
|
+
|
|
205
|
+
canonical_thinking = if thinking.respond_to?(:enabled?) && thinking.enabled?
|
|
206
|
+
Canonical::Thinking::Config.new(
|
|
207
|
+
effort: thinking.respond_to?(:effort) ? thinking.effort : nil
|
|
208
|
+
)
|
|
209
|
+
elsif thinking.is_a?(Hash)
|
|
210
|
+
Canonical::Thinking::Config.new(
|
|
211
|
+
effort: thinking[:effort] || thinking['effort'],
|
|
212
|
+
budget: thinking[:budget] || thinking['budget']
|
|
213
|
+
)
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Tool choice from tool_prefs
|
|
217
|
+
tool_choice = format_tool_choice_from_prefs(tool_prefs)
|
|
218
|
+
|
|
219
|
+
Canonical::Request.build(
|
|
220
|
+
messages: canonical_messages,
|
|
221
|
+
system: extract_system_prompt(messages),
|
|
222
|
+
tools: canonical_tools,
|
|
223
|
+
tool_choice: tool_choice,
|
|
224
|
+
params: canonical_params,
|
|
225
|
+
thinking: canonical_thinking,
|
|
226
|
+
stream: stream,
|
|
227
|
+
metadata: { model: model_id }
|
|
228
|
+
)
|
|
229
|
+
end
|
|
230
|
+
# rubocop:enable Metrics/ParameterLists, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
231
|
+
|
|
232
|
+
# ── Canonical bridge: Canonical→legacy Message/Chunk ──
|
|
233
|
+
|
|
234
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- verbose bridge
|
|
235
|
+
def to_legacy_message(canonical, raw_body, _raw_response)
|
|
236
|
+
thinking = nil
|
|
237
|
+
if canonical.thinking
|
|
238
|
+
thinking = Thinking.build(
|
|
239
|
+
text: canonical.thinking.content,
|
|
240
|
+
signature: canonical.thinking.signature
|
|
241
|
+
)
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
tool_calls = {}
|
|
245
|
+
canonical.tool_calls.each do |tc|
|
|
246
|
+
key = (tc.name || tc.id).to_s.to_sym
|
|
247
|
+
tool_calls[key] = Legion::Extensions::Llm::ToolCall.new(
|
|
248
|
+
id: tc.id,
|
|
249
|
+
name: tc.name,
|
|
250
|
+
arguments: tc.arguments
|
|
251
|
+
)
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
usage = canonical.usage || {}
|
|
255
|
+
|
|
256
|
+
Legion::Extensions::Llm::Message.new(
|
|
257
|
+
role: :assistant,
|
|
258
|
+
content: canonical.text,
|
|
259
|
+
model_id: canonical.model,
|
|
260
|
+
tool_calls: tool_calls.empty? ? nil : tool_calls,
|
|
261
|
+
thinking: thinking,
|
|
262
|
+
input_tokens: usage.respond_to?(:input_tokens) ? usage.input_tokens : nil,
|
|
263
|
+
output_tokens: usage.respond_to?(:output_tokens) ? usage.output_tokens : nil,
|
|
264
|
+
reasoning_tokens: usage.respond_to?(:thinking_tokens) ? usage.thinking_tokens : nil,
|
|
265
|
+
raw: raw_body
|
|
266
|
+
)
|
|
267
|
+
end
|
|
268
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
269
|
+
|
|
270
|
+
def to_legacy_chunk(canonical, raw_data)
|
|
271
|
+
usage = canonical&.usage || {}
|
|
272
|
+
|
|
273
|
+
content = canonical.delta
|
|
274
|
+
thinking = nil
|
|
275
|
+
if canonical.type == :thinking_delta
|
|
276
|
+
thinking = Thinking.build(text: canonical.delta)
|
|
277
|
+
content = nil
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
Legion::Extensions::Llm::Chunk.new(
|
|
281
|
+
role: :assistant,
|
|
282
|
+
content: content,
|
|
283
|
+
model_id: raw_data['model'],
|
|
284
|
+
tool_calls: nil,
|
|
285
|
+
thinking: thinking,
|
|
286
|
+
input_tokens: usage.respond_to?(:input_tokens) ? usage.input_tokens : nil,
|
|
287
|
+
output_tokens: usage.respond_to?(:output_tokens) ? usage.output_tokens : nil,
|
|
288
|
+
raw: raw_data
|
|
289
|
+
)
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
# ── Tool choice helpers ──
|
|
293
|
+
|
|
294
|
+
def format_tool_choice_from_prefs(tool_prefs)
|
|
295
|
+
return nil unless tool_prefs
|
|
296
|
+
|
|
297
|
+
choice = tool_prefs[:choice] || tool_prefs['choice']
|
|
298
|
+
return nil unless choice
|
|
299
|
+
return choice.to_sym if %w[auto none required].include?(choice.to_s)
|
|
300
|
+
|
|
301
|
+
{ name: choice.to_s }
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- multibranch guard chain for system parsing
|
|
305
|
+
def extract_system_prompt(messages)
|
|
306
|
+
return nil unless messages.is_a?(Array)
|
|
307
|
+
return nil if messages.empty?
|
|
308
|
+
|
|
309
|
+
first = messages.first
|
|
310
|
+
return nil unless first
|
|
311
|
+
|
|
312
|
+
role = first.respond_to?(:role) ? first.role.to_sym : (first[:role] || first['role'])
|
|
313
|
+
return nil unless [:system, 'system'].include?(role)
|
|
314
|
+
|
|
315
|
+
content = first.respond_to?(:content) ? first.content : (first[:content] || first['content'])
|
|
316
|
+
content.is_a?(String) ? content : nil
|
|
317
|
+
end
|
|
318
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
319
|
+
|
|
176
320
|
def render_payload(messages, tools:, temperature:, model:, stream:, schema:, thinking:, tool_prefs:) # rubocop:disable Metrics/ParameterLists
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
321
|
+
# Build a canonical request from provider call parameters,
|
|
322
|
+
# then delegate to the translator for wire-format rendering.
|
|
323
|
+
canonical_req = build_canonical_request(
|
|
324
|
+
messages:, tools:, temperature:, model:, stream:,
|
|
325
|
+
schema:, thinking:, tool_prefs:
|
|
326
|
+
)
|
|
327
|
+
wire = translator.render_request(canonical_req)
|
|
328
|
+
|
|
180
329
|
log.debug do
|
|
181
|
-
"rendered
|
|
182
|
-
"
|
|
330
|
+
"vLLM provider rendered wire payload model=#{wire[:model]} stream=#{wire[:stream]} " \
|
|
331
|
+
"messages=#{(wire[:messages] || []).size} keys=#{wire.keys.join(', ')}"
|
|
183
332
|
end
|
|
184
|
-
|
|
333
|
+
wire
|
|
185
334
|
end
|
|
186
335
|
|
|
187
336
|
def thinking_enabled?(thinking)
|
|
@@ -214,6 +363,24 @@ module Legion
|
|
|
214
363
|
vllm[:enable_thinking] == true || vllm['enable_thinking'] == true
|
|
215
364
|
end
|
|
216
365
|
|
|
366
|
+
# Override: delegate completion response parsing to the canonical translator.
|
|
367
|
+
def parse_completion_response(response)
|
|
368
|
+
body = response.body
|
|
369
|
+
canonical = translator.parse_response(body)
|
|
370
|
+
|
|
371
|
+
# Convert Canonical::Response back to the legacy Message/Chunk shape
|
|
372
|
+
# that the Provider base class expects (backward compat with existing callers).
|
|
373
|
+
to_legacy_message(canonical, body, response)
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
# Override: delegate SSE chunk parsing to the canonical translator.
|
|
377
|
+
def build_chunk(data)
|
|
378
|
+
canonical_chunk = translator.parse_chunk(data)
|
|
379
|
+
return nil if canonical_chunk.nil?
|
|
380
|
+
|
|
381
|
+
to_legacy_chunk(canonical_chunk, data)
|
|
382
|
+
end
|
|
383
|
+
|
|
217
384
|
def parse_list_models_response(response, provider, capabilities)
|
|
218
385
|
response.body.fetch('data', []).map do |model|
|
|
219
386
|
critical_capabilities = critical_capabilities_for(capabilities, model)
|