lex-llm-vllm 0.2.8 → 0.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/lib/legion/extensions/llm/vllm/actors/fleet_worker.rb +11 -2
- data/lib/legion/extensions/llm/vllm/provider.rb +36 -3
- data/lib/legion/extensions/llm/vllm/runners/fleet_worker.rb +19 -0
- data/lib/legion/extensions/llm/vllm/version.rb +1 -1
- data/lib/legion/extensions/llm/vllm.rb +4 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a3082ce3d1b3d61f220aed833a0b87138ad199cce52cfb455683c04d57db4f10
|
|
4
|
+
data.tar.gz: a542a059e10c1a12a58fa68f611706e3c6007c0642ba57777ddf2e7b31829e5e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 864d11b0394b30b9df44c5e5ffb97dcba939311c30b87c9588ca8dea73b92f2087c5a6af6c052606f97c63e93201e964d5a358b95ca6b32905f9a087d860bf80
|
|
7
|
+
data.tar.gz: 22df5b5ea5c9dedabe193cd2caa83944eef6f0b107f98605216587eda77ce53efb9fe4fea596fd796415d9c1e5ea7fda2d4079890ef805fd054ba78d7f398d9f
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.2.10 - 2026-05-13
|
|
4
|
+
|
|
5
|
+
- Add `fetch_model_detail` to re-fetch `/v1/models` for `context_window` on a cache miss.
|
|
6
|
+
- Pre-warm the model detail cache during offering discovery via `cache_set` using `model_detail_cache_key`.
|
|
7
|
+
|
|
8
|
+
## 0.2.9 - 2026-05-12
|
|
9
|
+
|
|
10
|
+
- Route fleet actor load failures through `Legion::Logging::Helper` instead of direct warnings.
|
|
11
|
+
- Add debug logging around vLLM instance discovery, fleet worker dispatch, offering construction, payload rendering, and management endpoints.
|
|
12
|
+
|
|
3
13
|
## 0.2.8 - 2026-05-07
|
|
4
14
|
|
|
5
15
|
- Read vLLM thinking defaults from the active provider instance config so per-instance `enable_thinking` settings affect chat payloads.
|
|
@@ -3,7 +3,11 @@
|
|
|
3
3
|
begin
|
|
4
4
|
require 'legion/extensions/actors/subscription'
|
|
5
5
|
rescue LoadError => e
|
|
6
|
-
|
|
6
|
+
require 'legion/extensions/llm/vllm'
|
|
7
|
+
unless defined?(Legion::Extensions::Actors::Subscription)
|
|
8
|
+
Legion::Extensions::Llm::Vllm.handle_exception(e, level: :warn, handled: false,
|
|
9
|
+
operation: 'vllm.fleet_worker.load_actor_runtime')
|
|
10
|
+
end
|
|
7
11
|
end
|
|
8
12
|
|
|
9
13
|
unless defined?(Legion::Extensions::Actors::Subscription)
|
|
@@ -12,6 +16,7 @@ end
|
|
|
12
16
|
|
|
13
17
|
require 'legion/extensions/llm/vllm'
|
|
14
18
|
require 'legion/extensions/llm/fleet/provider_responder'
|
|
19
|
+
require 'legion/logging'
|
|
15
20
|
|
|
16
21
|
module Legion
|
|
17
22
|
module Extensions
|
|
@@ -20,6 +25,8 @@ module Legion
|
|
|
20
25
|
module Actor
|
|
21
26
|
# Subscription actor for vLLM fleet request consumption.
|
|
22
27
|
class FleetWorker < Legion::Extensions::Actors::Subscription
|
|
28
|
+
include Legion::Logging::Helper
|
|
29
|
+
|
|
23
30
|
def runner_class
|
|
24
31
|
'Legion::Extensions::Llm::Vllm::Runners::FleetWorker'
|
|
25
32
|
end
|
|
@@ -33,7 +40,9 @@ module Legion
|
|
|
33
40
|
end
|
|
34
41
|
|
|
35
42
|
def enabled?
|
|
36
|
-
Legion::Extensions::Llm::Fleet::ProviderResponder.enabled_for?(Vllm.discover_instances)
|
|
43
|
+
Legion::Extensions::Llm::Fleet::ProviderResponder.enabled_for?(Vllm.discover_instances).tap do |enabled|
|
|
44
|
+
log.debug { "vLLM fleet worker enabled=#{enabled}" }
|
|
45
|
+
end
|
|
37
46
|
end
|
|
38
47
|
end
|
|
39
48
|
end
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'legion/extensions/llm'
|
|
4
|
+
require 'legion/logging'
|
|
4
5
|
require 'uri'
|
|
5
6
|
|
|
6
7
|
module Legion
|
|
@@ -94,7 +95,9 @@ module Legion
|
|
|
94
95
|
else
|
|
95
96
|
Array(@cached_models)
|
|
96
97
|
end
|
|
97
|
-
models.map { |model_info| offering_from_model(model_info) }
|
|
98
|
+
models.map { |model_info| offering_from_model(model_info) }.tap do |offerings|
|
|
99
|
+
log.debug { "built #{offerings.size} vLLM offering(s) live=#{live}" }
|
|
100
|
+
end
|
|
98
101
|
rescue StandardError => e
|
|
99
102
|
handle_exception(e, level: :warn, handled: true, operation: 'vllm.discover_offerings')
|
|
100
103
|
[]
|
|
@@ -106,25 +109,51 @@ module Legion
|
|
|
106
109
|
end
|
|
107
110
|
|
|
108
111
|
def reset_prefix_cache(reset_running_requests: nil, reset_external: nil)
|
|
112
|
+
log.debug do
|
|
113
|
+
"resetting vLLM prefix cache reset_running_requests=#{reset_running_requests.inspect} " \
|
|
114
|
+
"reset_external=#{reset_external.inspect}"
|
|
115
|
+
end
|
|
109
116
|
connection.post(with_query(reset_prefix_cache_url, reset_running_requests:, reset_external:), {}).body
|
|
110
117
|
end
|
|
111
118
|
|
|
112
119
|
def reset_mm_cache
|
|
120
|
+
log.debug { 'resetting vLLM multimodal cache' }
|
|
113
121
|
connection.post(reset_mm_cache_url, {}).body
|
|
114
122
|
end
|
|
115
123
|
|
|
116
124
|
def sleep(level: 1)
|
|
125
|
+
log.debug { "putting vLLM worker to sleep level=#{level.inspect}" }
|
|
117
126
|
connection.post(with_query(sleep_url, level:), {}).body
|
|
118
127
|
end
|
|
119
128
|
|
|
120
129
|
def wake_up(tags: nil)
|
|
130
|
+
log.debug { "waking vLLM worker tags=#{Array(tags).inspect}" }
|
|
121
131
|
query = Array(tags).map { |tag| ['tags', tag] }
|
|
122
132
|
connection.post(with_query(wake_up_url, query), {}).body
|
|
123
133
|
end
|
|
124
134
|
|
|
135
|
+
def fetch_model_detail(model_name)
|
|
136
|
+
# vLLM provides context_length via /v1/models during discovery.
|
|
137
|
+
# Re-fetch from the models endpoint if we need it outside discovery.
|
|
138
|
+
response = @connection.get(models_url)
|
|
139
|
+
models = response.body.fetch('data', [])
|
|
140
|
+
entry = models.find { |m| m['id'] == model_name.to_s }
|
|
141
|
+
return nil unless entry
|
|
142
|
+
|
|
143
|
+
ctx = entry['max_model_len']
|
|
144
|
+
ctx ? { context_window: ctx } : nil
|
|
145
|
+
rescue StandardError => e
|
|
146
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.fetch_model_detail',
|
|
147
|
+
model: model_name)
|
|
148
|
+
nil
|
|
149
|
+
end
|
|
150
|
+
|
|
125
151
|
private
|
|
126
152
|
|
|
127
153
|
def offering_from_model(model_info)
|
|
154
|
+
ctx = model_info.context_length
|
|
155
|
+
cache_set(model_detail_cache_key(model_info.id), { context_window: ctx }, ttl: 86_400) if ctx
|
|
156
|
+
|
|
128
157
|
Legion::Extensions::Llm::Routing::ModelOffering.new(
|
|
129
158
|
provider_family: :vllm,
|
|
130
159
|
instance_id: config.respond_to?(:instance_id) ? config.instance_id : :default,
|
|
@@ -133,8 +162,8 @@ module Legion
|
|
|
133
162
|
model: model_info.id,
|
|
134
163
|
usage_type: model_info.embedding? ? :embedding : :inference,
|
|
135
164
|
capabilities: model_info.capabilities.map(&:to_s),
|
|
136
|
-
limits: { context_window:
|
|
137
|
-
metadata: { context_length:
|
|
165
|
+
limits: { context_window: ctx }.compact,
|
|
166
|
+
metadata: { context_length: ctx }
|
|
138
167
|
)
|
|
139
168
|
end
|
|
140
169
|
|
|
@@ -150,6 +179,10 @@ module Legion
|
|
|
150
179
|
payload = super
|
|
151
180
|
payload.delete(:reasoning_effort)
|
|
152
181
|
payload[:chat_template_kwargs] = { enable_thinking: true } if thinking_enabled?(thinking)
|
|
182
|
+
log.debug do
|
|
183
|
+
"rendered vLLM payload model=#{model.respond_to?(:id) ? model.id : model} stream=#{stream} " \
|
|
184
|
+
"tools=#{tools.respond_to?(:size) ? tools.size : 0} thinking=#{payload.key?(:chat_template_kwargs)}"
|
|
185
|
+
end
|
|
153
186
|
payload
|
|
154
187
|
end
|
|
155
188
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require 'legion/extensions/llm/fleet/provider_responder'
|
|
4
4
|
require 'legion/extensions/llm/vllm'
|
|
5
|
+
require 'legion/logging'
|
|
5
6
|
|
|
6
7
|
module Legion
|
|
7
8
|
module Extensions
|
|
@@ -10,9 +11,17 @@ module Legion
|
|
|
10
11
|
module Runners
|
|
11
12
|
# Runner entrypoint for vLLM fleet request execution.
|
|
12
13
|
module FleetWorker
|
|
14
|
+
include Legion::Logging::Helper
|
|
15
|
+
extend Legion::Logging::Helper
|
|
16
|
+
|
|
13
17
|
module_function
|
|
14
18
|
|
|
15
19
|
def handle_fleet_request(payload, delivery: nil, properties: nil)
|
|
20
|
+
log.debug do
|
|
21
|
+
"handling vLLM fleet request request_id=#{payload_field(payload, :request_id).inspect} " \
|
|
22
|
+
"provider_instance=#{payload_field(payload, :provider_instance).inspect} " \
|
|
23
|
+
"operation=#{payload_field(payload, :operation).inspect}"
|
|
24
|
+
end
|
|
16
25
|
Legion::Extensions::Llm::Fleet::ProviderResponder.call(
|
|
17
26
|
payload: payload,
|
|
18
27
|
provider_family: Vllm::PROVIDER_FAMILY,
|
|
@@ -22,6 +31,16 @@ module Legion
|
|
|
22
31
|
properties: properties
|
|
23
32
|
)
|
|
24
33
|
end
|
|
34
|
+
|
|
35
|
+
def payload_field(payload, key)
|
|
36
|
+
return unless payload.respond_to?(:[])
|
|
37
|
+
|
|
38
|
+
payload[key] || payload[key.to_s]
|
|
39
|
+
rescue StandardError => e
|
|
40
|
+
handle_exception(e, level: :debug, handled: true, operation: 'vllm.fleet_worker.payload_field',
|
|
41
|
+
field: key)
|
|
42
|
+
nil
|
|
43
|
+
end
|
|
25
44
|
end
|
|
26
45
|
end
|
|
27
46
|
end
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
require 'legion/extensions/llm'
|
|
4
4
|
require 'legion/extensions/llm/vllm/provider'
|
|
5
5
|
require 'legion/extensions/llm/vllm/version'
|
|
6
|
+
require 'legion/logging'
|
|
6
7
|
|
|
7
8
|
module Legion
|
|
8
9
|
module Extensions
|
|
@@ -65,6 +66,7 @@ module Legion
|
|
|
65
66
|
end
|
|
66
67
|
end
|
|
67
68
|
|
|
69
|
+
log.debug { "discovered #{instances.size} vLLM instance(s): #{instances.keys.join(', ')}" }
|
|
68
70
|
instances
|
|
69
71
|
end
|
|
70
72
|
|
|
@@ -92,7 +94,8 @@ module Legion
|
|
|
92
94
|
require 'uri'
|
|
93
95
|
host = URI.parse(url.to_s).host.to_s.downcase
|
|
94
96
|
%w[localhost 127.0.0.1 ::1].include?(host) ? :local : :direct
|
|
95
|
-
rescue URI::InvalidURIError
|
|
97
|
+
rescue URI::InvalidURIError => e
|
|
98
|
+
handle_exception(e, level: :debug, handled: true, operation: 'vllm.infer_tier_from_endpoint')
|
|
96
99
|
:direct
|
|
97
100
|
end
|
|
98
101
|
end
|