lex-llm-vllm 0.3.5 → 0.3.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -0
- data/lex-llm-vllm.gemspec +1 -1
- data/lib/legion/extensions/llm/vllm/actors/discovery_refresh.rb +117 -12
- data/lib/legion/extensions/llm/vllm/provider.rb +56 -69
- data/lib/legion/extensions/llm/vllm/runners/fleet_worker.rb +1 -1
- data/lib/legion/extensions/llm/vllm/translator.rb +13 -0
- data/lib/legion/extensions/llm/vllm/version.rb +1 -1
- data/lib/legion/extensions/llm/vllm.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 66c6939650ded871419a3833d9d0a89e5c31a104f3acccbb065af86af17709cb
|
|
4
|
+
data.tar.gz: e639e39a60ff4c03838100bd361048997fecfe38e92fdd9f1763809958a43f8e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cf346327446a5e3c7b8666c8ea04269d7fea32b7bbe89b9a0c11b7578fe3ca1a8f948cf4fc639265bfcfa6f0a65bca7007f24ce90146f6234f1589fcb6fe5c31
|
|
7
|
+
data.tar.gz: 1e324da7463bd1a63b14b5fde9b3f6030be0f8cdd933d802d350066beaf40fe8208998a298a848b00126bf5d1d68868b8384c2ed719fb863c825179aa8c92ced
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,42 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.3.11] - 2026-06-20
|
|
4
|
+
|
|
5
|
+
### Fixed
|
|
6
|
+
- Stub shared registry publishing through `RegistryPublisher#schedule` in specs so async availability-event coverage stays stable after the shared publisher moved off raw `Thread.new`.
|
|
7
|
+
|
|
8
|
+
## [0.3.10] - 2026-06-20
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
- Stop bulk-publishing vLLM model availability from `list_models`; discovery now emits one registry event per seen model from the shared `lex-llm` policy-filter path so blocked models stay observable without duplicate publishes.
|
|
12
|
+
|
|
13
|
+
## [0.3.9] - 2026-06-20
|
|
14
|
+
|
|
15
|
+
### Changed
|
|
16
|
+
- Slow the live discovery refresh cadence from 60 seconds to 300 seconds for vLLM instances; `extensions.llm.vllm.discovery_interval` still overrides the default.
|
|
17
|
+
|
|
18
|
+
## [0.3.8] - 2026-06-20
|
|
19
|
+
|
|
20
|
+
### Fixed
|
|
21
|
+
- Use the shared `lex-llm` capability override contract for provider, instance, and model settings, with canonical capability normalization for embedding/tool/thinking routing.
|
|
22
|
+
|
|
23
|
+
## [0.3.7] - 2026-06-19
|
|
24
|
+
|
|
25
|
+
### Changed
|
|
26
|
+
- Adopt `Legion::Extensions::Llm::Inventory::ScopedRefresher` mixin (lex-llm 0.6.0). Discovery
|
|
27
|
+
refresh actors now write directly to the live `Inventory` catalog via `Inventory.write_lane`.
|
|
28
|
+
- Pin `lex-llm >= 0.6.0` and `legion-llm >= 0.14.0` in gemspec.
|
|
29
|
+
- Standard `weight: 100` default added to provider instance settings schema.
|
|
30
|
+
|
|
31
|
+
## 0.3.6 - 2026-06-18
|
|
32
|
+
|
|
33
|
+
- **Streaming token usage** — request `stream_options: { include_usage: true }` on streaming chat so
|
|
34
|
+
vLLM emits the final usage-only chunk. Streaming responses now carry input/output token counts;
|
|
35
|
+
previously every streamed response reported zero tokens, which blinded metering/cost. Overridable
|
|
36
|
+
per-instance via `config[:stream_token_usage] = false` for a non-conforming OpenAI-compatible
|
|
37
|
+
backend that rejects the field. The chunk parser already handled the trailing `choices: []` usage
|
|
38
|
+
chunk; the gap was only that the request never asked for it.
|
|
39
|
+
|
|
3
40
|
## 0.3.5 - 2026-06-16
|
|
4
41
|
|
|
5
42
|
- Extract `vllm_api_key` from `credentials: { api_key: ... }` in instance settings so Bearer auth works with the standard settings layout.
|
data/lex-llm-vllm.gemspec
CHANGED
|
@@ -27,5 +27,5 @@ Gem::Specification.new do |spec|
|
|
|
27
27
|
spec.add_dependency 'legion-logging', '>= 1.3.2'
|
|
28
28
|
spec.add_dependency 'legion-settings', '>= 1.3.14'
|
|
29
29
|
spec.add_dependency 'legion-transport', '>= 1.4.14'
|
|
30
|
-
spec.add_dependency 'lex-llm', '>= 0.
|
|
30
|
+
spec.add_dependency 'lex-llm', '>= 0.6.0'
|
|
31
31
|
end
|
|
@@ -1,11 +1,19 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'digest'
|
|
4
|
+
|
|
3
5
|
begin
|
|
4
6
|
require 'legion/extensions/actors/every'
|
|
5
7
|
rescue LoadError => e
|
|
6
8
|
warn(e.message) if $VERBOSE
|
|
7
9
|
end
|
|
8
10
|
|
|
11
|
+
begin
|
|
12
|
+
require 'legion/extensions/llm/inventory/scoped_refresher'
|
|
13
|
+
rescue LoadError => e
|
|
14
|
+
warn(e.message) if $VERBOSE
|
|
15
|
+
end
|
|
16
|
+
|
|
9
17
|
return unless defined?(Legion::Extensions::Actors::Every)
|
|
10
18
|
|
|
11
19
|
module Legion
|
|
@@ -17,7 +25,11 @@ module Legion
|
|
|
17
25
|
class DiscoveryRefresh < Legion::Extensions::Actors::Every
|
|
18
26
|
include Legion::Logging::Helper
|
|
19
27
|
|
|
20
|
-
|
|
28
|
+
if defined?(Legion::Extensions::Llm::Inventory::ScopedRefresher)
|
|
29
|
+
include Legion::Extensions::Llm::Inventory::ScopedRefresher
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def self.every_seconds = 300
|
|
21
33
|
|
|
22
34
|
def runner_class = self.class
|
|
23
35
|
def runner_function = 'manual'
|
|
@@ -27,24 +39,117 @@ module Legion
|
|
|
27
39
|
def generate_task? = false
|
|
28
40
|
|
|
29
41
|
def time
|
|
30
|
-
return
|
|
42
|
+
return self.class.every_seconds unless defined?(Legion::Settings)
|
|
43
|
+
|
|
44
|
+
Legion::Settings.dig(:extensions, :llm, :vllm, :discovery_interval) || self.class.every_seconds
|
|
45
|
+
end
|
|
31
46
|
|
|
32
|
-
|
|
47
|
+
def scope_key = { provider: :vllm }
|
|
48
|
+
def offering_type(raw_type) = %i[embed embedding].include?(raw_type) ? :embedding : :inference
|
|
49
|
+
|
|
50
|
+
def vllm_cfg
|
|
51
|
+
return unless defined?(Legion::Settings)
|
|
52
|
+
|
|
53
|
+
Legion::Settings.dig(:extensions, :llm,
|
|
54
|
+
:vllm)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def compute_lanes_for_scope(**)
|
|
58
|
+
return [] unless defined?(Legion::LLM::Call::Registry)
|
|
59
|
+
|
|
60
|
+
vllm_instances.flat_map { |entry| lanes_from_instance(entry) }
|
|
61
|
+
rescue StandardError => e
|
|
62
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.actor.compute_lanes_for_scope')
|
|
63
|
+
[]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def credential_hash(**)
|
|
67
|
+
cfg = vllm_cfg
|
|
68
|
+
Digest::SHA256.hexdigest(cfg&.dig(:api_key).to_s + cfg&.dig(:instances).to_s)[0, 16]
|
|
69
|
+
rescue StandardError
|
|
70
|
+
'unknown'
|
|
33
71
|
end
|
|
34
72
|
|
|
35
73
|
def manual
|
|
36
|
-
|
|
37
|
-
|
|
74
|
+
run_scoped_tick
|
|
75
|
+
rescue StandardError => e
|
|
76
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.actor.discovery_refresh')
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
private
|
|
80
|
+
|
|
81
|
+
def run_scoped_tick
|
|
82
|
+
return unless defined?(Legion::Extensions::Llm::Inventory::ScopedRefresher)
|
|
83
|
+
return unless self.class.ancestors.include?(Legion::Extensions::Llm::Inventory::ScopedRefresher)
|
|
84
|
+
|
|
85
|
+
tick
|
|
86
|
+
end
|
|
38
87
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
88
|
+
def vllm_instances
|
|
89
|
+
Legion::LLM::Call::Registry.all_instances.select { |e| (e[:provider] || '').to_sym == :vllm }
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def lanes_from_instance(instance_entry)
|
|
93
|
+
adapter = instance_entry[:adapter]
|
|
94
|
+
return [] unless adapter.respond_to?(:discover_offerings)
|
|
95
|
+
|
|
96
|
+
Array(adapter.discover_offerings(live: true)).flat_map do |offering|
|
|
97
|
+
raw = offering_to_hash(offering)
|
|
98
|
+
lane = build_lane(raw, instance_entry)
|
|
99
|
+
fleet = maybe_fleet_lane(lane)
|
|
100
|
+
fleet ? [lane, fleet] : [lane]
|
|
42
101
|
end
|
|
43
|
-
|
|
44
|
-
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# ModelOffering objects do not implement `[]`; normalize to a Hash so the
|
|
105
|
+
# rest of the writer stays Hash-shaped. Hash inputs pass through untouched.
|
|
106
|
+
def offering_to_hash(offering)
|
|
107
|
+
return offering if offering.is_a?(Hash)
|
|
108
|
+
|
|
109
|
+
hash = offering.to_h
|
|
110
|
+
hash[:type] ||= hash[:usage_type]
|
|
111
|
+
hash[:enabled] = offering.respond_to?(:enabled?) ? offering.enabled? : true
|
|
112
|
+
hash
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def build_lane(offering, instance_entry) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity
|
|
116
|
+
tier = offering[:tier] || :direct
|
|
117
|
+
type = offering_type(offering[:type])
|
|
118
|
+
instance_id = offering[:instance_id] ||
|
|
119
|
+
instance_entry[:instance] ||
|
|
120
|
+
instance_entry[:instance_id] ||
|
|
121
|
+
instance_entry[:id]
|
|
122
|
+
provider_family = offering[:provider_family] || :vllm
|
|
123
|
+
model = offering[:model]
|
|
124
|
+
lane_id = Legion::Extensions::Llm::Inventory::ScopedRefresher.compose_id(
|
|
125
|
+
tier: tier, provider_family: provider_family, instance_id: instance_id, type: type, model: model
|
|
126
|
+
)
|
|
127
|
+
{ id: lane_id, tier: tier, provider_family: provider_family, instance_id: instance_id,
|
|
128
|
+
model: model, canonical_model_alias: offering[:canonical_model_alias], type: type,
|
|
129
|
+
capabilities: normalize_caps(offering[:capabilities]),
|
|
130
|
+
limits: offering[:limits] || {}, enabled: offering.fetch(:enabled, true), cost: offering[:cost] || {} }
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def maybe_fleet_lane(lane)
|
|
134
|
+
return unless lane[:type] == :inference && vllm_cfg&.dig(:fleet, :dispatch, :enabled)
|
|
135
|
+
|
|
136
|
+
fleet_id = Legion::Extensions::Llm::Inventory::ScopedRefresher.compose_id(
|
|
137
|
+
tier: :fleet, provider_family: lane[:provider_family],
|
|
138
|
+
instance_id: lane[:instance_id], type: lane[:type], model: lane[:model]
|
|
139
|
+
)
|
|
140
|
+
lane.merge(id: fleet_id, tier: :fleet)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def normalize_caps(caps)
|
|
144
|
+
# Inventory::Capabilities lives in lex-llm; the previous fallback (`return []
|
|
145
|
+
# unless defined?(...)`) silently swallowed every capability the operator
|
|
146
|
+
# declared via enable_thinking/enable_tools when the constant wasn't loaded.
|
|
147
|
+
# Always normalize through the shared vocabulary so aliases collapse.
|
|
148
|
+
if defined?(Legion::Extensions::Llm::Inventory::Capabilities)
|
|
149
|
+
Legion::Extensions::Llm::Inventory::Capabilities.normalize(caps)
|
|
150
|
+
else
|
|
151
|
+
Array(caps).compact.map(&:to_sym).uniq
|
|
45
152
|
end
|
|
46
|
-
rescue StandardError => e
|
|
47
|
-
handle_exception(e, level: :warn, handled: true, operation: 'vllm.actor.discovery_refresh')
|
|
48
153
|
end
|
|
49
154
|
end
|
|
50
155
|
end
|
|
@@ -33,9 +33,9 @@ module Legion
|
|
|
33
33
|
|
|
34
34
|
def chat?(_model) = true
|
|
35
35
|
def streaming?(_model) = true
|
|
36
|
-
def vision?(_model) =
|
|
37
|
-
def functions?(_model) =
|
|
38
|
-
def embeddings?(_model) =
|
|
36
|
+
def vision?(_model) = false
|
|
37
|
+
def functions?(_model) = false
|
|
38
|
+
def embeddings?(_model) = false
|
|
39
39
|
|
|
40
40
|
def critical_capabilities_for(model)
|
|
41
41
|
[
|
|
@@ -78,7 +78,7 @@ module Legion
|
|
|
78
78
|
|
|
79
79
|
def health(live: false)
|
|
80
80
|
log.info { "checking health live=#{live} at #{api_base}#{health_url}" }
|
|
81
|
-
|
|
81
|
+
super
|
|
82
82
|
end
|
|
83
83
|
|
|
84
84
|
def readiness(live: false)
|
|
@@ -88,27 +88,20 @@ module Legion
|
|
|
88
88
|
end
|
|
89
89
|
end
|
|
90
90
|
|
|
91
|
-
def list_models
|
|
91
|
+
def list_models(live: false, **filters)
|
|
92
92
|
log.info { "discovering models from #{api_base}#{models_url}" }
|
|
93
93
|
super.tap do |models|
|
|
94
94
|
log.info { "discovered #{models.size} model(s) from vLLM" }
|
|
95
|
-
self.class.registry_publisher.publish_models_async(models, readiness: readiness(live: false))
|
|
96
95
|
end
|
|
97
96
|
end
|
|
98
97
|
|
|
99
|
-
def discover_offerings(live: false, **)
|
|
100
|
-
|
|
101
|
-
@cached_models = list_models
|
|
102
|
-
else
|
|
103
|
-
Array(@cached_models)
|
|
104
|
-
end
|
|
105
|
-
offerings = models.filter_map do |model_info|
|
|
106
|
-
next unless model_allowed?(model_info.id)
|
|
98
|
+
def discover_offerings(live: false, **filters)
|
|
99
|
+
return filter_cached_offerings(Array(@cached_offerings), filters) unless live
|
|
107
100
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
101
|
+
provider_health = health(live:)
|
|
102
|
+
@cached_offerings = discover_live_offerings(filters, provider_health, live:)
|
|
103
|
+
log_discover_complete(@cached_offerings)
|
|
104
|
+
@cached_offerings
|
|
112
105
|
rescue StandardError => e
|
|
113
106
|
handle_exception(e, level: :warn, handled: true, operation: 'vllm.discover_offerings')
|
|
114
107
|
[]
|
|
@@ -161,13 +154,49 @@ module Legion
|
|
|
161
154
|
|
|
162
155
|
private
|
|
163
156
|
|
|
164
|
-
def
|
|
157
|
+
def discovery_registry_readiness(provider_health, live:)
|
|
158
|
+
{
|
|
159
|
+
provider: slug.to_sym,
|
|
160
|
+
configured: configured?,
|
|
161
|
+
ready: provider_health[:ready] == true,
|
|
162
|
+
live: live,
|
|
163
|
+
health: provider_health
|
|
164
|
+
}
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def discover_live_offerings(filters, provider_health, live:)
|
|
168
|
+
readiness = discovery_registry_readiness(provider_health, live:)
|
|
169
|
+
Array(list_models(live:, **filters)).filter_map do |model|
|
|
170
|
+
self.class.registry_publisher.publish_models_async([model], readiness:)
|
|
171
|
+
next unless model_matches_filters?(model, filters)
|
|
172
|
+
next unless model_allowed?(model.id)
|
|
173
|
+
|
|
174
|
+
log_model_discovered(model)
|
|
175
|
+
offering_from_model(model, health: provider_health)
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def log_model_discovered(model)
|
|
180
|
+
log.debug(
|
|
181
|
+
"[#{slug}] instance=#{provider_instance_id} action=model_discovered " \
|
|
182
|
+
"model=#{model.id} family=#{model.family}"
|
|
183
|
+
)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def log_discover_complete(offerings)
|
|
187
|
+
log.info(
|
|
188
|
+
"[#{slug}] instance=#{provider_instance_id} action=discover_complete " \
|
|
189
|
+
"model_count=#{Array(offerings).size}"
|
|
190
|
+
)
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def offering_from_model(model_info, health: {})
|
|
165
194
|
ctx = model_info.context_length
|
|
166
195
|
if ctx
|
|
167
196
|
begin
|
|
168
197
|
cache_set(model_detail_cache_key(model_info.id), { context_window: ctx }, ttl: 86_400)
|
|
169
198
|
rescue StandardError => e
|
|
170
|
-
handle_exception(e, level: :
|
|
199
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.cache_model_detail')
|
|
171
200
|
end
|
|
172
201
|
end
|
|
173
202
|
|
|
@@ -181,11 +210,12 @@ module Legion
|
|
|
181
210
|
model_config: model_capability_config(model_info.id)
|
|
182
211
|
)
|
|
183
212
|
|
|
184
|
-
build_offering(model_info, policy, ctx)
|
|
213
|
+
build_offering(model_info, policy, ctx, health)
|
|
185
214
|
end
|
|
186
215
|
|
|
187
|
-
def build_offering(model_info, policy, ctx) # rubocop:disable Metrics/AbcSize
|
|
216
|
+
def build_offering(model_info, policy, ctx, health) # rubocop:disable Metrics/AbcSize
|
|
188
217
|
max_out = model_info.respond_to?(:max_output_tokens) ? model_info.max_output_tokens : nil
|
|
218
|
+
usage_type = policy[:capabilities].include?(:embedding) ? :embedding : :inference
|
|
189
219
|
|
|
190
220
|
Legion::Extensions::Llm::Routing::ModelOffering.new(
|
|
191
221
|
provider_family: :vllm,
|
|
@@ -195,10 +225,11 @@ module Legion
|
|
|
195
225
|
model: model_info.id,
|
|
196
226
|
canonical_model_alias: model_info.respond_to?(:name) ? model_info.name : nil,
|
|
197
227
|
model_family: model_info.respond_to?(:family) ? model_info.family : nil,
|
|
198
|
-
usage_type:
|
|
228
|
+
usage_type: usage_type,
|
|
199
229
|
capabilities: policy[:capabilities],
|
|
200
230
|
capability_sources: policy[:sources],
|
|
201
231
|
limits: { context_window: ctx, max_output_tokens: max_out }.compact,
|
|
232
|
+
health: health,
|
|
202
233
|
metadata: offering_metadata_for(model_info).merge(capability_sources: policy[:sources])
|
|
203
234
|
)
|
|
204
235
|
end
|
|
@@ -212,51 +243,7 @@ module Legion
|
|
|
212
243
|
end
|
|
213
244
|
|
|
214
245
|
def provider_envelope_capabilities
|
|
215
|
-
{ streaming: true }
|
|
216
|
-
end
|
|
217
|
-
|
|
218
|
-
def provider_capability_config
|
|
219
|
-
return {} unless defined?(Legion::Extensions::Llm::CredentialSources)
|
|
220
|
-
|
|
221
|
-
conf = Legion::Extensions::Llm::CredentialSources.setting(:extensions, :llm, :vllm)
|
|
222
|
-
conf.is_a?(Hash) ? conf.to_h.except(:instances, 'instances') : {}
|
|
223
|
-
rescue StandardError => e
|
|
224
|
-
handle_exception(e, level: :debug, handled: true, operation: 'vllm.provider_capability_config')
|
|
225
|
-
{}
|
|
226
|
-
end
|
|
227
|
-
|
|
228
|
-
def instance_capability_config
|
|
229
|
-
cfg = config
|
|
230
|
-
result = {}
|
|
231
|
-
%i[capabilities enable_thinking enable_tools enable_streaming enable_vision enable_embeddings
|
|
232
|
-
thinking_flag tools_flag streaming_flag vision_flag embedding_flag embeddings_flag
|
|
233
|
-
tool_flag images_flag image_flag].each do |key|
|
|
234
|
-
next unless cfg.respond_to?(key)
|
|
235
|
-
|
|
236
|
-
val = cfg.send(key)
|
|
237
|
-
result[key] = val unless val.nil?
|
|
238
|
-
rescue StandardError
|
|
239
|
-
next
|
|
240
|
-
end
|
|
241
|
-
result
|
|
242
|
-
end
|
|
243
|
-
|
|
244
|
-
def model_capability_config(model_id)
|
|
245
|
-
models_conf = resolve_models_config
|
|
246
|
-
return {} unless models_conf.respond_to?(:to_h)
|
|
247
|
-
|
|
248
|
-
hash = models_conf.to_h
|
|
249
|
-
hash[model_id.to_s] || hash[model_id.to_sym] || {}
|
|
250
|
-
rescue StandardError => e
|
|
251
|
-
handle_exception(e, level: :debug, handled: true, operation: 'vllm.model_capability_config')
|
|
252
|
-
{}
|
|
253
|
-
end
|
|
254
|
-
|
|
255
|
-
def resolve_models_config
|
|
256
|
-
return config.models if config.respond_to?(:models)
|
|
257
|
-
return config[:models] if config.respond_to?(:[])
|
|
258
|
-
|
|
259
|
-
nil
|
|
246
|
+
{ completion: true, streaming: true }
|
|
260
247
|
end
|
|
261
248
|
|
|
262
249
|
def offering_metadata_for(model_info)
|
|
@@ -453,7 +440,7 @@ module Legion
|
|
|
453
440
|
def vllm_thinking_setting
|
|
454
441
|
instance_thinking_enabled? || global_thinking_enabled?
|
|
455
442
|
rescue StandardError => e
|
|
456
|
-
handle_exception(e, level: :
|
|
443
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.thinking_setting')
|
|
457
444
|
false
|
|
458
445
|
end
|
|
459
446
|
|
|
@@ -37,7 +37,7 @@ module Legion
|
|
|
37
37
|
|
|
38
38
|
payload[key] || payload[key.to_s]
|
|
39
39
|
rescue StandardError => e
|
|
40
|
-
handle_exception(e, level: :
|
|
40
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.fleet_worker.payload_field',
|
|
41
41
|
field: key)
|
|
42
42
|
nil
|
|
43
43
|
end
|
|
@@ -70,6 +70,7 @@ module Legion
|
|
|
70
70
|
payload[:tools] = format_tools(request.tools) unless request.tools.to_h.empty?
|
|
71
71
|
payload[:tool_choice] = format_tool_choice(request.tool_choice) if request.tool_choice
|
|
72
72
|
payload.merge!(map_params_to_wire(request.params)) if request.params
|
|
73
|
+
payload[:stream_options] = { include_usage: true } if request.stream && stream_token_usage?
|
|
73
74
|
apply_thinking_config(payload, request)
|
|
74
75
|
if formatted_response_format?(request.params)
|
|
75
76
|
payload[:response_format] =
|
|
@@ -208,6 +209,18 @@ module Legion
|
|
|
208
209
|
|
|
209
210
|
attr_reader :config
|
|
210
211
|
|
|
212
|
+
# OpenAI `stream_options.include_usage` asks the server to emit a final
|
|
213
|
+
# usage-only chunk (choices:[]) so streaming responses carry token counts.
|
|
214
|
+
# vLLM supports it (capability streaming_token_usage); defaults on, but a
|
|
215
|
+
# non-conforming OpenAI-compatible backend that rejects the field can opt
|
|
216
|
+
# out per-instance via config[:stream_token_usage] = false.
|
|
217
|
+
def stream_token_usage?
|
|
218
|
+
override = config.respond_to?(:[]) ? config[:stream_token_usage] : nil
|
|
219
|
+
return override != false unless override.nil?
|
|
220
|
+
|
|
221
|
+
capabilities[:streaming_token_usage] == true
|
|
222
|
+
end
|
|
223
|
+
|
|
211
224
|
# ── Message formatting ──
|
|
212
225
|
|
|
213
226
|
def format_messages(request)
|
|
@@ -103,7 +103,7 @@ module Legion
|
|
|
103
103
|
host = URI.parse(url.to_s).host.to_s.downcase
|
|
104
104
|
%w[localhost 127.0.0.1 ::1].include?(host) ? :local : :direct
|
|
105
105
|
rescue URI::InvalidURIError => e
|
|
106
|
-
handle_exception(e, level: :
|
|
106
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.infer_tier_from_endpoint')
|
|
107
107
|
:direct
|
|
108
108
|
end
|
|
109
109
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lex-llm-vllm
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.3.
|
|
4
|
+
version: 0.3.11
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- LegionIO
|
|
@@ -71,14 +71,14 @@ dependencies:
|
|
|
71
71
|
requirements:
|
|
72
72
|
- - ">="
|
|
73
73
|
- !ruby/object:Gem::Version
|
|
74
|
-
version: 0.
|
|
74
|
+
version: 0.6.0
|
|
75
75
|
type: :runtime
|
|
76
76
|
prerelease: false
|
|
77
77
|
version_requirements: !ruby/object:Gem::Requirement
|
|
78
78
|
requirements:
|
|
79
79
|
- - ">="
|
|
80
80
|
- !ruby/object:Gem::Version
|
|
81
|
-
version: 0.
|
|
81
|
+
version: 0.6.0
|
|
82
82
|
description: vLLM provider integration for the LegionIO LLM routing framework.
|
|
83
83
|
email:
|
|
84
84
|
- matthewdiverson@gmail.com
|