lex-llm-vllm 0.3.5 → 0.3.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 20aa357958da5d294b132bdb5d3bea065d0192b42051deae734a3e1b8af592e5
4
- data.tar.gz: 203a81b2aa087bc2cdabbe26cd71ea88e1bd7b1aaa9e44a95a5b43c35ab2c344
3
+ metadata.gz: 66c6939650ded871419a3833d9d0a89e5c31a104f3acccbb065af86af17709cb
4
+ data.tar.gz: e639e39a60ff4c03838100bd361048997fecfe38e92fdd9f1763809958a43f8e
5
5
  SHA512:
6
- metadata.gz: 4ab5021f00c1f6652147297c4f9fc56a2f501eaa1468c0f098aff78d83426706069e4cca7184c96647135ff951dc4965d871352144826d4cf6c8961e60616862
7
- data.tar.gz: 1e52f17e28c52ddae6317c3fd6ea6fd58b54a71a46d77c80f00544fb35333bf0340c1498b2f9db6419c811765c9926e0bd20228c7bc44badae04dd68a9fd178c
6
+ metadata.gz: cf346327446a5e3c7b8666c8ea04269d7fea32b7bbe89b9a0c11b7578fe3ca1a8f948cf4fc639265bfcfa6f0a65bca7007f24ce90146f6234f1589fcb6fe5c31
7
+ data.tar.gz: 1e324da7463bd1a63b14b5fde9b3f6030be0f8cdd933d802d350066beaf40fe8208998a298a848b00126bf5d1d68868b8384c2ed719fb863c825179aa8c92ced
data/CHANGELOG.md CHANGED
@@ -1,5 +1,42 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.3.11] - 2026-06-20
4
+
5
+ ### Fixed
6
+ - Stub shared registry publishing through `RegistryPublisher#schedule` in specs so async availability-event coverage stays stable after the shared publisher moved off raw `Thread.new`.
7
+
8
+ ## [0.3.10] - 2026-06-20
9
+
10
+ ### Fixed
11
+ - Stop bulk-publishing vLLM model availability from `list_models`; discovery now emits one registry event per seen model from the shared `lex-llm` policy-filter path so blocked models stay observable without duplicate publishes.
12
+
13
+ ## [0.3.9] - 2026-06-20
14
+
15
+ ### Changed
16
+ - Slow the live discovery refresh cadence from 60 seconds to 300 seconds for vLLM instances; `extensions.llm.vllm.discovery_interval` still overrides the default.
17
+
18
+ ## [0.3.8] - 2026-06-20
19
+
20
+ ### Fixed
21
+ - Use the shared `lex-llm` capability override contract for provider, instance, and model settings, with canonical capability normalization for embedding/tool/thinking routing.
22
+
23
+ ## [0.3.7] - 2026-06-19
24
+
25
+ ### Changed
26
+ - Adopt `Legion::Extensions::Llm::Inventory::ScopedRefresher` mixin (lex-llm 0.6.0). Discovery
27
+ refresh actors now write directly to the live `Inventory` catalog via `Inventory.write_lane`.
28
+ - Pin `lex-llm >= 0.6.0` and `legion-llm >= 0.14.0` in gemspec.
29
+ - Standard `weight: 100` default added to provider instance settings schema.
30
+
31
+ ## 0.3.6 - 2026-06-18
32
+
33
+ - **Streaming token usage** — request `stream_options: { include_usage: true }` on streaming chat so
34
+ vLLM emits the final usage-only chunk. Streaming responses now carry input/output token counts;
35
+ previously every streamed response reported zero tokens, which blinded metering/cost. Overridable
36
+ per-instance via `config[:stream_token_usage] = false` for a non-conforming OpenAI-compatible
37
+ backend that rejects the field. The chunk parser already handled the trailing `choices: []` usage
38
+ chunk; the gap was only that the request never asked for it.
39
+
3
40
  ## 0.3.5 - 2026-06-16
4
41
 
5
42
  - Extract `vllm_api_key` from `credentials: { api_key: ... }` in instance settings so Bearer auth works with the standard settings layout.
data/lex-llm-vllm.gemspec CHANGED
@@ -27,5 +27,5 @@ Gem::Specification.new do |spec|
27
27
  spec.add_dependency 'legion-logging', '>= 1.3.2'
28
28
  spec.add_dependency 'legion-settings', '>= 1.3.14'
29
29
  spec.add_dependency 'legion-transport', '>= 1.4.14'
30
- spec.add_dependency 'lex-llm', '>= 0.5.0'
30
+ spec.add_dependency 'lex-llm', '>= 0.6.0'
31
31
  end
@@ -1,11 +1,19 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'digest'
4
+
3
5
  begin
4
6
  require 'legion/extensions/actors/every'
5
7
  rescue LoadError => e
6
8
  warn(e.message) if $VERBOSE
7
9
  end
8
10
 
11
+ begin
12
+ require 'legion/extensions/llm/inventory/scoped_refresher'
13
+ rescue LoadError => e
14
+ warn(e.message) if $VERBOSE
15
+ end
16
+
9
17
  return unless defined?(Legion::Extensions::Actors::Every)
10
18
 
11
19
  module Legion
@@ -17,7 +25,11 @@ module Legion
17
25
  class DiscoveryRefresh < Legion::Extensions::Actors::Every
18
26
  include Legion::Logging::Helper
19
27
 
20
- REFRESH_INTERVAL = 1800
28
+ if defined?(Legion::Extensions::Llm::Inventory::ScopedRefresher)
29
+ include Legion::Extensions::Llm::Inventory::ScopedRefresher
30
+ end
31
+
32
+ def self.every_seconds = 300
21
33
 
22
34
  def runner_class = self.class
23
35
  def runner_function = 'manual'
@@ -27,24 +39,117 @@ module Legion
27
39
  def generate_task? = false
28
40
 
29
41
  def time
30
- return REFRESH_INTERVAL unless defined?(Legion::Settings)
42
+ return self.class.every_seconds unless defined?(Legion::Settings)
43
+
44
+ Legion::Settings.dig(:extensions, :llm, :vllm, :discovery_interval) || self.class.every_seconds
45
+ end
31
46
 
32
- Legion::Settings.dig(:extensions, :llm, :vllm, :discovery_interval) || REFRESH_INTERVAL
47
+ def scope_key = { provider: :vllm }
48
+ def offering_type(raw_type) = %i[embed embedding].include?(raw_type) ? :embedding : :inference
49
+
50
+ def vllm_cfg
51
+ return unless defined?(Legion::Settings)
52
+
53
+ Legion::Settings.dig(:extensions, :llm,
54
+ :vllm)
55
+ end
56
+
57
+ def compute_lanes_for_scope(**)
58
+ return [] unless defined?(Legion::LLM::Call::Registry)
59
+
60
+ vllm_instances.flat_map { |entry| lanes_from_instance(entry) }
61
+ rescue StandardError => e
62
+ handle_exception(e, level: :warn, handled: true, operation: 'vllm.actor.compute_lanes_for_scope')
63
+ []
64
+ end
65
+
66
+ def credential_hash(**)
67
+ cfg = vllm_cfg
68
+ Digest::SHA256.hexdigest(cfg&.dig(:api_key).to_s + cfg&.dig(:instances).to_s)[0, 16]
69
+ rescue StandardError
70
+ 'unknown'
33
71
  end
34
72
 
35
73
  def manual
36
- log.debug('[vllm][discovery_refresh] refreshing model list')
37
- return unless defined?(Legion::LLM::Discovery)
74
+ run_scoped_tick
75
+ rescue StandardError => e
76
+ handle_exception(e, level: :warn, handled: true, operation: 'vllm.actor.discovery_refresh')
77
+ end
78
+
79
+ private
80
+
81
+ def run_scoped_tick
82
+ return unless defined?(Legion::Extensions::Llm::Inventory::ScopedRefresher)
83
+ return unless self.class.ancestors.include?(Legion::Extensions::Llm::Inventory::ScopedRefresher)
84
+
85
+ tick
86
+ end
38
87
 
39
- Legion::LLM::Discovery.refresh_discovered_models!(provider: :vllm)
40
- if defined?(Legion::LLM::Router) && Legion::LLM::Router.respond_to?(:populate_auto_rules)
41
- Legion::LLM::Router.populate_auto_rules(Legion::LLM::Discovery.discovered_instances)
88
+ def vllm_instances
89
+ Legion::LLM::Call::Registry.all_instances.select { |e| (e[:provider] || '').to_sym == :vllm }
90
+ end
91
+
92
+ def lanes_from_instance(instance_entry)
93
+ adapter = instance_entry[:adapter]
94
+ return [] unless adapter.respond_to?(:discover_offerings)
95
+
96
+ Array(adapter.discover_offerings(live: true)).flat_map do |offering|
97
+ raw = offering_to_hash(offering)
98
+ lane = build_lane(raw, instance_entry)
99
+ fleet = maybe_fleet_lane(lane)
100
+ fleet ? [lane, fleet] : [lane]
42
101
  end
43
- if defined?(Legion::LLM::Inventory) && Legion::LLM::Inventory.respond_to?(:invalidate_offerings_cache!)
44
- Legion::LLM::Inventory.invalidate_offerings_cache!
102
+ end
103
+
104
+ # ModelOffering objects do not implement `[]`; normalize to a Hash so the
105
+ # rest of the writer stays Hash-shaped. Hash inputs pass through untouched.
106
+ def offering_to_hash(offering)
107
+ return offering if offering.is_a?(Hash)
108
+
109
+ hash = offering.to_h
110
+ hash[:type] ||= hash[:usage_type]
111
+ hash[:enabled] = offering.respond_to?(:enabled?) ? offering.enabled? : true
112
+ hash
113
+ end
114
+
115
+ def build_lane(offering, instance_entry) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity
116
+ tier = offering[:tier] || :direct
117
+ type = offering_type(offering[:type])
118
+ instance_id = offering[:instance_id] ||
119
+ instance_entry[:instance] ||
120
+ instance_entry[:instance_id] ||
121
+ instance_entry[:id]
122
+ provider_family = offering[:provider_family] || :vllm
123
+ model = offering[:model]
124
+ lane_id = Legion::Extensions::Llm::Inventory::ScopedRefresher.compose_id(
125
+ tier: tier, provider_family: provider_family, instance_id: instance_id, type: type, model: model
126
+ )
127
+ { id: lane_id, tier: tier, provider_family: provider_family, instance_id: instance_id,
128
+ model: model, canonical_model_alias: offering[:canonical_model_alias], type: type,
129
+ capabilities: normalize_caps(offering[:capabilities]),
130
+ limits: offering[:limits] || {}, enabled: offering.fetch(:enabled, true), cost: offering[:cost] || {} }
131
+ end
132
+
133
+ def maybe_fleet_lane(lane)
134
+ return unless lane[:type] == :inference && vllm_cfg&.dig(:fleet, :dispatch, :enabled)
135
+
136
+ fleet_id = Legion::Extensions::Llm::Inventory::ScopedRefresher.compose_id(
137
+ tier: :fleet, provider_family: lane[:provider_family],
138
+ instance_id: lane[:instance_id], type: lane[:type], model: lane[:model]
139
+ )
140
+ lane.merge(id: fleet_id, tier: :fleet)
141
+ end
142
+
143
+ def normalize_caps(caps)
144
+ # Inventory::Capabilities lives in lex-llm; the previous fallback (`return []
145
+ # unless defined?(...)`) silently swallowed every capability the operator
146
+ # declared via enable_thinking/enable_tools when the constant wasn't loaded.
147
+ # Always normalize through the shared vocabulary so aliases collapse.
148
+ if defined?(Legion::Extensions::Llm::Inventory::Capabilities)
149
+ Legion::Extensions::Llm::Inventory::Capabilities.normalize(caps)
150
+ else
151
+ Array(caps).compact.map(&:to_sym).uniq
45
152
  end
46
- rescue StandardError => e
47
- handle_exception(e, level: :warn, handled: true, operation: 'vllm.actor.discovery_refresh')
48
153
  end
49
154
  end
50
155
  end
@@ -33,9 +33,9 @@ module Legion
33
33
 
34
34
  def chat?(_model) = true
35
35
  def streaming?(_model) = true
36
- def vision?(_model) = true
37
- def functions?(_model) = true
38
- def embeddings?(_model) = true
36
+ def vision?(_model) = false
37
+ def functions?(_model) = false
38
+ def embeddings?(_model) = false
39
39
 
40
40
  def critical_capabilities_for(model)
41
41
  [
@@ -78,7 +78,7 @@ module Legion
78
78
 
79
79
  def health(live: false)
80
80
  log.info { "checking health live=#{live} at #{api_base}#{health_url}" }
81
- connection.get(health_url).body
81
+ super
82
82
  end
83
83
 
84
84
  def readiness(live: false)
@@ -88,27 +88,20 @@ module Legion
88
88
  end
89
89
  end
90
90
 
91
- def list_models
91
+ def list_models(live: false, **filters)
92
92
  log.info { "discovering models from #{api_base}#{models_url}" }
93
93
  super.tap do |models|
94
94
  log.info { "discovered #{models.size} model(s) from vLLM" }
95
- self.class.registry_publisher.publish_models_async(models, readiness: readiness(live: false))
96
95
  end
97
96
  end
98
97
 
99
- def discover_offerings(live: false, **)
100
- models = if live
101
- @cached_models = list_models
102
- else
103
- Array(@cached_models)
104
- end
105
- offerings = models.filter_map do |model_info|
106
- next unless model_allowed?(model_info.id)
98
+ def discover_offerings(live: false, **filters)
99
+ return filter_cached_offerings(Array(@cached_offerings), filters) unless live
107
100
 
108
- offering_from_model(model_info)
109
- end
110
- log.debug { "built #{offerings.size} vLLM offering(s) live=#{live}" }
111
- offerings
101
+ provider_health = health(live:)
102
+ @cached_offerings = discover_live_offerings(filters, provider_health, live:)
103
+ log_discover_complete(@cached_offerings)
104
+ @cached_offerings
112
105
  rescue StandardError => e
113
106
  handle_exception(e, level: :warn, handled: true, operation: 'vllm.discover_offerings')
114
107
  []
@@ -161,13 +154,49 @@ module Legion
161
154
 
162
155
  private
163
156
 
164
- def offering_from_model(model_info)
157
+ def discovery_registry_readiness(provider_health, live:)
158
+ {
159
+ provider: slug.to_sym,
160
+ configured: configured?,
161
+ ready: provider_health[:ready] == true,
162
+ live: live,
163
+ health: provider_health
164
+ }
165
+ end
166
+
167
+ def discover_live_offerings(filters, provider_health, live:)
168
+ readiness = discovery_registry_readiness(provider_health, live:)
169
+ Array(list_models(live:, **filters)).filter_map do |model|
170
+ self.class.registry_publisher.publish_models_async([model], readiness:)
171
+ next unless model_matches_filters?(model, filters)
172
+ next unless model_allowed?(model.id)
173
+
174
+ log_model_discovered(model)
175
+ offering_from_model(model, health: provider_health)
176
+ end
177
+ end
178
+
179
+ def log_model_discovered(model)
180
+ log.debug(
181
+ "[#{slug}] instance=#{provider_instance_id} action=model_discovered " \
182
+ "model=#{model.id} family=#{model.family}"
183
+ )
184
+ end
185
+
186
+ def log_discover_complete(offerings)
187
+ log.info(
188
+ "[#{slug}] instance=#{provider_instance_id} action=discover_complete " \
189
+ "model_count=#{Array(offerings).size}"
190
+ )
191
+ end
192
+
193
+ def offering_from_model(model_info, health: {})
165
194
  ctx = model_info.context_length
166
195
  if ctx
167
196
  begin
168
197
  cache_set(model_detail_cache_key(model_info.id), { context_window: ctx }, ttl: 86_400)
169
198
  rescue StandardError => e
170
- handle_exception(e, level: :debug, handled: true, operation: 'vllm.cache_model_detail')
199
+ handle_exception(e, level: :warn, handled: true, operation: 'vllm.cache_model_detail')
171
200
  end
172
201
  end
173
202
 
@@ -181,11 +210,12 @@ module Legion
181
210
  model_config: model_capability_config(model_info.id)
182
211
  )
183
212
 
184
- build_offering(model_info, policy, ctx)
213
+ build_offering(model_info, policy, ctx, health)
185
214
  end
186
215
 
187
- def build_offering(model_info, policy, ctx) # rubocop:disable Metrics/AbcSize
216
+ def build_offering(model_info, policy, ctx, health) # rubocop:disable Metrics/AbcSize
188
217
  max_out = model_info.respond_to?(:max_output_tokens) ? model_info.max_output_tokens : nil
218
+ usage_type = policy[:capabilities].include?(:embedding) ? :embedding : :inference
189
219
 
190
220
  Legion::Extensions::Llm::Routing::ModelOffering.new(
191
221
  provider_family: :vllm,
@@ -195,10 +225,11 @@ module Legion
195
225
  model: model_info.id,
196
226
  canonical_model_alias: model_info.respond_to?(:name) ? model_info.name : nil,
197
227
  model_family: model_info.respond_to?(:family) ? model_info.family : nil,
198
- usage_type: model_info.embedding? ? :embedding : :inference,
228
+ usage_type: usage_type,
199
229
  capabilities: policy[:capabilities],
200
230
  capability_sources: policy[:sources],
201
231
  limits: { context_window: ctx, max_output_tokens: max_out }.compact,
232
+ health: health,
202
233
  metadata: offering_metadata_for(model_info).merge(capability_sources: policy[:sources])
203
234
  )
204
235
  end
@@ -212,51 +243,7 @@ module Legion
212
243
  end
213
244
 
214
245
  def provider_envelope_capabilities
215
- { streaming: true }
216
- end
217
-
218
- def provider_capability_config
219
- return {} unless defined?(Legion::Extensions::Llm::CredentialSources)
220
-
221
- conf = Legion::Extensions::Llm::CredentialSources.setting(:extensions, :llm, :vllm)
222
- conf.is_a?(Hash) ? conf.to_h.except(:instances, 'instances') : {}
223
- rescue StandardError => e
224
- handle_exception(e, level: :debug, handled: true, operation: 'vllm.provider_capability_config')
225
- {}
226
- end
227
-
228
- def instance_capability_config
229
- cfg = config
230
- result = {}
231
- %i[capabilities enable_thinking enable_tools enable_streaming enable_vision enable_embeddings
232
- thinking_flag tools_flag streaming_flag vision_flag embedding_flag embeddings_flag
233
- tool_flag images_flag image_flag].each do |key|
234
- next unless cfg.respond_to?(key)
235
-
236
- val = cfg.send(key)
237
- result[key] = val unless val.nil?
238
- rescue StandardError
239
- next
240
- end
241
- result
242
- end
243
-
244
- def model_capability_config(model_id)
245
- models_conf = resolve_models_config
246
- return {} unless models_conf.respond_to?(:to_h)
247
-
248
- hash = models_conf.to_h
249
- hash[model_id.to_s] || hash[model_id.to_sym] || {}
250
- rescue StandardError => e
251
- handle_exception(e, level: :debug, handled: true, operation: 'vllm.model_capability_config')
252
- {}
253
- end
254
-
255
- def resolve_models_config
256
- return config.models if config.respond_to?(:models)
257
- return config[:models] if config.respond_to?(:[])
258
-
259
- nil
246
+ { completion: true, streaming: true }
260
247
  end
261
248
 
262
249
  def offering_metadata_for(model_info)
@@ -453,7 +440,7 @@ module Legion
453
440
  def vllm_thinking_setting
454
441
  instance_thinking_enabled? || global_thinking_enabled?
455
442
  rescue StandardError => e
456
- handle_exception(e, level: :debug, handled: true, operation: 'vllm.thinking_setting')
443
+ handle_exception(e, level: :warn, handled: true, operation: 'vllm.thinking_setting')
457
444
  false
458
445
  end
459
446
 
@@ -37,7 +37,7 @@ module Legion
37
37
 
38
38
  payload[key] || payload[key.to_s]
39
39
  rescue StandardError => e
40
- handle_exception(e, level: :debug, handled: true, operation: 'vllm.fleet_worker.payload_field',
40
+ handle_exception(e, level: :warn, handled: true, operation: 'vllm.fleet_worker.payload_field',
41
41
  field: key)
42
42
  nil
43
43
  end
@@ -70,6 +70,7 @@ module Legion
70
70
  payload[:tools] = format_tools(request.tools) unless request.tools.to_h.empty?
71
71
  payload[:tool_choice] = format_tool_choice(request.tool_choice) if request.tool_choice
72
72
  payload.merge!(map_params_to_wire(request.params)) if request.params
73
+ payload[:stream_options] = { include_usage: true } if request.stream && stream_token_usage?
73
74
  apply_thinking_config(payload, request)
74
75
  if formatted_response_format?(request.params)
75
76
  payload[:response_format] =
@@ -208,6 +209,18 @@ module Legion
208
209
 
209
210
  attr_reader :config
210
211
 
212
+ # OpenAI `stream_options.include_usage` asks the server to emit a final
213
+ # usage-only chunk (choices:[]) so streaming responses carry token counts.
214
+ # vLLM supports it (capability streaming_token_usage); defaults on, but a
215
+ # non-conforming OpenAI-compatible backend that rejects the field can opt
216
+ # out per-instance via config[:stream_token_usage] = false.
217
+ def stream_token_usage?
218
+ override = config.respond_to?(:[]) ? config[:stream_token_usage] : nil
219
+ return override != false unless override.nil?
220
+
221
+ capabilities[:streaming_token_usage] == true
222
+ end
223
+
211
224
  # ── Message formatting ──
212
225
 
213
226
  def format_messages(request)
@@ -4,7 +4,7 @@ module Legion
4
4
  module Extensions
5
5
  module Llm
6
6
  module Vllm
7
- VERSION = '0.3.5'
7
+ VERSION = '0.3.11'
8
8
  end
9
9
  end
10
10
  end
@@ -103,7 +103,7 @@ module Legion
103
103
  host = URI.parse(url.to_s).host.to_s.downcase
104
104
  %w[localhost 127.0.0.1 ::1].include?(host) ? :local : :direct
105
105
  rescue URI::InvalidURIError => e
106
- handle_exception(e, level: :debug, handled: true, operation: 'vllm.infer_tier_from_endpoint')
106
+ handle_exception(e, level: :warn, handled: true, operation: 'vllm.infer_tier_from_endpoint')
107
107
  :direct
108
108
  end
109
109
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-llm-vllm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ version: 0.3.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - LegionIO
@@ -71,14 +71,14 @@ dependencies:
71
71
  requirements:
72
72
  - - ">="
73
73
  - !ruby/object:Gem::Version
74
- version: 0.5.0
74
+ version: 0.6.0
75
75
  type: :runtime
76
76
  prerelease: false
77
77
  version_requirements: !ruby/object:Gem::Requirement
78
78
  requirements:
79
79
  - - ">="
80
80
  - !ruby/object:Gem::Version
81
- version: 0.5.0
81
+ version: 0.6.0
82
82
  description: vLLM provider integration for the LegionIO LLM routing framework.
83
83
  email:
84
84
  - matthewdiverson@gmail.com