lex-llm-vllm 0.3.0 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 172c35debe332979f48575e43bd59c04828449a41a195f3d899bc15afa18bdb2
4
- data.tar.gz: c423c24ff7a5e4b33f1b6e562b50c196d2870b347bbcad61b38cd228d54ee318
3
+ metadata.gz: 20aa357958da5d294b132bdb5d3bea065d0192b42051deae734a3e1b8af592e5
4
+ data.tar.gz: 203a81b2aa087bc2cdabbe26cd71ea88e1bd7b1aaa9e44a95a5b43c35ab2c344
5
5
  SHA512:
6
- metadata.gz: dbf9166b8302c7dc786562b7e2e17381b4cf33b570825570cc153dc438bfbef4991e53b0e86811821f08b2a4a00e3b6522bac903764a6bfe3e90f04be4d556ea
7
- data.tar.gz: 5ee6cda495f98e9f68c4b3ea79b1a0fa28ab833113a00c3aa9a43e9a67e94c4aa79995faf8d8745e063d7c480d77dde246940d8d6b9c3570d678c39be19b496f
6
+ metadata.gz: 4ab5021f00c1f6652147297c4f9fc56a2f501eaa1468c0f098aff78d83426706069e4cca7184c96647135ff951dc4965d871352144826d4cf6c8961e60616862
7
+ data.tar.gz: 1e52f17e28c52ddae6317c3fd6ea6fd58b54a71a46d77c80f00544fb35333bf0340c1498b2f9db6419c811765c9926e0bd20228c7bc44badae04dd68a9fd178c
data/CHANGELOG.md CHANGED
@@ -1,5 +1,25 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.3.5 - 2026-06-16
4
+
5
+ - Extract `vllm_api_key` from `credentials: { api_key: ... }` in instance settings so Bearer auth works with the standard settings layout.
6
+ - Fix `Dalli::RingError` crash in `offering_from_model` when cache server is unavailable; cache write is now best-effort.
7
+
8
+ ## 0.3.3 - 2026-06-16
9
+
10
+ - Dependency updates (concurrent-ruby 1.3.7, faraday 2.14.3, rubocop 1.88.0) and code quality improvements.
11
+
12
+ ## 0.3.2 - 2026-06-15
13
+
14
+ - **CapabilityPolicy integration** — Optional capabilities default false; use `CapabilityPolicy.resolve` for offerings. Static all-true predicates no longer used for routing truth. Settings overrides at provider/instance/model level supported.
15
+
16
+ ## 0.3.1 - 2026-06-13
17
+
18
+ - **Gemfile cleanup** — Remove local path overrides; dependencies resolve from gemspec via rubygems.
19
+ - **Bug fix** — Restore vLLM streaming; private `ThinkingExtractor` call was killing every text delta.
20
+ - **Canonical tool normalization** — Use canonical normalization for tool parameter schemas.
21
+ - 155 examples, 0 failures; 17 files, 0 rubocop offenses.
22
+
3
23
  ## 0.3.0 - 2026-06-10
4
24
 
5
25
  - Add canonical provider translator (`Translator`) implementing `render_request`,
data/Gemfile CHANGED
@@ -2,12 +2,6 @@
2
2
 
3
3
  source 'https://rubygems.org'
4
4
 
5
- group :test do
6
- transport_path = ENV.fetch('LEGION_TRANSPORT_PATH', File.expand_path('../../legion-transport', __dir__))
7
- gem 'legion-transport', path: transport_path if File.directory?(transport_path)
8
- # lex-llm >= 0.5.0 carries canonical types + conformance kit (released on rubygems.org)
9
- end
10
-
11
5
  gemspec
12
6
 
13
7
  group :development do
@@ -37,6 +37,12 @@ module Legion
37
37
  return unless defined?(Legion::LLM::Discovery)
38
38
 
39
39
  Legion::LLM::Discovery.refresh_discovered_models!(provider: :vllm)
40
+ if defined?(Legion::LLM::Router) && Legion::LLM::Router.respond_to?(:populate_auto_rules)
41
+ Legion::LLM::Router.populate_auto_rules(Legion::LLM::Discovery.discovered_instances)
42
+ end
43
+ if defined?(Legion::LLM::Inventory) && Legion::LLM::Inventory.respond_to?(:invalidate_offerings_cache!)
44
+ Legion::LLM::Inventory.invalidate_offerings_cache!
45
+ end
40
46
  rescue StandardError => e
41
47
  handle_exception(e, level: :warn, handled: true, operation: 'vllm.actor.discovery_refresh')
42
48
  end
@@ -163,7 +163,29 @@ module Legion
163
163
 
164
164
  def offering_from_model(model_info)
165
165
  ctx = model_info.context_length
166
- cache_set(model_detail_cache_key(model_info.id), { context_window: ctx }, ttl: 86_400) if ctx
166
+ if ctx
167
+ begin
168
+ cache_set(model_detail_cache_key(model_info.id), { context_window: ctx }, ttl: 86_400)
169
+ rescue StandardError => e
170
+ handle_exception(e, level: :debug, handled: true, operation: 'vllm.cache_model_detail')
171
+ end
172
+ end
173
+
174
+ policy = Legion::Extensions::Llm::CapabilityPolicy.resolve(
175
+ real: extract_real_capabilities(model_info),
176
+ provider_catalog: {},
177
+ probe: {},
178
+ provider_envelope: provider_envelope_capabilities,
179
+ provider_config: provider_capability_config,
180
+ instance_config: instance_capability_config,
181
+ model_config: model_capability_config(model_info.id)
182
+ )
183
+
184
+ build_offering(model_info, policy, ctx)
185
+ end
186
+
187
+ def build_offering(model_info, policy, ctx) # rubocop:disable Metrics/AbcSize
188
+ max_out = model_info.respond_to?(:max_output_tokens) ? model_info.max_output_tokens : nil
167
189
 
168
190
  Legion::Extensions::Llm::Routing::ModelOffering.new(
169
191
  provider_family: :vllm,
@@ -171,13 +193,82 @@ module Legion
171
193
  transport: offering_transport,
172
194
  tier: offering_tier,
173
195
  model: model_info.id,
196
+ canonical_model_alias: model_info.respond_to?(:name) ? model_info.name : nil,
197
+ model_family: model_info.respond_to?(:family) ? model_info.family : nil,
174
198
  usage_type: model_info.embedding? ? :embedding : :inference,
175
- capabilities: model_info.capabilities.map(&:to_s),
176
- limits: { context_window: ctx }.compact,
177
- metadata: { context_length: ctx }
199
+ capabilities: policy[:capabilities],
200
+ capability_sources: policy[:sources],
201
+ limits: { context_window: ctx, max_output_tokens: max_out }.compact,
202
+ metadata: offering_metadata_for(model_info).merge(capability_sources: policy[:sources])
178
203
  )
179
204
  end
180
205
 
206
+ def extract_real_capabilities(model_info)
207
+ return {} unless model_info.respond_to?(:metadata)
208
+
209
+ meta = model_info.metadata
210
+ meta_caps = meta.is_a?(Hash) ? meta[:capabilities] : nil
211
+ meta_caps.is_a?(Hash) ? meta_caps : {}
212
+ end
213
+
214
+ def provider_envelope_capabilities
215
+ { streaming: true }
216
+ end
217
+
218
+ def provider_capability_config
219
+ return {} unless defined?(Legion::Extensions::Llm::CredentialSources)
220
+
221
+ conf = Legion::Extensions::Llm::CredentialSources.setting(:extensions, :llm, :vllm)
222
+ conf.is_a?(Hash) ? conf.to_h.except(:instances, 'instances') : {}
223
+ rescue StandardError => e
224
+ handle_exception(e, level: :debug, handled: true, operation: 'vllm.provider_capability_config')
225
+ {}
226
+ end
227
+
228
+ def instance_capability_config
229
+ cfg = config
230
+ result = {}
231
+ %i[capabilities enable_thinking enable_tools enable_streaming enable_vision enable_embeddings
232
+ thinking_flag tools_flag streaming_flag vision_flag embedding_flag embeddings_flag
233
+ tool_flag images_flag image_flag].each do |key|
234
+ next unless cfg.respond_to?(key)
235
+
236
+ val = cfg.send(key)
237
+ result[key] = val unless val.nil?
238
+ rescue StandardError
239
+ next
240
+ end
241
+ result
242
+ end
243
+
244
+ def model_capability_config(model_id)
245
+ models_conf = resolve_models_config
246
+ return {} unless models_conf.respond_to?(:to_h)
247
+
248
+ hash = models_conf.to_h
249
+ hash[model_id.to_s] || hash[model_id.to_sym] || {}
250
+ rescue StandardError => e
251
+ handle_exception(e, level: :debug, handled: true, operation: 'vllm.model_capability_config')
252
+ {}
253
+ end
254
+
255
+ def resolve_models_config
256
+ return config.models if config.respond_to?(:models)
257
+ return config[:models] if config.respond_to?(:[])
258
+
259
+ nil
260
+ end
261
+
262
+ def offering_metadata_for(model_info)
263
+ {
264
+ raw_model: model_info.id,
265
+ parameter_count: model_info.respond_to?(:parameter_count) ? model_info.parameter_count : nil,
266
+ parameter_size: model_info.respond_to?(:parameter_size) ? model_info.parameter_size : nil,
267
+ quantization: model_info.respond_to?(:quantization) ? model_info.quantization : nil,
268
+ size_bytes: model_info.respond_to?(:size_bytes) ? model_info.size_bytes : nil
269
+ }.compact
270
+ end
271
+
181
272
  # ── Canonical bridge: legacy provider API → Canonical::Request ──
182
273
 
183
274
  # rubocop:disable Metrics/ParameterLists, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- bridge method can be complex
@@ -281,7 +372,7 @@ module Legion
281
372
  role: :assistant,
282
373
  content: content,
283
374
  model_id: raw_data['model'],
284
- tool_calls: nil,
375
+ tool_calls: legacy_chunk_tool_calls(canonical),
285
376
  thinking: thinking,
286
377
  input_tokens: usage.respond_to?(:input_tokens) ? usage.input_tokens : nil,
287
378
  output_tokens: usage.respond_to?(:output_tokens) ? usage.output_tokens : nil,
@@ -289,6 +380,24 @@ module Legion
289
380
  )
290
381
  end
291
382
 
383
+ # Map a canonical tool_call_delta onto the legacy chunk tool_calls hash.
384
+ # Fragment semantics matter: an entry with a non-nil id starts a new tool
385
+ # call in the StreamAccumulator; a nil id appends the raw arguments
386
+ # fragment to the most recently started call.
387
+ def legacy_chunk_tool_calls(canonical)
388
+ return nil unless canonical.type == :tool_call_delta && canonical.tool_call
389
+
390
+ tc = canonical.tool_call
391
+ key = (tc.id || tc.name || :fragment).to_s.to_sym
392
+ {
393
+ key => Legion::Extensions::Llm::ToolCall.new(
394
+ id: tc.id,
395
+ name: tc.name,
396
+ arguments: tc.arguments
397
+ )
398
+ }
399
+ end
400
+
292
401
  # ── Tool choice helpers ──
293
402
 
294
403
  def format_tool_choice_from_prefs(tool_prefs)
@@ -164,24 +164,8 @@ module Legion
164
164
  )
165
165
  end
166
166
 
167
- tool_calls = delta['tool_calls']
168
- unless Array(tool_calls).empty?
169
- first_call = tool_calls.first
170
- function = first_call.fetch('function', {})
171
-
172
- tc = Canonical::ToolCall.build(
173
- id: (first_call['id'] || function['name'] || 'synthesized').to_s,
174
- name: function['name'].to_s,
175
- arguments: parse_tool_arguments(function['arguments']),
176
- source: :client
177
- )
178
-
179
- return Canonical::Chunk.tool_call_delta(
180
- tool_call: tc,
181
- request_id: request_id,
182
- block_index: first_call['index']
183
- )
184
- end
167
+ tool_calls = Array(delta['tool_calls'])
168
+ return build_tool_call_delta_chunk(tool_calls.first, request_id) unless tool_calls.empty?
185
169
 
186
170
  # Thinking delta from reasoning_content
187
171
  reasoning_content = delta['reasoning_content'] || delta['reasoning']
@@ -227,7 +211,8 @@ module Legion
227
211
  # ── Message formatting ──
228
212
 
229
213
  def format_messages(request)
230
- messages = format_request_messages(request.messages)
214
+ non_system = request.messages&.reject { |m| m.role.to_s == 'system' } || []
215
+ messages = format_request_messages(non_system)
231
216
 
232
217
  if request.system.to_s.strip.empty?
233
218
  messages
@@ -345,7 +330,8 @@ module Legion
345
330
  def format_message_tool_calls(tool_calls)
346
331
  return [] if tool_calls.empty?
347
332
 
348
- tool_calls.map { |tc| format_tool_call_for_history(tc) }
333
+ tc_array = tool_calls.is_a?(Hash) ? tool_calls.values : Array(tool_calls)
334
+ tc_array.map { |tc| format_tool_call_for_history(tc) }
349
335
  end
350
336
 
351
337
  def format_tool_call_for_history(tool_call_entry)
@@ -387,10 +373,9 @@ module Legion
387
373
 
388
374
  name = tool_hash[:name] || tool_hash['name']
389
375
  description = (tool_hash[:description] || tool_hash['description'] || '').to_s
390
- parameters = tool_hash[:parameters] || tool_hash[:input_schema] ||
391
- { type: 'object', properties: {} }
392
- parameters = parameters.to_h if parameters.respond_to?(:to_h) && !parameters.is_a?(Hash)
393
- parameters = { type: 'object', properties: {} } unless parameters.is_a?(Hash)
376
+ raw_params = tool_hash[:parameters] || tool_hash[:input_schema]
377
+ raw_params = raw_params.to_h if raw_params.respond_to?(:to_h) && !raw_params.is_a?(Hash)
378
+ parameters = Legion::Extensions::Llm::Canonical::ToolDefinition.normalize_parameters(raw_params)
394
379
 
395
380
  {
396
381
  type: 'function',
@@ -633,26 +618,48 @@ module Legion
633
618
  )
634
619
  end
635
620
 
621
+ # Build a tool_call_delta chunk preserving OpenAI streaming fragment
622
+ # semantics: the opening fragment carries id + name; continuation
623
+ # fragments carry id: nil and a raw partial-JSON arguments string.
624
+ # The StreamAccumulator keys off a nil id to append fragments to the
625
+ # current tool call, so the id must NOT be synthesized here.
626
+ def build_tool_call_delta_chunk(first_call, request_id)
627
+ function = first_call.fetch('function', {})
628
+
629
+ tc = Canonical::ToolCall.new(
630
+ id: first_call['id'], exchange_id: nil,
631
+ name: function['name'], arguments: function['arguments'].to_s,
632
+ source: :client, status: nil, duration_ms: nil, result: nil,
633
+ error: nil, started_at: nil, finished_at: nil, category: nil,
634
+ data_handling_classification: nil, policy_decision: nil
635
+ )
636
+
637
+ Canonical::Chunk.tool_call_delta(
638
+ tool_call: tc,
639
+ request_id: request_id,
640
+ block_index: first_call['index']
641
+ )
642
+ end
643
+
636
644
  def empty_delta?(delta)
637
645
  (delta['content'].nil? || delta['content'].to_s.empty?) &&
638
646
  (delta['tool_calls'].nil? || Array(delta['tool_calls']).empty?) &&
639
647
  (delta['reasoning_content'].nil? || delta['reasoning_content'].to_s.empty?)
640
648
  end
641
649
 
650
+ # Per-chunk think-tag extraction is structurally impossible while streaming:
651
+ # tags arrive split across SSE chunks, and ThinkingExtractor strips per-chunk
652
+ # whitespace, corrupting reassembled text. Emit the raw delta unmodified —
653
+ # the StreamAccumulator extracts think tags statefully across deltas.
654
+ # (Previously called ThinkingExtractor.extract_from_content, which is
655
+ # private_class_method in lex-llm >= 0.5.0 and raised NoMethodError on
656
+ # every streamed text delta, silently killing all vLLM streaming.)
642
657
  def parse_text_delta_with_thinking(content, request_id, data)
643
- extraction = Responses::ThinkingExtractor.extract_from_content(content)
644
- clean_text = extraction[0]
645
- thinking_text = extraction[1]
646
-
647
- if thinking_text && !thinking_text.empty?
648
- Canonical::Chunk.thinking_delta(delta: thinking_text, request_id: request_id)
649
- else
650
- Canonical::Chunk.text_delta(
651
- delta: clean_text || content,
652
- request_id: request_id,
653
- index: data['index']
654
- )
655
- end
658
+ Canonical::Chunk.text_delta(
659
+ delta: content,
660
+ request_id: request_id,
661
+ index: data['index']
662
+ )
656
663
  end
657
664
 
658
665
  # Parse a canonical-form chunk (from conformance kit fixtures).
@@ -4,7 +4,7 @@ module Legion
4
4
  module Extensions
5
5
  module Llm
6
6
  module Vllm
7
- VERSION = '0.3.0'
7
+ VERSION = '0.3.5'
8
8
  end
9
9
  end
10
10
  end
@@ -16,7 +16,7 @@ module Legion
16
16
  extend Legion::Extensions::Llm::AutoRegistration
17
17
 
18
18
  PROVIDER_FAMILY = :vllm
19
- DEFAULT_INSTANCE_TIER = { tier: :direct, capabilities: %i[completion streaming vision tools] }.freeze
19
+ DEFAULT_INSTANCE_TIER = { tier: :direct, capabilities: {}, provider_capabilities: { streaming: true } }.freeze
20
20
 
21
21
  def self.default_settings
22
22
  ::Legion::Extensions::Llm.provider_settings(
@@ -32,10 +32,7 @@ module Legion
32
32
  fleet: {
33
33
  enabled: false,
34
34
  respond_to_requests: false,
35
- capabilities: %i[chat stream_chat embed],
36
- lanes: [],
37
- concurrency: 1,
38
- queue_suffix: nil
35
+ capabilities: %i[chat stream_chat embed]
39
36
  }
40
37
  }
41
38
  )
@@ -74,10 +71,19 @@ module Legion
74
71
  def self.normalize_instance_config(config)
75
72
  normalized = config.to_h.transform_keys(&:to_sym)
76
73
  resolve_api_base_aliases(normalized)
74
+ resolve_credentials(normalized)
77
75
  normalized[:tier] ||= infer_tier_from_endpoint(normalized[:vllm_api_base])
78
76
  normalized
79
77
  end
80
78
 
79
+ def self.resolve_credentials(normalized)
80
+ creds = normalized.delete(:credentials)
81
+ return unless creds.is_a?(Hash)
82
+
83
+ creds = creds.transform_keys(&:to_sym)
84
+ normalized[:vllm_api_key] ||= creds[:api_key]
85
+ end
86
+
81
87
  def self.resolve_api_base_aliases(normalized)
82
88
  normalized[:vllm_api_base] ||= normalized.delete(:base_url)
83
89
  normalized[:vllm_api_base] ||= normalized.delete(:api_base)
@@ -93,12 +99,15 @@ module Legion
93
99
  return :direct if url.nil? || url.to_s.empty?
94
100
 
95
101
  require 'uri'
102
+ require_relative 'vllm/actors/discovery_refresh'
96
103
  host = URI.parse(url.to_s).host.to_s.downcase
97
104
  %w[localhost 127.0.0.1 ::1].include?(host) ? :local : :direct
98
105
  rescue URI::InvalidURIError => e
99
106
  handle_exception(e, level: :debug, handled: true, operation: 'vllm.infer_tier_from_endpoint')
100
107
  :direct
101
108
  end
109
+
110
+ Legion::Extensions::Llm::Configuration.register_provider_options(Provider.configuration_options)
102
111
  end
103
112
  end
104
113
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-llm-vllm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - LegionIO