lex-llm-vllm 0.2.13 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6adc86b9d3286821c0efa59e4c820f3d99ee0acb5327f133a96010383d154505
4
- data.tar.gz: 73ecff7ccc309eb0469a79edc3970fdf3d766199a6df31edde4cbaf2016dc970
3
+ metadata.gz: 20aa357958da5d294b132bdb5d3bea065d0192b42051deae734a3e1b8af592e5
4
+ data.tar.gz: 203a81b2aa087bc2cdabbe26cd71ea88e1bd7b1aaa9e44a95a5b43c35ab2c344
5
5
  SHA512:
6
- metadata.gz: 4b2c498e26f09fa27edfa7abf08bf6fae656313cf6e2ce625772a9ce809ff1fcfae55a8746261b943b6b41111a46784fa97d9d5004f4be69f58761de05c6383d
7
- data.tar.gz: 94b867bd099f8e062f23aee550be30d4549bbc3d4f10d40b6e4e9b3dcabca7c8837e246198a6ea9a208035da3dcfaaf41be626709de9c8e642e3a4035f6681b0
6
+ metadata.gz: 4ab5021f00c1f6652147297c4f9fc56a2f501eaa1468c0f098aff78d83426706069e4cca7184c96647135ff951dc4965d871352144826d4cf6c8961e60616862
7
+ data.tar.gz: 1e52f17e28c52ddae6317c3fd6ea6fd58b54a71a46d77c80f00544fb35333bf0340c1498b2f9db6419c811765c9926e0bd20228c7bc44badae04dd68a9fd178c
data/.rubocop.yml CHANGED
@@ -21,3 +21,12 @@ RSpec/ExampleLength:
21
21
  Max: 8
22
22
  RSpec/MultipleExpectations:
23
23
  Enabled: false
24
+
25
+ RSpec/ExampleLength:
26
+ Max: 10
27
+
28
+
29
+ Layout/LineLength:
30
+ Exclude:
31
+ - spec/**/*
32
+
data/CHANGELOG.md CHANGED
@@ -1,5 +1,39 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.3.5 - 2026-06-16
4
+
5
+ - Extract `vllm_api_key` from `credentials: { api_key: ... }` in instance settings so Bearer auth works with the standard settings layout.
6
+ - Fix `Dalli::RingError` crash in `offering_from_model` when cache server is unavailable; cache write is now best-effort.
7
+
8
+ ## 0.3.3 - 2026-06-16
9
+
10
+ - Dependency updates (concurrent-ruby 1.3.7, faraday 2.14.3, rubocop 1.88.0) and code quality improvements.
11
+
12
+ ## 0.3.2 - 2026-06-15
13
+
14
+ - **CapabilityPolicy integration** — Optional capabilities default false; use `CapabilityPolicy.resolve` for offerings. Static all-true predicates no longer used for routing truth. Settings overrides at provider/instance/model level supported.
15
+
16
+ ## 0.3.1 - 2026-06-13
17
+
18
+ - **Gemfile cleanup** — Remove local path overrides; dependencies resolve from gemspec via rubygems.
19
+ - **Bug fix** — Restore vLLM streaming; private `ThinkingExtractor` call was killing every text delta.
20
+ - **Canonical tool normalization** — Use canonical normalization for tool parameter schemas.
21
+ - 155 examples, 0 failures; 17 files, 0 rubocop offenses.
22
+
23
+ ## 0.3.0 - 2026-06-10
24
+
25
+ - Add canonical provider translator (`Translator`) implementing `render_request`,
26
+ `parse_response`, `parse_chunk`, and `capabilities` per N×N routing design
27
+ - Wire provider `render_payload`, `parse_completion_response`, `build_chunk` to
28
+ delegate to translator with legacy Message/Chunk bridge for backward compat
29
+ - Declare vLLM quirks: `tool_calls_as_text`, `forced_tool_choice`, `thinking_tags`,
30
+ `streaming_token_usage`
31
+ - G18 parameter mapping: max_tokens, temperature, top_p, top_k, stop_sequences,
32
+ seed, frequency_penalty, presence_penalty, response_format
33
+ - Qwen-style </think> tag extraction and tool-call synthesis from content text
34
+ - Adopt conformance kit (`it_behaves_like 'a canonical provider translator'`)
35
+ - Bump lex-llm dependency floor to >= 0.5.0
36
+
3
37
  ## 0.2.13 - 2026-06-05
4
38
 
5
39
  - Fix missing documentation comment on `DiscoveryRefresh` actor (RuboCop Style/Documentation)
data/Gemfile CHANGED
@@ -2,13 +2,6 @@
2
2
 
3
3
  source 'https://rubygems.org'
4
4
 
5
- group :test do
6
- llm_base_path = ENV.fetch('LEX_LLM_PATH', File.expand_path('../lex-llm', __dir__))
7
- transport_path = ENV.fetch('LEGION_TRANSPORT_PATH', File.expand_path('../../legion-transport', __dir__))
8
- gem 'legion-transport', path: transport_path if File.directory?(transport_path)
9
- gem 'lex-llm', path: llm_base_path if File.directory?(llm_base_path)
10
- end
11
-
12
5
  gemspec
13
6
 
14
7
  group :development do
data/lex-llm-vllm.gemspec CHANGED
@@ -27,5 +27,5 @@ Gem::Specification.new do |spec|
27
27
  spec.add_dependency 'legion-logging', '>= 1.3.2'
28
28
  spec.add_dependency 'legion-settings', '>= 1.3.14'
29
29
  spec.add_dependency 'legion-transport', '>= 1.4.14'
30
- spec.add_dependency 'lex-llm', '>= 0.4.3'
30
+ spec.add_dependency 'lex-llm', '>= 0.5.0'
31
31
  end
@@ -37,6 +37,12 @@ module Legion
37
37
  return unless defined?(Legion::LLM::Discovery)
38
38
 
39
39
  Legion::LLM::Discovery.refresh_discovered_models!(provider: :vllm)
40
+ if defined?(Legion::LLM::Router) && Legion::LLM::Router.respond_to?(:populate_auto_rules)
41
+ Legion::LLM::Router.populate_auto_rules(Legion::LLM::Discovery.discovered_instances)
42
+ end
43
+ if defined?(Legion::LLM::Inventory) && Legion::LLM::Inventory.respond_to?(:invalidate_offerings_cache!)
44
+ Legion::LLM::Inventory.invalidate_offerings_cache!
45
+ end
40
46
  rescue StandardError => e
41
47
  handle_exception(e, level: :warn, handled: true, operation: 'vllm.actor.discovery_refresh')
42
48
  end
@@ -53,6 +53,11 @@ module Legion
53
53
  Vllm.default_settings
54
54
  end
55
55
 
56
+ # Canonical translator instance — renders requests, parses responses/chunks.
57
+ def translator
58
+ @translator ||= Translator.new(config: config)
59
+ end
60
+
56
61
  def api_base
57
62
  normalize_url(config.vllm_api_base || settings[:endpoint] || 'http://localhost:8000')
58
63
  end
@@ -158,7 +163,29 @@ module Legion
158
163
 
159
164
  def offering_from_model(model_info)
160
165
  ctx = model_info.context_length
161
- cache_set(model_detail_cache_key(model_info.id), { context_window: ctx }, ttl: 86_400) if ctx
166
+ if ctx
167
+ begin
168
+ cache_set(model_detail_cache_key(model_info.id), { context_window: ctx }, ttl: 86_400)
169
+ rescue StandardError => e
170
+ handle_exception(e, level: :debug, handled: true, operation: 'vllm.cache_model_detail')
171
+ end
172
+ end
173
+
174
+ policy = Legion::Extensions::Llm::CapabilityPolicy.resolve(
175
+ real: extract_real_capabilities(model_info),
176
+ provider_catalog: {},
177
+ probe: {},
178
+ provider_envelope: provider_envelope_capabilities,
179
+ provider_config: provider_capability_config,
180
+ instance_config: instance_capability_config,
181
+ model_config: model_capability_config(model_info.id)
182
+ )
183
+
184
+ build_offering(model_info, policy, ctx)
185
+ end
186
+
187
+ def build_offering(model_info, policy, ctx) # rubocop:disable Metrics/AbcSize
188
+ max_out = model_info.respond_to?(:max_output_tokens) ? model_info.max_output_tokens : nil
162
189
 
163
190
  Legion::Extensions::Llm::Routing::ModelOffering.new(
164
191
  provider_family: :vllm,
@@ -166,22 +193,253 @@ module Legion
166
193
  transport: offering_transport,
167
194
  tier: offering_tier,
168
195
  model: model_info.id,
196
+ canonical_model_alias: model_info.respond_to?(:name) ? model_info.name : nil,
197
+ model_family: model_info.respond_to?(:family) ? model_info.family : nil,
169
198
  usage_type: model_info.embedding? ? :embedding : :inference,
170
- capabilities: model_info.capabilities.map(&:to_s),
171
- limits: { context_window: ctx }.compact,
172
- metadata: { context_length: ctx }
199
+ capabilities: policy[:capabilities],
200
+ capability_sources: policy[:sources],
201
+ limits: { context_window: ctx, max_output_tokens: max_out }.compact,
202
+ metadata: offering_metadata_for(model_info).merge(capability_sources: policy[:sources])
203
+ )
204
+ end
205
+
206
+ def extract_real_capabilities(model_info)
207
+ return {} unless model_info.respond_to?(:metadata)
208
+
209
+ meta = model_info.metadata
210
+ meta_caps = meta.is_a?(Hash) ? meta[:capabilities] : nil
211
+ meta_caps.is_a?(Hash) ? meta_caps : {}
212
+ end
213
+
214
+ def provider_envelope_capabilities
215
+ { streaming: true }
216
+ end
217
+
218
+ def provider_capability_config
219
+ return {} unless defined?(Legion::Extensions::Llm::CredentialSources)
220
+
221
+ conf = Legion::Extensions::Llm::CredentialSources.setting(:extensions, :llm, :vllm)
222
+ conf.is_a?(Hash) ? conf.to_h.except(:instances, 'instances') : {}
223
+ rescue StandardError => e
224
+ handle_exception(e, level: :debug, handled: true, operation: 'vllm.provider_capability_config')
225
+ {}
226
+ end
227
+
228
+ def instance_capability_config
229
+ cfg = config
230
+ result = {}
231
+ %i[capabilities enable_thinking enable_tools enable_streaming enable_vision enable_embeddings
232
+ thinking_flag tools_flag streaming_flag vision_flag embedding_flag embeddings_flag
233
+ tool_flag images_flag image_flag].each do |key|
234
+ next unless cfg.respond_to?(key)
235
+
236
+ val = cfg.send(key)
237
+ result[key] = val unless val.nil?
238
+ rescue StandardError
239
+ next
240
+ end
241
+ result
242
+ end
243
+
244
+ def model_capability_config(model_id)
245
+ models_conf = resolve_models_config
246
+ return {} unless models_conf.respond_to?(:to_h)
247
+
248
+ hash = models_conf.to_h
249
+ hash[model_id.to_s] || hash[model_id.to_sym] || {}
250
+ rescue StandardError => e
251
+ handle_exception(e, level: :debug, handled: true, operation: 'vllm.model_capability_config')
252
+ {}
253
+ end
254
+
255
+ def resolve_models_config
256
+ return config.models if config.respond_to?(:models)
257
+ return config[:models] if config.respond_to?(:[])
258
+
259
+ nil
260
+ end
261
+
262
+ def offering_metadata_for(model_info)
263
+ {
264
+ raw_model: model_info.id,
265
+ parameter_count: model_info.respond_to?(:parameter_count) ? model_info.parameter_count : nil,
266
+ parameter_size: model_info.respond_to?(:parameter_size) ? model_info.parameter_size : nil,
267
+ quantization: model_info.respond_to?(:quantization) ? model_info.quantization : nil,
268
+ size_bytes: model_info.respond_to?(:size_bytes) ? model_info.size_bytes : nil
269
+ }.compact
270
+ end
271
+
272
+ # ── Canonical bridge: legacy provider API → Canonical::Request ──
273
+
274
+ # rubocop:disable Metrics/ParameterLists, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- bridge method can be complex
275
+ def build_canonical_request(
276
+ messages:, tools:, temperature:, model:, stream:, schema:, thinking:, tool_prefs:
277
+ )
278
+ model_id = model.respond_to?(:id) ? model.id : model.to_s
279
+
280
+ canonical_messages = messages.filter_map do |msg|
281
+ Canonical::Message.from_hash(msg.to_h) if msg.respond_to?(:to_h)
282
+ end
283
+
284
+ canonical_tools = tools.to_h.transform_values do |tool|
285
+ if tool.is_a?(Canonical::ToolDefinition)
286
+ tool
287
+ else
288
+ Canonical::ToolDefinition.from_hash(tool.respond_to?(:to_h) ? tool.to_h : tool)
289
+ end
290
+ end
291
+
292
+ params_hash = { temperature: temperature }
293
+ params_hash[:response_format] = schema if schema
294
+ canonical_params = Canonical::Params.from_hash(params_hash)
295
+
296
+ canonical_thinking = if thinking.respond_to?(:enabled?) && thinking.enabled?
297
+ Canonical::Thinking::Config.new(
298
+ effort: thinking.respond_to?(:effort) ? thinking.effort : nil
299
+ )
300
+ elsif thinking.is_a?(Hash)
301
+ Canonical::Thinking::Config.new(
302
+ effort: thinking[:effort] || thinking['effort'],
303
+ budget: thinking[:budget] || thinking['budget']
304
+ )
305
+ end
306
+
307
+ # Tool choice from tool_prefs
308
+ tool_choice = format_tool_choice_from_prefs(tool_prefs)
309
+
310
+ Canonical::Request.build(
311
+ messages: canonical_messages,
312
+ system: extract_system_prompt(messages),
313
+ tools: canonical_tools,
314
+ tool_choice: tool_choice,
315
+ params: canonical_params,
316
+ thinking: canonical_thinking,
317
+ stream: stream,
318
+ metadata: { model: model_id }
173
319
  )
174
320
  end
321
+ # rubocop:enable Metrics/ParameterLists, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
322
+
323
+ # ── Canonical bridge: Canonical→legacy Message/Chunk ──
324
+
325
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- verbose bridge
326
+ def to_legacy_message(canonical, raw_body, _raw_response)
327
+ thinking = nil
328
+ if canonical.thinking
329
+ thinking = Thinking.build(
330
+ text: canonical.thinking.content,
331
+ signature: canonical.thinking.signature
332
+ )
333
+ end
334
+
335
+ tool_calls = {}
336
+ canonical.tool_calls.each do |tc|
337
+ key = (tc.name || tc.id).to_s.to_sym
338
+ tool_calls[key] = Legion::Extensions::Llm::ToolCall.new(
339
+ id: tc.id,
340
+ name: tc.name,
341
+ arguments: tc.arguments
342
+ )
343
+ end
344
+
345
+ usage = canonical.usage || {}
346
+
347
+ Legion::Extensions::Llm::Message.new(
348
+ role: :assistant,
349
+ content: canonical.text,
350
+ model_id: canonical.model,
351
+ tool_calls: tool_calls.empty? ? nil : tool_calls,
352
+ thinking: thinking,
353
+ input_tokens: usage.respond_to?(:input_tokens) ? usage.input_tokens : nil,
354
+ output_tokens: usage.respond_to?(:output_tokens) ? usage.output_tokens : nil,
355
+ reasoning_tokens: usage.respond_to?(:thinking_tokens) ? usage.thinking_tokens : nil,
356
+ raw: raw_body
357
+ )
358
+ end
359
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
360
+
361
+ def to_legacy_chunk(canonical, raw_data)
362
+ usage = canonical&.usage || {}
363
+
364
+ content = canonical.delta
365
+ thinking = nil
366
+ if canonical.type == :thinking_delta
367
+ thinking = Thinking.build(text: canonical.delta)
368
+ content = nil
369
+ end
370
+
371
+ Legion::Extensions::Llm::Chunk.new(
372
+ role: :assistant,
373
+ content: content,
374
+ model_id: raw_data['model'],
375
+ tool_calls: legacy_chunk_tool_calls(canonical),
376
+ thinking: thinking,
377
+ input_tokens: usage.respond_to?(:input_tokens) ? usage.input_tokens : nil,
378
+ output_tokens: usage.respond_to?(:output_tokens) ? usage.output_tokens : nil,
379
+ raw: raw_data
380
+ )
381
+ end
382
+
383
+ # Map a canonical tool_call_delta onto the legacy chunk tool_calls hash.
384
+ # Fragment semantics matter: an entry with a non-nil id starts a new tool
385
+ # call in the StreamAccumulator; a nil id appends the raw arguments
386
+ # fragment to the most recently started call.
387
+ def legacy_chunk_tool_calls(canonical)
388
+ return nil unless canonical.type == :tool_call_delta && canonical.tool_call
389
+
390
+ tc = canonical.tool_call
391
+ key = (tc.id || tc.name || :fragment).to_s.to_sym
392
+ {
393
+ key => Legion::Extensions::Llm::ToolCall.new(
394
+ id: tc.id,
395
+ name: tc.name,
396
+ arguments: tc.arguments
397
+ )
398
+ }
399
+ end
400
+
401
+ # ── Tool choice helpers ──
402
+
403
+ def format_tool_choice_from_prefs(tool_prefs)
404
+ return nil unless tool_prefs
405
+
406
+ choice = tool_prefs[:choice] || tool_prefs['choice']
407
+ return nil unless choice
408
+ return choice.to_sym if %w[auto none required].include?(choice.to_s)
409
+
410
+ { name: choice.to_s }
411
+ end
412
+
413
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- multibranch guard chain for system parsing
414
+ def extract_system_prompt(messages)
415
+ return nil unless messages.is_a?(Array)
416
+ return nil if messages.empty?
417
+
418
+ first = messages.first
419
+ return nil unless first
420
+
421
+ role = first.respond_to?(:role) ? first.role.to_sym : (first[:role] || first['role'])
422
+ return nil unless [:system, 'system'].include?(role)
423
+
424
+ content = first.respond_to?(:content) ? first.content : (first[:content] || first['content'])
425
+ content.is_a?(String) ? content : nil
426
+ end
427
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
175
428
 
176
429
  def render_payload(messages, tools:, temperature:, model:, stream:, schema:, thinking:, tool_prefs:) # rubocop:disable Metrics/ParameterLists
177
- payload = super
178
- payload.delete(:reasoning_effort)
179
- payload[:chat_template_kwargs] = { enable_thinking: true } if thinking_enabled?(thinking)
430
+ # Build a canonical request from provider call parameters,
431
+ # then delegate to the translator for wire-format rendering.
432
+ canonical_req = build_canonical_request(
433
+ messages:, tools:, temperature:, model:, stream:,
434
+ schema:, thinking:, tool_prefs:
435
+ )
436
+ wire = translator.render_request(canonical_req)
437
+
180
438
  log.debug do
181
- "rendered vLLM payload model=#{model.respond_to?(:id) ? model.id : model} stream=#{stream} " \
182
- "tools=#{tools.respond_to?(:size) ? tools.size : 0} thinking=#{payload.key?(:chat_template_kwargs)}"
439
+ "vLLM provider rendered wire payload model=#{wire[:model]} stream=#{wire[:stream]} " \
440
+ "messages=#{(wire[:messages] || []).size} keys=#{wire.keys.join(', ')}"
183
441
  end
184
- payload
442
+ wire
185
443
  end
186
444
 
187
445
  def thinking_enabled?(thinking)
@@ -214,6 +472,24 @@ module Legion
214
472
  vllm[:enable_thinking] == true || vllm['enable_thinking'] == true
215
473
  end
216
474
 
475
+ # Override: delegate completion response parsing to the canonical translator.
476
+ def parse_completion_response(response)
477
+ body = response.body
478
+ canonical = translator.parse_response(body)
479
+
480
+ # Convert Canonical::Response back to the legacy Message/Chunk shape
481
+ # that the Provider base class expects (backward compat with existing callers).
482
+ to_legacy_message(canonical, body, response)
483
+ end
484
+
485
+ # Override: delegate SSE chunk parsing to the canonical translator.
486
+ def build_chunk(data)
487
+ canonical_chunk = translator.parse_chunk(data)
488
+ return nil if canonical_chunk.nil?
489
+
490
+ to_legacy_chunk(canonical_chunk, data)
491
+ end
492
+
217
493
  def parse_list_models_response(response, provider, capabilities)
218
494
  response.body.fetch('data', []).map do |model|
219
495
  critical_capabilities = critical_capabilities_for(capabilities, model)
@@ -0,0 +1,703 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'legion/extensions/llm/canonical'
4
+ require 'legion/extensions/llm/responses/thinking_extractor'
5
+ require 'legion/json'
6
+ require 'legion/logging'
7
+
8
+ module Legion
9
+ module Extensions
10
+ module Llm
11
+ module Vllm
12
+ # Canonical provider translator for vLLM (OpenAI-compatible wire format).
13
+ #
14
+ # Implements render_request, parse_response, parse_chunk, and capabilities.
15
+ # Extracted from existing format_openai_*/parse_* methods in OpenAICompatible mixin
16
+ # and vLLM-specific render_payload override in Provider.
17
+ #
18
+ # vLLM quirks (declared in capabilities):
19
+ # - tool_calls_as_text: true — some model configurations output tool calls
20
+ # as JSON text in the content field rather than structured tool_calls.
21
+ # - forced_tool_choice: true — vLLM's tool_choice handling is strict;
22
+ # named tool choices must be explicit function references.
23
+ # - thinking_tags: ['think', 'thinking'] — Qwen-style models emit reasoning
24
+ # in <think> or <thinking> tags within content text.
25
+ # rubocop:disable Metrics/ClassLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- translator implementation
26
+ class Translator
27
+ include Legion::Logging::Helper
28
+
29
+ # vLLM-specific stop_reason mapping (per conformance fixture stop_reason_matrix).
30
+ VLLM_STOP_REASON_MAP = {
31
+ 'stop' => :end_turn,
32
+ 'tool_use' => :tool_use,
33
+ 'length' => :max_tokens
34
+ }.freeze
35
+ FALLBACK_STOP_REASON = :end_turn
36
+
37
+ # G18 parameter mapping: supported canonical params.
38
+ SUPPORTED_PARAMS = %i[
39
+ max_tokens temperature top_p top_k stop_sequences
40
+ seed frequency_penalty presence_penalty response_format
41
+ ].freeze
42
+
43
+ # vLLM wire keys for supported params (most are 1:1 with canonical names).
44
+ PARAM_WIRE_KEYS = {
45
+ max_tokens: :max_tokens,
46
+ temperature: :temperature,
47
+ top_p: :top_p,
48
+ top_k: :top_k,
49
+ stop_sequences: :stop,
50
+ seed: :seed,
51
+ frequency_penalty: :frequency_penalty,
52
+ presence_penalty: :presence_penalty,
53
+ response_format: :response_format
54
+ }.freeze
55
+
56
+ def initialize(config: nil)
57
+ @config = config
58
+ end
59
+
60
+ # Render a canonical request into an OpenAI-compatible wire payload for vLLM.
61
+ def render_request(request)
62
+ model = request.metadata&.dig(:model) || 'default'
63
+ messages = format_messages(request)
64
+ payload = {
65
+ model: model,
66
+ messages: messages,
67
+ stream: request.stream
68
+ }
69
+
70
+ payload[:tools] = format_tools(request.tools) unless request.tools.to_h.empty?
71
+ payload[:tool_choice] = format_tool_choice(request.tool_choice) if request.tool_choice
72
+ payload.merge!(map_params_to_wire(request.params)) if request.params
73
+ apply_thinking_config(payload, request)
74
+ if formatted_response_format?(request.params)
75
+ payload[:response_format] =
76
+ format_response_format(request.params)
77
+ end
78
+
79
+ log.debug do
80
+ "vLLM translator rendered request model=#{model} stream=#{request.stream} " \
81
+ "messages=#{messages.size} tools=#{request.tools&.size || 0} params=#{payload.keys.size}"
82
+ end
83
+
84
+ payload
85
+ end
86
+
87
+ # Parse a vLLM/OpenAI-compatible completion response into a Canonical::Response.
88
+ def parse_response(wire)
89
+ return canonical_error_response(wire) unless wire.is_a?(Hash)
90
+ # Canonical-form response (from conformance kit): already in canonical shape
91
+ return Canonical::Response.from_hash(wire) if canonical_response?(wire)
92
+
93
+ choice = Array(wire['choices']).first || {}
94
+ message = choice['message'] || {}
95
+ usage = wire['usage'] || {}
96
+ finish_reason = choice['finish_reason']
97
+ model = wire['model']
98
+
99
+ content = message['content'] || ''
100
+ thinking_meta = extract_thinking_metadata(message)
101
+ extraction = Responses::ThinkingExtractor.extract(content, metadata: thinking_meta)
102
+
103
+ text = extraction.content || ''
104
+ thinking = build_canonical_thinking(extraction)
105
+
106
+ tool_calls = parse_tool_calls(message['tool_calls'])
107
+
108
+ # vLLM quirk: tool_calls_as_text — synthesize from content if none found.
109
+ if tool_calls.empty?
110
+ synthesized = synthesize_tool_calls_from_content(extraction.content, message)
111
+ tool_calls.concat(synthesized) unless synthesized.empty?
112
+ end
113
+
114
+ stop_reason = map_stop_reason(finish_reason)
115
+
116
+ Canonical::Response.build(
117
+ text: text.to_s,
118
+ thinking: thinking,
119
+ tool_calls: tool_calls,
120
+ usage: Canonical::Usage.from_hash(usage),
121
+ stop_reason: stop_reason,
122
+ model: model,
123
+ metadata: wire_metadata(wire, message, thinking_meta)
124
+ )
125
+ rescue Legion::JSON::ParseError => e
126
+ handle_exception(e, level: :warn, handled: true, operation: 'vllm.translator.parse_response')
127
+ canonical_error_response(wire)
128
+ rescue StandardError => e
129
+ handle_exception(e, level: :error, handled: false, operation: 'vllm.translator.parse_response')
130
+ raise
131
+ end
132
+
133
+ # Parse a single SSE chunk into a Canonical::Chunk or nil.
134
+ def parse_chunk(raw)
135
+ return nil if raw.nil?
136
+ return nil if raw.is_a?(String) && (raw == '[DONE]' || raw.strip.empty?)
137
+
138
+ data = raw.is_a?(Hash) ? raw : parse_json_safely(raw)
139
+ return nil if data.nil?
140
+
141
+ # Handle canonical-form chunks (from conformance fixtures or other translators)
142
+ return handle_canonical_chunk(data) if data['type']
143
+
144
+ if data['error']
145
+ return Canonical::Chunk.error_chunk(
146
+ error: data['error'],
147
+ request_id: data['id']
148
+ )
149
+ end
150
+
151
+ choice = Array(data['choices']).first
152
+ return build_done_chunk(data) if choice.nil? && data['usage']
153
+ return nil unless choice
154
+
155
+ delta = choice['delta'] || {}
156
+ finish_reason = choice['finish_reason']
157
+ request_id = data['request_id'] || data['id']
158
+
159
+ if finish_reason && empty_delta?(delta)
160
+ return Canonical::Chunk.done(
161
+ request_id: request_id,
162
+ usage: Canonical::Usage.from_hash(data['usage']),
163
+ stop_reason: map_stop_reason(finish_reason)
164
+ )
165
+ end
166
+
167
+ tool_calls = Array(delta['tool_calls'])
168
+ return build_tool_call_delta_chunk(tool_calls.first, request_id) unless tool_calls.empty?
169
+
170
+ # Thinking delta from reasoning_content
171
+ reasoning_content = delta['reasoning_content'] || delta['reasoning']
172
+ unless reasoning_content.to_s.empty?
173
+ return Canonical::Chunk.thinking_delta(
174
+ delta: reasoning_content,
175
+ request_id: request_id,
176
+ block_index: delta.dig('content_block', 'index'),
177
+ item_id: delta['content_block_start']&.dig('id')
178
+ )
179
+ end
180
+
181
+ # Text delta — check for embedded think tags
182
+ content = delta['content']
183
+ return parse_text_delta_with_thinking(content, request_id, data) unless content.to_s.empty?
184
+
185
+ nil
186
+ rescue Legion::JSON::ParseError => e
187
+ handle_exception(e, level: :warn, handled: true, operation: 'vllm.translator.parse_chunk')
188
+ nil
189
+ rescue StandardError => e
190
+ handle_exception(e, level: :error, handled: false, operation: 'vllm.translator.parse_chunk')
191
+ raise
192
+ end
193
+
194
+ # Declared capabilities for the vLLM provider.
195
+ def capabilities
196
+ {
197
+ provider: 'vllm',
198
+ wire_format: 'openai_compatible',
199
+ tool_calls_as_text: true,
200
+ forced_tool_choice: true,
201
+ thinking_tags: %w[think thinking],
202
+ stop_reason_map: VLLM_STOP_REASON_MAP,
203
+ streaming_token_usage: true
204
+ }.freeze
205
+ end
206
+
207
+ private
208
+
209
+ attr_reader :config
210
+
211
+ # ── Message formatting ──
212
+
213
+ def format_messages(request)
214
+ non_system = request.messages&.reject { |m| m.role.to_s == 'system' } || []
215
+ messages = format_request_messages(non_system)
216
+
217
+ if request.system.to_s.strip.empty?
218
+ messages
219
+ else
220
+ [{ role: 'system', content: request.system.strip }] + messages
221
+ end
222
+ end
223
+
224
+ def format_request_messages(messages)
225
+ return [] if messages.nil? || messages.empty?
226
+
227
+ messages.map { |msg| format_message(msg) }
228
+ end
229
+
230
+ def format_message(msg)
231
+ role = msg.role.to_s
232
+ content = format_message_content(msg)
233
+ tool_calls = format_message_tool_calls(msg.tool_calls) if msg.tool_calls&.any?
234
+ tool_call_id = msg.tool_call_id
235
+ name = msg.name
236
+
237
+ {
238
+ role: role,
239
+ content: content,
240
+ tool_call_id: tool_call_id,
241
+ tool_calls: tool_calls,
242
+ name: name
243
+ }.compact.reject { |k, v| k == :name && (v.nil? || v.to_s.empty?) }
244
+ end
245
+
246
+ def format_message_content(msg)
247
+ content = msg.content
248
+ return content if content.is_a?(String) && !content.empty?
249
+
250
+ case content
251
+ when Array
252
+ format_content_blocks(content)
253
+ when Canonical::ContentBlock
254
+ format_content_blocks([content])
255
+ when Hash
256
+ format_content_blocks_from_hash(content)
257
+ else
258
+ content&.to_s
259
+ end
260
+ end
261
+
262
+ def format_content_blocks(blocks)
263
+ parts = blocks.map do |block|
264
+ if block.is_a?(Canonical::ContentBlock)
265
+ format_content_block(block)
266
+ elsif block.is_a?(Hash)
267
+ format_content_block_from_hash(block)
268
+ else
269
+ { type: 'text', text: block.to_s }
270
+ end
271
+ end
272
+ parts.empty? ? '' : parts
273
+ end
274
+
275
+ # rubocop:disable Lint/DuplicateBranch -- multiple block types intentionally normalize to text in OpenAI wire format
276
+ def format_content_block(block)
277
+ case block.type
278
+ when :text, :thinking, :tool_result
279
+ { type: 'text', text: block.text.to_s }
280
+ when :tool_use
281
+ { type: 'text', text: Legion::JSON.generate(block.input || {}) }
282
+ when :image
283
+ build_image_block(block)
284
+ else
285
+ { type: 'text', text: block.text.to_s }
286
+ end
287
+ end
288
+ # rubocop:enable Lint/DuplicateBranch
289
+
290
+ def format_content_blocks_from_hash(hash_input)
291
+ case hash_input
292
+ when Hash
293
+ [format_content_block_from_hash(hash_input)]
294
+ when Array
295
+ hash_input.map { |h| format_content_block_from_hash(h) }
296
+ else
297
+ []
298
+ end
299
+ end
300
+
301
+ # rubocop:disable Lint/DuplicateBranch -- multiple block types intentionally normalize to text in OpenAI wire format
302
+ def format_content_block_from_hash(block_hash)
303
+ h = block_hash.transform_keys(&:to_sym)
304
+ type = (h[:type] || :text).to_sym
305
+
306
+ case type
307
+ when :text, :thinking, :tool_result
308
+ { type: 'text', text: h[:text].to_s }
309
+ when :tool_use
310
+ { type: 'text', text: Legion::JSON.generate(h[:input] || {}) }
311
+ when :image, :image_url
312
+ { type: 'image_url', image_url: { url: h[:data] || h[:url] || '' } }
313
+ else
314
+ { type: 'text', text: h[:text].to_s }
315
+ end
316
+ end
317
+ # rubocop:enable Lint/DuplicateBranch
318
+
319
+ def build_image_block(block)
320
+ return {} unless block.data || block.source_type
321
+
322
+ url = if block.source_type == :base64 && block.media_type
323
+ "data:#{block.media_type};base64,#{block.data}"
324
+ else
325
+ block.data
326
+ end
327
+ { type: 'image_url', image_url: { url: url } }
328
+ end
329
+
330
+ def format_message_tool_calls(tool_calls)
331
+ return [] if tool_calls.empty?
332
+
333
+ tc_array = tool_calls.is_a?(Hash) ? tool_calls.values : Array(tool_calls)
334
+ tc_array.map { |tc| format_tool_call_for_history(tc) }
335
+ end
336
+
337
+ def format_tool_call_for_history(tool_call_entry)
338
+ tc_hash = case tool_call_entry
339
+ when Canonical::ToolCall
340
+ { name: tool_call_entry&.name&.to_s, id: tool_call_entry&.id&.to_s,
341
+ arguments: tool_call_entry&.arguments || {} }
342
+ when Hash
343
+ tool_call_entry.transform_keys(&:to_sym)
344
+ else
345
+ tool_call_entry
346
+ end
347
+
348
+ name = tc_hash[:name] || tc_hash['name']
349
+ id = tc_hash[:id] || tc_hash['id']
350
+ args = tc_hash[:arguments] || tc_hash['arguments'] || {}
351
+ args = args.is_a?(Hash) ? Legion::JSON.generate(args) : args.to_s
352
+
353
+ {
354
+ id: id.to_s,
355
+ type: 'function',
356
+ function: { name: name.to_s, arguments: args }
357
+ }
358
+ end
359
+
360
+ # ── Tool formatting ──
361
+
362
+ def format_tools(tools)
363
+ return [] if tools.to_h.empty?
364
+
365
+ tools.to_h.values.map do |tool|
366
+ tool_hash = if tool.is_a?(Canonical::ToolDefinition)
367
+ { name: tool.name, description: tool.description, parameters: tool.parameters }
368
+ elsif tool.is_a?(Hash)
369
+ tool.transform_keys(&:to_sym)
370
+ else
371
+ tool
372
+ end
373
+
374
+ name = tool_hash[:name] || tool_hash['name']
375
+ description = (tool_hash[:description] || tool_hash['description'] || '').to_s
376
+ raw_params = tool_hash[:parameters] || tool_hash[:input_schema]
377
+ raw_params = raw_params.to_h if raw_params.respond_to?(:to_h) && !raw_params.is_a?(Hash)
378
+ parameters = Legion::Extensions::Llm::Canonical::ToolDefinition.normalize_parameters(raw_params)
379
+
380
+ {
381
+ type: 'function',
382
+ function: {
383
+ name: name.to_s,
384
+ description: description,
385
+ parameters: parameters
386
+ }
387
+ }
388
+ end
389
+ end
390
+
391
+ def format_tool_choice(choice)
392
+ return nil unless choice
393
+
394
+ case choice
395
+ when :auto, 'auto'
396
+ 'auto'
397
+ when :none, 'none'
398
+ 'none'
399
+ when :required, 'required'
400
+ 'required'
401
+ when Hash
402
+ name = choice[:name] || choice['name']
403
+ { type: 'function', function: { name: name.to_s } }
404
+ when Symbol, String
405
+ { type: 'function', function: { name: choice.to_s } }
406
+ end
407
+ end
408
+
409
+ # ── Parameter mapping (G18) ──
410
+
411
+ def map_params_to_wire(params)
412
+ return {} unless params.is_a?(Canonical::Params)
413
+
414
+ wire = {}
415
+ SUPPORTED_PARAMS.each do |param_key|
416
+ value = params.public_send(param_key)
417
+ next if value.nil?
418
+
419
+ wire_key = PARAM_WIRE_KEYS[param_key]
420
+ wire[wire_key] = case param_key
421
+ when :stop_sequences
422
+ format_stop_sequences(value)
423
+ when :response_format
424
+ format_response_format_value(value)
425
+ else
426
+ value
427
+ end
428
+ end
429
+
430
+ unsupported = {}
431
+ unsupported[:max_thinking_tokens] = params.max_thinking_tokens if params.max_thinking_tokens
432
+
433
+ unless unsupported.empty?
434
+ log.debug do
435
+ "vLLM translator dropping unsupported params: #{unsupported.keys.join(', ')} " \
436
+ '(handled via vLLM-specific render paths)'
437
+ end
438
+ end
439
+
440
+ wire
441
+ end
442
+
443
+ def format_stop_sequences(sequences)
444
+ sequences.is_a?(Array) ? sequences : [sequences]
445
+ end
446
+
447
+ def format_response_format(params)
448
+ return nil unless formatted_response_format?(params)
449
+
450
+ format_response_format_value(params.response_format)
451
+ end
452
+
453
+ def formatted_response_format?(params)
454
+ params.is_a?(Canonical::Params) && params.response_format
455
+ end
456
+
457
+ def format_response_format_value(value)
458
+ return value if value.is_a?(String)
459
+
460
+ val_hash = value.is_a?(Hash) ? value.transform_keys(&:to_sym) : {}
461
+ type = val_hash[:type] || val_hash['type']
462
+
463
+ case type&.to_s
464
+ when 'json_schema'
465
+ schema = val_hash[:schema] || val_hash['schema'] || val_hash[:json_schema] || val_hash['json_schema']
466
+ { type: 'json_schema', json_schema: schema }
467
+ when 'json_object'
468
+ { type: 'json_object' }
469
+ else
470
+ value
471
+ end
472
+ end
473
+
474
+ # ── Thinking configuration ──
475
+
476
+ def apply_thinking_config(payload, request)
477
+ return unless enable_thinking?(request)
478
+
479
+ payload[:chat_template_kwargs] = { enable_thinking: true }
480
+ budget = request.params&.max_thinking_tokens
481
+ return unless budget&.positive?
482
+
483
+ log.debug { "vLLM translator thinking max_thinking_tokens=#{budget} via chat template" }
484
+ end
485
+
486
+ def enable_thinking?(request)
487
+ return true if request.thinking.is_a?(Canonical::Thinking::Config) && request.thinking.enabled?
488
+ return true if request.thinking.is_a?(Hash) && (request.thinking[:enabled] != false)
489
+
490
+ if request.thinking.nil? && config
491
+ config_thinking = if config.respond_to?(:enable_thinking)
492
+ config.enable_thinking
493
+ else
494
+ config.respond_to?(:[]) ? config[:enable_thinking] : nil
495
+ end
496
+ return true if config_thinking == true
497
+ end
498
+
499
+ false
500
+ end
501
+
502
+ # ── Response parsing ──
503
+
504
+ def canonical_error_response(wire)
505
+ body = wire.is_a?(Hash) ? wire : {}
506
+ error_info = body['error'] || { type: 'parse_error', message: 'Failed to parse response' }
507
+
508
+ Canonical::Response.build(
509
+ text: '',
510
+ tool_calls: [],
511
+ usage: Canonical::Usage.from_hash(body['usage'] || {}),
512
+ stop_reason: :error,
513
+ model: body['model'],
514
+ metadata: { error: error_info }
515
+ )
516
+ end
517
+
518
+ def extract_thinking_metadata(message)
519
+ {
520
+ reasoning_content: message['reasoning_content'],
521
+ reasoning: message['reasoning'],
522
+ thinking: message['thinking'],
523
+ thinking_text: message['thinking_text'],
524
+ thinking_signature: message['thinking_signature'],
525
+ reasoning_signature: message['reasoning_signature']
526
+ }.compact
527
+ end
528
+
529
+ def build_canonical_thinking(extraction)
530
+ return nil unless extraction.thinking || extraction.signature
531
+
532
+ Canonical::Thinking.new(
533
+ content: extraction.thinking,
534
+ signature: extraction.signature
535
+ )
536
+ end
537
+
538
+ def parse_tool_calls(tool_calls)
539
+ return [] unless tool_calls.is_a?(Array) && !tool_calls.empty?
540
+
541
+ tool_calls.filter_map do |call|
542
+ function = call.fetch('function', {})
543
+ name = function['name']
544
+ id = call['id'] || name || call['index']
545
+ args = parse_tool_arguments(function['arguments'])
546
+
547
+ Canonical::ToolCall.build(
548
+ id: id.to_s,
549
+ name: name.to_s,
550
+ arguments: args,
551
+ source: :client
552
+ )
553
+ rescue StandardError => e
554
+ handle_exception(e, level: :warn, handled: true, operation: 'vllm.translator.parse_tool_call')
555
+ nil
556
+ end
557
+ end
558
+
559
+ def parse_tool_arguments(arguments)
560
+ return {} if arguments.nil? || arguments == ''
561
+ return arguments if arguments.is_a?(Hash)
562
+
563
+ Legion::JSON.load(arguments)
564
+ rescue Legion::JSON::ParseError
565
+ {}
566
+ end
567
+
568
+ # vLLM quirk: synthesize tool calls from content text JSON.
569
+ def synthesize_tool_calls_from_content(content, _message)
570
+ return [] unless content.is_a?(String) && !content.empty?
571
+
572
+ tool_call = try_parse_tool_call_from_text(content)
573
+ return [tool_call] if tool_call
574
+
575
+ json_match = content.match(/\{[^{}]*(?:tool|function|name|arguments)[^{}]*\}/m)
576
+ return [] unless json_match
577
+
578
+ tool_call = try_parse_tool_call_from_text(json_match[0])
579
+ tool_call ? [tool_call] : []
580
+ end
581
+
582
+ def try_parse_tool_call_from_text(text)
583
+ parsed = Legion::JSON.load(text)
584
+ return nil unless parsed.is_a?(Hash)
585
+
586
+ name = parsed[:name] || parsed[:function_name]
587
+ args = parsed[:arguments] || parsed[:parameters] || parsed[:input] || {}
588
+ args = Legion::JSON.load(args) if args.is_a?(String)
589
+
590
+ return nil if name.nil? || name.to_s.empty?
591
+
592
+ Canonical::ToolCall.build(
593
+ name: name.to_s,
594
+ arguments: args.is_a?(Hash) ? args : {},
595
+ source: :client
596
+ )
597
+ rescue Legion::JSON::ParseError
598
+ nil
599
+ end
600
+
601
+ def wire_metadata(wire, message, _thinking_meta)
602
+ meta = {}
603
+ meta[:reasoning_content] = message['reasoning_content'] if message['reasoning_content']
604
+ raw_usage = wire['usage']
605
+ if raw_usage.is_a?(Hash) && raw_usage['completion_tokens_details']
606
+ meta[:completion_tokens_details] = raw_usage['completion_tokens_details']
607
+ end
608
+ meta
609
+ end
610
+
611
+ # ── Chunk helpers ──
612
+
613
+ def build_done_chunk(data)
614
+ Canonical::Chunk.done(
615
+ request_id: data['request_id'] || data['id'],
616
+ usage: Canonical::Usage.from_hash(data['usage']),
617
+ stop_reason: nil
618
+ )
619
+ end
620
+
621
+ # Build a tool_call_delta chunk preserving OpenAI streaming fragment
622
+ # semantics: the opening fragment carries id + name; continuation
623
+ # fragments carry id: nil and a raw partial-JSON arguments string.
624
+ # The StreamAccumulator keys off a nil id to append fragments to the
625
+ # current tool call, so the id must NOT be synthesized here.
626
+ def build_tool_call_delta_chunk(first_call, request_id)
627
+ function = first_call.fetch('function', {})
628
+
629
+ tc = Canonical::ToolCall.new(
630
+ id: first_call['id'], exchange_id: nil,
631
+ name: function['name'], arguments: function['arguments'].to_s,
632
+ source: :client, status: nil, duration_ms: nil, result: nil,
633
+ error: nil, started_at: nil, finished_at: nil, category: nil,
634
+ data_handling_classification: nil, policy_decision: nil
635
+ )
636
+
637
+ Canonical::Chunk.tool_call_delta(
638
+ tool_call: tc,
639
+ request_id: request_id,
640
+ block_index: first_call['index']
641
+ )
642
+ end
643
+
644
+ def empty_delta?(delta)
645
+ (delta['content'].nil? || delta['content'].to_s.empty?) &&
646
+ (delta['tool_calls'].nil? || Array(delta['tool_calls']).empty?) &&
647
+ (delta['reasoning_content'].nil? || delta['reasoning_content'].to_s.empty?)
648
+ end
649
+
650
+ # Per-chunk think-tag extraction is structurally impossible while streaming:
651
+ # tags arrive split across SSE chunks, and ThinkingExtractor strips per-chunk
652
+ # whitespace, corrupting reassembled text. Emit the raw delta unmodified —
653
+ # the StreamAccumulator extracts think tags statefully across deltas.
654
+ # (Previously called ThinkingExtractor.extract_from_content, which is
655
+ # private_class_method in lex-llm >= 0.5.0 and raised NoMethodError on
656
+ # every streamed text delta, silently killing all vLLM streaming.)
657
+ def parse_text_delta_with_thinking(content, request_id, data)
658
+ Canonical::Chunk.text_delta(
659
+ delta: content,
660
+ request_id: request_id,
661
+ index: data['index']
662
+ )
663
+ end
664
+
665
+ # Parse a canonical-form chunk (from conformance kit fixtures).
666
+
667
+ # Detect canonical-form response (from conformance fixtures).
668
+ def canonical_response?(wire)
669
+ wire.key?('text') || wire['text'] || wire.key?(:stop_reason) || wire.key?('stop_reason')
670
+ end
671
+
672
+ def handle_canonical_chunk(data)
673
+ Canonical::Chunk.from_hash(data)
674
+ rescue StandardError => e
675
+ log.debug { "vLLM translator canonical chunk parse error: #{e.message}" }
676
+ nil
677
+ end
678
+
679
+ # ── Stop reason mapping ──
680
+
681
+ def map_stop_reason(raw)
682
+ return FALLBACK_STOP_REASON if raw.nil? || raw.to_s.empty?
683
+
684
+ VLLM_STOP_REASON_MAP.fetch(raw.to_s, FALLBACK_STOP_REASON)
685
+ end
686
+
687
+ # ── JSON helpers ──
688
+ # Never use bare ::JSON inside the Legion namespace.
689
+
690
+ def parse_json_safely(raw)
691
+ return nil unless raw.is_a?(String)
692
+
693
+ Legion::JSON.load(raw)
694
+ rescue Legion::JSON::ParseError => e
695
+ log.debug { "vLLM translator chunk parse error: #{e.message}" }
696
+ nil
697
+ end
698
+ end
699
+ # rubocop:enable Metrics/ClassLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
700
+ end
701
+ end
702
+ end
703
+ end
@@ -4,7 +4,7 @@ module Legion
4
4
  module Extensions
5
5
  module Llm
6
6
  module Vllm
7
- VERSION = '0.2.13'
7
+ VERSION = '0.3.5'
8
8
  end
9
9
  end
10
10
  end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'legion/extensions/llm'
4
+ require 'legion/extensions/llm/vllm/translator'
4
5
  require 'legion/extensions/llm/vllm/provider'
5
6
  require 'legion/extensions/llm/vllm/version'
6
7
  require 'legion/logging'
@@ -15,7 +16,7 @@ module Legion
15
16
  extend Legion::Extensions::Llm::AutoRegistration
16
17
 
17
18
  PROVIDER_FAMILY = :vllm
18
- DEFAULT_INSTANCE_TIER = { tier: :direct, capabilities: %i[completion streaming vision tools] }.freeze
19
+ DEFAULT_INSTANCE_TIER = { tier: :direct, capabilities: {}, provider_capabilities: { streaming: true } }.freeze
19
20
 
20
21
  def self.default_settings
21
22
  ::Legion::Extensions::Llm.provider_settings(
@@ -31,10 +32,7 @@ module Legion
31
32
  fleet: {
32
33
  enabled: false,
33
34
  respond_to_requests: false,
34
- capabilities: %i[chat stream_chat embed],
35
- lanes: [],
36
- concurrency: 1,
37
- queue_suffix: nil
35
+ capabilities: %i[chat stream_chat embed]
38
36
  }
39
37
  }
40
38
  )
@@ -73,10 +71,19 @@ module Legion
73
71
  def self.normalize_instance_config(config)
74
72
  normalized = config.to_h.transform_keys(&:to_sym)
75
73
  resolve_api_base_aliases(normalized)
74
+ resolve_credentials(normalized)
76
75
  normalized[:tier] ||= infer_tier_from_endpoint(normalized[:vllm_api_base])
77
76
  normalized
78
77
  end
79
78
 
79
+ def self.resolve_credentials(normalized)
80
+ creds = normalized.delete(:credentials)
81
+ return unless creds.is_a?(Hash)
82
+
83
+ creds = creds.transform_keys(&:to_sym)
84
+ normalized[:vllm_api_key] ||= creds[:api_key]
85
+ end
86
+
80
87
  def self.resolve_api_base_aliases(normalized)
81
88
  normalized[:vllm_api_base] ||= normalized.delete(:base_url)
82
89
  normalized[:vllm_api_base] ||= normalized.delete(:api_base)
@@ -92,12 +99,15 @@ module Legion
92
99
  return :direct if url.nil? || url.to_s.empty?
93
100
 
94
101
  require 'uri'
102
+ require_relative 'vllm/actors/discovery_refresh'
95
103
  host = URI.parse(url.to_s).host.to_s.downcase
96
104
  %w[localhost 127.0.0.1 ::1].include?(host) ? :local : :direct
97
105
  rescue URI::InvalidURIError => e
98
106
  handle_exception(e, level: :debug, handled: true, operation: 'vllm.infer_tier_from_endpoint')
99
107
  :direct
100
108
  end
109
+
110
+ Legion::Extensions::Llm::Configuration.register_provider_options(Provider.configuration_options)
101
111
  end
102
112
  end
103
113
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-llm-vllm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.13
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - LegionIO
@@ -71,14 +71,14 @@ dependencies:
71
71
  requirements:
72
72
  - - ">="
73
73
  - !ruby/object:Gem::Version
74
- version: 0.4.3
74
+ version: 0.5.0
75
75
  type: :runtime
76
76
  prerelease: false
77
77
  version_requirements: !ruby/object:Gem::Requirement
78
78
  requirements:
79
79
  - - ">="
80
80
  - !ruby/object:Gem::Version
81
- version: 0.4.3
81
+ version: 0.5.0
82
82
  description: vLLM provider integration for the LegionIO LLM routing framework.
83
83
  email:
84
84
  - matthewdiverson@gmail.com
@@ -101,6 +101,7 @@ files:
101
101
  - lib/legion/extensions/llm/vllm/actors/fleet_worker.rb
102
102
  - lib/legion/extensions/llm/vllm/provider.rb
103
103
  - lib/legion/extensions/llm/vllm/runners/fleet_worker.rb
104
+ - lib/legion/extensions/llm/vllm/translator.rb
104
105
  - lib/legion/extensions/llm/vllm/version.rb
105
106
  homepage: https://github.com/LegionIO/lex-llm-vllm
106
107
  licenses: