lex-llm-vllm 0.2.13 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -0
- data/CHANGELOG.md +34 -0
- data/Gemfile +0 -7
- data/lex-llm-vllm.gemspec +1 -1
- data/lib/legion/extensions/llm/vllm/actors/discovery_refresh.rb +6 -0
- data/lib/legion/extensions/llm/vllm/provider.rb +286 -10
- data/lib/legion/extensions/llm/vllm/translator.rb +703 -0
- data/lib/legion/extensions/llm/vllm/version.rb +1 -1
- data/lib/legion/extensions/llm/vllm.rb +15 -5
- metadata +4 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 20aa357958da5d294b132bdb5d3bea065d0192b42051deae734a3e1b8af592e5
|
|
4
|
+
data.tar.gz: 203a81b2aa087bc2cdabbe26cd71ea88e1bd7b1aaa9e44a95a5b43c35ab2c344
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4ab5021f00c1f6652147297c4f9fc56a2f501eaa1468c0f098aff78d83426706069e4cca7184c96647135ff951dc4965d871352144826d4cf6c8961e60616862
|
|
7
|
+
data.tar.gz: 1e52f17e28c52ddae6317c3fd6ea6fd58b54a71a46d77c80f00544fb35333bf0340c1498b2f9db6419c811765c9926e0bd20228c7bc44badae04dd68a9fd178c
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,39 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.3.5 - 2026-06-16
|
|
4
|
+
|
|
5
|
+
- Extract `vllm_api_key` from `credentials: { api_key: ... }` in instance settings so Bearer auth works with the standard settings layout.
|
|
6
|
+
- Fix `Dalli::RingError` crash in `offering_from_model` when cache server is unavailable; cache write is now best-effort.
|
|
7
|
+
|
|
8
|
+
## 0.3.3 - 2026-06-16
|
|
9
|
+
|
|
10
|
+
- Dependency updates (concurrent-ruby 1.3.7, faraday 2.14.3, rubocop 1.88.0) and code quality improvements.
|
|
11
|
+
|
|
12
|
+
## 0.3.2 - 2026-06-15
|
|
13
|
+
|
|
14
|
+
- **CapabilityPolicy integration** — Optional capabilities default false; use `CapabilityPolicy.resolve` for offerings. Static all-true predicates no longer used for routing truth. Settings overrides at provider/instance/model level supported.
|
|
15
|
+
|
|
16
|
+
## 0.3.1 - 2026-06-13
|
|
17
|
+
|
|
18
|
+
- **Gemfile cleanup** — Remove local path overrides; dependencies resolve from gemspec via rubygems.
|
|
19
|
+
- **Bug fix** — Restore vLLM streaming; private `ThinkingExtractor` call was killing every text delta.
|
|
20
|
+
- **Canonical tool normalization** — Use canonical normalization for tool parameter schemas.
|
|
21
|
+
- 155 examples, 0 failures; 17 files, 0 rubocop offenses.
|
|
22
|
+
|
|
23
|
+
## 0.3.0 - 2026-06-10
|
|
24
|
+
|
|
25
|
+
- Add canonical provider translator (`Translator`) implementing `render_request`,
|
|
26
|
+
`parse_response`, `parse_chunk`, and `capabilities` per N×N routing design
|
|
27
|
+
- Wire provider `render_payload`, `parse_completion_response`, `build_chunk` to
|
|
28
|
+
delegate to translator with legacy Message/Chunk bridge for backward compat
|
|
29
|
+
- Declare vLLM quirks: `tool_calls_as_text`, `forced_tool_choice`, `thinking_tags`,
|
|
30
|
+
`streaming_token_usage`
|
|
31
|
+
- G18 parameter mapping: max_tokens, temperature, top_p, top_k, stop_sequences,
|
|
32
|
+
seed, frequency_penalty, presence_penalty, response_format
|
|
33
|
+
- Qwen-style </think> tag extraction and tool-call synthesis from content text
|
|
34
|
+
- Adopt conformance kit (`it_behaves_like 'a canonical provider translator'`)
|
|
35
|
+
- Bump lex-llm dependency floor to >= 0.5.0
|
|
36
|
+
|
|
3
37
|
## 0.2.13 - 2026-06-05
|
|
4
38
|
|
|
5
39
|
- Fix missing documentation comment on `DiscoveryRefresh` actor (RuboCop Style/Documentation)
|
data/Gemfile
CHANGED
|
@@ -2,13 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
source 'https://rubygems.org'
|
|
4
4
|
|
|
5
|
-
group :test do
|
|
6
|
-
llm_base_path = ENV.fetch('LEX_LLM_PATH', File.expand_path('../lex-llm', __dir__))
|
|
7
|
-
transport_path = ENV.fetch('LEGION_TRANSPORT_PATH', File.expand_path('../../legion-transport', __dir__))
|
|
8
|
-
gem 'legion-transport', path: transport_path if File.directory?(transport_path)
|
|
9
|
-
gem 'lex-llm', path: llm_base_path if File.directory?(llm_base_path)
|
|
10
|
-
end
|
|
11
|
-
|
|
12
5
|
gemspec
|
|
13
6
|
|
|
14
7
|
group :development do
|
data/lex-llm-vllm.gemspec
CHANGED
|
@@ -27,5 +27,5 @@ Gem::Specification.new do |spec|
|
|
|
27
27
|
spec.add_dependency 'legion-logging', '>= 1.3.2'
|
|
28
28
|
spec.add_dependency 'legion-settings', '>= 1.3.14'
|
|
29
29
|
spec.add_dependency 'legion-transport', '>= 1.4.14'
|
|
30
|
-
spec.add_dependency 'lex-llm', '>= 0.
|
|
30
|
+
spec.add_dependency 'lex-llm', '>= 0.5.0'
|
|
31
31
|
end
|
|
@@ -37,6 +37,12 @@ module Legion
|
|
|
37
37
|
return unless defined?(Legion::LLM::Discovery)
|
|
38
38
|
|
|
39
39
|
Legion::LLM::Discovery.refresh_discovered_models!(provider: :vllm)
|
|
40
|
+
if defined?(Legion::LLM::Router) && Legion::LLM::Router.respond_to?(:populate_auto_rules)
|
|
41
|
+
Legion::LLM::Router.populate_auto_rules(Legion::LLM::Discovery.discovered_instances)
|
|
42
|
+
end
|
|
43
|
+
if defined?(Legion::LLM::Inventory) && Legion::LLM::Inventory.respond_to?(:invalidate_offerings_cache!)
|
|
44
|
+
Legion::LLM::Inventory.invalidate_offerings_cache!
|
|
45
|
+
end
|
|
40
46
|
rescue StandardError => e
|
|
41
47
|
handle_exception(e, level: :warn, handled: true, operation: 'vllm.actor.discovery_refresh')
|
|
42
48
|
end
|
|
@@ -53,6 +53,11 @@ module Legion
|
|
|
53
53
|
Vllm.default_settings
|
|
54
54
|
end
|
|
55
55
|
|
|
56
|
+
# Canonical translator instance — renders requests, parses responses/chunks.
|
|
57
|
+
def translator
|
|
58
|
+
@translator ||= Translator.new(config: config)
|
|
59
|
+
end
|
|
60
|
+
|
|
56
61
|
def api_base
|
|
57
62
|
normalize_url(config.vllm_api_base || settings[:endpoint] || 'http://localhost:8000')
|
|
58
63
|
end
|
|
@@ -158,7 +163,29 @@ module Legion
|
|
|
158
163
|
|
|
159
164
|
def offering_from_model(model_info)
|
|
160
165
|
ctx = model_info.context_length
|
|
161
|
-
|
|
166
|
+
if ctx
|
|
167
|
+
begin
|
|
168
|
+
cache_set(model_detail_cache_key(model_info.id), { context_window: ctx }, ttl: 86_400)
|
|
169
|
+
rescue StandardError => e
|
|
170
|
+
handle_exception(e, level: :debug, handled: true, operation: 'vllm.cache_model_detail')
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
policy = Legion::Extensions::Llm::CapabilityPolicy.resolve(
|
|
175
|
+
real: extract_real_capabilities(model_info),
|
|
176
|
+
provider_catalog: {},
|
|
177
|
+
probe: {},
|
|
178
|
+
provider_envelope: provider_envelope_capabilities,
|
|
179
|
+
provider_config: provider_capability_config,
|
|
180
|
+
instance_config: instance_capability_config,
|
|
181
|
+
model_config: model_capability_config(model_info.id)
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
build_offering(model_info, policy, ctx)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def build_offering(model_info, policy, ctx) # rubocop:disable Metrics/AbcSize
|
|
188
|
+
max_out = model_info.respond_to?(:max_output_tokens) ? model_info.max_output_tokens : nil
|
|
162
189
|
|
|
163
190
|
Legion::Extensions::Llm::Routing::ModelOffering.new(
|
|
164
191
|
provider_family: :vllm,
|
|
@@ -166,22 +193,253 @@ module Legion
|
|
|
166
193
|
transport: offering_transport,
|
|
167
194
|
tier: offering_tier,
|
|
168
195
|
model: model_info.id,
|
|
196
|
+
canonical_model_alias: model_info.respond_to?(:name) ? model_info.name : nil,
|
|
197
|
+
model_family: model_info.respond_to?(:family) ? model_info.family : nil,
|
|
169
198
|
usage_type: model_info.embedding? ? :embedding : :inference,
|
|
170
|
-
capabilities:
|
|
171
|
-
|
|
172
|
-
|
|
199
|
+
capabilities: policy[:capabilities],
|
|
200
|
+
capability_sources: policy[:sources],
|
|
201
|
+
limits: { context_window: ctx, max_output_tokens: max_out }.compact,
|
|
202
|
+
metadata: offering_metadata_for(model_info).merge(capability_sources: policy[:sources])
|
|
203
|
+
)
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def extract_real_capabilities(model_info)
|
|
207
|
+
return {} unless model_info.respond_to?(:metadata)
|
|
208
|
+
|
|
209
|
+
meta = model_info.metadata
|
|
210
|
+
meta_caps = meta.is_a?(Hash) ? meta[:capabilities] : nil
|
|
211
|
+
meta_caps.is_a?(Hash) ? meta_caps : {}
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def provider_envelope_capabilities
|
|
215
|
+
{ streaming: true }
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def provider_capability_config
|
|
219
|
+
return {} unless defined?(Legion::Extensions::Llm::CredentialSources)
|
|
220
|
+
|
|
221
|
+
conf = Legion::Extensions::Llm::CredentialSources.setting(:extensions, :llm, :vllm)
|
|
222
|
+
conf.is_a?(Hash) ? conf.to_h.except(:instances, 'instances') : {}
|
|
223
|
+
rescue StandardError => e
|
|
224
|
+
handle_exception(e, level: :debug, handled: true, operation: 'vllm.provider_capability_config')
|
|
225
|
+
{}
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def instance_capability_config
|
|
229
|
+
cfg = config
|
|
230
|
+
result = {}
|
|
231
|
+
%i[capabilities enable_thinking enable_tools enable_streaming enable_vision enable_embeddings
|
|
232
|
+
thinking_flag tools_flag streaming_flag vision_flag embedding_flag embeddings_flag
|
|
233
|
+
tool_flag images_flag image_flag].each do |key|
|
|
234
|
+
next unless cfg.respond_to?(key)
|
|
235
|
+
|
|
236
|
+
val = cfg.send(key)
|
|
237
|
+
result[key] = val unless val.nil?
|
|
238
|
+
rescue StandardError
|
|
239
|
+
next
|
|
240
|
+
end
|
|
241
|
+
result
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def model_capability_config(model_id)
|
|
245
|
+
models_conf = resolve_models_config
|
|
246
|
+
return {} unless models_conf.respond_to?(:to_h)
|
|
247
|
+
|
|
248
|
+
hash = models_conf.to_h
|
|
249
|
+
hash[model_id.to_s] || hash[model_id.to_sym] || {}
|
|
250
|
+
rescue StandardError => e
|
|
251
|
+
handle_exception(e, level: :debug, handled: true, operation: 'vllm.model_capability_config')
|
|
252
|
+
{}
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def resolve_models_config
|
|
256
|
+
return config.models if config.respond_to?(:models)
|
|
257
|
+
return config[:models] if config.respond_to?(:[])
|
|
258
|
+
|
|
259
|
+
nil
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
def offering_metadata_for(model_info)
|
|
263
|
+
{
|
|
264
|
+
raw_model: model_info.id,
|
|
265
|
+
parameter_count: model_info.respond_to?(:parameter_count) ? model_info.parameter_count : nil,
|
|
266
|
+
parameter_size: model_info.respond_to?(:parameter_size) ? model_info.parameter_size : nil,
|
|
267
|
+
quantization: model_info.respond_to?(:quantization) ? model_info.quantization : nil,
|
|
268
|
+
size_bytes: model_info.respond_to?(:size_bytes) ? model_info.size_bytes : nil
|
|
269
|
+
}.compact
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
# ── Canonical bridge: legacy provider API → Canonical::Request ──
|
|
273
|
+
|
|
274
|
+
# rubocop:disable Metrics/ParameterLists, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- bridge method can be complex
|
|
275
|
+
def build_canonical_request(
|
|
276
|
+
messages:, tools:, temperature:, model:, stream:, schema:, thinking:, tool_prefs:
|
|
277
|
+
)
|
|
278
|
+
model_id = model.respond_to?(:id) ? model.id : model.to_s
|
|
279
|
+
|
|
280
|
+
canonical_messages = messages.filter_map do |msg|
|
|
281
|
+
Canonical::Message.from_hash(msg.to_h) if msg.respond_to?(:to_h)
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
canonical_tools = tools.to_h.transform_values do |tool|
|
|
285
|
+
if tool.is_a?(Canonical::ToolDefinition)
|
|
286
|
+
tool
|
|
287
|
+
else
|
|
288
|
+
Canonical::ToolDefinition.from_hash(tool.respond_to?(:to_h) ? tool.to_h : tool)
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
params_hash = { temperature: temperature }
|
|
293
|
+
params_hash[:response_format] = schema if schema
|
|
294
|
+
canonical_params = Canonical::Params.from_hash(params_hash)
|
|
295
|
+
|
|
296
|
+
canonical_thinking = if thinking.respond_to?(:enabled?) && thinking.enabled?
|
|
297
|
+
Canonical::Thinking::Config.new(
|
|
298
|
+
effort: thinking.respond_to?(:effort) ? thinking.effort : nil
|
|
299
|
+
)
|
|
300
|
+
elsif thinking.is_a?(Hash)
|
|
301
|
+
Canonical::Thinking::Config.new(
|
|
302
|
+
effort: thinking[:effort] || thinking['effort'],
|
|
303
|
+
budget: thinking[:budget] || thinking['budget']
|
|
304
|
+
)
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
# Tool choice from tool_prefs
|
|
308
|
+
tool_choice = format_tool_choice_from_prefs(tool_prefs)
|
|
309
|
+
|
|
310
|
+
Canonical::Request.build(
|
|
311
|
+
messages: canonical_messages,
|
|
312
|
+
system: extract_system_prompt(messages),
|
|
313
|
+
tools: canonical_tools,
|
|
314
|
+
tool_choice: tool_choice,
|
|
315
|
+
params: canonical_params,
|
|
316
|
+
thinking: canonical_thinking,
|
|
317
|
+
stream: stream,
|
|
318
|
+
metadata: { model: model_id }
|
|
173
319
|
)
|
|
174
320
|
end
|
|
321
|
+
# rubocop:enable Metrics/ParameterLists, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
322
|
+
|
|
323
|
+
# ── Canonical bridge: Canonical→legacy Message/Chunk ──
|
|
324
|
+
|
|
325
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- verbose bridge
|
|
326
|
+
def to_legacy_message(canonical, raw_body, _raw_response)
|
|
327
|
+
thinking = nil
|
|
328
|
+
if canonical.thinking
|
|
329
|
+
thinking = Thinking.build(
|
|
330
|
+
text: canonical.thinking.content,
|
|
331
|
+
signature: canonical.thinking.signature
|
|
332
|
+
)
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
tool_calls = {}
|
|
336
|
+
canonical.tool_calls.each do |tc|
|
|
337
|
+
key = (tc.name || tc.id).to_s.to_sym
|
|
338
|
+
tool_calls[key] = Legion::Extensions::Llm::ToolCall.new(
|
|
339
|
+
id: tc.id,
|
|
340
|
+
name: tc.name,
|
|
341
|
+
arguments: tc.arguments
|
|
342
|
+
)
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
usage = canonical.usage || {}
|
|
346
|
+
|
|
347
|
+
Legion::Extensions::Llm::Message.new(
|
|
348
|
+
role: :assistant,
|
|
349
|
+
content: canonical.text,
|
|
350
|
+
model_id: canonical.model,
|
|
351
|
+
tool_calls: tool_calls.empty? ? nil : tool_calls,
|
|
352
|
+
thinking: thinking,
|
|
353
|
+
input_tokens: usage.respond_to?(:input_tokens) ? usage.input_tokens : nil,
|
|
354
|
+
output_tokens: usage.respond_to?(:output_tokens) ? usage.output_tokens : nil,
|
|
355
|
+
reasoning_tokens: usage.respond_to?(:thinking_tokens) ? usage.thinking_tokens : nil,
|
|
356
|
+
raw: raw_body
|
|
357
|
+
)
|
|
358
|
+
end
|
|
359
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
360
|
+
|
|
361
|
+
def to_legacy_chunk(canonical, raw_data)
|
|
362
|
+
usage = canonical&.usage || {}
|
|
363
|
+
|
|
364
|
+
content = canonical.delta
|
|
365
|
+
thinking = nil
|
|
366
|
+
if canonical.type == :thinking_delta
|
|
367
|
+
thinking = Thinking.build(text: canonical.delta)
|
|
368
|
+
content = nil
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
Legion::Extensions::Llm::Chunk.new(
|
|
372
|
+
role: :assistant,
|
|
373
|
+
content: content,
|
|
374
|
+
model_id: raw_data['model'],
|
|
375
|
+
tool_calls: legacy_chunk_tool_calls(canonical),
|
|
376
|
+
thinking: thinking,
|
|
377
|
+
input_tokens: usage.respond_to?(:input_tokens) ? usage.input_tokens : nil,
|
|
378
|
+
output_tokens: usage.respond_to?(:output_tokens) ? usage.output_tokens : nil,
|
|
379
|
+
raw: raw_data
|
|
380
|
+
)
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
# Map a canonical tool_call_delta onto the legacy chunk tool_calls hash.
|
|
384
|
+
# Fragment semantics matter: an entry with a non-nil id starts a new tool
|
|
385
|
+
# call in the StreamAccumulator; a nil id appends the raw arguments
|
|
386
|
+
# fragment to the most recently started call.
|
|
387
|
+
def legacy_chunk_tool_calls(canonical)
|
|
388
|
+
return nil unless canonical.type == :tool_call_delta && canonical.tool_call
|
|
389
|
+
|
|
390
|
+
tc = canonical.tool_call
|
|
391
|
+
key = (tc.id || tc.name || :fragment).to_s.to_sym
|
|
392
|
+
{
|
|
393
|
+
key => Legion::Extensions::Llm::ToolCall.new(
|
|
394
|
+
id: tc.id,
|
|
395
|
+
name: tc.name,
|
|
396
|
+
arguments: tc.arguments
|
|
397
|
+
)
|
|
398
|
+
}
|
|
399
|
+
end
|
|
400
|
+
|
|
401
|
+
# ── Tool choice helpers ──
|
|
402
|
+
|
|
403
|
+
def format_tool_choice_from_prefs(tool_prefs)
|
|
404
|
+
return nil unless tool_prefs
|
|
405
|
+
|
|
406
|
+
choice = tool_prefs[:choice] || tool_prefs['choice']
|
|
407
|
+
return nil unless choice
|
|
408
|
+
return choice.to_sym if %w[auto none required].include?(choice.to_s)
|
|
409
|
+
|
|
410
|
+
{ name: choice.to_s }
|
|
411
|
+
end
|
|
412
|
+
|
|
413
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- multibranch guard chain for system parsing
|
|
414
|
+
def extract_system_prompt(messages)
|
|
415
|
+
return nil unless messages.is_a?(Array)
|
|
416
|
+
return nil if messages.empty?
|
|
417
|
+
|
|
418
|
+
first = messages.first
|
|
419
|
+
return nil unless first
|
|
420
|
+
|
|
421
|
+
role = first.respond_to?(:role) ? first.role.to_sym : (first[:role] || first['role'])
|
|
422
|
+
return nil unless [:system, 'system'].include?(role)
|
|
423
|
+
|
|
424
|
+
content = first.respond_to?(:content) ? first.content : (first[:content] || first['content'])
|
|
425
|
+
content.is_a?(String) ? content : nil
|
|
426
|
+
end
|
|
427
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
175
428
|
|
|
176
429
|
def render_payload(messages, tools:, temperature:, model:, stream:, schema:, thinking:, tool_prefs:) # rubocop:disable Metrics/ParameterLists
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
430
|
+
# Build a canonical request from provider call parameters,
|
|
431
|
+
# then delegate to the translator for wire-format rendering.
|
|
432
|
+
canonical_req = build_canonical_request(
|
|
433
|
+
messages:, tools:, temperature:, model:, stream:,
|
|
434
|
+
schema:, thinking:, tool_prefs:
|
|
435
|
+
)
|
|
436
|
+
wire = translator.render_request(canonical_req)
|
|
437
|
+
|
|
180
438
|
log.debug do
|
|
181
|
-
"rendered
|
|
182
|
-
"
|
|
439
|
+
"vLLM provider rendered wire payload model=#{wire[:model]} stream=#{wire[:stream]} " \
|
|
440
|
+
"messages=#{(wire[:messages] || []).size} keys=#{wire.keys.join(', ')}"
|
|
183
441
|
end
|
|
184
|
-
|
|
442
|
+
wire
|
|
185
443
|
end
|
|
186
444
|
|
|
187
445
|
def thinking_enabled?(thinking)
|
|
@@ -214,6 +472,24 @@ module Legion
|
|
|
214
472
|
vllm[:enable_thinking] == true || vllm['enable_thinking'] == true
|
|
215
473
|
end
|
|
216
474
|
|
|
475
|
+
# Override: delegate completion response parsing to the canonical translator.
|
|
476
|
+
def parse_completion_response(response)
|
|
477
|
+
body = response.body
|
|
478
|
+
canonical = translator.parse_response(body)
|
|
479
|
+
|
|
480
|
+
# Convert Canonical::Response back to the legacy Message/Chunk shape
|
|
481
|
+
# that the Provider base class expects (backward compat with existing callers).
|
|
482
|
+
to_legacy_message(canonical, body, response)
|
|
483
|
+
end
|
|
484
|
+
|
|
485
|
+
# Override: delegate SSE chunk parsing to the canonical translator.
|
|
486
|
+
def build_chunk(data)
|
|
487
|
+
canonical_chunk = translator.parse_chunk(data)
|
|
488
|
+
return nil if canonical_chunk.nil?
|
|
489
|
+
|
|
490
|
+
to_legacy_chunk(canonical_chunk, data)
|
|
491
|
+
end
|
|
492
|
+
|
|
217
493
|
def parse_list_models_response(response, provider, capabilities)
|
|
218
494
|
response.body.fetch('data', []).map do |model|
|
|
219
495
|
critical_capabilities = critical_capabilities_for(capabilities, model)
|
|
@@ -0,0 +1,703 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'legion/extensions/llm/canonical'
|
|
4
|
+
require 'legion/extensions/llm/responses/thinking_extractor'
|
|
5
|
+
require 'legion/json'
|
|
6
|
+
require 'legion/logging'
|
|
7
|
+
|
|
8
|
+
module Legion
|
|
9
|
+
module Extensions
|
|
10
|
+
module Llm
|
|
11
|
+
module Vllm
|
|
12
|
+
# Canonical provider translator for vLLM (OpenAI-compatible wire format).
|
|
13
|
+
#
|
|
14
|
+
# Implements render_request, parse_response, parse_chunk, and capabilities.
|
|
15
|
+
# Extracted from existing format_openai_*/parse_* methods in OpenAICompatible mixin
|
|
16
|
+
# and vLLM-specific render_payload override in Provider.
|
|
17
|
+
#
|
|
18
|
+
# vLLM quirks (declared in capabilities):
|
|
19
|
+
# - tool_calls_as_text: true — some model configurations output tool calls
|
|
20
|
+
# as JSON text in the content field rather than structured tool_calls.
|
|
21
|
+
# - forced_tool_choice: true — vLLM's tool_choice handling is strict;
|
|
22
|
+
# named tool choices must be explicit function references.
|
|
23
|
+
# - thinking_tags: ['think', 'thinking'] — Qwen-style models emit reasoning
|
|
24
|
+
# in <think> or <thinking> tags within content text.
|
|
25
|
+
# rubocop:disable Metrics/ClassLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- translator implementation
|
|
26
|
+
class Translator
|
|
27
|
+
include Legion::Logging::Helper
|
|
28
|
+
|
|
29
|
+
# vLLM-specific stop_reason mapping (per conformance fixture stop_reason_matrix).
|
|
30
|
+
VLLM_STOP_REASON_MAP = {
|
|
31
|
+
'stop' => :end_turn,
|
|
32
|
+
'tool_use' => :tool_use,
|
|
33
|
+
'length' => :max_tokens
|
|
34
|
+
}.freeze
|
|
35
|
+
FALLBACK_STOP_REASON = :end_turn
|
|
36
|
+
|
|
37
|
+
# G18 parameter mapping: supported canonical params.
|
|
38
|
+
SUPPORTED_PARAMS = %i[
|
|
39
|
+
max_tokens temperature top_p top_k stop_sequences
|
|
40
|
+
seed frequency_penalty presence_penalty response_format
|
|
41
|
+
].freeze
|
|
42
|
+
|
|
43
|
+
# vLLM wire keys for supported params (most are 1:1 with canonical names).
|
|
44
|
+
PARAM_WIRE_KEYS = {
|
|
45
|
+
max_tokens: :max_tokens,
|
|
46
|
+
temperature: :temperature,
|
|
47
|
+
top_p: :top_p,
|
|
48
|
+
top_k: :top_k,
|
|
49
|
+
stop_sequences: :stop,
|
|
50
|
+
seed: :seed,
|
|
51
|
+
frequency_penalty: :frequency_penalty,
|
|
52
|
+
presence_penalty: :presence_penalty,
|
|
53
|
+
response_format: :response_format
|
|
54
|
+
}.freeze
|
|
55
|
+
|
|
56
|
+
def initialize(config: nil)
|
|
57
|
+
@config = config
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Render a canonical request into an OpenAI-compatible wire payload for vLLM.
|
|
61
|
+
def render_request(request)
|
|
62
|
+
model = request.metadata&.dig(:model) || 'default'
|
|
63
|
+
messages = format_messages(request)
|
|
64
|
+
payload = {
|
|
65
|
+
model: model,
|
|
66
|
+
messages: messages,
|
|
67
|
+
stream: request.stream
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
payload[:tools] = format_tools(request.tools) unless request.tools.to_h.empty?
|
|
71
|
+
payload[:tool_choice] = format_tool_choice(request.tool_choice) if request.tool_choice
|
|
72
|
+
payload.merge!(map_params_to_wire(request.params)) if request.params
|
|
73
|
+
apply_thinking_config(payload, request)
|
|
74
|
+
if formatted_response_format?(request.params)
|
|
75
|
+
payload[:response_format] =
|
|
76
|
+
format_response_format(request.params)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
log.debug do
|
|
80
|
+
"vLLM translator rendered request model=#{model} stream=#{request.stream} " \
|
|
81
|
+
"messages=#{messages.size} tools=#{request.tools&.size || 0} params=#{payload.keys.size}"
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
payload
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Parse a vLLM/OpenAI-compatible completion response into a Canonical::Response.
|
|
88
|
+
def parse_response(wire)
|
|
89
|
+
return canonical_error_response(wire) unless wire.is_a?(Hash)
|
|
90
|
+
# Canonical-form response (from conformance kit): already in canonical shape
|
|
91
|
+
return Canonical::Response.from_hash(wire) if canonical_response?(wire)
|
|
92
|
+
|
|
93
|
+
choice = Array(wire['choices']).first || {}
|
|
94
|
+
message = choice['message'] || {}
|
|
95
|
+
usage = wire['usage'] || {}
|
|
96
|
+
finish_reason = choice['finish_reason']
|
|
97
|
+
model = wire['model']
|
|
98
|
+
|
|
99
|
+
content = message['content'] || ''
|
|
100
|
+
thinking_meta = extract_thinking_metadata(message)
|
|
101
|
+
extraction = Responses::ThinkingExtractor.extract(content, metadata: thinking_meta)
|
|
102
|
+
|
|
103
|
+
text = extraction.content || ''
|
|
104
|
+
thinking = build_canonical_thinking(extraction)
|
|
105
|
+
|
|
106
|
+
tool_calls = parse_tool_calls(message['tool_calls'])
|
|
107
|
+
|
|
108
|
+
# vLLM quirk: tool_calls_as_text — synthesize from content if none found.
|
|
109
|
+
if tool_calls.empty?
|
|
110
|
+
synthesized = synthesize_tool_calls_from_content(extraction.content, message)
|
|
111
|
+
tool_calls.concat(synthesized) unless synthesized.empty?
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
stop_reason = map_stop_reason(finish_reason)
|
|
115
|
+
|
|
116
|
+
Canonical::Response.build(
|
|
117
|
+
text: text.to_s,
|
|
118
|
+
thinking: thinking,
|
|
119
|
+
tool_calls: tool_calls,
|
|
120
|
+
usage: Canonical::Usage.from_hash(usage),
|
|
121
|
+
stop_reason: stop_reason,
|
|
122
|
+
model: model,
|
|
123
|
+
metadata: wire_metadata(wire, message, thinking_meta)
|
|
124
|
+
)
|
|
125
|
+
rescue Legion::JSON::ParseError => e
|
|
126
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.translator.parse_response')
|
|
127
|
+
canonical_error_response(wire)
|
|
128
|
+
rescue StandardError => e
|
|
129
|
+
handle_exception(e, level: :error, handled: false, operation: 'vllm.translator.parse_response')
|
|
130
|
+
raise
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Parse a single SSE chunk into a Canonical::Chunk or nil.
|
|
134
|
+
def parse_chunk(raw)
|
|
135
|
+
return nil if raw.nil?
|
|
136
|
+
return nil if raw.is_a?(String) && (raw == '[DONE]' || raw.strip.empty?)
|
|
137
|
+
|
|
138
|
+
data = raw.is_a?(Hash) ? raw : parse_json_safely(raw)
|
|
139
|
+
return nil if data.nil?
|
|
140
|
+
|
|
141
|
+
# Handle canonical-form chunks (from conformance fixtures or other translators)
|
|
142
|
+
return handle_canonical_chunk(data) if data['type']
|
|
143
|
+
|
|
144
|
+
if data['error']
|
|
145
|
+
return Canonical::Chunk.error_chunk(
|
|
146
|
+
error: data['error'],
|
|
147
|
+
request_id: data['id']
|
|
148
|
+
)
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
choice = Array(data['choices']).first
|
|
152
|
+
return build_done_chunk(data) if choice.nil? && data['usage']
|
|
153
|
+
return nil unless choice
|
|
154
|
+
|
|
155
|
+
delta = choice['delta'] || {}
|
|
156
|
+
finish_reason = choice['finish_reason']
|
|
157
|
+
request_id = data['request_id'] || data['id']
|
|
158
|
+
|
|
159
|
+
if finish_reason && empty_delta?(delta)
|
|
160
|
+
return Canonical::Chunk.done(
|
|
161
|
+
request_id: request_id,
|
|
162
|
+
usage: Canonical::Usage.from_hash(data['usage']),
|
|
163
|
+
stop_reason: map_stop_reason(finish_reason)
|
|
164
|
+
)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
tool_calls = Array(delta['tool_calls'])
|
|
168
|
+
return build_tool_call_delta_chunk(tool_calls.first, request_id) unless tool_calls.empty?
|
|
169
|
+
|
|
170
|
+
# Thinking delta from reasoning_content
|
|
171
|
+
reasoning_content = delta['reasoning_content'] || delta['reasoning']
|
|
172
|
+
unless reasoning_content.to_s.empty?
|
|
173
|
+
return Canonical::Chunk.thinking_delta(
|
|
174
|
+
delta: reasoning_content,
|
|
175
|
+
request_id: request_id,
|
|
176
|
+
block_index: delta.dig('content_block', 'index'),
|
|
177
|
+
item_id: delta['content_block_start']&.dig('id')
|
|
178
|
+
)
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Text delta — check for embedded think tags
|
|
182
|
+
content = delta['content']
|
|
183
|
+
return parse_text_delta_with_thinking(content, request_id, data) unless content.to_s.empty?
|
|
184
|
+
|
|
185
|
+
nil
|
|
186
|
+
rescue Legion::JSON::ParseError => e
|
|
187
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.translator.parse_chunk')
|
|
188
|
+
nil
|
|
189
|
+
rescue StandardError => e
|
|
190
|
+
handle_exception(e, level: :error, handled: false, operation: 'vllm.translator.parse_chunk')
|
|
191
|
+
raise
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Declared capabilities for the vLLM provider.
|
|
195
|
+
def capabilities
|
|
196
|
+
{
|
|
197
|
+
provider: 'vllm',
|
|
198
|
+
wire_format: 'openai_compatible',
|
|
199
|
+
tool_calls_as_text: true,
|
|
200
|
+
forced_tool_choice: true,
|
|
201
|
+
thinking_tags: %w[think thinking],
|
|
202
|
+
stop_reason_map: VLLM_STOP_REASON_MAP,
|
|
203
|
+
streaming_token_usage: true
|
|
204
|
+
}.freeze
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
private
|
|
208
|
+
|
|
209
|
+
attr_reader :config
|
|
210
|
+
|
|
211
|
+
# ── Message formatting ──
|
|
212
|
+
|
|
213
|
+
def format_messages(request)
|
|
214
|
+
non_system = request.messages&.reject { |m| m.role.to_s == 'system' } || []
|
|
215
|
+
messages = format_request_messages(non_system)
|
|
216
|
+
|
|
217
|
+
if request.system.to_s.strip.empty?
|
|
218
|
+
messages
|
|
219
|
+
else
|
|
220
|
+
[{ role: 'system', content: request.system.strip }] + messages
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
def format_request_messages(messages)
|
|
225
|
+
return [] if messages.nil? || messages.empty?
|
|
226
|
+
|
|
227
|
+
messages.map { |msg| format_message(msg) }
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def format_message(msg)
|
|
231
|
+
role = msg.role.to_s
|
|
232
|
+
content = format_message_content(msg)
|
|
233
|
+
tool_calls = format_message_tool_calls(msg.tool_calls) if msg.tool_calls&.any?
|
|
234
|
+
tool_call_id = msg.tool_call_id
|
|
235
|
+
name = msg.name
|
|
236
|
+
|
|
237
|
+
{
|
|
238
|
+
role: role,
|
|
239
|
+
content: content,
|
|
240
|
+
tool_call_id: tool_call_id,
|
|
241
|
+
tool_calls: tool_calls,
|
|
242
|
+
name: name
|
|
243
|
+
}.compact.reject { |k, v| k == :name && (v.nil? || v.to_s.empty?) }
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
def format_message_content(msg)
|
|
247
|
+
content = msg.content
|
|
248
|
+
return content if content.is_a?(String) && !content.empty?
|
|
249
|
+
|
|
250
|
+
case content
|
|
251
|
+
when Array
|
|
252
|
+
format_content_blocks(content)
|
|
253
|
+
when Canonical::ContentBlock
|
|
254
|
+
format_content_blocks([content])
|
|
255
|
+
when Hash
|
|
256
|
+
format_content_blocks_from_hash(content)
|
|
257
|
+
else
|
|
258
|
+
content&.to_s
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
def format_content_blocks(blocks)
|
|
263
|
+
parts = blocks.map do |block|
|
|
264
|
+
if block.is_a?(Canonical::ContentBlock)
|
|
265
|
+
format_content_block(block)
|
|
266
|
+
elsif block.is_a?(Hash)
|
|
267
|
+
format_content_block_from_hash(block)
|
|
268
|
+
else
|
|
269
|
+
{ type: 'text', text: block.to_s }
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
parts.empty? ? '' : parts
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
# rubocop:disable Lint/DuplicateBranch -- multiple block types intentionally normalize to text in OpenAI wire format
|
|
276
|
+
def format_content_block(block)
|
|
277
|
+
case block.type
|
|
278
|
+
when :text, :thinking, :tool_result
|
|
279
|
+
{ type: 'text', text: block.text.to_s }
|
|
280
|
+
when :tool_use
|
|
281
|
+
{ type: 'text', text: Legion::JSON.generate(block.input || {}) }
|
|
282
|
+
when :image
|
|
283
|
+
build_image_block(block)
|
|
284
|
+
else
|
|
285
|
+
{ type: 'text', text: block.text.to_s }
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
# rubocop:enable Lint/DuplicateBranch
|
|
289
|
+
|
|
290
|
+
def format_content_blocks_from_hash(hash_input)
|
|
291
|
+
case hash_input
|
|
292
|
+
when Hash
|
|
293
|
+
[format_content_block_from_hash(hash_input)]
|
|
294
|
+
when Array
|
|
295
|
+
hash_input.map { |h| format_content_block_from_hash(h) }
|
|
296
|
+
else
|
|
297
|
+
[]
|
|
298
|
+
end
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
# rubocop:disable Lint/DuplicateBranch -- multiple block types intentionally normalize to text in OpenAI wire format
|
|
302
|
+
def format_content_block_from_hash(block_hash)
|
|
303
|
+
h = block_hash.transform_keys(&:to_sym)
|
|
304
|
+
type = (h[:type] || :text).to_sym
|
|
305
|
+
|
|
306
|
+
case type
|
|
307
|
+
when :text, :thinking, :tool_result
|
|
308
|
+
{ type: 'text', text: h[:text].to_s }
|
|
309
|
+
when :tool_use
|
|
310
|
+
{ type: 'text', text: Legion::JSON.generate(h[:input] || {}) }
|
|
311
|
+
when :image, :image_url
|
|
312
|
+
{ type: 'image_url', image_url: { url: h[:data] || h[:url] || '' } }
|
|
313
|
+
else
|
|
314
|
+
{ type: 'text', text: h[:text].to_s }
|
|
315
|
+
end
|
|
316
|
+
end
|
|
317
|
+
# rubocop:enable Lint/DuplicateBranch
|
|
318
|
+
|
|
319
|
+
def build_image_block(block)
|
|
320
|
+
return {} unless block.data || block.source_type
|
|
321
|
+
|
|
322
|
+
url = if block.source_type == :base64 && block.media_type
|
|
323
|
+
"data:#{block.media_type};base64,#{block.data}"
|
|
324
|
+
else
|
|
325
|
+
block.data
|
|
326
|
+
end
|
|
327
|
+
{ type: 'image_url', image_url: { url: url } }
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
def format_message_tool_calls(tool_calls)
|
|
331
|
+
return [] if tool_calls.empty?
|
|
332
|
+
|
|
333
|
+
tc_array = tool_calls.is_a?(Hash) ? tool_calls.values : Array(tool_calls)
|
|
334
|
+
tc_array.map { |tc| format_tool_call_for_history(tc) }
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
def format_tool_call_for_history(tool_call_entry)
|
|
338
|
+
tc_hash = case tool_call_entry
|
|
339
|
+
when Canonical::ToolCall
|
|
340
|
+
{ name: tool_call_entry&.name&.to_s, id: tool_call_entry&.id&.to_s,
|
|
341
|
+
arguments: tool_call_entry&.arguments || {} }
|
|
342
|
+
when Hash
|
|
343
|
+
tool_call_entry.transform_keys(&:to_sym)
|
|
344
|
+
else
|
|
345
|
+
tool_call_entry
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
name = tc_hash[:name] || tc_hash['name']
|
|
349
|
+
id = tc_hash[:id] || tc_hash['id']
|
|
350
|
+
args = tc_hash[:arguments] || tc_hash['arguments'] || {}
|
|
351
|
+
args = args.is_a?(Hash) ? Legion::JSON.generate(args) : args.to_s
|
|
352
|
+
|
|
353
|
+
{
|
|
354
|
+
id: id.to_s,
|
|
355
|
+
type: 'function',
|
|
356
|
+
function: { name: name.to_s, arguments: args }
|
|
357
|
+
}
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
# ── Tool formatting ──
|
|
361
|
+
|
|
362
|
+
def format_tools(tools)
|
|
363
|
+
return [] if tools.to_h.empty?
|
|
364
|
+
|
|
365
|
+
tools.to_h.values.map do |tool|
|
|
366
|
+
tool_hash = if tool.is_a?(Canonical::ToolDefinition)
|
|
367
|
+
{ name: tool.name, description: tool.description, parameters: tool.parameters }
|
|
368
|
+
elsif tool.is_a?(Hash)
|
|
369
|
+
tool.transform_keys(&:to_sym)
|
|
370
|
+
else
|
|
371
|
+
tool
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
name = tool_hash[:name] || tool_hash['name']
|
|
375
|
+
description = (tool_hash[:description] || tool_hash['description'] || '').to_s
|
|
376
|
+
raw_params = tool_hash[:parameters] || tool_hash[:input_schema]
|
|
377
|
+
raw_params = raw_params.to_h if raw_params.respond_to?(:to_h) && !raw_params.is_a?(Hash)
|
|
378
|
+
parameters = Legion::Extensions::Llm::Canonical::ToolDefinition.normalize_parameters(raw_params)
|
|
379
|
+
|
|
380
|
+
{
|
|
381
|
+
type: 'function',
|
|
382
|
+
function: {
|
|
383
|
+
name: name.to_s,
|
|
384
|
+
description: description,
|
|
385
|
+
parameters: parameters
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
end
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
def format_tool_choice(choice)
|
|
392
|
+
return nil unless choice
|
|
393
|
+
|
|
394
|
+
case choice
|
|
395
|
+
when :auto, 'auto'
|
|
396
|
+
'auto'
|
|
397
|
+
when :none, 'none'
|
|
398
|
+
'none'
|
|
399
|
+
when :required, 'required'
|
|
400
|
+
'required'
|
|
401
|
+
when Hash
|
|
402
|
+
name = choice[:name] || choice['name']
|
|
403
|
+
{ type: 'function', function: { name: name.to_s } }
|
|
404
|
+
when Symbol, String
|
|
405
|
+
{ type: 'function', function: { name: choice.to_s } }
|
|
406
|
+
end
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
# ── Parameter mapping (G18) ──
|
|
410
|
+
|
|
411
|
+
def map_params_to_wire(params)
|
|
412
|
+
return {} unless params.is_a?(Canonical::Params)
|
|
413
|
+
|
|
414
|
+
wire = {}
|
|
415
|
+
SUPPORTED_PARAMS.each do |param_key|
|
|
416
|
+
value = params.public_send(param_key)
|
|
417
|
+
next if value.nil?
|
|
418
|
+
|
|
419
|
+
wire_key = PARAM_WIRE_KEYS[param_key]
|
|
420
|
+
wire[wire_key] = case param_key
|
|
421
|
+
when :stop_sequences
|
|
422
|
+
format_stop_sequences(value)
|
|
423
|
+
when :response_format
|
|
424
|
+
format_response_format_value(value)
|
|
425
|
+
else
|
|
426
|
+
value
|
|
427
|
+
end
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
unsupported = {}
|
|
431
|
+
unsupported[:max_thinking_tokens] = params.max_thinking_tokens if params.max_thinking_tokens
|
|
432
|
+
|
|
433
|
+
unless unsupported.empty?
|
|
434
|
+
log.debug do
|
|
435
|
+
"vLLM translator dropping unsupported params: #{unsupported.keys.join(', ')} " \
|
|
436
|
+
'(handled via vLLM-specific render paths)'
|
|
437
|
+
end
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
wire
|
|
441
|
+
end
|
|
442
|
+
|
|
443
|
+
def format_stop_sequences(sequences)
|
|
444
|
+
sequences.is_a?(Array) ? sequences : [sequences]
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
def format_response_format(params)
|
|
448
|
+
return nil unless formatted_response_format?(params)
|
|
449
|
+
|
|
450
|
+
format_response_format_value(params.response_format)
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
def formatted_response_format?(params)
|
|
454
|
+
params.is_a?(Canonical::Params) && params.response_format
|
|
455
|
+
end
|
|
456
|
+
|
|
457
|
+
def format_response_format_value(value)
|
|
458
|
+
return value if value.is_a?(String)
|
|
459
|
+
|
|
460
|
+
val_hash = value.is_a?(Hash) ? value.transform_keys(&:to_sym) : {}
|
|
461
|
+
type = val_hash[:type] || val_hash['type']
|
|
462
|
+
|
|
463
|
+
case type&.to_s
|
|
464
|
+
when 'json_schema'
|
|
465
|
+
schema = val_hash[:schema] || val_hash['schema'] || val_hash[:json_schema] || val_hash['json_schema']
|
|
466
|
+
{ type: 'json_schema', json_schema: schema }
|
|
467
|
+
when 'json_object'
|
|
468
|
+
{ type: 'json_object' }
|
|
469
|
+
else
|
|
470
|
+
value
|
|
471
|
+
end
|
|
472
|
+
end
|
|
473
|
+
|
|
474
|
+
# ── Thinking configuration ──
|
|
475
|
+
|
|
476
|
+
def apply_thinking_config(payload, request)
|
|
477
|
+
return unless enable_thinking?(request)
|
|
478
|
+
|
|
479
|
+
payload[:chat_template_kwargs] = { enable_thinking: true }
|
|
480
|
+
budget = request.params&.max_thinking_tokens
|
|
481
|
+
return unless budget&.positive?
|
|
482
|
+
|
|
483
|
+
log.debug { "vLLM translator thinking max_thinking_tokens=#{budget} via chat template" }
|
|
484
|
+
end
|
|
485
|
+
|
|
486
|
+
def enable_thinking?(request)
|
|
487
|
+
return true if request.thinking.is_a?(Canonical::Thinking::Config) && request.thinking.enabled?
|
|
488
|
+
return true if request.thinking.is_a?(Hash) && (request.thinking[:enabled] != false)
|
|
489
|
+
|
|
490
|
+
if request.thinking.nil? && config
|
|
491
|
+
config_thinking = if config.respond_to?(:enable_thinking)
|
|
492
|
+
config.enable_thinking
|
|
493
|
+
else
|
|
494
|
+
config.respond_to?(:[]) ? config[:enable_thinking] : nil
|
|
495
|
+
end
|
|
496
|
+
return true if config_thinking == true
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
false
|
|
500
|
+
end
|
|
501
|
+
|
|
502
|
+
# ── Response parsing ──
|
|
503
|
+
|
|
504
|
+
def canonical_error_response(wire)
|
|
505
|
+
body = wire.is_a?(Hash) ? wire : {}
|
|
506
|
+
error_info = body['error'] || { type: 'parse_error', message: 'Failed to parse response' }
|
|
507
|
+
|
|
508
|
+
Canonical::Response.build(
|
|
509
|
+
text: '',
|
|
510
|
+
tool_calls: [],
|
|
511
|
+
usage: Canonical::Usage.from_hash(body['usage'] || {}),
|
|
512
|
+
stop_reason: :error,
|
|
513
|
+
model: body['model'],
|
|
514
|
+
metadata: { error: error_info }
|
|
515
|
+
)
|
|
516
|
+
end
|
|
517
|
+
|
|
518
|
+
def extract_thinking_metadata(message)
|
|
519
|
+
{
|
|
520
|
+
reasoning_content: message['reasoning_content'],
|
|
521
|
+
reasoning: message['reasoning'],
|
|
522
|
+
thinking: message['thinking'],
|
|
523
|
+
thinking_text: message['thinking_text'],
|
|
524
|
+
thinking_signature: message['thinking_signature'],
|
|
525
|
+
reasoning_signature: message['reasoning_signature']
|
|
526
|
+
}.compact
|
|
527
|
+
end
|
|
528
|
+
|
|
529
|
+
def build_canonical_thinking(extraction)
|
|
530
|
+
return nil unless extraction.thinking || extraction.signature
|
|
531
|
+
|
|
532
|
+
Canonical::Thinking.new(
|
|
533
|
+
content: extraction.thinking,
|
|
534
|
+
signature: extraction.signature
|
|
535
|
+
)
|
|
536
|
+
end
|
|
537
|
+
|
|
538
|
+
def parse_tool_calls(tool_calls)
|
|
539
|
+
return [] unless tool_calls.is_a?(Array) && !tool_calls.empty?
|
|
540
|
+
|
|
541
|
+
tool_calls.filter_map do |call|
|
|
542
|
+
function = call.fetch('function', {})
|
|
543
|
+
name = function['name']
|
|
544
|
+
id = call['id'] || name || call['index']
|
|
545
|
+
args = parse_tool_arguments(function['arguments'])
|
|
546
|
+
|
|
547
|
+
Canonical::ToolCall.build(
|
|
548
|
+
id: id.to_s,
|
|
549
|
+
name: name.to_s,
|
|
550
|
+
arguments: args,
|
|
551
|
+
source: :client
|
|
552
|
+
)
|
|
553
|
+
rescue StandardError => e
|
|
554
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.translator.parse_tool_call')
|
|
555
|
+
nil
|
|
556
|
+
end
|
|
557
|
+
end
|
|
558
|
+
|
|
559
|
+
def parse_tool_arguments(arguments)
|
|
560
|
+
return {} if arguments.nil? || arguments == ''
|
|
561
|
+
return arguments if arguments.is_a?(Hash)
|
|
562
|
+
|
|
563
|
+
Legion::JSON.load(arguments)
|
|
564
|
+
rescue Legion::JSON::ParseError
|
|
565
|
+
{}
|
|
566
|
+
end
|
|
567
|
+
|
|
568
|
+
# vLLM quirk: synthesize tool calls from content text JSON.
|
|
569
|
+
def synthesize_tool_calls_from_content(content, _message)
|
|
570
|
+
return [] unless content.is_a?(String) && !content.empty?
|
|
571
|
+
|
|
572
|
+
tool_call = try_parse_tool_call_from_text(content)
|
|
573
|
+
return [tool_call] if tool_call
|
|
574
|
+
|
|
575
|
+
json_match = content.match(/\{[^{}]*(?:tool|function|name|arguments)[^{}]*\}/m)
|
|
576
|
+
return [] unless json_match
|
|
577
|
+
|
|
578
|
+
tool_call = try_parse_tool_call_from_text(json_match[0])
|
|
579
|
+
tool_call ? [tool_call] : []
|
|
580
|
+
end
|
|
581
|
+
|
|
582
|
+
def try_parse_tool_call_from_text(text)
|
|
583
|
+
parsed = Legion::JSON.load(text)
|
|
584
|
+
return nil unless parsed.is_a?(Hash)
|
|
585
|
+
|
|
586
|
+
name = parsed[:name] || parsed[:function_name]
|
|
587
|
+
args = parsed[:arguments] || parsed[:parameters] || parsed[:input] || {}
|
|
588
|
+
args = Legion::JSON.load(args) if args.is_a?(String)
|
|
589
|
+
|
|
590
|
+
return nil if name.nil? || name.to_s.empty?
|
|
591
|
+
|
|
592
|
+
Canonical::ToolCall.build(
|
|
593
|
+
name: name.to_s,
|
|
594
|
+
arguments: args.is_a?(Hash) ? args : {},
|
|
595
|
+
source: :client
|
|
596
|
+
)
|
|
597
|
+
rescue Legion::JSON::ParseError
|
|
598
|
+
nil
|
|
599
|
+
end
|
|
600
|
+
|
|
601
|
+
def wire_metadata(wire, message, _thinking_meta)
|
|
602
|
+
meta = {}
|
|
603
|
+
meta[:reasoning_content] = message['reasoning_content'] if message['reasoning_content']
|
|
604
|
+
raw_usage = wire['usage']
|
|
605
|
+
if raw_usage.is_a?(Hash) && raw_usage['completion_tokens_details']
|
|
606
|
+
meta[:completion_tokens_details] = raw_usage['completion_tokens_details']
|
|
607
|
+
end
|
|
608
|
+
meta
|
|
609
|
+
end
|
|
610
|
+
|
|
611
|
+
# ── Chunk helpers ──
|
|
612
|
+
|
|
613
|
+
def build_done_chunk(data)
|
|
614
|
+
Canonical::Chunk.done(
|
|
615
|
+
request_id: data['request_id'] || data['id'],
|
|
616
|
+
usage: Canonical::Usage.from_hash(data['usage']),
|
|
617
|
+
stop_reason: nil
|
|
618
|
+
)
|
|
619
|
+
end
|
|
620
|
+
|
|
621
|
+
# Build a tool_call_delta chunk preserving OpenAI streaming fragment
|
|
622
|
+
# semantics: the opening fragment carries id + name; continuation
|
|
623
|
+
# fragments carry id: nil and a raw partial-JSON arguments string.
|
|
624
|
+
# The StreamAccumulator keys off a nil id to append fragments to the
|
|
625
|
+
# current tool call, so the id must NOT be synthesized here.
|
|
626
|
+
def build_tool_call_delta_chunk(first_call, request_id)
|
|
627
|
+
function = first_call.fetch('function', {})
|
|
628
|
+
|
|
629
|
+
tc = Canonical::ToolCall.new(
|
|
630
|
+
id: first_call['id'], exchange_id: nil,
|
|
631
|
+
name: function['name'], arguments: function['arguments'].to_s,
|
|
632
|
+
source: :client, status: nil, duration_ms: nil, result: nil,
|
|
633
|
+
error: nil, started_at: nil, finished_at: nil, category: nil,
|
|
634
|
+
data_handling_classification: nil, policy_decision: nil
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
Canonical::Chunk.tool_call_delta(
|
|
638
|
+
tool_call: tc,
|
|
639
|
+
request_id: request_id,
|
|
640
|
+
block_index: first_call['index']
|
|
641
|
+
)
|
|
642
|
+
end
|
|
643
|
+
|
|
644
|
+
def empty_delta?(delta)
|
|
645
|
+
(delta['content'].nil? || delta['content'].to_s.empty?) &&
|
|
646
|
+
(delta['tool_calls'].nil? || Array(delta['tool_calls']).empty?) &&
|
|
647
|
+
(delta['reasoning_content'].nil? || delta['reasoning_content'].to_s.empty?)
|
|
648
|
+
end
|
|
649
|
+
|
|
650
|
+
# Per-chunk think-tag extraction is structurally impossible while streaming:
|
|
651
|
+
# tags arrive split across SSE chunks, and ThinkingExtractor strips per-chunk
|
|
652
|
+
# whitespace, corrupting reassembled text. Emit the raw delta unmodified —
|
|
653
|
+
# the StreamAccumulator extracts think tags statefully across deltas.
|
|
654
|
+
# (Previously called ThinkingExtractor.extract_from_content, which is
|
|
655
|
+
# private_class_method in lex-llm >= 0.5.0 and raised NoMethodError on
|
|
656
|
+
# every streamed text delta, silently killing all vLLM streaming.)
|
|
657
|
+
def parse_text_delta_with_thinking(content, request_id, data)
|
|
658
|
+
Canonical::Chunk.text_delta(
|
|
659
|
+
delta: content,
|
|
660
|
+
request_id: request_id,
|
|
661
|
+
index: data['index']
|
|
662
|
+
)
|
|
663
|
+
end
|
|
664
|
+
|
|
665
|
+
# Parse a canonical-form chunk (from conformance kit fixtures).
|
|
666
|
+
|
|
667
|
+
# Detect canonical-form response (from conformance fixtures).
|
|
668
|
+
def canonical_response?(wire)
|
|
669
|
+
wire.key?('text') || wire['text'] || wire.key?(:stop_reason) || wire.key?('stop_reason')
|
|
670
|
+
end
|
|
671
|
+
|
|
672
|
+
def handle_canonical_chunk(data)
|
|
673
|
+
Canonical::Chunk.from_hash(data)
|
|
674
|
+
rescue StandardError => e
|
|
675
|
+
log.debug { "vLLM translator canonical chunk parse error: #{e.message}" }
|
|
676
|
+
nil
|
|
677
|
+
end
|
|
678
|
+
|
|
679
|
+
# ── Stop reason mapping ──
|
|
680
|
+
|
|
681
|
+
def map_stop_reason(raw)
|
|
682
|
+
return FALLBACK_STOP_REASON if raw.nil? || raw.to_s.empty?
|
|
683
|
+
|
|
684
|
+
VLLM_STOP_REASON_MAP.fetch(raw.to_s, FALLBACK_STOP_REASON)
|
|
685
|
+
end
|
|
686
|
+
|
|
687
|
+
# ── JSON helpers ──
|
|
688
|
+
# Never use bare ::JSON inside the Legion namespace.
|
|
689
|
+
|
|
690
|
+
def parse_json_safely(raw)
|
|
691
|
+
return nil unless raw.is_a?(String)
|
|
692
|
+
|
|
693
|
+
Legion::JSON.load(raw)
|
|
694
|
+
rescue Legion::JSON::ParseError => e
|
|
695
|
+
log.debug { "vLLM translator chunk parse error: #{e.message}" }
|
|
696
|
+
nil
|
|
697
|
+
end
|
|
698
|
+
end
|
|
699
|
+
# rubocop:enable Metrics/ClassLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
700
|
+
end
|
|
701
|
+
end
|
|
702
|
+
end
|
|
703
|
+
end
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'legion/extensions/llm'
|
|
4
|
+
require 'legion/extensions/llm/vllm/translator'
|
|
4
5
|
require 'legion/extensions/llm/vllm/provider'
|
|
5
6
|
require 'legion/extensions/llm/vllm/version'
|
|
6
7
|
require 'legion/logging'
|
|
@@ -15,7 +16,7 @@ module Legion
|
|
|
15
16
|
extend Legion::Extensions::Llm::AutoRegistration
|
|
16
17
|
|
|
17
18
|
PROVIDER_FAMILY = :vllm
|
|
18
|
-
DEFAULT_INSTANCE_TIER = { tier: :direct, capabilities:
|
|
19
|
+
DEFAULT_INSTANCE_TIER = { tier: :direct, capabilities: {}, provider_capabilities: { streaming: true } }.freeze
|
|
19
20
|
|
|
20
21
|
def self.default_settings
|
|
21
22
|
::Legion::Extensions::Llm.provider_settings(
|
|
@@ -31,10 +32,7 @@ module Legion
|
|
|
31
32
|
fleet: {
|
|
32
33
|
enabled: false,
|
|
33
34
|
respond_to_requests: false,
|
|
34
|
-
capabilities: %i[chat stream_chat embed]
|
|
35
|
-
lanes: [],
|
|
36
|
-
concurrency: 1,
|
|
37
|
-
queue_suffix: nil
|
|
35
|
+
capabilities: %i[chat stream_chat embed]
|
|
38
36
|
}
|
|
39
37
|
}
|
|
40
38
|
)
|
|
@@ -73,10 +71,19 @@ module Legion
|
|
|
73
71
|
def self.normalize_instance_config(config)
|
|
74
72
|
normalized = config.to_h.transform_keys(&:to_sym)
|
|
75
73
|
resolve_api_base_aliases(normalized)
|
|
74
|
+
resolve_credentials(normalized)
|
|
76
75
|
normalized[:tier] ||= infer_tier_from_endpoint(normalized[:vllm_api_base])
|
|
77
76
|
normalized
|
|
78
77
|
end
|
|
79
78
|
|
|
79
|
+
def self.resolve_credentials(normalized)
|
|
80
|
+
creds = normalized.delete(:credentials)
|
|
81
|
+
return unless creds.is_a?(Hash)
|
|
82
|
+
|
|
83
|
+
creds = creds.transform_keys(&:to_sym)
|
|
84
|
+
normalized[:vllm_api_key] ||= creds[:api_key]
|
|
85
|
+
end
|
|
86
|
+
|
|
80
87
|
def self.resolve_api_base_aliases(normalized)
|
|
81
88
|
normalized[:vllm_api_base] ||= normalized.delete(:base_url)
|
|
82
89
|
normalized[:vllm_api_base] ||= normalized.delete(:api_base)
|
|
@@ -92,12 +99,15 @@ module Legion
|
|
|
92
99
|
return :direct if url.nil? || url.to_s.empty?
|
|
93
100
|
|
|
94
101
|
require 'uri'
|
|
102
|
+
require_relative 'vllm/actors/discovery_refresh'
|
|
95
103
|
host = URI.parse(url.to_s).host.to_s.downcase
|
|
96
104
|
%w[localhost 127.0.0.1 ::1].include?(host) ? :local : :direct
|
|
97
105
|
rescue URI::InvalidURIError => e
|
|
98
106
|
handle_exception(e, level: :debug, handled: true, operation: 'vllm.infer_tier_from_endpoint')
|
|
99
107
|
:direct
|
|
100
108
|
end
|
|
109
|
+
|
|
110
|
+
Legion::Extensions::Llm::Configuration.register_provider_options(Provider.configuration_options)
|
|
101
111
|
end
|
|
102
112
|
end
|
|
103
113
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lex-llm-vllm
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.5
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- LegionIO
|
|
@@ -71,14 +71,14 @@ dependencies:
|
|
|
71
71
|
requirements:
|
|
72
72
|
- - ">="
|
|
73
73
|
- !ruby/object:Gem::Version
|
|
74
|
-
version: 0.
|
|
74
|
+
version: 0.5.0
|
|
75
75
|
type: :runtime
|
|
76
76
|
prerelease: false
|
|
77
77
|
version_requirements: !ruby/object:Gem::Requirement
|
|
78
78
|
requirements:
|
|
79
79
|
- - ">="
|
|
80
80
|
- !ruby/object:Gem::Version
|
|
81
|
-
version: 0.
|
|
81
|
+
version: 0.5.0
|
|
82
82
|
description: vLLM provider integration for the LegionIO LLM routing framework.
|
|
83
83
|
email:
|
|
84
84
|
- matthewdiverson@gmail.com
|
|
@@ -101,6 +101,7 @@ files:
|
|
|
101
101
|
- lib/legion/extensions/llm/vllm/actors/fleet_worker.rb
|
|
102
102
|
- lib/legion/extensions/llm/vllm/provider.rb
|
|
103
103
|
- lib/legion/extensions/llm/vllm/runners/fleet_worker.rb
|
|
104
|
+
- lib/legion/extensions/llm/vllm/translator.rb
|
|
104
105
|
- lib/legion/extensions/llm/vllm/version.rb
|
|
105
106
|
homepage: https://github.com/LegionIO/lex-llm-vllm
|
|
106
107
|
licenses:
|