lex-llm-vllm 0.3.0 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/Gemfile +0 -6
- data/lib/legion/extensions/llm/vllm/actors/discovery_refresh.rb +6 -0
- data/lib/legion/extensions/llm/vllm/provider.rb +114 -5
- data/lib/legion/extensions/llm/vllm/translator.rb +44 -37
- data/lib/legion/extensions/llm/vllm/version.rb +1 -1
- data/lib/legion/extensions/llm/vllm.rb +14 -5
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 20aa357958da5d294b132bdb5d3bea065d0192b42051deae734a3e1b8af592e5
|
|
4
|
+
data.tar.gz: 203a81b2aa087bc2cdabbe26cd71ea88e1bd7b1aaa9e44a95a5b43c35ab2c344
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4ab5021f00c1f6652147297c4f9fc56a2f501eaa1468c0f098aff78d83426706069e4cca7184c96647135ff951dc4965d871352144826d4cf6c8961e60616862
|
|
7
|
+
data.tar.gz: 1e52f17e28c52ddae6317c3fd6ea6fd58b54a71a46d77c80f00544fb35333bf0340c1498b2f9db6419c811765c9926e0bd20228c7bc44badae04dd68a9fd178c
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,25 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.3.5 - 2026-06-16
|
|
4
|
+
|
|
5
|
+
- Extract `vllm_api_key` from `credentials: { api_key: ... }` in instance settings so Bearer auth works with the standard settings layout.
|
|
6
|
+
- Fix `Dalli::RingError` crash in `offering_from_model` when cache server is unavailable; cache write is now best-effort.
|
|
7
|
+
|
|
8
|
+
## 0.3.3 - 2026-06-16
|
|
9
|
+
|
|
10
|
+
- Dependency updates (concurrent-ruby 1.3.7, faraday 2.14.3, rubocop 1.88.0) and code quality improvements.
|
|
11
|
+
|
|
12
|
+
## 0.3.2 - 2026-06-15
|
|
13
|
+
|
|
14
|
+
- **CapabilityPolicy integration** — Optional capabilities default false; use `CapabilityPolicy.resolve` for offerings. Static all-true predicates no longer used for routing truth. Settings overrides at provider/instance/model level supported.
|
|
15
|
+
|
|
16
|
+
## 0.3.1 - 2026-06-13
|
|
17
|
+
|
|
18
|
+
- **Gemfile cleanup** — Remove local path overrides; dependencies resolve from gemspec via rubygems.
|
|
19
|
+
- **Bug fix** — Restore vLLM streaming; private `ThinkingExtractor` call was killing every text delta.
|
|
20
|
+
- **Canonical tool normalization** — Use canonical normalization for tool parameter schemas.
|
|
21
|
+
- 155 examples, 0 failures; 17 files, 0 rubocop offenses.
|
|
22
|
+
|
|
3
23
|
## 0.3.0 - 2026-06-10
|
|
4
24
|
|
|
5
25
|
- Add canonical provider translator (`Translator`) implementing `render_request`,
|
data/Gemfile
CHANGED
|
@@ -2,12 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
source 'https://rubygems.org'
|
|
4
4
|
|
|
5
|
-
group :test do
|
|
6
|
-
transport_path = ENV.fetch('LEGION_TRANSPORT_PATH', File.expand_path('../../legion-transport', __dir__))
|
|
7
|
-
gem 'legion-transport', path: transport_path if File.directory?(transport_path)
|
|
8
|
-
# lex-llm >= 0.5.0 carries canonical types + conformance kit (released on rubygems.org)
|
|
9
|
-
end
|
|
10
|
-
|
|
11
5
|
gemspec
|
|
12
6
|
|
|
13
7
|
group :development do
|
|
@@ -37,6 +37,12 @@ module Legion
|
|
|
37
37
|
return unless defined?(Legion::LLM::Discovery)
|
|
38
38
|
|
|
39
39
|
Legion::LLM::Discovery.refresh_discovered_models!(provider: :vllm)
|
|
40
|
+
if defined?(Legion::LLM::Router) && Legion::LLM::Router.respond_to?(:populate_auto_rules)
|
|
41
|
+
Legion::LLM::Router.populate_auto_rules(Legion::LLM::Discovery.discovered_instances)
|
|
42
|
+
end
|
|
43
|
+
if defined?(Legion::LLM::Inventory) && Legion::LLM::Inventory.respond_to?(:invalidate_offerings_cache!)
|
|
44
|
+
Legion::LLM::Inventory.invalidate_offerings_cache!
|
|
45
|
+
end
|
|
40
46
|
rescue StandardError => e
|
|
41
47
|
handle_exception(e, level: :warn, handled: true, operation: 'vllm.actor.discovery_refresh')
|
|
42
48
|
end
|
|
@@ -163,7 +163,29 @@ module Legion
|
|
|
163
163
|
|
|
164
164
|
def offering_from_model(model_info)
|
|
165
165
|
ctx = model_info.context_length
|
|
166
|
-
|
|
166
|
+
if ctx
|
|
167
|
+
begin
|
|
168
|
+
cache_set(model_detail_cache_key(model_info.id), { context_window: ctx }, ttl: 86_400)
|
|
169
|
+
rescue StandardError => e
|
|
170
|
+
handle_exception(e, level: :debug, handled: true, operation: 'vllm.cache_model_detail')
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
policy = Legion::Extensions::Llm::CapabilityPolicy.resolve(
|
|
175
|
+
real: extract_real_capabilities(model_info),
|
|
176
|
+
provider_catalog: {},
|
|
177
|
+
probe: {},
|
|
178
|
+
provider_envelope: provider_envelope_capabilities,
|
|
179
|
+
provider_config: provider_capability_config,
|
|
180
|
+
instance_config: instance_capability_config,
|
|
181
|
+
model_config: model_capability_config(model_info.id)
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
build_offering(model_info, policy, ctx)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def build_offering(model_info, policy, ctx) # rubocop:disable Metrics/AbcSize
|
|
188
|
+
max_out = model_info.respond_to?(:max_output_tokens) ? model_info.max_output_tokens : nil
|
|
167
189
|
|
|
168
190
|
Legion::Extensions::Llm::Routing::ModelOffering.new(
|
|
169
191
|
provider_family: :vllm,
|
|
@@ -171,13 +193,82 @@ module Legion
|
|
|
171
193
|
transport: offering_transport,
|
|
172
194
|
tier: offering_tier,
|
|
173
195
|
model: model_info.id,
|
|
196
|
+
canonical_model_alias: model_info.respond_to?(:name) ? model_info.name : nil,
|
|
197
|
+
model_family: model_info.respond_to?(:family) ? model_info.family : nil,
|
|
174
198
|
usage_type: model_info.embedding? ? :embedding : :inference,
|
|
175
|
-
capabilities:
|
|
176
|
-
|
|
177
|
-
|
|
199
|
+
capabilities: policy[:capabilities],
|
|
200
|
+
capability_sources: policy[:sources],
|
|
201
|
+
limits: { context_window: ctx, max_output_tokens: max_out }.compact,
|
|
202
|
+
metadata: offering_metadata_for(model_info).merge(capability_sources: policy[:sources])
|
|
178
203
|
)
|
|
179
204
|
end
|
|
180
205
|
|
|
206
|
+
def extract_real_capabilities(model_info)
|
|
207
|
+
return {} unless model_info.respond_to?(:metadata)
|
|
208
|
+
|
|
209
|
+
meta = model_info.metadata
|
|
210
|
+
meta_caps = meta.is_a?(Hash) ? meta[:capabilities] : nil
|
|
211
|
+
meta_caps.is_a?(Hash) ? meta_caps : {}
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def provider_envelope_capabilities
|
|
215
|
+
{ streaming: true }
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def provider_capability_config
|
|
219
|
+
return {} unless defined?(Legion::Extensions::Llm::CredentialSources)
|
|
220
|
+
|
|
221
|
+
conf = Legion::Extensions::Llm::CredentialSources.setting(:extensions, :llm, :vllm)
|
|
222
|
+
conf.is_a?(Hash) ? conf.to_h.except(:instances, 'instances') : {}
|
|
223
|
+
rescue StandardError => e
|
|
224
|
+
handle_exception(e, level: :debug, handled: true, operation: 'vllm.provider_capability_config')
|
|
225
|
+
{}
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def instance_capability_config
|
|
229
|
+
cfg = config
|
|
230
|
+
result = {}
|
|
231
|
+
%i[capabilities enable_thinking enable_tools enable_streaming enable_vision enable_embeddings
|
|
232
|
+
thinking_flag tools_flag streaming_flag vision_flag embedding_flag embeddings_flag
|
|
233
|
+
tool_flag images_flag image_flag].each do |key|
|
|
234
|
+
next unless cfg.respond_to?(key)
|
|
235
|
+
|
|
236
|
+
val = cfg.send(key)
|
|
237
|
+
result[key] = val unless val.nil?
|
|
238
|
+
rescue StandardError
|
|
239
|
+
next
|
|
240
|
+
end
|
|
241
|
+
result
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def model_capability_config(model_id)
|
|
245
|
+
models_conf = resolve_models_config
|
|
246
|
+
return {} unless models_conf.respond_to?(:to_h)
|
|
247
|
+
|
|
248
|
+
hash = models_conf.to_h
|
|
249
|
+
hash[model_id.to_s] || hash[model_id.to_sym] || {}
|
|
250
|
+
rescue StandardError => e
|
|
251
|
+
handle_exception(e, level: :debug, handled: true, operation: 'vllm.model_capability_config')
|
|
252
|
+
{}
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def resolve_models_config
|
|
256
|
+
return config.models if config.respond_to?(:models)
|
|
257
|
+
return config[:models] if config.respond_to?(:[])
|
|
258
|
+
|
|
259
|
+
nil
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
def offering_metadata_for(model_info)
|
|
263
|
+
{
|
|
264
|
+
raw_model: model_info.id,
|
|
265
|
+
parameter_count: model_info.respond_to?(:parameter_count) ? model_info.parameter_count : nil,
|
|
266
|
+
parameter_size: model_info.respond_to?(:parameter_size) ? model_info.parameter_size : nil,
|
|
267
|
+
quantization: model_info.respond_to?(:quantization) ? model_info.quantization : nil,
|
|
268
|
+
size_bytes: model_info.respond_to?(:size_bytes) ? model_info.size_bytes : nil
|
|
269
|
+
}.compact
|
|
270
|
+
end
|
|
271
|
+
|
|
181
272
|
# ── Canonical bridge: legacy provider API → Canonical::Request ──
|
|
182
273
|
|
|
183
274
|
# rubocop:disable Metrics/ParameterLists, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- bridge method can be complex
|
|
@@ -281,7 +372,7 @@ module Legion
|
|
|
281
372
|
role: :assistant,
|
|
282
373
|
content: content,
|
|
283
374
|
model_id: raw_data['model'],
|
|
284
|
-
tool_calls:
|
|
375
|
+
tool_calls: legacy_chunk_tool_calls(canonical),
|
|
285
376
|
thinking: thinking,
|
|
286
377
|
input_tokens: usage.respond_to?(:input_tokens) ? usage.input_tokens : nil,
|
|
287
378
|
output_tokens: usage.respond_to?(:output_tokens) ? usage.output_tokens : nil,
|
|
@@ -289,6 +380,24 @@ module Legion
|
|
|
289
380
|
)
|
|
290
381
|
end
|
|
291
382
|
|
|
383
|
+
# Map a canonical tool_call_delta onto the legacy chunk tool_calls hash.
|
|
384
|
+
# Fragment semantics matter: an entry with a non-nil id starts a new tool
|
|
385
|
+
# call in the StreamAccumulator; a nil id appends the raw arguments
|
|
386
|
+
# fragment to the most recently started call.
|
|
387
|
+
def legacy_chunk_tool_calls(canonical)
|
|
388
|
+
return nil unless canonical.type == :tool_call_delta && canonical.tool_call
|
|
389
|
+
|
|
390
|
+
tc = canonical.tool_call
|
|
391
|
+
key = (tc.id || tc.name || :fragment).to_s.to_sym
|
|
392
|
+
{
|
|
393
|
+
key => Legion::Extensions::Llm::ToolCall.new(
|
|
394
|
+
id: tc.id,
|
|
395
|
+
name: tc.name,
|
|
396
|
+
arguments: tc.arguments
|
|
397
|
+
)
|
|
398
|
+
}
|
|
399
|
+
end
|
|
400
|
+
|
|
292
401
|
# ── Tool choice helpers ──
|
|
293
402
|
|
|
294
403
|
def format_tool_choice_from_prefs(tool_prefs)
|
|
@@ -164,24 +164,8 @@ module Legion
|
|
|
164
164
|
)
|
|
165
165
|
end
|
|
166
166
|
|
|
167
|
-
tool_calls = delta['tool_calls']
|
|
168
|
-
|
|
169
|
-
first_call = tool_calls.first
|
|
170
|
-
function = first_call.fetch('function', {})
|
|
171
|
-
|
|
172
|
-
tc = Canonical::ToolCall.build(
|
|
173
|
-
id: (first_call['id'] || function['name'] || 'synthesized').to_s,
|
|
174
|
-
name: function['name'].to_s,
|
|
175
|
-
arguments: parse_tool_arguments(function['arguments']),
|
|
176
|
-
source: :client
|
|
177
|
-
)
|
|
178
|
-
|
|
179
|
-
return Canonical::Chunk.tool_call_delta(
|
|
180
|
-
tool_call: tc,
|
|
181
|
-
request_id: request_id,
|
|
182
|
-
block_index: first_call['index']
|
|
183
|
-
)
|
|
184
|
-
end
|
|
167
|
+
tool_calls = Array(delta['tool_calls'])
|
|
168
|
+
return build_tool_call_delta_chunk(tool_calls.first, request_id) unless tool_calls.empty?
|
|
185
169
|
|
|
186
170
|
# Thinking delta from reasoning_content
|
|
187
171
|
reasoning_content = delta['reasoning_content'] || delta['reasoning']
|
|
@@ -227,7 +211,8 @@ module Legion
|
|
|
227
211
|
# ── Message formatting ──
|
|
228
212
|
|
|
229
213
|
def format_messages(request)
|
|
230
|
-
|
|
214
|
+
non_system = request.messages&.reject { |m| m.role.to_s == 'system' } || []
|
|
215
|
+
messages = format_request_messages(non_system)
|
|
231
216
|
|
|
232
217
|
if request.system.to_s.strip.empty?
|
|
233
218
|
messages
|
|
@@ -345,7 +330,8 @@ module Legion
|
|
|
345
330
|
def format_message_tool_calls(tool_calls)
|
|
346
331
|
return [] if tool_calls.empty?
|
|
347
332
|
|
|
348
|
-
tool_calls.
|
|
333
|
+
tc_array = tool_calls.is_a?(Hash) ? tool_calls.values : Array(tool_calls)
|
|
334
|
+
tc_array.map { |tc| format_tool_call_for_history(tc) }
|
|
349
335
|
end
|
|
350
336
|
|
|
351
337
|
def format_tool_call_for_history(tool_call_entry)
|
|
@@ -387,10 +373,9 @@ module Legion
|
|
|
387
373
|
|
|
388
374
|
name = tool_hash[:name] || tool_hash['name']
|
|
389
375
|
description = (tool_hash[:description] || tool_hash['description'] || '').to_s
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
parameters =
|
|
393
|
-
parameters = { type: 'object', properties: {} } unless parameters.is_a?(Hash)
|
|
376
|
+
raw_params = tool_hash[:parameters] || tool_hash[:input_schema]
|
|
377
|
+
raw_params = raw_params.to_h if raw_params.respond_to?(:to_h) && !raw_params.is_a?(Hash)
|
|
378
|
+
parameters = Legion::Extensions::Llm::Canonical::ToolDefinition.normalize_parameters(raw_params)
|
|
394
379
|
|
|
395
380
|
{
|
|
396
381
|
type: 'function',
|
|
@@ -633,26 +618,48 @@ module Legion
|
|
|
633
618
|
)
|
|
634
619
|
end
|
|
635
620
|
|
|
621
|
+
# Build a tool_call_delta chunk preserving OpenAI streaming fragment
|
|
622
|
+
# semantics: the opening fragment carries id + name; continuation
|
|
623
|
+
# fragments carry id: nil and a raw partial-JSON arguments string.
|
|
624
|
+
# The StreamAccumulator keys off a nil id to append fragments to the
|
|
625
|
+
# current tool call, so the id must NOT be synthesized here.
|
|
626
|
+
def build_tool_call_delta_chunk(first_call, request_id)
|
|
627
|
+
function = first_call.fetch('function', {})
|
|
628
|
+
|
|
629
|
+
tc = Canonical::ToolCall.new(
|
|
630
|
+
id: first_call['id'], exchange_id: nil,
|
|
631
|
+
name: function['name'], arguments: function['arguments'].to_s,
|
|
632
|
+
source: :client, status: nil, duration_ms: nil, result: nil,
|
|
633
|
+
error: nil, started_at: nil, finished_at: nil, category: nil,
|
|
634
|
+
data_handling_classification: nil, policy_decision: nil
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
Canonical::Chunk.tool_call_delta(
|
|
638
|
+
tool_call: tc,
|
|
639
|
+
request_id: request_id,
|
|
640
|
+
block_index: first_call['index']
|
|
641
|
+
)
|
|
642
|
+
end
|
|
643
|
+
|
|
636
644
|
def empty_delta?(delta)
|
|
637
645
|
(delta['content'].nil? || delta['content'].to_s.empty?) &&
|
|
638
646
|
(delta['tool_calls'].nil? || Array(delta['tool_calls']).empty?) &&
|
|
639
647
|
(delta['reasoning_content'].nil? || delta['reasoning_content'].to_s.empty?)
|
|
640
648
|
end
|
|
641
649
|
|
|
650
|
+
# Per-chunk think-tag extraction is structurally impossible while streaming:
|
|
651
|
+
# tags arrive split across SSE chunks, and ThinkingExtractor strips per-chunk
|
|
652
|
+
# whitespace, corrupting reassembled text. Emit the raw delta unmodified —
|
|
653
|
+
# the StreamAccumulator extracts think tags statefully across deltas.
|
|
654
|
+
# (Previously called ThinkingExtractor.extract_from_content, which is
|
|
655
|
+
# private_class_method in lex-llm >= 0.5.0 and raised NoMethodError on
|
|
656
|
+
# every streamed text delta, silently killing all vLLM streaming.)
|
|
642
657
|
def parse_text_delta_with_thinking(content, request_id, data)
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
Canonical::Chunk.thinking_delta(delta: thinking_text, request_id: request_id)
|
|
649
|
-
else
|
|
650
|
-
Canonical::Chunk.text_delta(
|
|
651
|
-
delta: clean_text || content,
|
|
652
|
-
request_id: request_id,
|
|
653
|
-
index: data['index']
|
|
654
|
-
)
|
|
655
|
-
end
|
|
658
|
+
Canonical::Chunk.text_delta(
|
|
659
|
+
delta: content,
|
|
660
|
+
request_id: request_id,
|
|
661
|
+
index: data['index']
|
|
662
|
+
)
|
|
656
663
|
end
|
|
657
664
|
|
|
658
665
|
# Parse a canonical-form chunk (from conformance kit fixtures).
|
|
@@ -16,7 +16,7 @@ module Legion
|
|
|
16
16
|
extend Legion::Extensions::Llm::AutoRegistration
|
|
17
17
|
|
|
18
18
|
PROVIDER_FAMILY = :vllm
|
|
19
|
-
DEFAULT_INSTANCE_TIER = { tier: :direct, capabilities:
|
|
19
|
+
DEFAULT_INSTANCE_TIER = { tier: :direct, capabilities: {}, provider_capabilities: { streaming: true } }.freeze
|
|
20
20
|
|
|
21
21
|
def self.default_settings
|
|
22
22
|
::Legion::Extensions::Llm.provider_settings(
|
|
@@ -32,10 +32,7 @@ module Legion
|
|
|
32
32
|
fleet: {
|
|
33
33
|
enabled: false,
|
|
34
34
|
respond_to_requests: false,
|
|
35
|
-
capabilities: %i[chat stream_chat embed]
|
|
36
|
-
lanes: [],
|
|
37
|
-
concurrency: 1,
|
|
38
|
-
queue_suffix: nil
|
|
35
|
+
capabilities: %i[chat stream_chat embed]
|
|
39
36
|
}
|
|
40
37
|
}
|
|
41
38
|
)
|
|
@@ -74,10 +71,19 @@ module Legion
|
|
|
74
71
|
def self.normalize_instance_config(config)
|
|
75
72
|
normalized = config.to_h.transform_keys(&:to_sym)
|
|
76
73
|
resolve_api_base_aliases(normalized)
|
|
74
|
+
resolve_credentials(normalized)
|
|
77
75
|
normalized[:tier] ||= infer_tier_from_endpoint(normalized[:vllm_api_base])
|
|
78
76
|
normalized
|
|
79
77
|
end
|
|
80
78
|
|
|
79
|
+
def self.resolve_credentials(normalized)
|
|
80
|
+
creds = normalized.delete(:credentials)
|
|
81
|
+
return unless creds.is_a?(Hash)
|
|
82
|
+
|
|
83
|
+
creds = creds.transform_keys(&:to_sym)
|
|
84
|
+
normalized[:vllm_api_key] ||= creds[:api_key]
|
|
85
|
+
end
|
|
86
|
+
|
|
81
87
|
def self.resolve_api_base_aliases(normalized)
|
|
82
88
|
normalized[:vllm_api_base] ||= normalized.delete(:base_url)
|
|
83
89
|
normalized[:vllm_api_base] ||= normalized.delete(:api_base)
|
|
@@ -93,12 +99,15 @@ module Legion
|
|
|
93
99
|
return :direct if url.nil? || url.to_s.empty?
|
|
94
100
|
|
|
95
101
|
require 'uri'
|
|
102
|
+
require_relative 'vllm/actors/discovery_refresh'
|
|
96
103
|
host = URI.parse(url.to_s).host.to_s.downcase
|
|
97
104
|
%w[localhost 127.0.0.1 ::1].include?(host) ? :local : :direct
|
|
98
105
|
rescue URI::InvalidURIError => e
|
|
99
106
|
handle_exception(e, level: :debug, handled: true, operation: 'vllm.infer_tier_from_endpoint')
|
|
100
107
|
:direct
|
|
101
108
|
end
|
|
109
|
+
|
|
110
|
+
Legion::Extensions::Llm::Configuration.register_provider_options(Provider.configuration_options)
|
|
102
111
|
end
|
|
103
112
|
end
|
|
104
113
|
end
|