lex-llm-vllm 0.3.0 → 0.3.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +57 -0
- data/Gemfile +0 -6
- data/lex-llm-vllm.gemspec +1 -1
- data/lib/legion/extensions/llm/vllm/actors/discovery_refresh.rb +118 -7
- data/lib/legion/extensions/llm/vllm/provider.rb +122 -26
- data/lib/legion/extensions/llm/vllm/runners/fleet_worker.rb +1 -1
- data/lib/legion/extensions/llm/vllm/translator.rb +57 -37
- data/lib/legion/extensions/llm/vllm/version.rb +1 -1
- data/lib/legion/extensions/llm/vllm.rb +15 -6
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 66c6939650ded871419a3833d9d0a89e5c31a104f3acccbb065af86af17709cb
|
|
4
|
+
data.tar.gz: e639e39a60ff4c03838100bd361048997fecfe38e92fdd9f1763809958a43f8e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cf346327446a5e3c7b8666c8ea04269d7fea32b7bbe89b9a0c11b7578fe3ca1a8f948cf4fc639265bfcfa6f0a65bca7007f24ce90146f6234f1589fcb6fe5c31
|
|
7
|
+
data.tar.gz: 1e324da7463bd1a63b14b5fde9b3f6030be0f8cdd933d802d350066beaf40fe8208998a298a848b00126bf5d1d68868b8384c2ed719fb863c825179aa8c92ced
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,62 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.3.11] - 2026-06-20
|
|
4
|
+
|
|
5
|
+
### Fixed
|
|
6
|
+
- Stub shared registry publishing through `RegistryPublisher#schedule` in specs so async availability-event coverage stays stable after the shared publisher moved off raw `Thread.new`.
|
|
7
|
+
|
|
8
|
+
## [0.3.10] - 2026-06-20
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
- Stop bulk-publishing vLLM model availability from `list_models`; discovery now emits one registry event per seen model from the shared `lex-llm` policy-filter path so blocked models stay observable without duplicate publishes.
|
|
12
|
+
|
|
13
|
+
## [0.3.9] - 2026-06-20
|
|
14
|
+
|
|
15
|
+
### Changed
|
|
16
|
+
- Slow the live discovery refresh cadence from 60 seconds to 300 seconds for vLLM instances; `extensions.llm.vllm.discovery_interval` still overrides the default.
|
|
17
|
+
|
|
18
|
+
## [0.3.8] - 2026-06-20
|
|
19
|
+
|
|
20
|
+
### Fixed
|
|
21
|
+
- Use the shared `lex-llm` capability override contract for provider, instance, and model settings, with canonical capability normalization for embedding/tool/thinking routing.
|
|
22
|
+
|
|
23
|
+
## [0.3.7] - 2026-06-19
|
|
24
|
+
|
|
25
|
+
### Changed
|
|
26
|
+
- Adopt `Legion::Extensions::Llm::Inventory::ScopedRefresher` mixin (lex-llm 0.6.0). Discovery
|
|
27
|
+
refresh actors now write directly to the live `Inventory` catalog via `Inventory.write_lane`.
|
|
28
|
+
- Pin `lex-llm >= 0.6.0` and `legion-llm >= 0.14.0` in gemspec.
|
|
29
|
+
- Standard `weight: 100` default added to provider instance settings schema.
|
|
30
|
+
|
|
31
|
+
## 0.3.6 - 2026-06-18
|
|
32
|
+
|
|
33
|
+
- **Streaming token usage** — request `stream_options: { include_usage: true }` on streaming chat so
|
|
34
|
+
vLLM emits the final usage-only chunk. Streaming responses now carry input/output token counts;
|
|
35
|
+
previously every streamed response reported zero tokens, which blinded metering/cost. Overridable
|
|
36
|
+
per-instance via `config[:stream_token_usage] = false` for a non-conforming OpenAI-compatible
|
|
37
|
+
backend that rejects the field. The chunk parser already handled the trailing `choices: []` usage
|
|
38
|
+
chunk; the gap was only that the request never asked for it.
|
|
39
|
+
|
|
40
|
+
## 0.3.5 - 2026-06-16
|
|
41
|
+
|
|
42
|
+
- Extract `vllm_api_key` from `credentials: { api_key: ... }` in instance settings so Bearer auth works with the standard settings layout.
|
|
43
|
+
- Fix `Dalli::RingError` crash in `offering_from_model` when cache server is unavailable; cache write is now best-effort.
|
|
44
|
+
|
|
45
|
+
## 0.3.3 - 2026-06-16
|
|
46
|
+
|
|
47
|
+
- Dependency updates (concurrent-ruby 1.3.7, faraday 2.14.3, rubocop 1.88.0) and code quality improvements.
|
|
48
|
+
|
|
49
|
+
## 0.3.2 - 2026-06-15
|
|
50
|
+
|
|
51
|
+
- **CapabilityPolicy integration** — Optional capabilities default false; use `CapabilityPolicy.resolve` for offerings. Static all-true predicates no longer used for routing truth. Settings overrides at provider/instance/model level supported.
|
|
52
|
+
|
|
53
|
+
## 0.3.1 - 2026-06-13
|
|
54
|
+
|
|
55
|
+
- **Gemfile cleanup** — Remove local path overrides; dependencies resolve from gemspec via rubygems.
|
|
56
|
+
- **Bug fix** — Restore vLLM streaming; private `ThinkingExtractor` call was killing every text delta.
|
|
57
|
+
- **Canonical tool normalization** — Use canonical normalization for tool parameter schemas.
|
|
58
|
+
- 155 examples, 0 failures; 17 files, 0 rubocop offenses.
|
|
59
|
+
|
|
3
60
|
## 0.3.0 - 2026-06-10
|
|
4
61
|
|
|
5
62
|
- Add canonical provider translator (`Translator`) implementing `render_request`,
|
data/Gemfile
CHANGED
|
@@ -2,12 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
source 'https://rubygems.org'
|
|
4
4
|
|
|
5
|
-
group :test do
|
|
6
|
-
transport_path = ENV.fetch('LEGION_TRANSPORT_PATH', File.expand_path('../../legion-transport', __dir__))
|
|
7
|
-
gem 'legion-transport', path: transport_path if File.directory?(transport_path)
|
|
8
|
-
# lex-llm >= 0.5.0 carries canonical types + conformance kit (released on rubygems.org)
|
|
9
|
-
end
|
|
10
|
-
|
|
11
5
|
gemspec
|
|
12
6
|
|
|
13
7
|
group :development do
|
data/lex-llm-vllm.gemspec
CHANGED
|
@@ -27,5 +27,5 @@ Gem::Specification.new do |spec|
|
|
|
27
27
|
spec.add_dependency 'legion-logging', '>= 1.3.2'
|
|
28
28
|
spec.add_dependency 'legion-settings', '>= 1.3.14'
|
|
29
29
|
spec.add_dependency 'legion-transport', '>= 1.4.14'
|
|
30
|
-
spec.add_dependency 'lex-llm', '>= 0.
|
|
30
|
+
spec.add_dependency 'lex-llm', '>= 0.6.0'
|
|
31
31
|
end
|
|
@@ -1,11 +1,19 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'digest'
|
|
4
|
+
|
|
3
5
|
begin
|
|
4
6
|
require 'legion/extensions/actors/every'
|
|
5
7
|
rescue LoadError => e
|
|
6
8
|
warn(e.message) if $VERBOSE
|
|
7
9
|
end
|
|
8
10
|
|
|
11
|
+
begin
|
|
12
|
+
require 'legion/extensions/llm/inventory/scoped_refresher'
|
|
13
|
+
rescue LoadError => e
|
|
14
|
+
warn(e.message) if $VERBOSE
|
|
15
|
+
end
|
|
16
|
+
|
|
9
17
|
return unless defined?(Legion::Extensions::Actors::Every)
|
|
10
18
|
|
|
11
19
|
module Legion
|
|
@@ -17,7 +25,11 @@ module Legion
|
|
|
17
25
|
class DiscoveryRefresh < Legion::Extensions::Actors::Every
|
|
18
26
|
include Legion::Logging::Helper
|
|
19
27
|
|
|
20
|
-
|
|
28
|
+
if defined?(Legion::Extensions::Llm::Inventory::ScopedRefresher)
|
|
29
|
+
include Legion::Extensions::Llm::Inventory::ScopedRefresher
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def self.every_seconds = 300
|
|
21
33
|
|
|
22
34
|
def runner_class = self.class
|
|
23
35
|
def runner_function = 'manual'
|
|
@@ -27,19 +39,118 @@ module Legion
|
|
|
27
39
|
def generate_task? = false
|
|
28
40
|
|
|
29
41
|
def time
|
|
30
|
-
return
|
|
42
|
+
return self.class.every_seconds unless defined?(Legion::Settings)
|
|
31
43
|
|
|
32
|
-
Legion::Settings.dig(:extensions, :llm, :vllm, :discovery_interval) ||
|
|
44
|
+
Legion::Settings.dig(:extensions, :llm, :vllm, :discovery_interval) || self.class.every_seconds
|
|
33
45
|
end
|
|
34
46
|
|
|
35
|
-
def
|
|
36
|
-
|
|
37
|
-
|
|
47
|
+
def scope_key = { provider: :vllm }
|
|
48
|
+
def offering_type(raw_type) = %i[embed embedding].include?(raw_type) ? :embedding : :inference
|
|
49
|
+
|
|
50
|
+
def vllm_cfg
|
|
51
|
+
return unless defined?(Legion::Settings)
|
|
38
52
|
|
|
39
|
-
Legion::
|
|
53
|
+
Legion::Settings.dig(:extensions, :llm,
|
|
54
|
+
:vllm)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def compute_lanes_for_scope(**)
|
|
58
|
+
return [] unless defined?(Legion::LLM::Call::Registry)
|
|
59
|
+
|
|
60
|
+
vllm_instances.flat_map { |entry| lanes_from_instance(entry) }
|
|
61
|
+
rescue StandardError => e
|
|
62
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.actor.compute_lanes_for_scope')
|
|
63
|
+
[]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def credential_hash(**)
|
|
67
|
+
cfg = vllm_cfg
|
|
68
|
+
Digest::SHA256.hexdigest(cfg&.dig(:api_key).to_s + cfg&.dig(:instances).to_s)[0, 16]
|
|
69
|
+
rescue StandardError
|
|
70
|
+
'unknown'
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def manual
|
|
74
|
+
run_scoped_tick
|
|
40
75
|
rescue StandardError => e
|
|
41
76
|
handle_exception(e, level: :warn, handled: true, operation: 'vllm.actor.discovery_refresh')
|
|
42
77
|
end
|
|
78
|
+
|
|
79
|
+
private
|
|
80
|
+
|
|
81
|
+
def run_scoped_tick
|
|
82
|
+
return unless defined?(Legion::Extensions::Llm::Inventory::ScopedRefresher)
|
|
83
|
+
return unless self.class.ancestors.include?(Legion::Extensions::Llm::Inventory::ScopedRefresher)
|
|
84
|
+
|
|
85
|
+
tick
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def vllm_instances
|
|
89
|
+
Legion::LLM::Call::Registry.all_instances.select { |e| (e[:provider] || '').to_sym == :vllm }
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def lanes_from_instance(instance_entry)
|
|
93
|
+
adapter = instance_entry[:adapter]
|
|
94
|
+
return [] unless adapter.respond_to?(:discover_offerings)
|
|
95
|
+
|
|
96
|
+
Array(adapter.discover_offerings(live: true)).flat_map do |offering|
|
|
97
|
+
raw = offering_to_hash(offering)
|
|
98
|
+
lane = build_lane(raw, instance_entry)
|
|
99
|
+
fleet = maybe_fleet_lane(lane)
|
|
100
|
+
fleet ? [lane, fleet] : [lane]
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# ModelOffering objects do not implement `[]`; normalize to a Hash so the
|
|
105
|
+
# rest of the writer stays Hash-shaped. Hash inputs pass through untouched.
|
|
106
|
+
def offering_to_hash(offering)
|
|
107
|
+
return offering if offering.is_a?(Hash)
|
|
108
|
+
|
|
109
|
+
hash = offering.to_h
|
|
110
|
+
hash[:type] ||= hash[:usage_type]
|
|
111
|
+
hash[:enabled] = offering.respond_to?(:enabled?) ? offering.enabled? : true
|
|
112
|
+
hash
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def build_lane(offering, instance_entry) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity
|
|
116
|
+
tier = offering[:tier] || :direct
|
|
117
|
+
type = offering_type(offering[:type])
|
|
118
|
+
instance_id = offering[:instance_id] ||
|
|
119
|
+
instance_entry[:instance] ||
|
|
120
|
+
instance_entry[:instance_id] ||
|
|
121
|
+
instance_entry[:id]
|
|
122
|
+
provider_family = offering[:provider_family] || :vllm
|
|
123
|
+
model = offering[:model]
|
|
124
|
+
lane_id = Legion::Extensions::Llm::Inventory::ScopedRefresher.compose_id(
|
|
125
|
+
tier: tier, provider_family: provider_family, instance_id: instance_id, type: type, model: model
|
|
126
|
+
)
|
|
127
|
+
{ id: lane_id, tier: tier, provider_family: provider_family, instance_id: instance_id,
|
|
128
|
+
model: model, canonical_model_alias: offering[:canonical_model_alias], type: type,
|
|
129
|
+
capabilities: normalize_caps(offering[:capabilities]),
|
|
130
|
+
limits: offering[:limits] || {}, enabled: offering.fetch(:enabled, true), cost: offering[:cost] || {} }
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def maybe_fleet_lane(lane)
|
|
134
|
+
return unless lane[:type] == :inference && vllm_cfg&.dig(:fleet, :dispatch, :enabled)
|
|
135
|
+
|
|
136
|
+
fleet_id = Legion::Extensions::Llm::Inventory::ScopedRefresher.compose_id(
|
|
137
|
+
tier: :fleet, provider_family: lane[:provider_family],
|
|
138
|
+
instance_id: lane[:instance_id], type: lane[:type], model: lane[:model]
|
|
139
|
+
)
|
|
140
|
+
lane.merge(id: fleet_id, tier: :fleet)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def normalize_caps(caps)
|
|
144
|
+
# Inventory::Capabilities lives in lex-llm; the previous fallback (`return []
|
|
145
|
+
# unless defined?(...)`) silently swallowed every capability the operator
|
|
146
|
+
# declared via enable_thinking/enable_tools when the constant wasn't loaded.
|
|
147
|
+
# Always normalize through the shared vocabulary so aliases collapse.
|
|
148
|
+
if defined?(Legion::Extensions::Llm::Inventory::Capabilities)
|
|
149
|
+
Legion::Extensions::Llm::Inventory::Capabilities.normalize(caps)
|
|
150
|
+
else
|
|
151
|
+
Array(caps).compact.map(&:to_sym).uniq
|
|
152
|
+
end
|
|
153
|
+
end
|
|
43
154
|
end
|
|
44
155
|
end
|
|
45
156
|
end
|
|
@@ -33,9 +33,9 @@ module Legion
|
|
|
33
33
|
|
|
34
34
|
def chat?(_model) = true
|
|
35
35
|
def streaming?(_model) = true
|
|
36
|
-
def vision?(_model) =
|
|
37
|
-
def functions?(_model) =
|
|
38
|
-
def embeddings?(_model) =
|
|
36
|
+
def vision?(_model) = false
|
|
37
|
+
def functions?(_model) = false
|
|
38
|
+
def embeddings?(_model) = false
|
|
39
39
|
|
|
40
40
|
def critical_capabilities_for(model)
|
|
41
41
|
[
|
|
@@ -78,7 +78,7 @@ module Legion
|
|
|
78
78
|
|
|
79
79
|
def health(live: false)
|
|
80
80
|
log.info { "checking health live=#{live} at #{api_base}#{health_url}" }
|
|
81
|
-
|
|
81
|
+
super
|
|
82
82
|
end
|
|
83
83
|
|
|
84
84
|
def readiness(live: false)
|
|
@@ -88,27 +88,20 @@ module Legion
|
|
|
88
88
|
end
|
|
89
89
|
end
|
|
90
90
|
|
|
91
|
-
def list_models
|
|
91
|
+
def list_models(live: false, **filters)
|
|
92
92
|
log.info { "discovering models from #{api_base}#{models_url}" }
|
|
93
93
|
super.tap do |models|
|
|
94
94
|
log.info { "discovered #{models.size} model(s) from vLLM" }
|
|
95
|
-
self.class.registry_publisher.publish_models_async(models, readiness: readiness(live: false))
|
|
96
95
|
end
|
|
97
96
|
end
|
|
98
97
|
|
|
99
|
-
def discover_offerings(live: false, **)
|
|
100
|
-
|
|
101
|
-
@cached_models = list_models
|
|
102
|
-
else
|
|
103
|
-
Array(@cached_models)
|
|
104
|
-
end
|
|
105
|
-
offerings = models.filter_map do |model_info|
|
|
106
|
-
next unless model_allowed?(model_info.id)
|
|
98
|
+
def discover_offerings(live: false, **filters)
|
|
99
|
+
return filter_cached_offerings(Array(@cached_offerings), filters) unless live
|
|
107
100
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
101
|
+
provider_health = health(live:)
|
|
102
|
+
@cached_offerings = discover_live_offerings(filters, provider_health, live:)
|
|
103
|
+
log_discover_complete(@cached_offerings)
|
|
104
|
+
@cached_offerings
|
|
112
105
|
rescue StandardError => e
|
|
113
106
|
handle_exception(e, level: :warn, handled: true, operation: 'vllm.discover_offerings')
|
|
114
107
|
[]
|
|
@@ -161,9 +154,68 @@ module Legion
|
|
|
161
154
|
|
|
162
155
|
private
|
|
163
156
|
|
|
164
|
-
def
|
|
157
|
+
def discovery_registry_readiness(provider_health, live:)
|
|
158
|
+
{
|
|
159
|
+
provider: slug.to_sym,
|
|
160
|
+
configured: configured?,
|
|
161
|
+
ready: provider_health[:ready] == true,
|
|
162
|
+
live: live,
|
|
163
|
+
health: provider_health
|
|
164
|
+
}
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def discover_live_offerings(filters, provider_health, live:)
|
|
168
|
+
readiness = discovery_registry_readiness(provider_health, live:)
|
|
169
|
+
Array(list_models(live:, **filters)).filter_map do |model|
|
|
170
|
+
self.class.registry_publisher.publish_models_async([model], readiness:)
|
|
171
|
+
next unless model_matches_filters?(model, filters)
|
|
172
|
+
next unless model_allowed?(model.id)
|
|
173
|
+
|
|
174
|
+
log_model_discovered(model)
|
|
175
|
+
offering_from_model(model, health: provider_health)
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def log_model_discovered(model)
|
|
180
|
+
log.debug(
|
|
181
|
+
"[#{slug}] instance=#{provider_instance_id} action=model_discovered " \
|
|
182
|
+
"model=#{model.id} family=#{model.family}"
|
|
183
|
+
)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def log_discover_complete(offerings)
|
|
187
|
+
log.info(
|
|
188
|
+
"[#{slug}] instance=#{provider_instance_id} action=discover_complete " \
|
|
189
|
+
"model_count=#{Array(offerings).size}"
|
|
190
|
+
)
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def offering_from_model(model_info, health: {})
|
|
165
194
|
ctx = model_info.context_length
|
|
166
|
-
|
|
195
|
+
if ctx
|
|
196
|
+
begin
|
|
197
|
+
cache_set(model_detail_cache_key(model_info.id), { context_window: ctx }, ttl: 86_400)
|
|
198
|
+
rescue StandardError => e
|
|
199
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.cache_model_detail')
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
policy = Legion::Extensions::Llm::CapabilityPolicy.resolve(
|
|
204
|
+
real: extract_real_capabilities(model_info),
|
|
205
|
+
provider_catalog: {},
|
|
206
|
+
probe: {},
|
|
207
|
+
provider_envelope: provider_envelope_capabilities,
|
|
208
|
+
provider_config: provider_capability_config,
|
|
209
|
+
instance_config: instance_capability_config,
|
|
210
|
+
model_config: model_capability_config(model_info.id)
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
build_offering(model_info, policy, ctx, health)
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
def build_offering(model_info, policy, ctx, health) # rubocop:disable Metrics/AbcSize
|
|
217
|
+
max_out = model_info.respond_to?(:max_output_tokens) ? model_info.max_output_tokens : nil
|
|
218
|
+
usage_type = policy[:capabilities].include?(:embedding) ? :embedding : :inference
|
|
167
219
|
|
|
168
220
|
Legion::Extensions::Llm::Routing::ModelOffering.new(
|
|
169
221
|
provider_family: :vllm,
|
|
@@ -171,13 +223,39 @@ module Legion
|
|
|
171
223
|
transport: offering_transport,
|
|
172
224
|
tier: offering_tier,
|
|
173
225
|
model: model_info.id,
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
226
|
+
canonical_model_alias: model_info.respond_to?(:name) ? model_info.name : nil,
|
|
227
|
+
model_family: model_info.respond_to?(:family) ? model_info.family : nil,
|
|
228
|
+
usage_type: usage_type,
|
|
229
|
+
capabilities: policy[:capabilities],
|
|
230
|
+
capability_sources: policy[:sources],
|
|
231
|
+
limits: { context_window: ctx, max_output_tokens: max_out }.compact,
|
|
232
|
+
health: health,
|
|
233
|
+
metadata: offering_metadata_for(model_info).merge(capability_sources: policy[:sources])
|
|
178
234
|
)
|
|
179
235
|
end
|
|
180
236
|
|
|
237
|
+
def extract_real_capabilities(model_info)
|
|
238
|
+
return {} unless model_info.respond_to?(:metadata)
|
|
239
|
+
|
|
240
|
+
meta = model_info.metadata
|
|
241
|
+
meta_caps = meta.is_a?(Hash) ? meta[:capabilities] : nil
|
|
242
|
+
meta_caps.is_a?(Hash) ? meta_caps : {}
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
def provider_envelope_capabilities
|
|
246
|
+
{ completion: true, streaming: true }
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def offering_metadata_for(model_info)
|
|
250
|
+
{
|
|
251
|
+
raw_model: model_info.id,
|
|
252
|
+
parameter_count: model_info.respond_to?(:parameter_count) ? model_info.parameter_count : nil,
|
|
253
|
+
parameter_size: model_info.respond_to?(:parameter_size) ? model_info.parameter_size : nil,
|
|
254
|
+
quantization: model_info.respond_to?(:quantization) ? model_info.quantization : nil,
|
|
255
|
+
size_bytes: model_info.respond_to?(:size_bytes) ? model_info.size_bytes : nil
|
|
256
|
+
}.compact
|
|
257
|
+
end
|
|
258
|
+
|
|
181
259
|
# ── Canonical bridge: legacy provider API → Canonical::Request ──
|
|
182
260
|
|
|
183
261
|
# rubocop:disable Metrics/ParameterLists, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- bridge method can be complex
|
|
@@ -281,7 +359,7 @@ module Legion
|
|
|
281
359
|
role: :assistant,
|
|
282
360
|
content: content,
|
|
283
361
|
model_id: raw_data['model'],
|
|
284
|
-
tool_calls:
|
|
362
|
+
tool_calls: legacy_chunk_tool_calls(canonical),
|
|
285
363
|
thinking: thinking,
|
|
286
364
|
input_tokens: usage.respond_to?(:input_tokens) ? usage.input_tokens : nil,
|
|
287
365
|
output_tokens: usage.respond_to?(:output_tokens) ? usage.output_tokens : nil,
|
|
@@ -289,6 +367,24 @@ module Legion
|
|
|
289
367
|
)
|
|
290
368
|
end
|
|
291
369
|
|
|
370
|
+
# Map a canonical tool_call_delta onto the legacy chunk tool_calls hash.
|
|
371
|
+
# Fragment semantics matter: an entry with a non-nil id starts a new tool
|
|
372
|
+
# call in the StreamAccumulator; a nil id appends the raw arguments
|
|
373
|
+
# fragment to the most recently started call.
|
|
374
|
+
def legacy_chunk_tool_calls(canonical)
|
|
375
|
+
return nil unless canonical.type == :tool_call_delta && canonical.tool_call
|
|
376
|
+
|
|
377
|
+
tc = canonical.tool_call
|
|
378
|
+
key = (tc.id || tc.name || :fragment).to_s.to_sym
|
|
379
|
+
{
|
|
380
|
+
key => Legion::Extensions::Llm::ToolCall.new(
|
|
381
|
+
id: tc.id,
|
|
382
|
+
name: tc.name,
|
|
383
|
+
arguments: tc.arguments
|
|
384
|
+
)
|
|
385
|
+
}
|
|
386
|
+
end
|
|
387
|
+
|
|
292
388
|
# ── Tool choice helpers ──
|
|
293
389
|
|
|
294
390
|
def format_tool_choice_from_prefs(tool_prefs)
|
|
@@ -344,7 +440,7 @@ module Legion
|
|
|
344
440
|
def vllm_thinking_setting
|
|
345
441
|
instance_thinking_enabled? || global_thinking_enabled?
|
|
346
442
|
rescue StandardError => e
|
|
347
|
-
handle_exception(e, level: :
|
|
443
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.thinking_setting')
|
|
348
444
|
false
|
|
349
445
|
end
|
|
350
446
|
|
|
@@ -37,7 +37,7 @@ module Legion
|
|
|
37
37
|
|
|
38
38
|
payload[key] || payload[key.to_s]
|
|
39
39
|
rescue StandardError => e
|
|
40
|
-
handle_exception(e, level: :
|
|
40
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.fleet_worker.payload_field',
|
|
41
41
|
field: key)
|
|
42
42
|
nil
|
|
43
43
|
end
|
|
@@ -70,6 +70,7 @@ module Legion
|
|
|
70
70
|
payload[:tools] = format_tools(request.tools) unless request.tools.to_h.empty?
|
|
71
71
|
payload[:tool_choice] = format_tool_choice(request.tool_choice) if request.tool_choice
|
|
72
72
|
payload.merge!(map_params_to_wire(request.params)) if request.params
|
|
73
|
+
payload[:stream_options] = { include_usage: true } if request.stream && stream_token_usage?
|
|
73
74
|
apply_thinking_config(payload, request)
|
|
74
75
|
if formatted_response_format?(request.params)
|
|
75
76
|
payload[:response_format] =
|
|
@@ -164,24 +165,8 @@ module Legion
|
|
|
164
165
|
)
|
|
165
166
|
end
|
|
166
167
|
|
|
167
|
-
tool_calls = delta['tool_calls']
|
|
168
|
-
|
|
169
|
-
first_call = tool_calls.first
|
|
170
|
-
function = first_call.fetch('function', {})
|
|
171
|
-
|
|
172
|
-
tc = Canonical::ToolCall.build(
|
|
173
|
-
id: (first_call['id'] || function['name'] || 'synthesized').to_s,
|
|
174
|
-
name: function['name'].to_s,
|
|
175
|
-
arguments: parse_tool_arguments(function['arguments']),
|
|
176
|
-
source: :client
|
|
177
|
-
)
|
|
178
|
-
|
|
179
|
-
return Canonical::Chunk.tool_call_delta(
|
|
180
|
-
tool_call: tc,
|
|
181
|
-
request_id: request_id,
|
|
182
|
-
block_index: first_call['index']
|
|
183
|
-
)
|
|
184
|
-
end
|
|
168
|
+
tool_calls = Array(delta['tool_calls'])
|
|
169
|
+
return build_tool_call_delta_chunk(tool_calls.first, request_id) unless tool_calls.empty?
|
|
185
170
|
|
|
186
171
|
# Thinking delta from reasoning_content
|
|
187
172
|
reasoning_content = delta['reasoning_content'] || delta['reasoning']
|
|
@@ -224,10 +209,23 @@ module Legion
|
|
|
224
209
|
|
|
225
210
|
attr_reader :config
|
|
226
211
|
|
|
212
|
+
# OpenAI `stream_options.include_usage` asks the server to emit a final
|
|
213
|
+
# usage-only chunk (choices:[]) so streaming responses carry token counts.
|
|
214
|
+
# vLLM supports it (capability streaming_token_usage); defaults on, but a
|
|
215
|
+
# non-conforming OpenAI-compatible backend that rejects the field can opt
|
|
216
|
+
# out per-instance via config[:stream_token_usage] = false.
|
|
217
|
+
def stream_token_usage?
|
|
218
|
+
override = config.respond_to?(:[]) ? config[:stream_token_usage] : nil
|
|
219
|
+
return override != false unless override.nil?
|
|
220
|
+
|
|
221
|
+
capabilities[:streaming_token_usage] == true
|
|
222
|
+
end
|
|
223
|
+
|
|
227
224
|
# ── Message formatting ──
|
|
228
225
|
|
|
229
226
|
def format_messages(request)
|
|
230
|
-
|
|
227
|
+
non_system = request.messages&.reject { |m| m.role.to_s == 'system' } || []
|
|
228
|
+
messages = format_request_messages(non_system)
|
|
231
229
|
|
|
232
230
|
if request.system.to_s.strip.empty?
|
|
233
231
|
messages
|
|
@@ -345,7 +343,8 @@ module Legion
|
|
|
345
343
|
def format_message_tool_calls(tool_calls)
|
|
346
344
|
return [] if tool_calls.empty?
|
|
347
345
|
|
|
348
|
-
tool_calls.
|
|
346
|
+
tc_array = tool_calls.is_a?(Hash) ? tool_calls.values : Array(tool_calls)
|
|
347
|
+
tc_array.map { |tc| format_tool_call_for_history(tc) }
|
|
349
348
|
end
|
|
350
349
|
|
|
351
350
|
def format_tool_call_for_history(tool_call_entry)
|
|
@@ -387,10 +386,9 @@ module Legion
|
|
|
387
386
|
|
|
388
387
|
name = tool_hash[:name] || tool_hash['name']
|
|
389
388
|
description = (tool_hash[:description] || tool_hash['description'] || '').to_s
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
parameters =
|
|
393
|
-
parameters = { type: 'object', properties: {} } unless parameters.is_a?(Hash)
|
|
389
|
+
raw_params = tool_hash[:parameters] || tool_hash[:input_schema]
|
|
390
|
+
raw_params = raw_params.to_h if raw_params.respond_to?(:to_h) && !raw_params.is_a?(Hash)
|
|
391
|
+
parameters = Legion::Extensions::Llm::Canonical::ToolDefinition.normalize_parameters(raw_params)
|
|
394
392
|
|
|
395
393
|
{
|
|
396
394
|
type: 'function',
|
|
@@ -633,26 +631,48 @@ module Legion
|
|
|
633
631
|
)
|
|
634
632
|
end
|
|
635
633
|
|
|
634
|
+
# Build a tool_call_delta chunk preserving OpenAI streaming fragment
|
|
635
|
+
# semantics: the opening fragment carries id + name; continuation
|
|
636
|
+
# fragments carry id: nil and a raw partial-JSON arguments string.
|
|
637
|
+
# The StreamAccumulator keys off a nil id to append fragments to the
|
|
638
|
+
# current tool call, so the id must NOT be synthesized here.
|
|
639
|
+
def build_tool_call_delta_chunk(first_call, request_id)
|
|
640
|
+
function = first_call.fetch('function', {})
|
|
641
|
+
|
|
642
|
+
tc = Canonical::ToolCall.new(
|
|
643
|
+
id: first_call['id'], exchange_id: nil,
|
|
644
|
+
name: function['name'], arguments: function['arguments'].to_s,
|
|
645
|
+
source: :client, status: nil, duration_ms: nil, result: nil,
|
|
646
|
+
error: nil, started_at: nil, finished_at: nil, category: nil,
|
|
647
|
+
data_handling_classification: nil, policy_decision: nil
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
Canonical::Chunk.tool_call_delta(
|
|
651
|
+
tool_call: tc,
|
|
652
|
+
request_id: request_id,
|
|
653
|
+
block_index: first_call['index']
|
|
654
|
+
)
|
|
655
|
+
end
|
|
656
|
+
|
|
636
657
|
def empty_delta?(delta)
|
|
637
658
|
(delta['content'].nil? || delta['content'].to_s.empty?) &&
|
|
638
659
|
(delta['tool_calls'].nil? || Array(delta['tool_calls']).empty?) &&
|
|
639
660
|
(delta['reasoning_content'].nil? || delta['reasoning_content'].to_s.empty?)
|
|
640
661
|
end
|
|
641
662
|
|
|
663
|
+
# Per-chunk think-tag extraction is structurally impossible while streaming:
|
|
664
|
+
# tags arrive split across SSE chunks, and ThinkingExtractor strips per-chunk
|
|
665
|
+
# whitespace, corrupting reassembled text. Emit the raw delta unmodified —
|
|
666
|
+
# the StreamAccumulator extracts think tags statefully across deltas.
|
|
667
|
+
# (Previously called ThinkingExtractor.extract_from_content, which is
|
|
668
|
+
# private_class_method in lex-llm >= 0.5.0 and raised NoMethodError on
|
|
669
|
+
# every streamed text delta, silently killing all vLLM streaming.)
|
|
642
670
|
def parse_text_delta_with_thinking(content, request_id, data)
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
Canonical::Chunk.thinking_delta(delta: thinking_text, request_id: request_id)
|
|
649
|
-
else
|
|
650
|
-
Canonical::Chunk.text_delta(
|
|
651
|
-
delta: clean_text || content,
|
|
652
|
-
request_id: request_id,
|
|
653
|
-
index: data['index']
|
|
654
|
-
)
|
|
655
|
-
end
|
|
671
|
+
Canonical::Chunk.text_delta(
|
|
672
|
+
delta: content,
|
|
673
|
+
request_id: request_id,
|
|
674
|
+
index: data['index']
|
|
675
|
+
)
|
|
656
676
|
end
|
|
657
677
|
|
|
658
678
|
# Parse a canonical-form chunk (from conformance kit fixtures).
|
|
@@ -16,7 +16,7 @@ module Legion
|
|
|
16
16
|
extend Legion::Extensions::Llm::AutoRegistration
|
|
17
17
|
|
|
18
18
|
PROVIDER_FAMILY = :vllm
|
|
19
|
-
DEFAULT_INSTANCE_TIER = { tier: :direct, capabilities:
|
|
19
|
+
DEFAULT_INSTANCE_TIER = { tier: :direct, capabilities: {}, provider_capabilities: { streaming: true } }.freeze
|
|
20
20
|
|
|
21
21
|
def self.default_settings
|
|
22
22
|
::Legion::Extensions::Llm.provider_settings(
|
|
@@ -32,10 +32,7 @@ module Legion
|
|
|
32
32
|
fleet: {
|
|
33
33
|
enabled: false,
|
|
34
34
|
respond_to_requests: false,
|
|
35
|
-
capabilities: %i[chat stream_chat embed]
|
|
36
|
-
lanes: [],
|
|
37
|
-
concurrency: 1,
|
|
38
|
-
queue_suffix: nil
|
|
35
|
+
capabilities: %i[chat stream_chat embed]
|
|
39
36
|
}
|
|
40
37
|
}
|
|
41
38
|
)
|
|
@@ -74,10 +71,19 @@ module Legion
|
|
|
74
71
|
def self.normalize_instance_config(config)
|
|
75
72
|
normalized = config.to_h.transform_keys(&:to_sym)
|
|
76
73
|
resolve_api_base_aliases(normalized)
|
|
74
|
+
resolve_credentials(normalized)
|
|
77
75
|
normalized[:tier] ||= infer_tier_from_endpoint(normalized[:vllm_api_base])
|
|
78
76
|
normalized
|
|
79
77
|
end
|
|
80
78
|
|
|
79
|
+
def self.resolve_credentials(normalized)
|
|
80
|
+
creds = normalized.delete(:credentials)
|
|
81
|
+
return unless creds.is_a?(Hash)
|
|
82
|
+
|
|
83
|
+
creds = creds.transform_keys(&:to_sym)
|
|
84
|
+
normalized[:vllm_api_key] ||= creds[:api_key]
|
|
85
|
+
end
|
|
86
|
+
|
|
81
87
|
def self.resolve_api_base_aliases(normalized)
|
|
82
88
|
normalized[:vllm_api_base] ||= normalized.delete(:base_url)
|
|
83
89
|
normalized[:vllm_api_base] ||= normalized.delete(:api_base)
|
|
@@ -93,12 +99,15 @@ module Legion
|
|
|
93
99
|
return :direct if url.nil? || url.to_s.empty?
|
|
94
100
|
|
|
95
101
|
require 'uri'
|
|
102
|
+
require_relative 'vllm/actors/discovery_refresh'
|
|
96
103
|
host = URI.parse(url.to_s).host.to_s.downcase
|
|
97
104
|
%w[localhost 127.0.0.1 ::1].include?(host) ? :local : :direct
|
|
98
105
|
rescue URI::InvalidURIError => e
|
|
99
|
-
handle_exception(e, level: :
|
|
106
|
+
handle_exception(e, level: :warn, handled: true, operation: 'vllm.infer_tier_from_endpoint')
|
|
100
107
|
:direct
|
|
101
108
|
end
|
|
109
|
+
|
|
110
|
+
Legion::Extensions::Llm::Configuration.register_provider_options(Provider.configuration_options)
|
|
102
111
|
end
|
|
103
112
|
end
|
|
104
113
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lex-llm-vllm
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.3.
|
|
4
|
+
version: 0.3.11
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- LegionIO
|
|
@@ -71,14 +71,14 @@ dependencies:
|
|
|
71
71
|
requirements:
|
|
72
72
|
- - ">="
|
|
73
73
|
- !ruby/object:Gem::Version
|
|
74
|
-
version: 0.
|
|
74
|
+
version: 0.6.0
|
|
75
75
|
type: :runtime
|
|
76
76
|
prerelease: false
|
|
77
77
|
version_requirements: !ruby/object:Gem::Requirement
|
|
78
78
|
requirements:
|
|
79
79
|
- - ">="
|
|
80
80
|
- !ruby/object:Gem::Version
|
|
81
|
-
version: 0.
|
|
81
|
+
version: 0.6.0
|
|
82
82
|
description: vLLM provider integration for the LegionIO LLM routing framework.
|
|
83
83
|
email:
|
|
84
84
|
- matthewdiverson@gmail.com
|