legion-llm 0.8.24 → 0.8.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1c7a3d39cf4e2e31494ee7e354680e57ae1c6c1feabe2f39e8707a06701c6a15
4
- data.tar.gz: c8733d7f96801aa19c35458f4e527574815909dd87e89ed80d589bf565a8467c
3
+ metadata.gz: 942f34663b8d915ee982996b5b2e63e26a7edf79a7aac17f8ce71ed1829dff01
4
+ data.tar.gz: dd78dd3bd79c9f1cf19d170f4ee2905fc92865dd3e21b107856c973eaf752fb5
5
5
  SHA512:
6
- metadata.gz: 37049fdb4a5dc838fecc0d3b6c57e48bbcb72d490b8e8460e04cdd7a19728d82e8c2c3f48ab0bfc059bc2245d56f40f279d79b082e31c7f4c85fa57d86270e42
7
- data.tar.gz: 06ae35daf7458e38b4990c2a230a5f72bffd65bc99bc3dd2bbb16be4ff60aa3665c3b807c3b19637f407fc1e55b6f88148c433d39474411bfb222fe9e07d412c
6
+ metadata.gz: bfc1f55dce2a3eda78b5b6ab2405b6ce5d4e58fa841a81bb304af3bbe9a5b52851023c845d898713cfa87d9e292cd5fd1545464a7e0937eadde6f8668595ccc2
7
+ data.tar.gz: 4cad8eb9c6b6cfc79c1ffce687b7fddbb7b47d4e22ec9bca424f2dbb061ed83fff97d4ab2bbec441d3b319922316a238b057752210f1d7908e0d7169380485e9
data/CHANGELOG.md CHANGED
@@ -1,5 +1,23 @@
1
1
  # Legion LLM Changelog
2
2
 
3
+ ## [0.8.26] - 2026-04-24
4
+
5
+ ### Added
6
+ - First-class vLLM provider support. vLLM exposes an OpenAI-compatible API and is registered as a new RubyLLM provider (`:vllm`). Configured via `providers.vllm.base_url` in settings. Mapped to `:fleet` tier in the router.
7
+ - vLLM discovery via `/v1/models` endpoint. Caches model list with `max_model_len` (context window size) using the same TTL as Ollama discovery. Health checks via `/health` endpoint.
8
+ - Context overflow escalation: when vLLM rejects a request due to context length limits (32k on V100 hardware), the executor automatically falls back to cloud/frontier providers.
9
+
10
+ ### Changed
11
+ - `find_fallback_provider` in `Executor` now skips all local providers (`:ollama` and `:vllm`) when searching for fallbacks, not just `:ollama`. Ensures context overflow escalates to cloud/frontier.
12
+ - `Router::PROVIDER_ORDER` updated: `:vllm` inserted after `:ollama` and before `:bedrock`.
13
+ - `default_provider_for_tier(:fleet)` returns `:vllm` when vLLM is enabled, falls back to `:ollama`.
14
+
15
+ ## [0.8.25] - 2026-04-24
16
+
17
+ ### Fixed
18
+ - `StructuredOutput.generate`, `handle_parse_error`, and `retry_with_instruction` used hash-style access (`result[:content]`, `result[:model]`) on the return value of `chat_single`, but `chat_single` returns a `RubyLLM::Message` object which only supports method access (`.content`, `.model_id`). All four access sites now use `respond_to?` duck-typing so both hash and Message objects work. Visible as `undefined method '[]' for an instance of RubyLLM::Message` in Apollo's `llm_detects_conflict?` and any structured output caller using non-schema-capable models (e.g. ollama/qwen).
19
+ - `Call::Embeddings.generate` crashed with `NoMethodError` on `.size` when `response.vectors` was a flat array (`[0.007, ...]`) instead of nested (`[[0.007, ...]]`). RubyLLM's OpenAI provider unwraps single-input embedding responses. Added `normalize_vectors_first` to detect and handle both flat and nested vector formats before dimension enforcement.
20
+
3
21
  ## [0.8.24] - 2026-04-23
4
22
 
5
23
  ### Fixed
@@ -27,7 +27,8 @@ module Legion
27
27
 
28
28
  response = RubyLLM.embed(text, **build_opts(model, provider, dimensions))
29
29
  emit_embedding_metering(provider: provider, model: model, tokens: response.input_tokens)
30
- vector = apply_dimension_enforcement(response.vectors.first, provider)
30
+ vector = normalize_vectors_first(response.vectors)
31
+ vector = apply_dimension_enforcement(vector, provider)
31
32
  return dimension_error(model, provider, vector) if vector.is_a?(String)
32
33
 
33
34
  { vector: vector, model: model, provider: provider, dimensions: vector&.size || 0, tokens: response.input_tokens }
@@ -101,6 +102,16 @@ module Legion
101
102
  opts
102
103
  end
103
104
 
105
+ def normalize_vectors_first(vectors)
106
+ return nil if vectors.nil? || (vectors.is_a?(Array) && vectors.empty?)
107
+
108
+ first = vectors.first
109
+ return first if first.is_a?(Array)
110
+ return vectors if vectors.is_a?(Array) && vectors.first.is_a?(Numeric)
111
+
112
+ first
113
+ end
114
+
104
115
  def apply_dimension_enforcement(vector, provider)
105
116
  return vector unless enforce_dimension? && vector.is_a?(Array)
106
117
 
@@ -76,6 +76,8 @@ module Legion
76
76
  config[:api_base] && (usable_setting?(config[:api_key]) || usable_setting?(config[:auth_token]))
77
77
  when :ollama
78
78
  ollama_running?(config)
79
+ when :vllm
80
+ vllm_running?(config)
79
81
  else
80
82
  usable_setting?(config[:api_key])
81
83
  end
@@ -106,6 +108,22 @@ module Legion
106
108
  false
107
109
  end
108
110
 
111
+ def vllm_running?(config)
112
+ require 'faraday'
113
+ url = config[:base_url] || 'http://localhost:8000/v1'
114
+ base = url.sub(%r{/+\z}, '').sub(%r{/v1\z}, '')
115
+ log.debug "[llm][providers] vllm_running? url=#{base}/health"
116
+ response = Faraday.new(url: base) do |f|
117
+ f.options.timeout = 2
118
+ f.options.open_timeout = 2
119
+ f.adapter Faraday.default_adapter
120
+ end.get('/health')
121
+ response.success?
122
+ rescue StandardError => e
123
+ handle_exception(e, level: :debug, operation: 'llm.providers.vllm_running', base_url: url)
124
+ false
125
+ end
126
+
109
127
  def apply_provider_config(provider, config)
110
128
  case provider
111
129
  when :bedrock then configure_bedrock(config)
@@ -114,6 +132,7 @@ module Legion
114
132
  when :gemini then configure_gemini(config)
115
133
  when :azure then configure_azure(config)
116
134
  when :ollama then configure_ollama(config)
135
+ when :vllm then configure_vllm(config)
117
136
  else
118
137
  log.warn "[llm][providers] unknown provider=#{provider}"
119
138
  end
@@ -214,6 +233,15 @@ module Legion
214
233
  log.info "[llm][providers] configured ollama base_url=#{config[:base_url].inspect}"
215
234
  end
216
235
 
236
+ def configure_vllm(config)
237
+ base_url = config[:base_url] || 'http://localhost:8000/v1'
238
+ RubyLLM.configure do |c|
239
+ c.vllm_api_base = base_url
240
+ c.vllm_api_key = config[:api_key] if config[:api_key]
241
+ end
242
+ log.info "[llm][providers] configured vllm base_url=#{base_url.inspect}"
243
+ end
244
+
217
245
  SAAS_PROVIDERS = %i[bedrock anthropic openai gemini azure].freeze
218
246
 
219
247
  def verify_providers
@@ -15,8 +15,11 @@ module Legion
15
15
  result = call_with_schema(messages, schema, model, provider: provider, **)
16
16
  log.info "[llm][structured_output] model=#{model} provider=#{provider} valid=true"
17
17
 
18
- parsed = Legion::JSON.load(result[:content])
19
- { data: parsed, raw: result[:content], model: result[:model], valid: true }
18
+ content = result.respond_to?(:content) ? result.content : result[:content]
19
+ raw_model = result.respond_to?(:model_id) ? result.model_id : result[:model]
20
+
21
+ parsed = Legion::JSON.load(content)
22
+ { data: parsed, raw: content, model: raw_model, valid: true }
20
23
  rescue ::JSON::ParserError => e
21
24
  log.warn "[llm][structured_output] model=#{model} provider=#{provider} parse_error=#{e.message}"
22
25
  handle_parse_error(e, messages, schema, model, provider, result, **)
@@ -49,7 +52,8 @@ module Legion
49
52
  if retry_enabled? && attempt < max_retries
50
53
  retry_with_instruction(messages, schema, model, provider: provider, attempt: attempt + 1, **opts)
51
54
  else
52
- { data: nil, error: "JSON parse failed: #{error.message}", raw: result&.dig(:content), valid: false }
55
+ raw = result.respond_to?(:content) ? result&.content : result&.dig(:content)
56
+ { data: nil, error: "JSON parse failed: #{error.message}", raw: raw, valid: false }
53
57
  end
54
58
  end
55
59
 
@@ -60,8 +64,11 @@ module Legion
60
64
  model: model, provider: provider, intent: nil, tier: nil,
61
65
  message: user_content, **opts.except(:attempt))
62
66
 
63
- parsed = Legion::JSON.load(result[:content])
64
- { data: parsed, raw: result[:content], model: result[:model], valid: true, retried: true }
67
+ retry_content = result.respond_to?(:content) ? result.content : result[:content]
68
+ retry_model = result.respond_to?(:model_id) ? result.model_id : result[:model]
69
+
70
+ parsed = Legion::JSON.load(retry_content)
71
+ { data: parsed, raw: retry_content, model: retry_model, valid: true, retried: true }
65
72
  rescue StandardError => e
66
73
  handle_exception(e, level: :warn)
67
74
  { data: nil, error: e.message, valid: false }
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+
5
+ require 'legion/logging/helper'
6
+ require 'legion/json'
7
+
8
+ module Legion
9
+ module LLM
10
+ module Discovery
11
+ module Vllm
12
+ extend Legion::Logging::Helper
13
+
14
+ class << self
15
+ def models
16
+ ensure_fresh
17
+ @models || []
18
+ end
19
+
20
+ def model_names
21
+ models.map { |m| m[:id] }
22
+ end
23
+
24
+ def model_available?(name)
25
+ model_names.any? { |n| n == name }
26
+ end
27
+
28
+ def max_context(name)
29
+ model = models.find { |m| m[:id] == name }
30
+ model&.dig(:max_model_len)
31
+ end
32
+
33
+ def healthy?
34
+ response = health_connection.get('/health')
35
+ response.success?
36
+ rescue StandardError => e
37
+ handle_exception(e, level: :debug, operation: 'llm.discovery.vllm.healthy')
38
+ false
39
+ end
40
+
41
+ def refresh!
42
+ response = connection.get('/v1/models')
43
+ if response.success?
44
+ parsed = Legion::JSON.load(response.body)
45
+ @models = parsed[:data] || []
46
+ log.debug "[llm][discovery][vllm] model list refreshed count=#{@models.size}"
47
+ else
48
+ log.warn "[llm][discovery][vllm] HTTP failure status=#{response.status}"
49
+ @models ||= []
50
+ end
51
+ rescue StandardError => e
52
+ handle_exception(e, level: :warn, operation: 'llm.discovery.vllm.refresh')
53
+ @models ||= []
54
+ ensure
55
+ @last_refreshed_at = Time.now
56
+ end
57
+
58
+ def reset!
59
+ @models = nil
60
+ @last_refreshed_at = nil
61
+ end
62
+
63
+ def stale?
64
+ return true if @last_refreshed_at.nil?
65
+
66
+ ttl = discovery_settings[:refresh_seconds] || 60
67
+ Time.now - @last_refreshed_at > ttl
68
+ end
69
+
70
+ private
71
+
72
+ def ensure_fresh
73
+ refresh! if stale?
74
+ end
75
+
76
+ def connection
77
+ Faraday.new(url: vllm_base_url) do |f|
78
+ f.options.timeout = 3
79
+ f.options.open_timeout = 2
80
+ f.adapter Faraday.default_adapter
81
+ end
82
+ end
83
+
84
+ def health_connection
85
+ base = vllm_base_url.sub(%r{/+\z}, '').sub(%r{/v1\z}, '')
86
+ Faraday.new(url: base) do |f|
87
+ f.options.timeout = 2
88
+ f.options.open_timeout = 2
89
+ f.adapter Faraday.default_adapter
90
+ end
91
+ end
92
+
93
+ def vllm_base_url
94
+ return 'http://localhost:8000/v1' unless Legion.const_defined?('Settings', false)
95
+
96
+ Legion::Settings[:llm].dig(:providers, :vllm, :base_url) || 'http://localhost:8000/v1'
97
+ rescue StandardError => e
98
+ handle_exception(e, level: :debug, operation: 'llm.discovery.vllm.base_url')
99
+ 'http://localhost:8000/v1'
100
+ end
101
+
102
+ def discovery_settings
103
+ return {} unless Legion.const_defined?('Settings', false)
104
+
105
+ Legion::Settings[:llm][:discovery] || {}
106
+ rescue StandardError => e
107
+ handle_exception(e, level: :debug, operation: 'llm.discovery.vllm.settings')
108
+ {}
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
@@ -2,6 +2,7 @@
2
2
 
3
3
  require 'legion/logging/helper'
4
4
  require_relative 'discovery/ollama'
5
+ require_relative 'discovery/vllm'
5
6
  require_relative 'discovery/system'
6
7
 
7
8
  module Legion
@@ -23,15 +24,21 @@ module Legion
23
24
 
24
25
  def run
25
26
  log.debug '[llm][discovery] run.enter'
26
- return unless Legion::LLM.settings.dig(:providers, :ollama, :enabled)
27
27
 
28
- Ollama.refresh!
29
- System.refresh!
28
+ if Legion::LLM.settings.dig(:providers, :ollama, :enabled)
29
+ Ollama.refresh!
30
+ System.refresh!
31
+ names = Ollama.model_names
32
+ log.info "[llm][discovery] ollama model_count=#{names.size} models=#{names.join(', ')}"
33
+ log.info "[llm][discovery] system total_mb=#{System.total_memory_mb} available_mb=#{System.available_memory_mb}"
34
+ end
30
35
 
31
- names = Ollama.model_names
32
- count = names.size
33
- log.info "[llm][discovery] ollama model_count=#{count} models=#{names.join(', ')}"
34
- log.info "[llm][discovery] system total_mb=#{System.total_memory_mb} available_mb=#{System.available_memory_mb}"
36
+ if Legion::LLM.settings.dig(:providers, :vllm, :enabled)
37
+ Vllm.refresh!
38
+ names = Vllm.model_names
39
+ contexts = names.map { |n| "#{n}(#{Vllm.max_context(n)})" }
40
+ log.info "[llm][discovery] vllm model_count=#{names.size} models=#{contexts.join(', ')}"
41
+ end
35
42
  rescue StandardError => e
36
43
  handle_exception(e, level: :warn, operation: 'llm.discovery.run')
37
44
  end
@@ -1030,7 +1030,7 @@ module Legion
1030
1030
  providers.each do |name, config|
1031
1031
  next unless config.is_a?(Hash) && config[:enabled]
1032
1032
  next if exclude.include?(name) || exclude.include?(name.to_s)
1033
- next if name == :ollama
1033
+ next if %i[ollama vllm].include?(name)
1034
1034
  next unless config[:default_model]
1035
1035
 
1036
1036
  return { provider: name, model: config[:default_model] }
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyLLM
4
+ module Providers
5
+ class Vllm < OpenAI
6
+ def api_base
7
+ @config.vllm_api_base
8
+ end
9
+
10
+ def headers
11
+ return {} unless @config.vllm_api_key
12
+
13
+ { 'Authorization' => "Bearer #{@config.vllm_api_key}" }
14
+ end
15
+
16
+ class << self
17
+ def configuration_options
18
+ %i[vllm_api_base vllm_api_key]
19
+ end
20
+
21
+ def configuration_requirements
22
+ %i[vllm_api_base]
23
+ end
24
+
25
+ def local?
26
+ true
27
+ end
28
+
29
+ def capabilities
30
+ nil
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ RubyLLM::Provider.register :vllm, RubyLLM::Providers::Vllm
@@ -15,8 +15,8 @@ module Legion
15
15
  extend Legion::Logging::Helper
16
16
 
17
17
  PROVIDER_TIER = { bedrock: :cloud, anthropic: :frontier, openai: :frontier,
18
- gemini: :cloud, azure: :cloud, ollama: :local }.freeze
19
- PROVIDER_ORDER = %i[ollama bedrock azure gemini anthropic openai].freeze
18
+ gemini: :cloud, azure: :cloud, ollama: :local, vllm: :local }.freeze
19
+ PROVIDER_ORDER = %i[ollama vllm bedrock azure gemini anthropic openai].freeze
20
20
 
21
21
  class << self
22
22
  # Resolve an LLM routing intent to a tier/provider/model decision.
@@ -296,8 +296,11 @@ module Legion
296
296
 
297
297
  def default_provider_for_tier(tier)
298
298
  case tier.to_sym
299
- when :local, :fleet
299
+ when :local
300
300
  :ollama
301
+ when :fleet
302
+ vllm_config = Legion::Settings[:llm].dig(:providers, :vllm)
303
+ vllm_config.is_a?(Hash) && vllm_config[:enabled] ? :vllm : :ollama
301
304
  when :openai_compat
302
305
  :openai
303
306
  when :cloud
@@ -316,7 +319,13 @@ module Legion
316
319
  ollama = Legion::Settings[:llm].dig(:providers, :ollama) || {}
317
320
  ollama[:default_model] || 'llama3'
318
321
  when :fleet
319
- 'llama4:70b'
322
+ vllm_config = Legion::Settings[:llm].dig(:providers, :vllm) || {}
323
+ if vllm_config[:enabled]
324
+ vllm_config[:default_model] || 'qwen3.6-27b'
325
+ else
326
+ ollama = Legion::Settings[:llm].dig(:providers, :ollama) || {}
327
+ ollama[:default_model] || 'llama3'
328
+ end
320
329
  when :openai_compat
321
330
  'gpt-4o'
322
331
  when :cloud
@@ -375,6 +375,12 @@ module Legion
375
375
  enabled: false,
376
376
  default_model: 'qwen3.5:latest',
377
377
  base_url: 'http://localhost:11434'
378
+ },
379
+ vllm: {
380
+ enabled: false,
381
+ default_model: 'qwen3.6-27b',
382
+ base_url: 'http://localhost:8000/v1',
383
+ api_key: nil
378
384
  }
379
385
  }
380
386
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Legion
4
4
  module LLM
5
- VERSION = '0.8.24'
5
+ VERSION = '0.8.26'
6
6
  end
7
7
  end
data/lib/legion/llm.rb CHANGED
@@ -4,6 +4,7 @@ require 'legion/logging/helper'
4
4
 
5
5
  require 'ruby_llm'
6
6
  require_relative 'llm/patches/ruby_llm_parallel_tools'
7
+ require_relative 'llm/patches/ruby_llm_vllm'
7
8
  require_relative 'llm/version'
8
9
  require_relative 'llm/errors'
9
10
  require_relative 'llm/settings'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: legion-llm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.24
4
+ version: 0.8.26
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esity
@@ -262,6 +262,7 @@ files:
262
262
  - lib/legion/llm/discovery.rb
263
263
  - lib/legion/llm/discovery/ollama.rb
264
264
  - lib/legion/llm/discovery/system.rb
265
+ - lib/legion/llm/discovery/vllm.rb
265
266
  - lib/legion/llm/errors.rb
266
267
  - lib/legion/llm/fleet.rb
267
268
  - lib/legion/llm/fleet/dispatcher.rb
@@ -323,6 +324,7 @@ files:
323
324
  - lib/legion/llm/metering/tracker.rb
324
325
  - lib/legion/llm/metering/usage.rb
325
326
  - lib/legion/llm/patches/ruby_llm_parallel_tools.rb
327
+ - lib/legion/llm/patches/ruby_llm_vllm.rb
326
328
  - lib/legion/llm/quality.rb
327
329
  - lib/legion/llm/quality/checker.rb
328
330
  - lib/legion/llm/quality/confidence/score.rb