RubyGems - legion-llm - Versions diffs - 0.8.24 → 0.8.26 - Mend

legion-llm 0.8.24 → 0.8.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +18 -0
data/lib/legion/llm/call/embeddings.rb +12 -1
data/lib/legion/llm/call/providers.rb +28 -0
data/lib/legion/llm/call/structured_output.rb +12 -5
data/lib/legion/llm/discovery/vllm.rb +114 -0
data/lib/legion/llm/discovery.rb +14 -7
data/lib/legion/llm/inference/executor.rb +1 -1
data/lib/legion/llm/patches/ruby_llm_vllm.rb +37 -0
data/lib/legion/llm/router.rb +13 -4
data/lib/legion/llm/settings.rb +6 -0
data/lib/legion/llm/version.rb +1 -1
data/lib/legion/llm.rb +1 -0
metadata +3 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1c7a3d39cf4e2e31494ee7e354680e57ae1c6c1feabe2f39e8707a06701c6a15
-  data.tar.gz: c8733d7f96801aa19c35458f4e527574815909dd87e89ed80d589bf565a8467c
+  metadata.gz: 942f34663b8d915ee982996b5b2e63e26a7edf79a7aac17f8ce71ed1829dff01
+  data.tar.gz: dd78dd3bd79c9f1cf19d170f4ee2905fc92865dd3e21b107856c973eaf752fb5
 SHA512:
-  metadata.gz: 37049fdb4a5dc838fecc0d3b6c57e48bbcb72d490b8e8460e04cdd7a19728d82e8c2c3f48ab0bfc059bc2245d56f40f279d79b082e31c7f4c85fa57d86270e42
-  data.tar.gz: 06ae35daf7458e38b4990c2a230a5f72bffd65bc99bc3dd2bbb16be4ff60aa3665c3b807c3b19637f407fc1e55b6f88148c433d39474411bfb222fe9e07d412c
+  metadata.gz: bfc1f55dce2a3eda78b5b6ab2405b6ce5d4e58fa841a81bb304af3bbe9a5b52851023c845d898713cfa87d9e292cd5fd1545464a7e0937eadde6f8668595ccc2
+  data.tar.gz: 4cad8eb9c6b6cfc79c1ffce687b7fddbb7b47d4e22ec9bca424f2dbb061ed83fff97d4ab2bbec441d3b319922316a238b057752210f1d7908e0d7169380485e9

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,23 @@
 # Legion LLM Changelog
+## [0.8.26] - 2026-04-24
+### Added
+- First-class vLLM provider support. vLLM exposes an OpenAI-compatible API and is registered as a new RubyLLM provider (`:vllm`). Configured via `providers.vllm.base_url` in settings. Mapped to `:fleet` tier in the router.
+- vLLM discovery via `/v1/models` endpoint. Caches model list with `max_model_len` (context window size) using the same TTL as Ollama discovery. Health checks via `/health` endpoint.
+- Context overflow escalation: when vLLM rejects a request due to context length limits (32k on V100 hardware), the executor automatically falls back to cloud/frontier providers.
+### Changed
+- `find_fallback_provider` in `Executor` now skips all local providers (`:ollama` and `:vllm`) when searching for fallbacks, not just `:ollama`. Ensures context overflow escalates to cloud/frontier.
+- `Router::PROVIDER_ORDER` updated: `:vllm` inserted after `:ollama` and before `:bedrock`.
+- `default_provider_for_tier(:fleet)` returns `:vllm` when vLLM is enabled, falls back to `:ollama`.
+## [0.8.25] - 2026-04-24
+### Fixed
+- `StructuredOutput.generate`, `handle_parse_error`, and `retry_with_instruction` used hash-style access (`result[:content]`, `result[:model]`) on the return value of `chat_single`, but `chat_single` returns a `RubyLLM::Message` object which only supports method access (`.content`, `.model_id`). All four access sites now use `respond_to?` duck-typing so both hash and Message objects work. Visible as `undefined method '[]' for an instance of RubyLLM::Message` in Apollo's `llm_detects_conflict?` and any structured output caller using non-schema-capable models (e.g. ollama/qwen).
+- `Call::Embeddings.generate` crashed with `NoMethodError` on `.size` when `response.vectors` was a flat array (`[0.007, ...]`) instead of nested (`[[0.007, ...]]`). RubyLLM's OpenAI provider unwraps single-input embedding responses. Added `normalize_vectors_first` to detect and handle both flat and nested vector formats before dimension enforcement.
 ## [0.8.24] - 2026-04-23
 ### Fixed

data/lib/legion/llm/call/embeddings.rb CHANGED Viewed

@@ -27,7 +27,8 @@ module Legion
             response = RubyLLM.embed(text, **build_opts(model, provider, dimensions))
             emit_embedding_metering(provider: provider, model: model, tokens: response.input_tokens)
-            vector = apply_dimension_enforcement(response.vectors.first, provider)
+            vector = normalize_vectors_first(response.vectors)
+            vector = apply_dimension_enforcement(vector, provider)
             return dimension_error(model, provider, vector) if vector.is_a?(String)
             { vector: vector, model: model, provider: provider, dimensions: vector&.size || 0, tokens: response.input_tokens }
@@ -101,6 +102,16 @@ module Legion
             opts
           end
+          def normalize_vectors_first(vectors)
+            return nil if vectors.nil? || (vectors.is_a?(Array) && vectors.empty?)
+            first = vectors.first
+            return first if first.is_a?(Array)
+            return vectors if vectors.is_a?(Array) && vectors.first.is_a?(Numeric)
+            first
+          end
           def apply_dimension_enforcement(vector, provider)
             return vector unless enforce_dimension? && vector.is_a?(Array)

data/lib/legion/llm/call/providers.rb CHANGED Viewed

@@ -76,6 +76,8 @@ module Legion
             config[:api_base] && (usable_setting?(config[:api_key]) || usable_setting?(config[:auth_token]))
           when :ollama
             ollama_running?(config)
+          when :vllm
+            vllm_running?(config)
           else
             usable_setting?(config[:api_key])
           end
@@ -106,6 +108,22 @@ module Legion
           false
         end
+        def vllm_running?(config)
+          require 'faraday'
+          url = config[:base_url] || 'http://localhost:8000/v1'
+          base = url.sub(%r{/+\z}, '').sub(%r{/v1\z}, '')
+          log.debug "[llm][providers] vllm_running? url=#{base}/health"
+          response = Faraday.new(url: base) do |f|
+            f.options.timeout = 2
+            f.options.open_timeout = 2
+            f.adapter Faraday.default_adapter
+          end.get('/health')
+          response.success?
+        rescue StandardError => e
+          handle_exception(e, level: :debug, operation: 'llm.providers.vllm_running', base_url: url)
+          false
+        end
         def apply_provider_config(provider, config)
           case provider
           when :bedrock   then configure_bedrock(config)
@@ -114,6 +132,7 @@ module Legion
           when :gemini    then configure_gemini(config)
           when :azure     then configure_azure(config)
           when :ollama    then configure_ollama(config)
+          when :vllm then configure_vllm(config)
           else
             log.warn "[llm][providers] unknown provider=#{provider}"
           end
@@ -214,6 +233,15 @@ module Legion
           log.info "[llm][providers] configured ollama base_url=#{config[:base_url].inspect}"
         end
+        def configure_vllm(config)
+          base_url = config[:base_url] || 'http://localhost:8000/v1'
+          RubyLLM.configure do |c|
+            c.vllm_api_base = base_url
+            c.vllm_api_key = config[:api_key] if config[:api_key]
+          end
+          log.info "[llm][providers] configured vllm base_url=#{base_url.inspect}"
+        end
         SAAS_PROVIDERS = %i[bedrock anthropic openai gemini azure].freeze
         def verify_providers

data/lib/legion/llm/call/structured_output.rb CHANGED Viewed

@@ -15,8 +15,11 @@ module Legion
             result = call_with_schema(messages, schema, model, provider: provider, **)
             log.info "[llm][structured_output] model=#{model} provider=#{provider} valid=true"
-            parsed = Legion::JSON.load(result[:content])
-            { data: parsed, raw: result[:content], model: result[:model], valid: true }
+            content = result.respond_to?(:content) ? result.content : result[:content]
+            raw_model = result.respond_to?(:model_id) ? result.model_id : result[:model]
+            parsed = Legion::JSON.load(content)
+            { data: parsed, raw: content, model: raw_model, valid: true }
           rescue ::JSON::ParserError => e
             log.warn "[llm][structured_output] model=#{model} provider=#{provider} parse_error=#{e.message}"
             handle_parse_error(e, messages, schema, model, provider, result, **)
@@ -49,7 +52,8 @@ module Legion
             if retry_enabled? && attempt < max_retries
               retry_with_instruction(messages, schema, model, provider: provider, attempt: attempt + 1, **opts)
             else
-              { data: nil, error: "JSON parse failed: #{error.message}", raw: result&.dig(:content), valid: false }
+              raw = result.respond_to?(:content) ? result&.content : result&.dig(:content)
+              { data: nil, error: "JSON parse failed: #{error.message}", raw: raw, valid: false }
             end
           end
@@ -60,8 +64,11 @@ module Legion
                                                  model: model, provider: provider, intent: nil, tier: nil,
                                                  message: user_content, **opts.except(:attempt))
-            parsed = Legion::JSON.load(result[:content])
-            { data: parsed, raw: result[:content], model: result[:model], valid: true, retried: true }
+            retry_content = result.respond_to?(:content) ? result.content : result[:content]
+            retry_model = result.respond_to?(:model_id) ? result.model_id : result[:model]
+            parsed = Legion::JSON.load(retry_content)
+            { data: parsed, raw: retry_content, model: retry_model, valid: true, retried: true }
           rescue StandardError => e
             handle_exception(e, level: :warn)
             { data: nil, error: e.message, valid: false }

data/lib/legion/llm/discovery/vllm.rb ADDED Viewed

@@ -0,0 +1,114 @@
+# frozen_string_literal: true
+require 'faraday'
+require 'legion/logging/helper'
+require 'legion/json'
+module Legion
+  module LLM
+    module Discovery
+      module Vllm
+        extend Legion::Logging::Helper
+        class << self
+          def models
+            ensure_fresh
+            @models || []
+          end
+          def model_names
+            models.map { |m| m[:id] }
+          end
+          def model_available?(name)
+            model_names.any? { |n| n == name }
+          end
+          def max_context(name)
+            model = models.find { |m| m[:id] == name }
+            model&.dig(:max_model_len)
+          end
+          def healthy?
+            response = health_connection.get('/health')
+            response.success?
+          rescue StandardError => e
+            handle_exception(e, level: :debug, operation: 'llm.discovery.vllm.healthy')
+            false
+          end
+          def refresh!
+            response = connection.get('/v1/models')
+            if response.success?
+              parsed = Legion::JSON.load(response.body)
+              @models = parsed[:data] || []
+              log.debug "[llm][discovery][vllm] model list refreshed count=#{@models.size}"
+            else
+              log.warn "[llm][discovery][vllm] HTTP failure status=#{response.status}"
+              @models ||= []
+            end
+          rescue StandardError => e
+            handle_exception(e, level: :warn, operation: 'llm.discovery.vllm.refresh')
+            @models ||= []
+          ensure
+            @last_refreshed_at = Time.now
+          end
+          def reset!
+            @models = nil
+            @last_refreshed_at = nil
+          end
+          def stale?
+            return true if @last_refreshed_at.nil?
+            ttl = discovery_settings[:refresh_seconds] || 60
+            Time.now - @last_refreshed_at > ttl
+          end
+          private
+          def ensure_fresh
+            refresh! if stale?
+          end
+          def connection
+            Faraday.new(url: vllm_base_url) do |f|
+              f.options.timeout = 3
+              f.options.open_timeout = 2
+              f.adapter Faraday.default_adapter
+            end
+          end
+          def health_connection
+            base = vllm_base_url.sub(%r{/+\z}, '').sub(%r{/v1\z}, '')
+            Faraday.new(url: base) do |f|
+              f.options.timeout = 2
+              f.options.open_timeout = 2
+              f.adapter Faraday.default_adapter
+            end
+          end
+          def vllm_base_url
+            return 'http://localhost:8000/v1' unless Legion.const_defined?('Settings', false)
+            Legion::Settings[:llm].dig(:providers, :vllm, :base_url) || 'http://localhost:8000/v1'
+          rescue StandardError => e
+            handle_exception(e, level: :debug, operation: 'llm.discovery.vllm.base_url')
+            'http://localhost:8000/v1'
+          end
+          def discovery_settings
+            return {} unless Legion.const_defined?('Settings', false)
+            Legion::Settings[:llm][:discovery] || {}
+          rescue StandardError => e
+            handle_exception(e, level: :debug, operation: 'llm.discovery.vllm.settings')
+            {}
+          end
+        end
+      end
+    end
+  end
+end

data/lib/legion/llm/discovery.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require 'legion/logging/helper'
 require_relative 'discovery/ollama'
+require_relative 'discovery/vllm'
 require_relative 'discovery/system'
 module Legion
@@ -23,15 +24,21 @@ module Legion
         def run
           log.debug '[llm][discovery] run.enter'
-          return unless Legion::LLM.settings.dig(:providers, :ollama, :enabled)
-          Ollama.refresh!
-          System.refresh!
+          if Legion::LLM.settings.dig(:providers, :ollama, :enabled)
+            Ollama.refresh!
+            System.refresh!
+            names = Ollama.model_names
+            log.info "[llm][discovery] ollama model_count=#{names.size} models=#{names.join(', ')}"
+            log.info "[llm][discovery] system total_mb=#{System.total_memory_mb} available_mb=#{System.available_memory_mb}"
+          end
-          names = Ollama.model_names
-          count = names.size
-          log.info "[llm][discovery] ollama model_count=#{count} models=#{names.join(', ')}"
-          log.info "[llm][discovery] system total_mb=#{System.total_memory_mb} available_mb=#{System.available_memory_mb}"
+          if Legion::LLM.settings.dig(:providers, :vllm, :enabled)
+            Vllm.refresh!
+            names = Vllm.model_names
+            contexts = names.map { |n| "#{n}(#{Vllm.max_context(n)})" }
+            log.info "[llm][discovery] vllm model_count=#{names.size} models=#{contexts.join(', ')}"
+          end
         rescue StandardError => e
           handle_exception(e, level: :warn, operation: 'llm.discovery.run')
         end

data/lib/legion/llm/inference/executor.rb CHANGED Viewed

@@ -1030,7 +1030,7 @@ module Legion
           providers.each do |name, config|
             next unless config.is_a?(Hash) && config[:enabled]
             next if exclude.include?(name) || exclude.include?(name.to_s)
-            next if name == :ollama
+            next if %i[ollama vllm].include?(name)
             next unless config[:default_model]
             return { provider: name, model: config[:default_model] }

data/lib/legion/llm/patches/ruby_llm_vllm.rb ADDED Viewed

@@ -0,0 +1,37 @@
+# frozen_string_literal: true
+module RubyLLM
+  module Providers
+    class Vllm < OpenAI
+      def api_base
+        @config.vllm_api_base
+      end
+      def headers
+        return {} unless @config.vllm_api_key
+        { 'Authorization' => "Bearer #{@config.vllm_api_key}" }
+      end
+      class << self
+        def configuration_options
+          %i[vllm_api_base vllm_api_key]
+        end
+        def configuration_requirements
+          %i[vllm_api_base]
+        end
+        def local?
+          true
+        end
+        def capabilities
+          nil
+        end
+      end
+    end
+  end
+end
+RubyLLM::Provider.register :vllm, RubyLLM::Providers::Vllm

data/lib/legion/llm/router.rb CHANGED Viewed

@@ -15,8 +15,8 @@ module Legion
       extend Legion::Logging::Helper
       PROVIDER_TIER = { bedrock: :cloud, anthropic: :frontier, openai: :frontier,
-                        gemini: :cloud, azure: :cloud, ollama: :local }.freeze
-      PROVIDER_ORDER = %i[ollama bedrock azure gemini anthropic openai].freeze
+                        gemini: :cloud, azure: :cloud, ollama: :local, vllm: :local }.freeze
+      PROVIDER_ORDER = %i[ollama vllm bedrock azure gemini anthropic openai].freeze
       class << self
         # Resolve an LLM routing intent to a tier/provider/model decision.
@@ -296,8 +296,11 @@ module Legion
         def default_provider_for_tier(tier)
           case tier.to_sym
-          when :local, :fleet
+          when :local
             :ollama
+          when :fleet
+            vllm_config = Legion::Settings[:llm].dig(:providers, :vllm)
+            vllm_config.is_a?(Hash) && vllm_config[:enabled] ? :vllm : :ollama
           when :openai_compat
             :openai
           when :cloud
@@ -316,7 +319,13 @@ module Legion
             ollama = Legion::Settings[:llm].dig(:providers, :ollama) || {}
             ollama[:default_model] || 'llama3'
           when :fleet
-            'llama4:70b'
+            vllm_config = Legion::Settings[:llm].dig(:providers, :vllm) || {}
+            if vllm_config[:enabled]
+              vllm_config[:default_model] || 'qwen3.6-27b'
+            else
+              ollama = Legion::Settings[:llm].dig(:providers, :ollama) || {}
+              ollama[:default_model] || 'llama3'
+            end
           when :openai_compat
             'gpt-4o'
           when :cloud

data/lib/legion/llm/settings.rb CHANGED Viewed

@@ -375,6 +375,12 @@ module Legion
             enabled:       false,
             default_model: 'qwen3.5:latest',
             base_url:      'http://localhost:11434'
+          },
+          vllm:      {
+            enabled:       false,
+            default_model: 'qwen3.6-27b',
+            base_url:      'http://localhost:8000/v1',
+            api_key:       nil
           }
         }
       end

data/lib/legion/llm/version.rb CHANGED Viewed

@@ -2,6 +2,6 @@
 module Legion
   module LLM
-    VERSION = '0.8.24'
+    VERSION = '0.8.26'
   end
 end

data/lib/legion/llm.rb CHANGED Viewed

@@ -4,6 +4,7 @@ require 'legion/logging/helper'
 require 'ruby_llm'
 require_relative 'llm/patches/ruby_llm_parallel_tools'
+require_relative 'llm/patches/ruby_llm_vllm'
 require_relative 'llm/version'
 require_relative 'llm/errors'
 require_relative 'llm/settings'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: legion-llm
 version: !ruby/object:Gem::Version
-  version: 0.8.24
+  version: 0.8.26
 platform: ruby
 authors:
 - Esity
@@ -262,6 +262,7 @@ files:
 - lib/legion/llm/discovery.rb
 - lib/legion/llm/discovery/ollama.rb
 - lib/legion/llm/discovery/system.rb
+- lib/legion/llm/discovery/vllm.rb
 - lib/legion/llm/errors.rb
 - lib/legion/llm/fleet.rb
 - lib/legion/llm/fleet/dispatcher.rb
@@ -323,6 +324,7 @@ files:
 - lib/legion/llm/metering/tracker.rb
 - lib/legion/llm/metering/usage.rb
 - lib/legion/llm/patches/ruby_llm_parallel_tools.rb
+- lib/legion/llm/patches/ruby_llm_vllm.rb
 - lib/legion/llm/quality.rb
 - lib/legion/llm/quality/checker.rb
 - lib/legion/llm/quality/confidence/score.rb