RubyGems - lex-llm-vllm - Versions diffs - 0.3.0 → 0.3.5 - Mend

lex-llm-vllm 0.3.0 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +20 -0
data/Gemfile +0 -6
data/lib/legion/extensions/llm/vllm/actors/discovery_refresh.rb +6 -0
data/lib/legion/extensions/llm/vllm/provider.rb +114 -5
data/lib/legion/extensions/llm/vllm/translator.rb +44 -37
data/lib/legion/extensions/llm/vllm/version.rb +1 -1
data/lib/legion/extensions/llm/vllm.rb +14 -5
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 172c35debe332979f48575e43bd59c04828449a41a195f3d899bc15afa18bdb2
-  data.tar.gz: c423c24ff7a5e4b33f1b6e562b50c196d2870b347bbcad61b38cd228d54ee318
+  metadata.gz: 20aa357958da5d294b132bdb5d3bea065d0192b42051deae734a3e1b8af592e5
+  data.tar.gz: 203a81b2aa087bc2cdabbe26cd71ea88e1bd7b1aaa9e44a95a5b43c35ab2c344
 SHA512:
-  metadata.gz: dbf9166b8302c7dc786562b7e2e17381b4cf33b570825570cc153dc438bfbef4991e53b0e86811821f08b2a4a00e3b6522bac903764a6bfe3e90f04be4d556ea
-  data.tar.gz: 5ee6cda495f98e9f68c4b3ea79b1a0fa28ab833113a00c3aa9a43e9a67e94c4aa79995faf8d8745e063d7c480d77dde246940d8d6b9c3570d678c39be19b496f
+  metadata.gz: 4ab5021f00c1f6652147297c4f9fc56a2f501eaa1468c0f098aff78d83426706069e4cca7184c96647135ff951dc4965d871352144826d4cf6c8961e60616862
+  data.tar.gz: 1e52f17e28c52ddae6317c3fd6ea6fd58b54a71a46d77c80f00544fb35333bf0340c1498b2f9db6419c811765c9926e0bd20228c7bc44badae04dd68a9fd178c

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,25 @@
 # Changelog
+## 0.3.5 - 2026-06-16
+- Extract `vllm_api_key` from `credentials: { api_key: ... }` in instance settings so Bearer auth works with the standard settings layout.
+- Fix `Dalli::RingError` crash in `offering_from_model` when cache server is unavailable; cache write is now best-effort.
+## 0.3.3 - 2026-06-16
+- Dependency updates (concurrent-ruby 1.3.7, faraday 2.14.3, rubocop 1.88.0) and code quality improvements.
+## 0.3.2 - 2026-06-15
+- **CapabilityPolicy integration** — Optional capabilities default false; use `CapabilityPolicy.resolve` for offerings. Static all-true predicates no longer used for routing truth. Settings overrides at provider/instance/model level supported.
+## 0.3.1 - 2026-06-13
+- **Gemfile cleanup** — Remove local path overrides; dependencies resolve from gemspec via rubygems.
+- **Bug fix** — Restore vLLM streaming; private `ThinkingExtractor` call was killing every text delta.
+- **Canonical tool normalization** — Use canonical normalization for tool parameter schemas.
+- 155 examples, 0 failures; 17 files, 0 rubocop offenses.
 ## 0.3.0 - 2026-06-10
 - Add canonical provider translator (`Translator`) implementing `render_request`,

data/Gemfile CHANGED Viewed

@@ -2,12 +2,6 @@
 source 'https://rubygems.org'
-group :test do
-  transport_path = ENV.fetch('LEGION_TRANSPORT_PATH', File.expand_path('../../legion-transport', __dir__))
-  gem 'legion-transport', path: transport_path if File.directory?(transport_path)
-  # lex-llm >= 0.5.0 carries canonical types + conformance kit (released on rubygems.org)
-end
 gemspec
 group :development do

data/lib/legion/extensions/llm/vllm/actors/discovery_refresh.rb CHANGED Viewed

@@ -37,6 +37,12 @@ module Legion
               return unless defined?(Legion::LLM::Discovery)
               Legion::LLM::Discovery.refresh_discovered_models!(provider: :vllm)
+              if defined?(Legion::LLM::Router) && Legion::LLM::Router.respond_to?(:populate_auto_rules)
+                Legion::LLM::Router.populate_auto_rules(Legion::LLM::Discovery.discovered_instances)
+              end
+              if defined?(Legion::LLM::Inventory) && Legion::LLM::Inventory.respond_to?(:invalidate_offerings_cache!)
+                Legion::LLM::Inventory.invalidate_offerings_cache!
+              end
             rescue StandardError => e
               handle_exception(e, level: :warn, handled: true, operation: 'vllm.actor.discovery_refresh')
             end

data/lib/legion/extensions/llm/vllm/provider.rb CHANGED Viewed

@@ -163,7 +163,29 @@ module Legion
           def offering_from_model(model_info)
             ctx = model_info.context_length
-            cache_set(model_detail_cache_key(model_info.id), { context_window: ctx }, ttl: 86_400) if ctx
+            if ctx
+              begin
+                cache_set(model_detail_cache_key(model_info.id), { context_window: ctx }, ttl: 86_400)
+              rescue StandardError => e
+                handle_exception(e, level: :debug, handled: true, operation: 'vllm.cache_model_detail')
+              end
+            end
+            policy = Legion::Extensions::Llm::CapabilityPolicy.resolve(
+              real: extract_real_capabilities(model_info),
+              provider_catalog: {},
+              probe: {},
+              provider_envelope: provider_envelope_capabilities,
+              provider_config: provider_capability_config,
+              instance_config: instance_capability_config,
+              model_config: model_capability_config(model_info.id)
+            )
+            build_offering(model_info, policy, ctx)
+          end
+          def build_offering(model_info, policy, ctx) # rubocop:disable Metrics/AbcSize
+            max_out = model_info.respond_to?(:max_output_tokens) ? model_info.max_output_tokens : nil
             Legion::Extensions::Llm::Routing::ModelOffering.new(
               provider_family: :vllm,
@@ -171,13 +193,82 @@ module Legion
               transport: offering_transport,
               tier: offering_tier,
               model: model_info.id,
+              canonical_model_alias: model_info.respond_to?(:name) ? model_info.name : nil,
+              model_family: model_info.respond_to?(:family) ? model_info.family : nil,
               usage_type: model_info.embedding? ? :embedding : :inference,
-              capabilities: model_info.capabilities.map(&:to_s),
-              limits: { context_window: ctx }.compact,
-              metadata: { context_length: ctx }
+              capabilities: policy[:capabilities],
+              capability_sources: policy[:sources],
+              limits: { context_window: ctx, max_output_tokens: max_out }.compact,
+              metadata: offering_metadata_for(model_info).merge(capability_sources: policy[:sources])
             )
           end
+          def extract_real_capabilities(model_info)
+            return {} unless model_info.respond_to?(:metadata)
+            meta = model_info.metadata
+            meta_caps = meta.is_a?(Hash) ? meta[:capabilities] : nil
+            meta_caps.is_a?(Hash) ? meta_caps : {}
+          end
+          def provider_envelope_capabilities
+            { streaming: true }
+          end
+          def provider_capability_config
+            return {} unless defined?(Legion::Extensions::Llm::CredentialSources)
+            conf = Legion::Extensions::Llm::CredentialSources.setting(:extensions, :llm, :vllm)
+            conf.is_a?(Hash) ? conf.to_h.except(:instances, 'instances') : {}
+          rescue StandardError => e
+            handle_exception(e, level: :debug, handled: true, operation: 'vllm.provider_capability_config')
+            {}
+          end
+          def instance_capability_config
+            cfg = config
+            result = {}
+            %i[capabilities enable_thinking enable_tools enable_streaming enable_vision enable_embeddings
+               thinking_flag tools_flag streaming_flag vision_flag embedding_flag embeddings_flag
+               tool_flag images_flag image_flag].each do |key|
+              next unless cfg.respond_to?(key)
+              val = cfg.send(key)
+              result[key] = val unless val.nil?
+            rescue StandardError
+              next
+            end
+            result
+          end
+          def model_capability_config(model_id)
+            models_conf = resolve_models_config
+            return {} unless models_conf.respond_to?(:to_h)
+            hash = models_conf.to_h
+            hash[model_id.to_s] || hash[model_id.to_sym] || {}
+          rescue StandardError => e
+            handle_exception(e, level: :debug, handled: true, operation: 'vllm.model_capability_config')
+            {}
+          end
+          def resolve_models_config
+            return config.models if config.respond_to?(:models)
+            return config[:models] if config.respond_to?(:[])
+            nil
+          end
+          def offering_metadata_for(model_info)
+            {
+              raw_model: model_info.id,
+              parameter_count: model_info.respond_to?(:parameter_count) ? model_info.parameter_count : nil,
+              parameter_size: model_info.respond_to?(:parameter_size) ? model_info.parameter_size : nil,
+              quantization: model_info.respond_to?(:quantization) ? model_info.quantization : nil,
+              size_bytes: model_info.respond_to?(:size_bytes) ? model_info.size_bytes : nil
+            }.compact
+          end
           # ── Canonical bridge: legacy provider API → Canonical::Request ──
           # rubocop:disable Metrics/ParameterLists, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- bridge method can be complex
@@ -281,7 +372,7 @@ module Legion
               role: :assistant,
               content: content,
               model_id: raw_data['model'],
-              tool_calls: nil,
+              tool_calls: legacy_chunk_tool_calls(canonical),
               thinking: thinking,
               input_tokens: usage.respond_to?(:input_tokens) ? usage.input_tokens : nil,
               output_tokens: usage.respond_to?(:output_tokens) ? usage.output_tokens : nil,
@@ -289,6 +380,24 @@ module Legion
             )
           end
+          # Map a canonical tool_call_delta onto the legacy chunk tool_calls hash.
+          # Fragment semantics matter: an entry with a non-nil id starts a new tool
+          # call in the StreamAccumulator; a nil id appends the raw arguments
+          # fragment to the most recently started call.
+          def legacy_chunk_tool_calls(canonical)
+            return nil unless canonical.type == :tool_call_delta && canonical.tool_call
+            tc = canonical.tool_call
+            key = (tc.id || tc.name || :fragment).to_s.to_sym
+            {
+              key => Legion::Extensions::Llm::ToolCall.new(
+                id: tc.id,
+                name: tc.name,
+                arguments: tc.arguments
+              )
+            }
+          end
           # ── Tool choice helpers ──
           def format_tool_choice_from_prefs(tool_prefs)

data/lib/legion/extensions/llm/vllm/translator.rb CHANGED Viewed

@@ -164,24 +164,8 @@ module Legion
               )
             end
-            tool_calls = delta['tool_calls']
-            unless Array(tool_calls).empty?
-              first_call = tool_calls.first
-              function = first_call.fetch('function', {})
-              tc = Canonical::ToolCall.build(
-                id: (first_call['id'] || function['name'] || 'synthesized').to_s,
-                name: function['name'].to_s,
-                arguments: parse_tool_arguments(function['arguments']),
-                source: :client
-              )
-              return Canonical::Chunk.tool_call_delta(
-                tool_call: tc,
-                request_id: request_id,
-                block_index: first_call['index']
-              )
-            end
+            tool_calls = Array(delta['tool_calls'])
+            return build_tool_call_delta_chunk(tool_calls.first, request_id) unless tool_calls.empty?
             # Thinking delta from reasoning_content
             reasoning_content = delta['reasoning_content'] || delta['reasoning']
@@ -227,7 +211,8 @@ module Legion
           # ── Message formatting ──
           def format_messages(request)
-            messages = format_request_messages(request.messages)
+            non_system = request.messages&.reject { |m| m.role.to_s == 'system' } || []
+            messages = format_request_messages(non_system)
             if request.system.to_s.strip.empty?
               messages
@@ -345,7 +330,8 @@ module Legion
           def format_message_tool_calls(tool_calls)
             return [] if tool_calls.empty?
-            tool_calls.map { |tc| format_tool_call_for_history(tc) }
+            tc_array = tool_calls.is_a?(Hash) ? tool_calls.values : Array(tool_calls)
+            tc_array.map { |tc| format_tool_call_for_history(tc) }
           end
           def format_tool_call_for_history(tool_call_entry)
@@ -387,10 +373,9 @@ module Legion
               name = tool_hash[:name] || tool_hash['name']
               description = (tool_hash[:description] || tool_hash['description'] || '').to_s
-              parameters = tool_hash[:parameters] || tool_hash[:input_schema] ||
-                           { type: 'object', properties: {} }
-              parameters = parameters.to_h if parameters.respond_to?(:to_h) && !parameters.is_a?(Hash)
-              parameters = { type: 'object', properties: {} } unless parameters.is_a?(Hash)
+              raw_params = tool_hash[:parameters] || tool_hash[:input_schema]
+              raw_params = raw_params.to_h if raw_params.respond_to?(:to_h) && !raw_params.is_a?(Hash)
+              parameters = Legion::Extensions::Llm::Canonical::ToolDefinition.normalize_parameters(raw_params)
               {
                 type: 'function',
@@ -633,26 +618,48 @@ module Legion
             )
           end
+          # Build a tool_call_delta chunk preserving OpenAI streaming fragment
+          # semantics: the opening fragment carries id + name; continuation
+          # fragments carry id: nil and a raw partial-JSON arguments string.
+          # The StreamAccumulator keys off a nil id to append fragments to the
+          # current tool call, so the id must NOT be synthesized here.
+          def build_tool_call_delta_chunk(first_call, request_id)
+            function = first_call.fetch('function', {})
+            tc = Canonical::ToolCall.new(
+              id: first_call['id'], exchange_id: nil,
+              name: function['name'], arguments: function['arguments'].to_s,
+              source: :client, status: nil, duration_ms: nil, result: nil,
+              error: nil, started_at: nil, finished_at: nil, category: nil,
+              data_handling_classification: nil, policy_decision: nil
+            )
+            Canonical::Chunk.tool_call_delta(
+              tool_call: tc,
+              request_id: request_id,
+              block_index: first_call['index']
+            )
+          end
           def empty_delta?(delta)
             (delta['content'].nil? || delta['content'].to_s.empty?) &&
               (delta['tool_calls'].nil? || Array(delta['tool_calls']).empty?) &&
               (delta['reasoning_content'].nil? || delta['reasoning_content'].to_s.empty?)
           end
+          # Per-chunk think-tag extraction is structurally impossible while streaming:
+          # tags arrive split across SSE chunks, and ThinkingExtractor strips per-chunk
+          # whitespace, corrupting reassembled text. Emit the raw delta unmodified —
+          # the StreamAccumulator extracts think tags statefully across deltas.
+          # (Previously called ThinkingExtractor.extract_from_content, which is
+          # private_class_method in lex-llm >= 0.5.0 and raised NoMethodError on
+          # every streamed text delta, silently killing all vLLM streaming.)
           def parse_text_delta_with_thinking(content, request_id, data)
-            extraction = Responses::ThinkingExtractor.extract_from_content(content)
-            clean_text = extraction[0]
-            thinking_text = extraction[1]
-            if thinking_text && !thinking_text.empty?
-              Canonical::Chunk.thinking_delta(delta: thinking_text, request_id: request_id)
-            else
-              Canonical::Chunk.text_delta(
-                delta: clean_text || content,
-                request_id: request_id,
-                index: data['index']
-              )
-            end
+            Canonical::Chunk.text_delta(
+              delta: content,
+              request_id: request_id,
+              index: data['index']
+            )
           end
           # Parse a canonical-form chunk (from conformance kit fixtures).

data/lib/legion/extensions/llm/vllm/version.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module Legion
   module Extensions
     module Llm
       module Vllm
-        VERSION = '0.3.0'
+        VERSION = '0.3.5'
       end
     end
   end

data/lib/legion/extensions/llm/vllm.rb CHANGED Viewed

@@ -16,7 +16,7 @@ module Legion
         extend Legion::Extensions::Llm::AutoRegistration
         PROVIDER_FAMILY = :vllm
-        DEFAULT_INSTANCE_TIER = { tier: :direct, capabilities: %i[completion streaming vision tools] }.freeze
+        DEFAULT_INSTANCE_TIER = { tier: :direct, capabilities: {}, provider_capabilities: { streaming: true } }.freeze
         def self.default_settings
           ::Legion::Extensions::Llm.provider_settings(
@@ -32,10 +32,7 @@ module Legion
               fleet: {
                 enabled: false,
                 respond_to_requests: false,
-                capabilities: %i[chat stream_chat embed],
-                lanes: [],
-                concurrency: 1,
-                queue_suffix: nil
+                capabilities: %i[chat stream_chat embed]
               }
             }
           )
@@ -74,10 +71,19 @@ module Legion
         def self.normalize_instance_config(config)
           normalized = config.to_h.transform_keys(&:to_sym)
           resolve_api_base_aliases(normalized)
+          resolve_credentials(normalized)
           normalized[:tier] ||= infer_tier_from_endpoint(normalized[:vllm_api_base])
           normalized
         end
+        def self.resolve_credentials(normalized)
+          creds = normalized.delete(:credentials)
+          return unless creds.is_a?(Hash)
+          creds = creds.transform_keys(&:to_sym)
+          normalized[:vllm_api_key] ||= creds[:api_key]
+        end
         def self.resolve_api_base_aliases(normalized)
           normalized[:vllm_api_base] ||= normalized.delete(:base_url)
           normalized[:vllm_api_base] ||= normalized.delete(:api_base)
@@ -93,12 +99,15 @@ module Legion
           return :direct if url.nil? || url.to_s.empty?
           require 'uri'
+          require_relative 'vllm/actors/discovery_refresh'
           host = URI.parse(url.to_s).host.to_s.downcase
           %w[localhost 127.0.0.1 ::1].include?(host) ? :local : :direct
         rescue URI::InvalidURIError => e
           handle_exception(e, level: :debug, handled: true, operation: 'vllm.infer_tier_from_endpoint')
           :direct
         end
+        Legion::Extensions::Llm::Configuration.register_provider_options(Provider.configuration_options)
       end
     end
   end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: lex-llm-vllm
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.3.5
 platform: ruby
 authors:
 - LegionIO