RubyGems - lex-llm - Versions diffs - 0.4.10 → 0.4.13 - Mend

lex-llm 0.4.10 → 0.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +12 -0
data/lib/legion/extensions/llm/error.rb +42 -0
data/lib/legion/extensions/llm/provider.rb +2 -0
data/lib/legion/extensions/llm/responses/thinking_extractor.rb +89 -17
data/lib/legion/extensions/llm/stream_accumulator.rb +88 -18
data/lib/legion/extensions/llm/streaming.rb +40 -2
data/lib/legion/extensions/llm/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a203372d751b290a71cc289e382d80a49fafcc0687925c02594b8c5cfe6ef7aa
-  data.tar.gz: 95cfd5a03c002a16da80bac58914f1fb808a940db378d99035f97b6256240863
+  metadata.gz: 6d60f78c459fb43344897e6fdba10730b881f698229058a50a1c1be2564539cf
+  data.tar.gz: d7fcedadb69266af972caf1a51d1153bd5270f1fd5e9b45f65d51076fafa07aa
 SHA512:
-  metadata.gz: 645bde1f8e4b6701fa5092f2b92e2867f63d243c239341e691921efa9cc74a861b3382f00665efd8f4d1420976c462f9c47eaa89bdae93ae60655983681bcddc
-  data.tar.gz: f180a90275c427970e6129ae3f0ef285fabb68fa92a97059687fb37fcf9282f5e083b159f3757293a79ae4bf71f263a54fb9387469f55da25a23959e295d2371
+  metadata.gz: c60726bfac3eff11cf37d8035ad78c7437b627f465bad31efdac1be3061fe410dc176d805bd168389859fa94773cd994578d265a6534a8e3feed1d37db517988
+  data.tar.gz: 40439ec46e06530b9e5d287fe8d5980d57b87c2700343b3282c30deb9cd1b241862812e4264a9842d6a1fea20aa9bcb4f580cf08ed31cc61b73c00c2c753c9ce

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,17 @@
 # Changelog
+## 0.4.13 - 2026-05-15
+- Strip provider thinking from OpenAI-compatible responses when local models emit `<thinking>` tags or untagged initial reasoning preambles, and keep those hidden from live streaming content deltas.
+## 0.4.12 - 2026-05-15
+- Preserve streamed provider error bodies in a custom Faraday env key so Faraday Net::HTTP finalization cannot replace the buffered body with an empty string before `ErrorMiddleware` parses it.
+## 0.4.11 - 2026-05-15
+- Fix `handle_failed_response` to preserve non-200 streaming error bodies across chunks instead of swallowing `ParseError` and falling through to a generic "An unknown error occurred". Complete JSON error bodies still raise typed provider errors immediately; incomplete bodies are buffered onto the Faraday response env for final middleware parsing, with regex fallback extraction for vLLM-style partial `message` fields when the env cannot carry the buffered body.
 ## 0.4.10 - 2026-05-13
 - Add cache-backed `model_detail` lookup with 24-hour TTL; nil results are not cached; `fetch_model_detail` hook for subclasses to override with live API calls.

data/lib/legion/extensions/llm/error.rb CHANGED Viewed

@@ -54,6 +54,8 @@ module Legion
       # Faraday middleware that maps provider-specific API errors to Legion::Extensions::Llm errors.
       class ErrorMiddleware < Faraday::Middleware
+        STREAM_ERROR_BODY_KEY = :legion_llm_stream_error_body
         def initialize(app, options = {})
           super(app)
           @provider = options[:provider]
@@ -79,6 +81,7 @@ module Legion
           ].freeze
           def parse_error(provider:, response:) # rubocop:disable Metrics/PerceivedComplexity
+            response = response_with_stream_error_body(response)
             message = provider&.parse_error(response)
             case response.status
@@ -116,12 +119,51 @@ module Legion
           private
+          def response_with_stream_error_body(response)
+            return response unless empty_body?(response)
+            stream_body = preserved_stream_error_body(response)
+            return response if stream_body.to_s.empty?
+            ResponseWithBody.new(response, stream_body)
+          end
+          def empty_body?(response)
+            !response.respond_to?(:body) || response.body.to_s.empty?
+          end
+          def preserved_stream_error_body(response)
+            return unless response.respond_to?(:[])
+            response[STREAM_ERROR_BODY_KEY]
+          rescue StandardError
+            nil
+          end
           def context_length_exceeded?(message)
             return false if message.to_s.empty?
             CONTEXT_LENGTH_PATTERNS.any? { |pattern| message.match?(pattern) }
           end
         end
+        ResponseWithBody = Struct.new(:response, :body) do
+          def status = response.status
+          def [](key)
+            response[key] if response.respond_to?(:[])
+          end
+          def method_missing(method_name, ...)
+            return response.public_send(method_name, ...) if response.respond_to?(method_name)
+            super
+          end
+          def respond_to_missing?(method_name, include_private = false)
+            response.respond_to?(method_name, include_private) || super
+          end
+        end
       end
     end
   end

data/lib/legion/extensions/llm/provider.rb CHANGED Viewed

@@ -264,6 +264,8 @@ module Legion
               error = part['error']
               error.is_a?(String) ? error : part.dig('error', 'message')
             end.join('. ')
+          when String
+            body[/"message"\s*:\s*"([^"]{1,500})/, 1] || body
           else
             body
           end

data/lib/legion/extensions/llm/responses/thinking_extractor.rb CHANGED Viewed

@@ -8,9 +8,39 @@ module Legion
         module ThinkingExtractor
           Extraction = Struct.new(:content, :thinking, :signature, :metadata, keyword_init: true)
-          THINK_OPEN = '<think>'
-          THINK_CLOSE = '</think>'
-          THINK_PATTERN = %r{<think>(.*?)</think>}m
+          THINK_TAG_PAIRS = [
+            ['<thinking>', '</thinking>'],
+            ['<think>',    '</think>']
+          ].freeze
+          UNTAGGED_PREAMBLE_MAX_LENGTH = 4_000
+          UNTAGGED_PREAMBLE_STARTS = [
+            'the user',
+            'the request',
+            'the prompt',
+            'the question',
+            'i need',
+            'i should',
+            'i will',
+            "i'll",
+            'i can',
+            'we need',
+            'we should',
+            'we will',
+            "we'll",
+            'we can',
+            'let me'
+          ].freeze
+          UNTAGGED_PREAMBLE_PATTERNS = [
+            /
+              \AThe\s+(?:user|request|prompt|question)\b.*\b
+              (?:let\s+me|i'll|i\s+will|i\s+should|i\s+need|i\s+can|respond|answer|reply)\b
+            /imx,
+            /
+              \A(?:I|We)\s+(?:need|should|will|can)\s+(?:to\s+)?
+              (?:answer|respond|reply|confirm|provide|explain|help)\b
+            /imx,
+            /\ALet me\s+(?:answer|respond|reply|confirm|provide|explain|help)\b/im
+          ].freeze
           THINKING_METADATA_KEYS = %i[
             reasoning_content reasoning thinking thinking_text thinking_signature reasoning_signature thought_signature
           ].freeze
@@ -42,20 +72,45 @@ module Legion
             remaining = content.dup
             remaining = consume_next_segment(remaining, clean, thinking_parts) until remaining.empty?
+            clean, untagged_thinking = extract_untagged_preamble(clean.strip)
+            thinking_parts << untagged_thinking
-            [clean.strip, compact_thinking(thinking_parts)]
+            [clean, compact_thinking(thinking_parts)]
           end
           private_class_method :extract_from_content
+          def extract_untagged_preamble(content)
+            return [content, nil] unless content.is_a?(String)
+            match = content.match(/\A(?<preamble>.+?)\n{2,}(?<visible>.+)\z/m)
+            return [content, nil] unless match
+            preamble = match[:preamble].strip
+            return [content, nil] unless untagged_reasoning_preamble?(preamble)
+            [match[:visible].sub(/\A[[:space:]]+/, '').strip, preamble]
+          end
+          def untagged_reasoning_preamble_candidate?(content)
+            return false unless content.is_a?(String)
+            text = content.lstrip.downcase
+            return false if text.empty?
+            UNTAGGED_PREAMBLE_STARTS.any? do |start|
+              start.start_with?(text) || text.start_with?(start)
+            end
+          end
           def consume_next_segment(remaining, clean, thinking_parts)
-            close_index = remaining.index(THINK_CLOSE)
-            open_index = remaining.index(THINK_OPEN)
-            if close_index && (open_index.nil? || close_index < open_index)
-              thinking_parts << remaining.slice(0, close_index)
-              remaining.slice((close_index + THINK_CLOSE.length)..).to_s.sub(/\A[[:space:]]+/, '')
-            elsif open_index
-              consume_open_think_segment(remaining, open_index, clean, thinking_parts)
+            close_match = next_tag_match(remaining, :close)
+            open_match = next_tag_match(remaining, :open)
+            if close_match && (open_match.nil? || close_match[:index] < open_match[:index])
+              thinking_parts << remaining.slice(0, close_match[:index])
+              remaining.slice((close_match[:index] + close_match[:tag].length)..).to_s.sub(/\A[[:space:]]+/, '')
+            elsif open_match
+              consume_open_think_segment(remaining, open_match, clean, thinking_parts)
             else
               clean << remaining
               +''
@@ -63,20 +118,37 @@ module Legion
           end
           private_class_method :consume_next_segment
-          def consume_open_think_segment(remaining, open_index, clean, thinking_parts)
-            clean << remaining.slice(0, open_index)
-            after_open = remaining.slice((open_index + THINK_OPEN.length)..).to_s
-            close_index = after_open.index(THINK_CLOSE)
+          def consume_open_think_segment(remaining, open_match, clean, thinking_parts)
+            clean << remaining.slice(0, open_match[:index])
+            after_open = remaining.slice((open_match[:index] + open_match[:tag].length)..).to_s
+            close_index = after_open.index(open_match[:close_tag])
             unless close_index
               thinking_parts << after_open
               return +''
             end
             thinking_parts << after_open.slice(0, close_index)
-            after_open.slice((close_index + THINK_CLOSE.length)..).to_s
+            after_open.slice((close_index + open_match[:close_tag].length)..).to_s
           end
           private_class_method :consume_open_think_segment
+          def next_tag_match(text, type)
+            matches = THINK_TAG_PAIRS.filter_map do |open_tag, close_tag|
+              tag = type == :open ? open_tag : close_tag
+              index = text.index(tag)
+              { index: index, tag: tag, close_tag: close_tag } if index
+            end
+            matches.min_by { |match| match[:index] }
+          end
+          private_class_method :next_tag_match
+          def untagged_reasoning_preamble?(preamble)
+            return false if preamble.length > UNTAGGED_PREAMBLE_MAX_LENGTH
+            UNTAGGED_PREAMBLE_PATTERNS.any? { |pattern| preamble.match?(pattern) }
+          end
+          private_class_method :untagged_reasoning_preamble?
           def extract_metadata_thinking(metadata)
             compact_thinking(
               [

data/lib/legion/extensions/llm/stream_accumulator.rb CHANGED Viewed

@@ -21,6 +21,9 @@ module Legion
           @thinking_tokens = nil
           @inside_think_tag = false
           @pending_think_tag = +''
+          @active_think_close_tag = nil
+          @untagged_preamble_pending = true
+          @untagged_preamble_buffer = +''
           @latest_tool_call_id = nil
         end
@@ -55,6 +58,8 @@ module Legion
         end
         def to_message(response)
+          flush_pending_untagged_preamble
           Message.new(
             role: :assistant,
             content: content.empty? ? nil : content,
@@ -171,14 +176,63 @@ module Legion
         def append_text_with_thinking(text)
           content_chunk, thinking_chunk = extract_think_tags(text)
+          content_chunk, untagged_thinking = extract_untagged_preamble(content_chunk)
           @content << content_chunk
           @last_content_delta << content_chunk
+          if untagged_thinking
+            @thinking_text << untagged_thinking
+            @last_thinking_delta << untagged_thinking
+          end
           return unless thinking_chunk
           @thinking_text << thinking_chunk
           @last_thinking_delta << thinking_chunk
         end
+        def extract_untagged_preamble(content_chunk)
+          return [content_chunk, nil] unless @untagged_preamble_pending
+          return [content_chunk, nil] unless @content.empty? && @thinking_text.empty?
+          return [content_chunk, nil] if content_chunk.empty?
+          candidate = @untagged_preamble_buffer + content_chunk
+          return release_untagged_preamble(candidate) unless candidate_untagged_preamble?(candidate)
+          content, thinking = Responses::ThinkingExtractor.extract_untagged_preamble(candidate)
+          return release_untagged_preamble(content, thinking) if thinking
+          return release_untagged_preamble(candidate) if complete_untagged_preamble_candidate?(candidate)
+          @untagged_preamble_buffer = candidate
+          ['', nil]
+        end
+        def candidate_untagged_preamble?(candidate)
+          Responses::ThinkingExtractor.untagged_reasoning_preamble_candidate?(candidate)
+        end
+        def complete_untagged_preamble_candidate?(candidate)
+          candidate.match?(/\n{2,}/) || candidate.length > Responses::ThinkingExtractor::UNTAGGED_PREAMBLE_MAX_LENGTH
+        end
+        def release_untagged_preamble(content, thinking = nil)
+          @untagged_preamble_pending = false
+          @untagged_preamble_buffer = +''
+          [content, thinking]
+        end
+        def flush_pending_untagged_preamble
+          return if @untagged_preamble_buffer.empty?
+          content, thinking = Responses::ThinkingExtractor.extract_untagged_preamble(@untagged_preamble_buffer)
+          if thinking
+            @content << content
+            @thinking_text << thinking
+          else
+            @content << @untagged_preamble_buffer
+          end
+          @untagged_preamble_buffer = +''
+          @untagged_preamble_pending = false
+        end
         def append_thinking_from_chunk(chunk)
           thinking = chunk.thinking
           return unless thinking
@@ -191,8 +245,6 @@ module Legion
         end
         def extract_think_tags(text)
-          start_tag = '<think>'
-          end_tag = '</think>'
           remaining = @pending_think_tag + text
           @pending_think_tag = +''
@@ -201,9 +253,9 @@ module Legion
           until remaining.empty?
             remaining = if @inside_think_tag
-                          consume_think_content(remaining, end_tag, thinking)
+                          consume_think_content(remaining, @active_think_close_tag, thinking)
                         else
-                          consume_non_think_content(remaining, start_tag, output)
+                          consume_non_think_content(remaining, output)
                         end
           end
@@ -215,41 +267,59 @@ module Legion
           if end_index
             thinking << remaining.slice(0, end_index)
             @inside_think_tag = false
+            @active_think_close_tag = nil
             remaining.slice((end_index + end_tag.length)..) || +''
           else
-            suffix_len = longest_suffix_prefix(remaining, end_tag)
+            suffix_len = longest_suffix_prefix(remaining, [end_tag])
             thinking << remaining.slice(0, remaining.length - suffix_len)
             @pending_think_tag = remaining.slice(-suffix_len, suffix_len)
             +''
           end
         end
-        def consume_non_think_content(remaining, start_tag, output)
-          unmatched_close = remaining.index('</think>')
-          start_index = remaining.index(start_tag)
-          if unmatched_close && (start_index.nil? || unmatched_close < start_index)
+        def consume_non_think_content(remaining, output)
+          unmatched_close = next_stream_tag_match(remaining, :close)
+          start_match = next_stream_tag_match(remaining, :open)
+          if unmatched_close && (start_match.nil? || unmatched_close[:index] < start_match[:index])
             consume_unmatched_think_close(remaining, unmatched_close)
-          elsif start_index
-            output << remaining.slice(0, start_index)
+          elsif start_match
+            output << remaining.slice(0, start_match[:index])
             @inside_think_tag = true
-            remaining.slice((start_index + start_tag.length)..) || +''
+            @active_think_close_tag = start_match[:close_tag]
+            remaining.slice((start_match[:index] + start_match[:tag].length)..) || +''
           else
-            suffix_len = longest_suffix_prefix(remaining, start_tag)
+            suffix_len = longest_suffix_prefix(remaining, stream_tag_tokens)
             output << remaining.slice(0, remaining.length - suffix_len)
             @pending_think_tag = remaining.slice(-suffix_len, suffix_len)
             +''
           end
         end
-        def consume_unmatched_think_close(remaining, close_index)
-          end_tag = '</think>'
-          thinking = remaining.slice(0, close_index)
+        def consume_unmatched_think_close(remaining, close_match)
+          thinking = remaining.slice(0, close_match[:index])
           @thinking_text << thinking
           @last_thinking_delta << thinking
-          remaining.slice((close_index + end_tag.length)..).to_s.sub(/\A[[:space:]]+/, '')
+          remaining.slice((close_match[:index] + close_match[:tag].length)..).to_s.sub(/\A[[:space:]]+/, '')
+        end
+        def next_stream_tag_match(text, type)
+          matches = Responses::ThinkingExtractor::THINK_TAG_PAIRS.filter_map do |open_tag, close_tag|
+            tag = type == :open ? open_tag : close_tag
+            index = text.index(tag)
+            { index: index, tag: tag, close_tag: close_tag } if index
+          end
+          matches.min_by { |match| match[:index] }
+        end
+        def stream_tag_tokens
+          Responses::ThinkingExtractor::THINK_TAG_PAIRS.flat_map { |open_tag, close_tag| [open_tag, close_tag] }
+        end
+        def longest_suffix_prefix(text, tags)
+          tags.map { |tag| longest_suffix_prefix_for_tag(text, tag) }.max || 0
         end
-        def longest_suffix_prefix(text, tag)
+        def longest_suffix_prefix_for_tag(text, tag)
           max = [text.length, tag.length - 1].min
           max.downto(1) do |len|
             return len if text.end_with?(tag[0, len])

data/lib/legion/extensions/llm/streaming.rb CHANGED Viewed

@@ -93,10 +93,48 @@ module Legion
         def handle_failed_response(chunk, buffer, env)
           buffer << chunk
+          body_persisted = persist_failed_response_body(buffer, env)
           error_data = Legion::JSON.parse(buffer, symbolize_names: false)
           handle_parsed_error(error_data, env)
-        rescue Legion::JSON::ParseError => e
-          handle_exception(e, level: :warn, handled: true, operation: 'llm.streaming.handle_failed_response')
+        rescue Legion::JSON::ParseError
+          return if body_persisted
+          raise_partial_streaming_error(buffer, env)
+        end
+        def persist_failed_response_body(buffer, env)
+          custom_persisted = persist_failed_response_custom_body?(buffer, env)
+          body_persisted = persist_failed_response_env_body?(buffer, env)
+          custom_persisted || body_persisted
+        end
+        def persist_failed_response_env_body?(buffer, env)
+          return false unless env.respond_to?(:body=)
+          env.body = buffer.dup
+          true
+        end
+        def persist_failed_response_custom_body?(buffer, env)
+          return false unless env.respond_to?(:[]=)
+          env[ErrorMiddleware::STREAM_ERROR_BODY_KEY] = buffer.dup
+          true
+        rescue StandardError
+          false
+        end
+        def raise_partial_streaming_error(buffer, env)
+          partial = buffer[/"message"\s*:\s*"([^"]{1,200})/, 1]
+          status  = env&.status || 0
+          msg     = if partial
+                      "Provider error (status #{status}): #{partial}"
+                    else
+                      "Provider error (status #{status}) - response body incomplete"
+                    end
+          log.warn "[llm][streaming] action=handle_failed_response status=#{status} " \
+                   "partial_body=#{buffer.length}b msg=#{partial.inspect}"
+          raise Legion::Extensions::Llm::ServerError, msg
         end
         def handle_sse(chunk, parser, env, &)

data/lib/legion/extensions/llm/version.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 module Legion
   module Extensions
     module Llm
-      VERSION = '0.4.10'
+      VERSION = '0.4.13'
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: lex-llm
 version: !ruby/object:Gem::Version
-  version: 0.4.10
+  version: 0.4.13
 platform: ruby
 authors:
 - LegionIO