lex-llm 0.4.10 → 0.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a203372d751b290a71cc289e382d80a49fafcc0687925c02594b8c5cfe6ef7aa
4
- data.tar.gz: 95cfd5a03c002a16da80bac58914f1fb808a940db378d99035f97b6256240863
3
+ metadata.gz: 6d60f78c459fb43344897e6fdba10730b881f698229058a50a1c1be2564539cf
4
+ data.tar.gz: d7fcedadb69266af972caf1a51d1153bd5270f1fd5e9b45f65d51076fafa07aa
5
5
  SHA512:
6
- metadata.gz: 645bde1f8e4b6701fa5092f2b92e2867f63d243c239341e691921efa9cc74a861b3382f00665efd8f4d1420976c462f9c47eaa89bdae93ae60655983681bcddc
7
- data.tar.gz: f180a90275c427970e6129ae3f0ef285fabb68fa92a97059687fb37fcf9282f5e083b159f3757293a79ae4bf71f263a54fb9387469f55da25a23959e295d2371
6
+ metadata.gz: c60726bfac3eff11cf37d8035ad78c7437b627f465bad31efdac1be3061fe410dc176d805bd168389859fa94773cd994578d265a6534a8e3feed1d37db517988
7
+ data.tar.gz: 40439ec46e06530b9e5d287fe8d5980d57b87c2700343b3282c30deb9cd1b241862812e4264a9842d6a1fea20aa9bcb4f580cf08ed31cc61b73c00c2c753c9ce
data/CHANGELOG.md CHANGED
@@ -1,5 +1,17 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.4.13 - 2026-05-15
4
+
5
+ - Strip provider thinking from OpenAI-compatible responses when local models emit `<thinking>` tags or untagged initial reasoning preambles, and keep those hidden from live streaming content deltas.
6
+
7
+ ## 0.4.12 - 2026-05-15
8
+
9
+ - Preserve streamed provider error bodies in a custom Faraday env key so Faraday Net::HTTP finalization cannot replace the buffered body with an empty string before `ErrorMiddleware` parses it.
10
+
11
+ ## 0.4.11 - 2026-05-15
12
+
13
+ - Fix `handle_failed_response` to preserve non-200 streaming error bodies across chunks instead of swallowing `ParseError` and falling through to a generic "An unknown error occurred". Complete JSON error bodies still raise typed provider errors immediately; incomplete bodies are buffered onto the Faraday response env for final middleware parsing, with regex fallback extraction for vLLM-style partial `message` fields when the env cannot carry the buffered body.
14
+
3
15
  ## 0.4.10 - 2026-05-13
4
16
 
5
17
  - Add cache-backed `model_detail` lookup with 24-hour TTL; nil results are not cached; `fetch_model_detail` hook for subclasses to override with live API calls.
@@ -54,6 +54,8 @@ module Legion
54
54
 
55
55
  # Faraday middleware that maps provider-specific API errors to Legion::Extensions::Llm errors.
56
56
  class ErrorMiddleware < Faraday::Middleware
57
+ STREAM_ERROR_BODY_KEY = :legion_llm_stream_error_body
58
+
57
59
  def initialize(app, options = {})
58
60
  super(app)
59
61
  @provider = options[:provider]
@@ -79,6 +81,7 @@ module Legion
79
81
  ].freeze
80
82
 
81
83
  def parse_error(provider:, response:) # rubocop:disable Metrics/PerceivedComplexity
84
+ response = response_with_stream_error_body(response)
82
85
  message = provider&.parse_error(response)
83
86
 
84
87
  case response.status
@@ -116,12 +119,51 @@ module Legion
116
119
 
117
120
  private
118
121
 
122
+ def response_with_stream_error_body(response)
123
+ return response unless empty_body?(response)
124
+
125
+ stream_body = preserved_stream_error_body(response)
126
+ return response if stream_body.to_s.empty?
127
+
128
+ ResponseWithBody.new(response, stream_body)
129
+ end
130
+
131
+ def empty_body?(response)
132
+ !response.respond_to?(:body) || response.body.to_s.empty?
133
+ end
134
+
135
+ def preserved_stream_error_body(response)
136
+ return unless response.respond_to?(:[])
137
+
138
+ response[STREAM_ERROR_BODY_KEY]
139
+ rescue StandardError
140
+ nil
141
+ end
142
+
119
143
  def context_length_exceeded?(message)
120
144
  return false if message.to_s.empty?
121
145
 
122
146
  CONTEXT_LENGTH_PATTERNS.any? { |pattern| message.match?(pattern) }
123
147
  end
124
148
  end
149
+
150
+ ResponseWithBody = Struct.new(:response, :body) do
151
+ def status = response.status
152
+
153
+ def [](key)
154
+ response[key] if response.respond_to?(:[])
155
+ end
156
+
157
+ def method_missing(method_name, ...)
158
+ return response.public_send(method_name, ...) if response.respond_to?(method_name)
159
+
160
+ super
161
+ end
162
+
163
+ def respond_to_missing?(method_name, include_private = false)
164
+ response.respond_to?(method_name, include_private) || super
165
+ end
166
+ end
125
167
  end
126
168
  end
127
169
  end
@@ -264,6 +264,8 @@ module Legion
264
264
  error = part['error']
265
265
  error.is_a?(String) ? error : part.dig('error', 'message')
266
266
  end.join('. ')
267
+ when String
268
+ body[/"message"\s*:\s*"([^"]{1,500})/, 1] || body
267
269
  else
268
270
  body
269
271
  end
@@ -8,9 +8,39 @@ module Legion
8
8
  module ThinkingExtractor
9
9
  Extraction = Struct.new(:content, :thinking, :signature, :metadata, keyword_init: true)
10
10
 
11
- THINK_OPEN = '<think>'
12
- THINK_CLOSE = '</think>'
13
- THINK_PATTERN = %r{<think>(.*?)</think>}m
11
+ THINK_TAG_PAIRS = [
12
+ ['<thinking>', '</thinking>'],
13
+ ['<think>', '</think>']
14
+ ].freeze
15
+ UNTAGGED_PREAMBLE_MAX_LENGTH = 4_000
16
+ UNTAGGED_PREAMBLE_STARTS = [
17
+ 'the user',
18
+ 'the request',
19
+ 'the prompt',
20
+ 'the question',
21
+ 'i need',
22
+ 'i should',
23
+ 'i will',
24
+ "i'll",
25
+ 'i can',
26
+ 'we need',
27
+ 'we should',
28
+ 'we will',
29
+ "we'll",
30
+ 'we can',
31
+ 'let me'
32
+ ].freeze
33
+ UNTAGGED_PREAMBLE_PATTERNS = [
34
+ /
35
+ \AThe\s+(?:user|request|prompt|question)\b.*\b
36
+ (?:let\s+me|i'll|i\s+will|i\s+should|i\s+need|i\s+can|respond|answer|reply)\b
37
+ /imx,
38
+ /
39
+ \A(?:I|We)\s+(?:need|should|will|can)\s+(?:to\s+)?
40
+ (?:answer|respond|reply|confirm|provide|explain|help)\b
41
+ /imx,
42
+ /\ALet me\s+(?:answer|respond|reply|confirm|provide|explain|help)\b/im
43
+ ].freeze
14
44
  THINKING_METADATA_KEYS = %i[
15
45
  reasoning_content reasoning thinking thinking_text thinking_signature reasoning_signature thought_signature
16
46
  ].freeze
@@ -42,20 +72,45 @@ module Legion
42
72
  remaining = content.dup
43
73
 
44
74
  remaining = consume_next_segment(remaining, clean, thinking_parts) until remaining.empty?
75
+ clean, untagged_thinking = extract_untagged_preamble(clean.strip)
76
+ thinking_parts << untagged_thinking
45
77
 
46
- [clean.strip, compact_thinking(thinking_parts)]
78
+ [clean, compact_thinking(thinking_parts)]
47
79
  end
48
80
  private_class_method :extract_from_content
49
81
 
82
+ def extract_untagged_preamble(content)
83
+ return [content, nil] unless content.is_a?(String)
84
+
85
+ match = content.match(/\A(?<preamble>.+?)\n{2,}(?<visible>.+)\z/m)
86
+ return [content, nil] unless match
87
+
88
+ preamble = match[:preamble].strip
89
+ return [content, nil] unless untagged_reasoning_preamble?(preamble)
90
+
91
+ [match[:visible].sub(/\A[[:space:]]+/, '').strip, preamble]
92
+ end
93
+
94
+ def untagged_reasoning_preamble_candidate?(content)
95
+ return false unless content.is_a?(String)
96
+
97
+ text = content.lstrip.downcase
98
+ return false if text.empty?
99
+
100
+ UNTAGGED_PREAMBLE_STARTS.any? do |start|
101
+ start.start_with?(text) || text.start_with?(start)
102
+ end
103
+ end
104
+
50
105
  def consume_next_segment(remaining, clean, thinking_parts)
51
- close_index = remaining.index(THINK_CLOSE)
52
- open_index = remaining.index(THINK_OPEN)
53
-
54
- if close_index && (open_index.nil? || close_index < open_index)
55
- thinking_parts << remaining.slice(0, close_index)
56
- remaining.slice((close_index + THINK_CLOSE.length)..).to_s.sub(/\A[[:space:]]+/, '')
57
- elsif open_index
58
- consume_open_think_segment(remaining, open_index, clean, thinking_parts)
106
+ close_match = next_tag_match(remaining, :close)
107
+ open_match = next_tag_match(remaining, :open)
108
+
109
+ if close_match && (open_match.nil? || close_match[:index] < open_match[:index])
110
+ thinking_parts << remaining.slice(0, close_match[:index])
111
+ remaining.slice((close_match[:index] + close_match[:tag].length)..).to_s.sub(/\A[[:space:]]+/, '')
112
+ elsif open_match
113
+ consume_open_think_segment(remaining, open_match, clean, thinking_parts)
59
114
  else
60
115
  clean << remaining
61
116
  +''
@@ -63,20 +118,37 @@ module Legion
63
118
  end
64
119
  private_class_method :consume_next_segment
65
120
 
66
- def consume_open_think_segment(remaining, open_index, clean, thinking_parts)
67
- clean << remaining.slice(0, open_index)
68
- after_open = remaining.slice((open_index + THINK_OPEN.length)..).to_s
69
- close_index = after_open.index(THINK_CLOSE)
121
+ def consume_open_think_segment(remaining, open_match, clean, thinking_parts)
122
+ clean << remaining.slice(0, open_match[:index])
123
+ after_open = remaining.slice((open_match[:index] + open_match[:tag].length)..).to_s
124
+ close_index = after_open.index(open_match[:close_tag])
70
125
  unless close_index
71
126
  thinking_parts << after_open
72
127
  return +''
73
128
  end
74
129
 
75
130
  thinking_parts << after_open.slice(0, close_index)
76
- after_open.slice((close_index + THINK_CLOSE.length)..).to_s
131
+ after_open.slice((close_index + open_match[:close_tag].length)..).to_s
77
132
  end
78
133
  private_class_method :consume_open_think_segment
79
134
 
135
+ def next_tag_match(text, type)
136
+ matches = THINK_TAG_PAIRS.filter_map do |open_tag, close_tag|
137
+ tag = type == :open ? open_tag : close_tag
138
+ index = text.index(tag)
139
+ { index: index, tag: tag, close_tag: close_tag } if index
140
+ end
141
+ matches.min_by { |match| match[:index] }
142
+ end
143
+ private_class_method :next_tag_match
144
+
145
+ def untagged_reasoning_preamble?(preamble)
146
+ return false if preamble.length > UNTAGGED_PREAMBLE_MAX_LENGTH
147
+
148
+ UNTAGGED_PREAMBLE_PATTERNS.any? { |pattern| preamble.match?(pattern) }
149
+ end
150
+ private_class_method :untagged_reasoning_preamble?
151
+
80
152
  def extract_metadata_thinking(metadata)
81
153
  compact_thinking(
82
154
  [
@@ -21,6 +21,9 @@ module Legion
21
21
  @thinking_tokens = nil
22
22
  @inside_think_tag = false
23
23
  @pending_think_tag = +''
24
+ @active_think_close_tag = nil
25
+ @untagged_preamble_pending = true
26
+ @untagged_preamble_buffer = +''
24
27
  @latest_tool_call_id = nil
25
28
  end
26
29
 
@@ -55,6 +58,8 @@ module Legion
55
58
  end
56
59
 
57
60
  def to_message(response)
61
+ flush_pending_untagged_preamble
62
+
58
63
  Message.new(
59
64
  role: :assistant,
60
65
  content: content.empty? ? nil : content,
@@ -171,14 +176,63 @@ module Legion
171
176
 
172
177
  def append_text_with_thinking(text)
173
178
  content_chunk, thinking_chunk = extract_think_tags(text)
179
+ content_chunk, untagged_thinking = extract_untagged_preamble(content_chunk)
174
180
  @content << content_chunk
175
181
  @last_content_delta << content_chunk
182
+ if untagged_thinking
183
+ @thinking_text << untagged_thinking
184
+ @last_thinking_delta << untagged_thinking
185
+ end
176
186
  return unless thinking_chunk
177
187
 
178
188
  @thinking_text << thinking_chunk
179
189
  @last_thinking_delta << thinking_chunk
180
190
  end
181
191
 
192
+ def extract_untagged_preamble(content_chunk)
193
+ return [content_chunk, nil] unless @untagged_preamble_pending
194
+ return [content_chunk, nil] unless @content.empty? && @thinking_text.empty?
195
+ return [content_chunk, nil] if content_chunk.empty?
196
+
197
+ candidate = @untagged_preamble_buffer + content_chunk
198
+ return release_untagged_preamble(candidate) unless candidate_untagged_preamble?(candidate)
199
+
200
+ content, thinking = Responses::ThinkingExtractor.extract_untagged_preamble(candidate)
201
+ return release_untagged_preamble(content, thinking) if thinking
202
+ return release_untagged_preamble(candidate) if complete_untagged_preamble_candidate?(candidate)
203
+
204
+ @untagged_preamble_buffer = candidate
205
+ ['', nil]
206
+ end
207
+
208
+ def candidate_untagged_preamble?(candidate)
209
+ Responses::ThinkingExtractor.untagged_reasoning_preamble_candidate?(candidate)
210
+ end
211
+
212
+ def complete_untagged_preamble_candidate?(candidate)
213
+ candidate.match?(/\n{2,}/) || candidate.length > Responses::ThinkingExtractor::UNTAGGED_PREAMBLE_MAX_LENGTH
214
+ end
215
+
216
+ def release_untagged_preamble(content, thinking = nil)
217
+ @untagged_preamble_pending = false
218
+ @untagged_preamble_buffer = +''
219
+ [content, thinking]
220
+ end
221
+
222
+ def flush_pending_untagged_preamble
223
+ return if @untagged_preamble_buffer.empty?
224
+
225
+ content, thinking = Responses::ThinkingExtractor.extract_untagged_preamble(@untagged_preamble_buffer)
226
+ if thinking
227
+ @content << content
228
+ @thinking_text << thinking
229
+ else
230
+ @content << @untagged_preamble_buffer
231
+ end
232
+ @untagged_preamble_buffer = +''
233
+ @untagged_preamble_pending = false
234
+ end
235
+
182
236
  def append_thinking_from_chunk(chunk)
183
237
  thinking = chunk.thinking
184
238
  return unless thinking
@@ -191,8 +245,6 @@ module Legion
191
245
  end
192
246
 
193
247
  def extract_think_tags(text)
194
- start_tag = '<think>'
195
- end_tag = '</think>'
196
248
  remaining = @pending_think_tag + text
197
249
  @pending_think_tag = +''
198
250
 
@@ -201,9 +253,9 @@ module Legion
201
253
 
202
254
  until remaining.empty?
203
255
  remaining = if @inside_think_tag
204
- consume_think_content(remaining, end_tag, thinking)
256
+ consume_think_content(remaining, @active_think_close_tag, thinking)
205
257
  else
206
- consume_non_think_content(remaining, start_tag, output)
258
+ consume_non_think_content(remaining, output)
207
259
  end
208
260
  end
209
261
 
@@ -215,41 +267,59 @@ module Legion
215
267
  if end_index
216
268
  thinking << remaining.slice(0, end_index)
217
269
  @inside_think_tag = false
270
+ @active_think_close_tag = nil
218
271
  remaining.slice((end_index + end_tag.length)..) || +''
219
272
  else
220
- suffix_len = longest_suffix_prefix(remaining, end_tag)
273
+ suffix_len = longest_suffix_prefix(remaining, [end_tag])
221
274
  thinking << remaining.slice(0, remaining.length - suffix_len)
222
275
  @pending_think_tag = remaining.slice(-suffix_len, suffix_len)
223
276
  +''
224
277
  end
225
278
  end
226
279
 
227
- def consume_non_think_content(remaining, start_tag, output)
228
- unmatched_close = remaining.index('</think>')
229
- start_index = remaining.index(start_tag)
230
- if unmatched_close && (start_index.nil? || unmatched_close < start_index)
280
+ def consume_non_think_content(remaining, output)
281
+ unmatched_close = next_stream_tag_match(remaining, :close)
282
+ start_match = next_stream_tag_match(remaining, :open)
283
+ if unmatched_close && (start_match.nil? || unmatched_close[:index] < start_match[:index])
231
284
  consume_unmatched_think_close(remaining, unmatched_close)
232
- elsif start_index
233
- output << remaining.slice(0, start_index)
285
+ elsif start_match
286
+ output << remaining.slice(0, start_match[:index])
234
287
  @inside_think_tag = true
235
- remaining.slice((start_index + start_tag.length)..) || +''
288
+ @active_think_close_tag = start_match[:close_tag]
289
+ remaining.slice((start_match[:index] + start_match[:tag].length)..) || +''
236
290
  else
237
- suffix_len = longest_suffix_prefix(remaining, start_tag)
291
+ suffix_len = longest_suffix_prefix(remaining, stream_tag_tokens)
238
292
  output << remaining.slice(0, remaining.length - suffix_len)
239
293
  @pending_think_tag = remaining.slice(-suffix_len, suffix_len)
240
294
  +''
241
295
  end
242
296
  end
243
297
 
244
- def consume_unmatched_think_close(remaining, close_index)
245
- end_tag = '</think>'
246
- thinking = remaining.slice(0, close_index)
298
+ def consume_unmatched_think_close(remaining, close_match)
299
+ thinking = remaining.slice(0, close_match[:index])
247
300
  @thinking_text << thinking
248
301
  @last_thinking_delta << thinking
249
- remaining.slice((close_index + end_tag.length)..).to_s.sub(/\A[[:space:]]+/, '')
302
+ remaining.slice((close_match[:index] + close_match[:tag].length)..).to_s.sub(/\A[[:space:]]+/, '')
303
+ end
304
+
305
+ def next_stream_tag_match(text, type)
306
+ matches = Responses::ThinkingExtractor::THINK_TAG_PAIRS.filter_map do |open_tag, close_tag|
307
+ tag = type == :open ? open_tag : close_tag
308
+ index = text.index(tag)
309
+ { index: index, tag: tag, close_tag: close_tag } if index
310
+ end
311
+ matches.min_by { |match| match[:index] }
312
+ end
313
+
314
+ def stream_tag_tokens
315
+ Responses::ThinkingExtractor::THINK_TAG_PAIRS.flat_map { |open_tag, close_tag| [open_tag, close_tag] }
316
+ end
317
+
318
+ def longest_suffix_prefix(text, tags)
319
+ tags.map { |tag| longest_suffix_prefix_for_tag(text, tag) }.max || 0
250
320
  end
251
321
 
252
- def longest_suffix_prefix(text, tag)
322
+ def longest_suffix_prefix_for_tag(text, tag)
253
323
  max = [text.length, tag.length - 1].min
254
324
  max.downto(1) do |len|
255
325
  return len if text.end_with?(tag[0, len])
@@ -93,10 +93,48 @@ module Legion
93
93
 
94
94
  def handle_failed_response(chunk, buffer, env)
95
95
  buffer << chunk
96
+ body_persisted = persist_failed_response_body(buffer, env)
96
97
  error_data = Legion::JSON.parse(buffer, symbolize_names: false)
97
98
  handle_parsed_error(error_data, env)
98
- rescue Legion::JSON::ParseError => e
99
- handle_exception(e, level: :warn, handled: true, operation: 'llm.streaming.handle_failed_response')
99
+ rescue Legion::JSON::ParseError
100
+ return if body_persisted
101
+
102
+ raise_partial_streaming_error(buffer, env)
103
+ end
104
+
105
+ def persist_failed_response_body(buffer, env)
106
+ custom_persisted = persist_failed_response_custom_body?(buffer, env)
107
+ body_persisted = persist_failed_response_env_body?(buffer, env)
108
+ custom_persisted || body_persisted
109
+ end
110
+
111
+ def persist_failed_response_env_body?(buffer, env)
112
+ return false unless env.respond_to?(:body=)
113
+
114
+ env.body = buffer.dup
115
+ true
116
+ end
117
+
118
+ def persist_failed_response_custom_body?(buffer, env)
119
+ return false unless env.respond_to?(:[]=)
120
+
121
+ env[ErrorMiddleware::STREAM_ERROR_BODY_KEY] = buffer.dup
122
+ true
123
+ rescue StandardError
124
+ false
125
+ end
126
+
127
+ def raise_partial_streaming_error(buffer, env)
128
+ partial = buffer[/"message"\s*:\s*"([^"]{1,200})/, 1]
129
+ status = env&.status || 0
130
+ msg = if partial
131
+ "Provider error (status #{status}): #{partial}"
132
+ else
133
+ "Provider error (status #{status}) - response body incomplete"
134
+ end
135
+ log.warn "[llm][streaming] action=handle_failed_response status=#{status} " \
136
+ "partial_body=#{buffer.length}b msg=#{partial.inspect}"
137
+ raise Legion::Extensions::Llm::ServerError, msg
100
138
  end
101
139
 
102
140
  def handle_sse(chunk, parser, env, &)
@@ -3,7 +3,7 @@
3
3
  module Legion
4
4
  module Extensions
5
5
  module Llm
6
- VERSION = '0.4.10'
6
+ VERSION = '0.4.13'
7
7
  end
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-llm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.10
4
+ version: 0.4.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - LegionIO