lex-llm 0.4.9 → 0.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e2b7839d1fcb47e176aa62970f3ec40d04b9e25095418285d3cf6a5d492ccb8d
4
- data.tar.gz: f444cd5054325a007f05749d6d4a2ca0933bf1f5b7004b2585daaa97745eb337
3
+ metadata.gz: 6d60f78c459fb43344897e6fdba10730b881f698229058a50a1c1be2564539cf
4
+ data.tar.gz: d7fcedadb69266af972caf1a51d1153bd5270f1fd5e9b45f65d51076fafa07aa
5
5
  SHA512:
6
- metadata.gz: 7412bc0234b379941ae045fa826f267d97b1577dba15ee7268957d62c999808e27392d9acf65d0bf464c31aa7365e3912642a4b0041eb92294f108c4203d8f38
7
- data.tar.gz: a90e99a7c61f6fda2ffc4d2e4fd6b2fcba6f69bf75d5da526cacef550c345aadd157ea1ee6efeae59e3f69160a8e6ad92237a31997e9ef259c69dc29a6db01e6
6
+ metadata.gz: c60726bfac3eff11cf37d8035ad78c7437b627f465bad31efdac1be3061fe410dc176d805bd168389859fa94773cd994578d265a6534a8e3feed1d37db517988
7
+ data.tar.gz: 40439ec46e06530b9e5d287fe8d5980d57b87c2700343b3282c30deb9cd1b241862812e4264a9842d6a1fea20aa9bcb4f580cf08ed31cc61b73c00c2c753c9ce
data/CHANGELOG.md CHANGED
@@ -1,5 +1,29 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.4.13 - 2026-05-15
4
+
5
+ - Strip provider thinking from OpenAI-compatible responses when local models emit `<thinking>` tags or untagged initial reasoning preambles, and keep those hidden from live streaming content deltas.
6
+
7
+ ## 0.4.12 - 2026-05-15
8
+
9
+ - Preserve streamed provider error bodies in a custom Faraday env key so Faraday Net::HTTP finalization cannot replace the buffered body with an empty string before `ErrorMiddleware` parses it.
10
+
11
+ ## 0.4.11 - 2026-05-15
12
+
13
+ - Fix `handle_failed_response` to preserve non-200 streaming error bodies across chunks instead of swallowing `ParseError` and falling through to a generic "An unknown error occurred". Complete JSON error bodies still raise typed provider errors immediately; incomplete bodies are buffered onto the Faraday response env for final middleware parsing, with regex fallback extraction for vLLM-style partial `message` fields when the env cannot carry the buffered body.
14
+
15
+ ## 0.4.10 - 2026-05-13
16
+
17
+ - Add cache-backed `model_detail` lookup with 24-hour TTL; nil results are not cached; `fetch_model_detail` hook for subclasses to override with live API calls.
18
+ - Build `model_detail_cache_key` from tier, slug, instance, and credential fingerprint so remote providers never share model detail entries across credentials.
19
+ - Add `credential_cache_fragment` — includes an 8-char SHA-256 credential fingerprint in cache keys for non-local providers.
20
+ - Add `source_tag`, `credential_fingerprint`, and `config_fingerprint` to `CredentialSources` for provenance tracking across discovered instances.
21
+ - Suppress Faraday raw stacktrace dumps on connection failures by setting `errors: false` on the response logger middleware.
22
+ - Rescue `Faraday::ConnectionFailed` in `discover_offerings` and return an empty list with a concise warning instead of propagating the exception.
23
+ - Wire `model_allowed?` filtering into `discover_offerings` so whitelist/blacklist settings are enforced during live discovery (was dead code before).
24
+ - Check instance config first for `model_whitelist`/`model_blacklist` before falling back to provider settings, enabling per-instance override.
25
+ - Add `legion-cache >= 1.3.0` as a runtime dependency and include `Legion::Cache::Helper` in the base `Provider` class.
26
+
3
27
  ## 0.4.9 - 2026-05-13
4
28
 
5
29
  - Route provider, tool, streaming, model, attachment, connection, credential, and fleet diagnostics through `Legion::Logging::Helper`.
data/lex-llm.gemspec CHANGED
@@ -35,6 +35,7 @@ Gem::Specification.new do |spec|
35
35
  spec.add_dependency 'faraday-multipart', '>= 1'
36
36
  spec.add_dependency 'faraday-net_http', '>= 1'
37
37
  spec.add_dependency 'faraday-retry', '>= 1'
38
+ spec.add_dependency 'legion-cache', '>= 1.3.0'
38
39
  spec.add_dependency 'legion-crypt', '>= 1.5.1'
39
40
  spec.add_dependency 'legion-json', '>= 1.2.1'
40
41
  spec.add_dependency 'legion-logging', '>= 1.3.2'
@@ -79,7 +79,7 @@ module Legion
79
79
  faraday.response :logger,
80
80
  logger,
81
81
  bodies: debug_logger?(logger),
82
- errors: true,
82
+ errors: false,
83
83
  headers: false,
84
84
  log_level: :debug do |logger|
85
85
  logger.filter(logging_regexp('[A-Za-z0-9+/=]{100,}'), '[BASE64 DATA]')
@@ -167,6 +167,30 @@ module Legion
167
167
  Digest::SHA256.hexdigest(val.to_s)
168
168
  end
169
169
 
170
+ # Build a human-readable source tag describing where a credential was found.
171
+ # Format: "type:location:key" e.g. "env:ANTHROPIC_API_KEY", "file:~/.claude/settings.json:anthropicApiKey"
172
+ def source_tag(type, location, key = nil)
173
+ parts = [type.to_s, location.to_s]
174
+ parts << key.to_s if key && !key.to_s.empty?
175
+ parts.join(':')
176
+ end
177
+
178
+ # Generate a short fingerprint (first 8 chars of SHA-256) for a credential value.
179
+ # Stable for the lifetime of the credential; safe to log and include in audit events.
180
+ def credential_fingerprint(value)
181
+ return nil if value.nil? || value.to_s.strip.empty?
182
+
183
+ Digest::SHA256.hexdigest(value.to_s)[0, 8]
184
+ end
185
+
186
+ # Extract fingerprint from a config hash by finding the first credential field.
187
+ def config_fingerprint(config)
188
+ val = config[:api_key] || config['api_key'] ||
189
+ config[:bearer_token] || config['bearer_token'] ||
190
+ config[:access_token] || config['access_token']
191
+ credential_fingerprint(val)
192
+ end
193
+
170
194
  # Returns true when the URL points to localhost / 127.0.0.1 / ::1.
171
195
  def localhost?(url)
172
196
  return false if url.nil?
@@ -185,7 +209,9 @@ module Legion
185
209
  module_function :env, :claude_config, :claude_config_value,
186
210
  :claude_env_value, :codex_token, :codex_openai_key,
187
211
  :setting, :socket_open?, :http_ok?,
188
- :dedup_credentials, :credential_hash, :localhost?
212
+ :dedup_credentials, :credential_hash,
213
+ :source_tag, :credential_fingerprint, :config_fingerprint,
214
+ :localhost?
189
215
 
190
216
  # --- private helpers -----------------------------------------------
191
217
 
@@ -54,6 +54,8 @@ module Legion
54
54
 
55
55
  # Faraday middleware that maps provider-specific API errors to Legion::Extensions::Llm errors.
56
56
  class ErrorMiddleware < Faraday::Middleware
57
+ STREAM_ERROR_BODY_KEY = :legion_llm_stream_error_body
58
+
57
59
  def initialize(app, options = {})
58
60
  super(app)
59
61
  @provider = options[:provider]
@@ -79,6 +81,7 @@ module Legion
79
81
  ].freeze
80
82
 
81
83
  def parse_error(provider:, response:) # rubocop:disable Metrics/PerceivedComplexity
84
+ response = response_with_stream_error_body(response)
82
85
  message = provider&.parse_error(response)
83
86
 
84
87
  case response.status
@@ -116,12 +119,51 @@ module Legion
116
119
 
117
120
  private
118
121
 
122
+ def response_with_stream_error_body(response)
123
+ return response unless empty_body?(response)
124
+
125
+ stream_body = preserved_stream_error_body(response)
126
+ return response if stream_body.to_s.empty?
127
+
128
+ ResponseWithBody.new(response, stream_body)
129
+ end
130
+
131
+ def empty_body?(response)
132
+ !response.respond_to?(:body) || response.body.to_s.empty?
133
+ end
134
+
135
+ def preserved_stream_error_body(response)
136
+ return unless response.respond_to?(:[])
137
+
138
+ response[STREAM_ERROR_BODY_KEY]
139
+ rescue StandardError
140
+ nil
141
+ end
142
+
119
143
  def context_length_exceeded?(message)
120
144
  return false if message.to_s.empty?
121
145
 
122
146
  CONTEXT_LENGTH_PATTERNS.any? { |pattern| message.match?(pattern) }
123
147
  end
124
148
  end
149
+
150
+ ResponseWithBody = Struct.new(:response, :body) do
151
+ def status = response.status
152
+
153
+ def [](key)
154
+ response[key] if response.respond_to?(:[])
155
+ end
156
+
157
+ def method_missing(method_name, ...)
158
+ return response.public_send(method_name, ...) if response.respond_to?(method_name)
159
+
160
+ super
161
+ end
162
+
163
+ def respond_to_missing?(method_name, include_private = false)
164
+ response.respond_to?(method_name, include_private) || super
165
+ end
166
+ end
125
167
  end
126
168
  end
127
169
  end
@@ -28,6 +28,7 @@ module Legion
28
28
  class Provider
29
29
  include Streaming
30
30
  include Legion::Logging::Helper
31
+ include Legion::Cache::Helper
31
32
 
32
33
  attr_reader :config, :connection
33
34
 
@@ -123,10 +124,14 @@ module Legion
123
124
  provider_health = health(live:)
124
125
  @cached_offerings = Array(list_models(live:, **filters)).filter_map do |model|
125
126
  next unless model_matches_filters?(model, filters)
127
+ next unless model_allowed?(model.id)
126
128
 
127
129
  offering_from_model(model, health: provider_health)
128
130
  end
129
131
  @cached_offerings
132
+ rescue Faraday::ConnectionFailed => e
133
+ log.warn("[#{slug}] instance=#{provider_instance_id} unreachable: #{e.message}")
134
+ []
130
135
  end
131
136
 
132
137
  def health(live: false)
@@ -259,6 +264,8 @@ module Legion
259
264
  error = part['error']
260
265
  error.is_a?(String) ? error : part.dig('error', 'message')
261
266
  end.join('. ')
267
+ when String
268
+ body[/"message"\s*:\s*"([^"]{1,500})/, 1] || body
262
269
  else
263
270
  body
264
271
  end
@@ -284,12 +291,14 @@ module Legion
284
291
  # ── Model allow-list / deny-list filtering ────────────────────────
285
292
 
286
293
  def model_whitelist
287
- wl = settings[:model_whitelist] if respond_to?(:settings)
294
+ wl = config.model_whitelist if config.respond_to?(:model_whitelist)
295
+ wl ||= settings[:model_whitelist] if respond_to?(:settings)
288
296
  Array(wl).map { |p| p.to_s.downcase }
289
297
  end
290
298
 
291
299
  def model_blacklist
292
- bl = settings[:model_blacklist] if respond_to?(:settings)
300
+ bl = config.model_blacklist if config.respond_to?(:model_blacklist)
301
+ bl ||= settings[:model_blacklist] if respond_to?(:settings)
293
302
  Array(bl).map { |p| p.to_s.downcase }
294
303
  end
295
304
 
@@ -371,21 +380,24 @@ module Legion
371
380
  nil
372
381
  end
373
382
 
374
- def model_cache_set(key, value, ttl:)
375
- return unless defined?(Legion::Cache)
383
+ def model_detail(model_name)
384
+ key = model_detail_cache_key(model_name)
385
+ cached = cache_get(key)
386
+ return cached if cached
376
387
 
377
- cache_local_instance? ? local_cache_set(key, value, ttl: ttl) : cache_set(key, value, ttl: ttl)
388
+ result = fetch_model_detail(model_name)
389
+ cache_set(key, result, ttl: 86_400) if result
390
+ result
378
391
  rescue StandardError => e
379
- handle_exception(e, level: :debug, handled: true, operation: 'lex.provider.model_cache_set')
392
+ handle_exception(e, level: :warn, handled: true, operation: 'llm.provider.model_detail',
393
+ model: model_name)
394
+ nil
380
395
  end
381
396
 
382
- def model_cache_fetch(key, ttl:, &)
383
- return yield unless defined?(Legion::Cache)
384
-
385
- cache_local_instance? ? local_cache_fetch(key, ttl: ttl, &) : cache_fetch(key, ttl: ttl, &)
386
- rescue StandardError => e
387
- handle_exception(e, level: :debug, handled: true, operation: 'llm.provider.model_cache_fetch', key:)
388
- yield
397
+ # Override in subclasses to make a live API call for model detail.
398
+ # Must return a Hash with symbol keys (e.g. { context_window: 128000 }).
399
+ def fetch_model_detail(_model_name)
400
+ nil
389
401
  end
390
402
 
391
403
  def cache_instance_key
@@ -448,6 +460,26 @@ module Legion
448
460
 
449
461
  private
450
462
 
463
+ def model_detail_cache_key(model_name)
464
+ tier = offering_tier
465
+ instance_key = cache_instance_key
466
+ cred_fp = credential_cache_fragment
467
+ key_parts = ['model_info', tier, slug, instance_key, cred_fp, model_name].compact
468
+ key_parts.join('.')
469
+ end
470
+
471
+ def credential_cache_fragment
472
+ return nil if cache_local_instance?
473
+
474
+ cred = config.respond_to?(:bearer_token) && config.bearer_token
475
+ cred ||= config.respond_to?(:api_key) && config.api_key
476
+ cred ||= config.respond_to?(:bedrock_access_key_id) && config.bedrock_access_key_id
477
+ return nil unless cred
478
+
479
+ require 'digest'
480
+ Digest::SHA256.hexdigest(cred.to_s)[0, 8]
481
+ end
482
+
451
483
  def validate_paint_inputs!(with:, mask:)
452
484
  return if with.nil? && mask.nil?
453
485
 
@@ -8,9 +8,39 @@ module Legion
8
8
  module ThinkingExtractor
9
9
  Extraction = Struct.new(:content, :thinking, :signature, :metadata, keyword_init: true)
10
10
 
11
- THINK_OPEN = '<think>'
12
- THINK_CLOSE = '</think>'
13
- THINK_PATTERN = %r{<think>(.*?)</think>}m
11
+ THINK_TAG_PAIRS = [
12
+ ['<thinking>', '</thinking>'],
13
+ ['<think>', '</think>']
14
+ ].freeze
15
+ UNTAGGED_PREAMBLE_MAX_LENGTH = 4_000
16
+ UNTAGGED_PREAMBLE_STARTS = [
17
+ 'the user',
18
+ 'the request',
19
+ 'the prompt',
20
+ 'the question',
21
+ 'i need',
22
+ 'i should',
23
+ 'i will',
24
+ "i'll",
25
+ 'i can',
26
+ 'we need',
27
+ 'we should',
28
+ 'we will',
29
+ "we'll",
30
+ 'we can',
31
+ 'let me'
32
+ ].freeze
33
+ UNTAGGED_PREAMBLE_PATTERNS = [
34
+ /
35
+ \AThe\s+(?:user|request|prompt|question)\b.*\b
36
+ (?:let\s+me|i'll|i\s+will|i\s+should|i\s+need|i\s+can|respond|answer|reply)\b
37
+ /imx,
38
+ /
39
+ \A(?:I|We)\s+(?:need|should|will|can)\s+(?:to\s+)?
40
+ (?:answer|respond|reply|confirm|provide|explain|help)\b
41
+ /imx,
42
+ /\ALet me\s+(?:answer|respond|reply|confirm|provide|explain|help)\b/im
43
+ ].freeze
14
44
  THINKING_METADATA_KEYS = %i[
15
45
  reasoning_content reasoning thinking thinking_text thinking_signature reasoning_signature thought_signature
16
46
  ].freeze
@@ -42,20 +72,45 @@ module Legion
42
72
  remaining = content.dup
43
73
 
44
74
  remaining = consume_next_segment(remaining, clean, thinking_parts) until remaining.empty?
75
+ clean, untagged_thinking = extract_untagged_preamble(clean.strip)
76
+ thinking_parts << untagged_thinking
45
77
 
46
- [clean.strip, compact_thinking(thinking_parts)]
78
+ [clean, compact_thinking(thinking_parts)]
47
79
  end
48
80
  private_class_method :extract_from_content
49
81
 
82
+ def extract_untagged_preamble(content)
83
+ return [content, nil] unless content.is_a?(String)
84
+
85
+ match = content.match(/\A(?<preamble>.+?)\n{2,}(?<visible>.+)\z/m)
86
+ return [content, nil] unless match
87
+
88
+ preamble = match[:preamble].strip
89
+ return [content, nil] unless untagged_reasoning_preamble?(preamble)
90
+
91
+ [match[:visible].sub(/\A[[:space:]]+/, '').strip, preamble]
92
+ end
93
+
94
+ def untagged_reasoning_preamble_candidate?(content)
95
+ return false unless content.is_a?(String)
96
+
97
+ text = content.lstrip.downcase
98
+ return false if text.empty?
99
+
100
+ UNTAGGED_PREAMBLE_STARTS.any? do |start|
101
+ start.start_with?(text) || text.start_with?(start)
102
+ end
103
+ end
104
+
50
105
  def consume_next_segment(remaining, clean, thinking_parts)
51
- close_index = remaining.index(THINK_CLOSE)
52
- open_index = remaining.index(THINK_OPEN)
53
-
54
- if close_index && (open_index.nil? || close_index < open_index)
55
- thinking_parts << remaining.slice(0, close_index)
56
- remaining.slice((close_index + THINK_CLOSE.length)..).to_s.sub(/\A[[:space:]]+/, '')
57
- elsif open_index
58
- consume_open_think_segment(remaining, open_index, clean, thinking_parts)
106
+ close_match = next_tag_match(remaining, :close)
107
+ open_match = next_tag_match(remaining, :open)
108
+
109
+ if close_match && (open_match.nil? || close_match[:index] < open_match[:index])
110
+ thinking_parts << remaining.slice(0, close_match[:index])
111
+ remaining.slice((close_match[:index] + close_match[:tag].length)..).to_s.sub(/\A[[:space:]]+/, '')
112
+ elsif open_match
113
+ consume_open_think_segment(remaining, open_match, clean, thinking_parts)
59
114
  else
60
115
  clean << remaining
61
116
  +''
@@ -63,20 +118,37 @@ module Legion
63
118
  end
64
119
  private_class_method :consume_next_segment
65
120
 
66
- def consume_open_think_segment(remaining, open_index, clean, thinking_parts)
67
- clean << remaining.slice(0, open_index)
68
- after_open = remaining.slice((open_index + THINK_OPEN.length)..).to_s
69
- close_index = after_open.index(THINK_CLOSE)
121
+ def consume_open_think_segment(remaining, open_match, clean, thinking_parts)
122
+ clean << remaining.slice(0, open_match[:index])
123
+ after_open = remaining.slice((open_match[:index] + open_match[:tag].length)..).to_s
124
+ close_index = after_open.index(open_match[:close_tag])
70
125
  unless close_index
71
126
  thinking_parts << after_open
72
127
  return +''
73
128
  end
74
129
 
75
130
  thinking_parts << after_open.slice(0, close_index)
76
- after_open.slice((close_index + THINK_CLOSE.length)..).to_s
131
+ after_open.slice((close_index + open_match[:close_tag].length)..).to_s
77
132
  end
78
133
  private_class_method :consume_open_think_segment
79
134
 
135
+ def next_tag_match(text, type)
136
+ matches = THINK_TAG_PAIRS.filter_map do |open_tag, close_tag|
137
+ tag = type == :open ? open_tag : close_tag
138
+ index = text.index(tag)
139
+ { index: index, tag: tag, close_tag: close_tag } if index
140
+ end
141
+ matches.min_by { |match| match[:index] }
142
+ end
143
+ private_class_method :next_tag_match
144
+
145
+ def untagged_reasoning_preamble?(preamble)
146
+ return false if preamble.length > UNTAGGED_PREAMBLE_MAX_LENGTH
147
+
148
+ UNTAGGED_PREAMBLE_PATTERNS.any? { |pattern| preamble.match?(pattern) }
149
+ end
150
+ private_class_method :untagged_reasoning_preamble?
151
+
80
152
  def extract_metadata_thinking(metadata)
81
153
  compact_thinking(
82
154
  [
@@ -21,6 +21,9 @@ module Legion
21
21
  @thinking_tokens = nil
22
22
  @inside_think_tag = false
23
23
  @pending_think_tag = +''
24
+ @active_think_close_tag = nil
25
+ @untagged_preamble_pending = true
26
+ @untagged_preamble_buffer = +''
24
27
  @latest_tool_call_id = nil
25
28
  end
26
29
 
@@ -55,6 +58,8 @@ module Legion
55
58
  end
56
59
 
57
60
  def to_message(response)
61
+ flush_pending_untagged_preamble
62
+
58
63
  Message.new(
59
64
  role: :assistant,
60
65
  content: content.empty? ? nil : content,
@@ -171,14 +176,63 @@ module Legion
171
176
 
172
177
  def append_text_with_thinking(text)
173
178
  content_chunk, thinking_chunk = extract_think_tags(text)
179
+ content_chunk, untagged_thinking = extract_untagged_preamble(content_chunk)
174
180
  @content << content_chunk
175
181
  @last_content_delta << content_chunk
182
+ if untagged_thinking
183
+ @thinking_text << untagged_thinking
184
+ @last_thinking_delta << untagged_thinking
185
+ end
176
186
  return unless thinking_chunk
177
187
 
178
188
  @thinking_text << thinking_chunk
179
189
  @last_thinking_delta << thinking_chunk
180
190
  end
181
191
 
192
+ def extract_untagged_preamble(content_chunk)
193
+ return [content_chunk, nil] unless @untagged_preamble_pending
194
+ return [content_chunk, nil] unless @content.empty? && @thinking_text.empty?
195
+ return [content_chunk, nil] if content_chunk.empty?
196
+
197
+ candidate = @untagged_preamble_buffer + content_chunk
198
+ return release_untagged_preamble(candidate) unless candidate_untagged_preamble?(candidate)
199
+
200
+ content, thinking = Responses::ThinkingExtractor.extract_untagged_preamble(candidate)
201
+ return release_untagged_preamble(content, thinking) if thinking
202
+ return release_untagged_preamble(candidate) if complete_untagged_preamble_candidate?(candidate)
203
+
204
+ @untagged_preamble_buffer = candidate
205
+ ['', nil]
206
+ end
207
+
208
+ def candidate_untagged_preamble?(candidate)
209
+ Responses::ThinkingExtractor.untagged_reasoning_preamble_candidate?(candidate)
210
+ end
211
+
212
+ def complete_untagged_preamble_candidate?(candidate)
213
+ candidate.match?(/\n{2,}/) || candidate.length > Responses::ThinkingExtractor::UNTAGGED_PREAMBLE_MAX_LENGTH
214
+ end
215
+
216
+ def release_untagged_preamble(content, thinking = nil)
217
+ @untagged_preamble_pending = false
218
+ @untagged_preamble_buffer = +''
219
+ [content, thinking]
220
+ end
221
+
222
+ def flush_pending_untagged_preamble
223
+ return if @untagged_preamble_buffer.empty?
224
+
225
+ content, thinking = Responses::ThinkingExtractor.extract_untagged_preamble(@untagged_preamble_buffer)
226
+ if thinking
227
+ @content << content
228
+ @thinking_text << thinking
229
+ else
230
+ @content << @untagged_preamble_buffer
231
+ end
232
+ @untagged_preamble_buffer = +''
233
+ @untagged_preamble_pending = false
234
+ end
235
+
182
236
  def append_thinking_from_chunk(chunk)
183
237
  thinking = chunk.thinking
184
238
  return unless thinking
@@ -191,8 +245,6 @@ module Legion
191
245
  end
192
246
 
193
247
  def extract_think_tags(text)
194
- start_tag = '<think>'
195
- end_tag = '</think>'
196
248
  remaining = @pending_think_tag + text
197
249
  @pending_think_tag = +''
198
250
 
@@ -201,9 +253,9 @@ module Legion
201
253
 
202
254
  until remaining.empty?
203
255
  remaining = if @inside_think_tag
204
- consume_think_content(remaining, end_tag, thinking)
256
+ consume_think_content(remaining, @active_think_close_tag, thinking)
205
257
  else
206
- consume_non_think_content(remaining, start_tag, output)
258
+ consume_non_think_content(remaining, output)
207
259
  end
208
260
  end
209
261
 
@@ -215,41 +267,59 @@ module Legion
215
267
  if end_index
216
268
  thinking << remaining.slice(0, end_index)
217
269
  @inside_think_tag = false
270
+ @active_think_close_tag = nil
218
271
  remaining.slice((end_index + end_tag.length)..) || +''
219
272
  else
220
- suffix_len = longest_suffix_prefix(remaining, end_tag)
273
+ suffix_len = longest_suffix_prefix(remaining, [end_tag])
221
274
  thinking << remaining.slice(0, remaining.length - suffix_len)
222
275
  @pending_think_tag = remaining.slice(-suffix_len, suffix_len)
223
276
  +''
224
277
  end
225
278
  end
226
279
 
227
- def consume_non_think_content(remaining, start_tag, output)
228
- unmatched_close = remaining.index('</think>')
229
- start_index = remaining.index(start_tag)
230
- if unmatched_close && (start_index.nil? || unmatched_close < start_index)
280
+ def consume_non_think_content(remaining, output)
281
+ unmatched_close = next_stream_tag_match(remaining, :close)
282
+ start_match = next_stream_tag_match(remaining, :open)
283
+ if unmatched_close && (start_match.nil? || unmatched_close[:index] < start_match[:index])
231
284
  consume_unmatched_think_close(remaining, unmatched_close)
232
- elsif start_index
233
- output << remaining.slice(0, start_index)
285
+ elsif start_match
286
+ output << remaining.slice(0, start_match[:index])
234
287
  @inside_think_tag = true
235
- remaining.slice((start_index + start_tag.length)..) || +''
288
+ @active_think_close_tag = start_match[:close_tag]
289
+ remaining.slice((start_match[:index] + start_match[:tag].length)..) || +''
236
290
  else
237
- suffix_len = longest_suffix_prefix(remaining, start_tag)
291
+ suffix_len = longest_suffix_prefix(remaining, stream_tag_tokens)
238
292
  output << remaining.slice(0, remaining.length - suffix_len)
239
293
  @pending_think_tag = remaining.slice(-suffix_len, suffix_len)
240
294
  +''
241
295
  end
242
296
  end
243
297
 
244
- def consume_unmatched_think_close(remaining, close_index)
245
- end_tag = '</think>'
246
- thinking = remaining.slice(0, close_index)
298
+ def consume_unmatched_think_close(remaining, close_match)
299
+ thinking = remaining.slice(0, close_match[:index])
247
300
  @thinking_text << thinking
248
301
  @last_thinking_delta << thinking
249
- remaining.slice((close_index + end_tag.length)..).to_s.sub(/\A[[:space:]]+/, '')
302
+ remaining.slice((close_match[:index] + close_match[:tag].length)..).to_s.sub(/\A[[:space:]]+/, '')
303
+ end
304
+
305
+ def next_stream_tag_match(text, type)
306
+ matches = Responses::ThinkingExtractor::THINK_TAG_PAIRS.filter_map do |open_tag, close_tag|
307
+ tag = type == :open ? open_tag : close_tag
308
+ index = text.index(tag)
309
+ { index: index, tag: tag, close_tag: close_tag } if index
310
+ end
311
+ matches.min_by { |match| match[:index] }
312
+ end
313
+
314
+ def stream_tag_tokens
315
+ Responses::ThinkingExtractor::THINK_TAG_PAIRS.flat_map { |open_tag, close_tag| [open_tag, close_tag] }
316
+ end
317
+
318
+ def longest_suffix_prefix(text, tags)
319
+ tags.map { |tag| longest_suffix_prefix_for_tag(text, tag) }.max || 0
250
320
  end
251
321
 
252
- def longest_suffix_prefix(text, tag)
322
+ def longest_suffix_prefix_for_tag(text, tag)
253
323
  max = [text.length, tag.length - 1].min
254
324
  max.downto(1) do |len|
255
325
  return len if text.end_with?(tag[0, len])
@@ -93,10 +93,48 @@ module Legion
93
93
 
94
94
  def handle_failed_response(chunk, buffer, env)
95
95
  buffer << chunk
96
+ body_persisted = persist_failed_response_body(buffer, env)
96
97
  error_data = Legion::JSON.parse(buffer, symbolize_names: false)
97
98
  handle_parsed_error(error_data, env)
98
- rescue Legion::JSON::ParseError => e
99
- handle_exception(e, level: :warn, handled: true, operation: 'llm.streaming.handle_failed_response')
99
+ rescue Legion::JSON::ParseError
100
+ return if body_persisted
101
+
102
+ raise_partial_streaming_error(buffer, env)
103
+ end
104
+
105
+ def persist_failed_response_body(buffer, env)
106
+ custom_persisted = persist_failed_response_custom_body?(buffer, env)
107
+ body_persisted = persist_failed_response_env_body?(buffer, env)
108
+ custom_persisted || body_persisted
109
+ end
110
+
111
+ def persist_failed_response_env_body?(buffer, env)
112
+ return false unless env.respond_to?(:body=)
113
+
114
+ env.body = buffer.dup
115
+ true
116
+ end
117
+
118
+ def persist_failed_response_custom_body?(buffer, env)
119
+ return false unless env.respond_to?(:[]=)
120
+
121
+ env[ErrorMiddleware::STREAM_ERROR_BODY_KEY] = buffer.dup
122
+ true
123
+ rescue StandardError
124
+ false
125
+ end
126
+
127
+ def raise_partial_streaming_error(buffer, env)
128
+ partial = buffer[/"message"\s*:\s*"([^"]{1,200})/, 1]
129
+ status = env&.status || 0
130
+ msg = if partial
131
+ "Provider error (status #{status}): #{partial}"
132
+ else
133
+ "Provider error (status #{status}) - response body incomplete"
134
+ end
135
+ log.warn "[llm][streaming] action=handle_failed_response status=#{status} " \
136
+ "partial_body=#{buffer.length}b msg=#{partial.inspect}"
137
+ raise Legion::Extensions::Llm::ServerError, msg
100
138
  end
101
139
 
102
140
  def handle_sse(chunk, parser, env, &)
@@ -3,7 +3,7 @@
3
3
  module Legion
4
4
  module Extensions
5
5
  module Llm
6
- VERSION = '0.4.9'
6
+ VERSION = '0.4.13'
7
7
  end
8
8
  end
9
9
  end
@@ -9,6 +9,15 @@ require 'faraday/multipart'
9
9
  require 'faraday/retry'
10
10
  require 'legion/json'
11
11
  require 'legion/logging'
12
+ # legion/cache writes DEBUG lines to $stdout on first load; suppress them here
13
+ # so callers that capture our stdout (e.g. Open3-based integration tests) are unaffected.
14
+ begin
15
+ old_stdout = $stdout
16
+ $stdout = File.open(File::NULL, 'w')
17
+ require 'legion/cache'
18
+ ensure
19
+ $stdout = old_stdout
20
+ end
12
21
  require 'logger'
13
22
  require 'marcel'
14
23
  require 'ruby_llm/schema'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-llm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.9
4
+ version: 0.4.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - LegionIO
@@ -108,6 +108,20 @@ dependencies:
108
108
  - - ">="
109
109
  - !ruby/object:Gem::Version
110
110
  version: '1'
111
+ - !ruby/object:Gem::Dependency
112
+ name: legion-cache
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: 1.3.0
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: 1.3.0
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: legion-crypt
113
127
  requirement: !ruby/object:Gem::Requirement