lex-llm-vllm 0.2.13 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6adc86b9d3286821c0efa59e4c820f3d99ee0acb5327f133a96010383d154505
4
- data.tar.gz: 73ecff7ccc309eb0469a79edc3970fdf3d766199a6df31edde4cbaf2016dc970
3
+ metadata.gz: 172c35debe332979f48575e43bd59c04828449a41a195f3d899bc15afa18bdb2
4
+ data.tar.gz: c423c24ff7a5e4b33f1b6e562b50c196d2870b347bbcad61b38cd228d54ee318
5
5
  SHA512:
6
- metadata.gz: 4b2c498e26f09fa27edfa7abf08bf6fae656313cf6e2ce625772a9ce809ff1fcfae55a8746261b943b6b41111a46784fa97d9d5004f4be69f58761de05c6383d
7
- data.tar.gz: 94b867bd099f8e062f23aee550be30d4549bbc3d4f10d40b6e4e9b3dcabca7c8837e246198a6ea9a208035da3dcfaaf41be626709de9c8e642e3a4035f6681b0
6
+ metadata.gz: dbf9166b8302c7dc786562b7e2e17381b4cf33b570825570cc153dc438bfbef4991e53b0e86811821f08b2a4a00e3b6522bac903764a6bfe3e90f04be4d556ea
7
+ data.tar.gz: 5ee6cda495f98e9f68c4b3ea79b1a0fa28ab833113a00c3aa9a43e9a67e94c4aa79995faf8d8745e063d7c480d77dde246940d8d6b9c3570d678c39be19b496f
data/.rubocop.yml CHANGED
@@ -21,3 +21,12 @@ RSpec/ExampleLength:
21
21
  Max: 8
22
22
  RSpec/MultipleExpectations:
23
23
  Enabled: false
24
+
25
+ RSpec/ExampleLength:
26
+ Max: 10
27
+
28
+
29
+ Layout/LineLength:
30
+ Exclude:
31
+ - spec/**/*
32
+
data/CHANGELOG.md CHANGED
@@ -1,5 +1,19 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.3.0 - 2026-06-10
4
+
5
+ - Add canonical provider translator (`Translator`) implementing `render_request`,
6
+ `parse_response`, `parse_chunk`, and `capabilities` per N×N routing design
7
+ - Wire provider `render_payload`, `parse_completion_response`, `build_chunk` to
8
+ delegate to translator with legacy Message/Chunk bridge for backward compat
9
+ - Declare vLLM quirks: `tool_calls_as_text`, `forced_tool_choice`, `thinking_tags`,
10
+ `streaming_token_usage`
11
+ - G18 parameter mapping: max_tokens, temperature, top_p, top_k, stop_sequences,
12
+ seed, frequency_penalty, presence_penalty, response_format
13
+ - Qwen-style </think> tag extraction and tool-call synthesis from content text
14
+ - Adopt conformance kit (`it_behaves_like 'a canonical provider translator'`)
15
+ - Bump lex-llm dependency floor to >= 0.5.0
16
+
3
17
  ## 0.2.13 - 2026-06-05
4
18
 
5
19
  - Fix missing documentation comment on `DiscoveryRefresh` actor (RuboCop Style/Documentation)
data/Gemfile CHANGED
@@ -3,10 +3,9 @@
3
3
  source 'https://rubygems.org'
4
4
 
5
5
  group :test do
6
- llm_base_path = ENV.fetch('LEX_LLM_PATH', File.expand_path('../lex-llm', __dir__))
7
6
  transport_path = ENV.fetch('LEGION_TRANSPORT_PATH', File.expand_path('../../legion-transport', __dir__))
8
7
  gem 'legion-transport', path: transport_path if File.directory?(transport_path)
9
- gem 'lex-llm', path: llm_base_path if File.directory?(llm_base_path)
8
+ # lex-llm >= 0.5.0 carries canonical types + conformance kit (released on rubygems.org)
10
9
  end
11
10
 
12
11
  gemspec
data/lex-llm-vllm.gemspec CHANGED
@@ -27,5 +27,5 @@ Gem::Specification.new do |spec|
27
27
  spec.add_dependency 'legion-logging', '>= 1.3.2'
28
28
  spec.add_dependency 'legion-settings', '>= 1.3.14'
29
29
  spec.add_dependency 'legion-transport', '>= 1.4.14'
30
- spec.add_dependency 'lex-llm', '>= 0.4.3'
30
+ spec.add_dependency 'lex-llm', '>= 0.5.0'
31
31
  end
@@ -53,6 +53,11 @@ module Legion
53
53
  Vllm.default_settings
54
54
  end
55
55
 
56
+ # Canonical translator instance — renders requests, parses responses/chunks.
57
+ def translator
58
+ @translator ||= Translator.new(config: config)
59
+ end
60
+
56
61
  def api_base
57
62
  normalize_url(config.vllm_api_base || settings[:endpoint] || 'http://localhost:8000')
58
63
  end
@@ -173,15 +178,159 @@ module Legion
173
178
  )
174
179
  end
175
180
 
181
+ # ── Canonical bridge: legacy provider API → Canonical::Request ──
182
+
183
+ # rubocop:disable Metrics/ParameterLists, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- bridge method can be complex
184
+ def build_canonical_request(
185
+ messages:, tools:, temperature:, model:, stream:, schema:, thinking:, tool_prefs:
186
+ )
187
+ model_id = model.respond_to?(:id) ? model.id : model.to_s
188
+
189
+ canonical_messages = messages.filter_map do |msg|
190
+ Canonical::Message.from_hash(msg.to_h) if msg.respond_to?(:to_h)
191
+ end
192
+
193
+ canonical_tools = tools.to_h.transform_values do |tool|
194
+ if tool.is_a?(Canonical::ToolDefinition)
195
+ tool
196
+ else
197
+ Canonical::ToolDefinition.from_hash(tool.respond_to?(:to_h) ? tool.to_h : tool)
198
+ end
199
+ end
200
+
201
+ params_hash = { temperature: temperature }
202
+ params_hash[:response_format] = schema if schema
203
+ canonical_params = Canonical::Params.from_hash(params_hash)
204
+
205
+ canonical_thinking = if thinking.respond_to?(:enabled?) && thinking.enabled?
206
+ Canonical::Thinking::Config.new(
207
+ effort: thinking.respond_to?(:effort) ? thinking.effort : nil
208
+ )
209
+ elsif thinking.is_a?(Hash)
210
+ Canonical::Thinking::Config.new(
211
+ effort: thinking[:effort] || thinking['effort'],
212
+ budget: thinking[:budget] || thinking['budget']
213
+ )
214
+ end
215
+
216
+ # Tool choice from tool_prefs
217
+ tool_choice = format_tool_choice_from_prefs(tool_prefs)
218
+
219
+ Canonical::Request.build(
220
+ messages: canonical_messages,
221
+ system: extract_system_prompt(messages),
222
+ tools: canonical_tools,
223
+ tool_choice: tool_choice,
224
+ params: canonical_params,
225
+ thinking: canonical_thinking,
226
+ stream: stream,
227
+ metadata: { model: model_id }
228
+ )
229
+ end
230
+ # rubocop:enable Metrics/ParameterLists, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
231
+
232
+ # ── Canonical bridge: Canonical→legacy Message/Chunk ──
233
+
234
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- verbose bridge
235
+ def to_legacy_message(canonical, raw_body, _raw_response)
236
+ thinking = nil
237
+ if canonical.thinking
238
+ thinking = Thinking.build(
239
+ text: canonical.thinking.content,
240
+ signature: canonical.thinking.signature
241
+ )
242
+ end
243
+
244
+ tool_calls = {}
245
+ canonical.tool_calls.each do |tc|
246
+ key = (tc.name || tc.id).to_s.to_sym
247
+ tool_calls[key] = Legion::Extensions::Llm::ToolCall.new(
248
+ id: tc.id,
249
+ name: tc.name,
250
+ arguments: tc.arguments
251
+ )
252
+ end
253
+
254
+ usage = canonical.usage || {}
255
+
256
+ Legion::Extensions::Llm::Message.new(
257
+ role: :assistant,
258
+ content: canonical.text,
259
+ model_id: canonical.model,
260
+ tool_calls: tool_calls.empty? ? nil : tool_calls,
261
+ thinking: thinking,
262
+ input_tokens: usage.respond_to?(:input_tokens) ? usage.input_tokens : nil,
263
+ output_tokens: usage.respond_to?(:output_tokens) ? usage.output_tokens : nil,
264
+ reasoning_tokens: usage.respond_to?(:thinking_tokens) ? usage.thinking_tokens : nil,
265
+ raw: raw_body
266
+ )
267
+ end
268
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
269
+
270
+ def to_legacy_chunk(canonical, raw_data)
271
+ usage = canonical&.usage || {}
272
+
273
+ content = canonical.delta
274
+ thinking = nil
275
+ if canonical.type == :thinking_delta
276
+ thinking = Thinking.build(text: canonical.delta)
277
+ content = nil
278
+ end
279
+
280
+ Legion::Extensions::Llm::Chunk.new(
281
+ role: :assistant,
282
+ content: content,
283
+ model_id: raw_data['model'],
284
+ tool_calls: nil,
285
+ thinking: thinking,
286
+ input_tokens: usage.respond_to?(:input_tokens) ? usage.input_tokens : nil,
287
+ output_tokens: usage.respond_to?(:output_tokens) ? usage.output_tokens : nil,
288
+ raw: raw_data
289
+ )
290
+ end
291
+
292
+ # ── Tool choice helpers ──
293
+
294
+ def format_tool_choice_from_prefs(tool_prefs)
295
+ return nil unless tool_prefs
296
+
297
+ choice = tool_prefs[:choice] || tool_prefs['choice']
298
+ return nil unless choice
299
+ return choice.to_sym if %w[auto none required].include?(choice.to_s)
300
+
301
+ { name: choice.to_s }
302
+ end
303
+
304
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- multibranch guard chain for system parsing
305
+ def extract_system_prompt(messages)
306
+ return nil unless messages.is_a?(Array)
307
+ return nil if messages.empty?
308
+
309
+ first = messages.first
310
+ return nil unless first
311
+
312
+ role = first.respond_to?(:role) ? first.role.to_sym : (first[:role] || first['role'])
313
+ return nil unless [:system, 'system'].include?(role)
314
+
315
+ content = first.respond_to?(:content) ? first.content : (first[:content] || first['content'])
316
+ content.is_a?(String) ? content : nil
317
+ end
318
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
319
+
176
320
  def render_payload(messages, tools:, temperature:, model:, stream:, schema:, thinking:, tool_prefs:) # rubocop:disable Metrics/ParameterLists
177
- payload = super
178
- payload.delete(:reasoning_effort)
179
- payload[:chat_template_kwargs] = { enable_thinking: true } if thinking_enabled?(thinking)
321
+ # Build a canonical request from provider call parameters,
322
+ # then delegate to the translator for wire-format rendering.
323
+ canonical_req = build_canonical_request(
324
+ messages:, tools:, temperature:, model:, stream:,
325
+ schema:, thinking:, tool_prefs:
326
+ )
327
+ wire = translator.render_request(canonical_req)
328
+
180
329
  log.debug do
181
- "rendered vLLM payload model=#{model.respond_to?(:id) ? model.id : model} stream=#{stream} " \
182
- "tools=#{tools.respond_to?(:size) ? tools.size : 0} thinking=#{payload.key?(:chat_template_kwargs)}"
330
+ "vLLM provider rendered wire payload model=#{wire[:model]} stream=#{wire[:stream]} " \
331
+ "messages=#{(wire[:messages] || []).size} keys=#{wire.keys.join(', ')}"
183
332
  end
184
- payload
333
+ wire
185
334
  end
186
335
 
187
336
  def thinking_enabled?(thinking)
@@ -214,6 +363,24 @@ module Legion
214
363
  vllm[:enable_thinking] == true || vllm['enable_thinking'] == true
215
364
  end
216
365
 
366
+ # Override: delegate completion response parsing to the canonical translator.
367
+ def parse_completion_response(response)
368
+ body = response.body
369
+ canonical = translator.parse_response(body)
370
+
371
+ # Convert Canonical::Response back to the legacy Message/Chunk shape
372
+ # that the Provider base class expects (backward compat with existing callers).
373
+ to_legacy_message(canonical, body, response)
374
+ end
375
+
376
+ # Override: delegate SSE chunk parsing to the canonical translator.
377
+ def build_chunk(data)
378
+ canonical_chunk = translator.parse_chunk(data)
379
+ return nil if canonical_chunk.nil?
380
+
381
+ to_legacy_chunk(canonical_chunk, data)
382
+ end
383
+
217
384
  def parse_list_models_response(response, provider, capabilities)
218
385
  response.body.fetch('data', []).map do |model|
219
386
  critical_capabilities = critical_capabilities_for(capabilities, model)
@@ -0,0 +1,696 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'legion/extensions/llm/canonical'
4
+ require 'legion/extensions/llm/responses/thinking_extractor'
5
+ require 'legion/json'
6
+ require 'legion/logging'
7
+
8
+ module Legion
9
+ module Extensions
10
+ module Llm
11
+ module Vllm
12
+ # Canonical provider translator for vLLM (OpenAI-compatible wire format).
13
+ #
14
+ # Implements render_request, parse_response, parse_chunk, and capabilities.
15
+ # Extracted from existing format_openai_*/parse_* methods in OpenAICompatible mixin
16
+ # and vLLM-specific render_payload override in Provider.
17
+ #
18
+ # vLLM quirks (declared in capabilities):
19
+ # - tool_calls_as_text: true — some model configurations output tool calls
20
+ # as JSON text in the content field rather than structured tool_calls.
21
+ # - forced_tool_choice: true — vLLM's tool_choice handling is strict;
22
+ # named tool choices must be explicit function references.
23
+ # - thinking_tags: ['think', 'thinking'] — Qwen-style models emit reasoning
24
+ # in <think> or <thinking> tags within content text.
25
+ # rubocop:disable Metrics/ClassLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity -- translator implementation
26
+ class Translator
27
+ include Legion::Logging::Helper
28
+
29
+ # vLLM-specific stop_reason mapping (per conformance fixture stop_reason_matrix).
30
+ VLLM_STOP_REASON_MAP = {
31
+ 'stop' => :end_turn,
32
+ 'tool_use' => :tool_use,
33
+ 'length' => :max_tokens
34
+ }.freeze
35
+ FALLBACK_STOP_REASON = :end_turn
36
+
37
+ # G18 parameter mapping: supported canonical params.
38
+ SUPPORTED_PARAMS = %i[
39
+ max_tokens temperature top_p top_k stop_sequences
40
+ seed frequency_penalty presence_penalty response_format
41
+ ].freeze
42
+
43
+ # vLLM wire keys for supported params (most are 1:1 with canonical names).
44
+ PARAM_WIRE_KEYS = {
45
+ max_tokens: :max_tokens,
46
+ temperature: :temperature,
47
+ top_p: :top_p,
48
+ top_k: :top_k,
49
+ stop_sequences: :stop,
50
+ seed: :seed,
51
+ frequency_penalty: :frequency_penalty,
52
+ presence_penalty: :presence_penalty,
53
+ response_format: :response_format
54
+ }.freeze
55
+
56
+ def initialize(config: nil)
57
+ @config = config
58
+ end
59
+
60
+ # Render a canonical request into an OpenAI-compatible wire payload for vLLM.
61
+ def render_request(request)
62
+ model = request.metadata&.dig(:model) || 'default'
63
+ messages = format_messages(request)
64
+ payload = {
65
+ model: model,
66
+ messages: messages,
67
+ stream: request.stream
68
+ }
69
+
70
+ payload[:tools] = format_tools(request.tools) unless request.tools.to_h.empty?
71
+ payload[:tool_choice] = format_tool_choice(request.tool_choice) if request.tool_choice
72
+ payload.merge!(map_params_to_wire(request.params)) if request.params
73
+ apply_thinking_config(payload, request)
74
+ if formatted_response_format?(request.params)
75
+ payload[:response_format] =
76
+ format_response_format(request.params)
77
+ end
78
+
79
+ log.debug do
80
+ "vLLM translator rendered request model=#{model} stream=#{request.stream} " \
81
+ "messages=#{messages.size} tools=#{request.tools&.size || 0} params=#{payload.keys.size}"
82
+ end
83
+
84
+ payload
85
+ end
86
+
87
+ # Parse a vLLM/OpenAI-compatible completion response into a Canonical::Response.
88
+ def parse_response(wire)
89
+ return canonical_error_response(wire) unless wire.is_a?(Hash)
90
+ # Canonical-form response (from conformance kit): already in canonical shape
91
+ return Canonical::Response.from_hash(wire) if canonical_response?(wire)
92
+
93
+ choice = Array(wire['choices']).first || {}
94
+ message = choice['message'] || {}
95
+ usage = wire['usage'] || {}
96
+ finish_reason = choice['finish_reason']
97
+ model = wire['model']
98
+
99
+ content = message['content'] || ''
100
+ thinking_meta = extract_thinking_metadata(message)
101
+ extraction = Responses::ThinkingExtractor.extract(content, metadata: thinking_meta)
102
+
103
+ text = extraction.content || ''
104
+ thinking = build_canonical_thinking(extraction)
105
+
106
+ tool_calls = parse_tool_calls(message['tool_calls'])
107
+
108
+ # vLLM quirk: tool_calls_as_text — synthesize from content if none found.
109
+ if tool_calls.empty?
110
+ synthesized = synthesize_tool_calls_from_content(extraction.content, message)
111
+ tool_calls.concat(synthesized) unless synthesized.empty?
112
+ end
113
+
114
+ stop_reason = map_stop_reason(finish_reason)
115
+
116
+ Canonical::Response.build(
117
+ text: text.to_s,
118
+ thinking: thinking,
119
+ tool_calls: tool_calls,
120
+ usage: Canonical::Usage.from_hash(usage),
121
+ stop_reason: stop_reason,
122
+ model: model,
123
+ metadata: wire_metadata(wire, message, thinking_meta)
124
+ )
125
+ rescue Legion::JSON::ParseError => e
126
+ handle_exception(e, level: :warn, handled: true, operation: 'vllm.translator.parse_response')
127
+ canonical_error_response(wire)
128
+ rescue StandardError => e
129
+ handle_exception(e, level: :error, handled: false, operation: 'vllm.translator.parse_response')
130
+ raise
131
+ end
132
+
133
+ # Parse a single SSE chunk into a Canonical::Chunk or nil.
134
+ def parse_chunk(raw)
135
+ return nil if raw.nil?
136
+ return nil if raw.is_a?(String) && (raw == '[DONE]' || raw.strip.empty?)
137
+
138
+ data = raw.is_a?(Hash) ? raw : parse_json_safely(raw)
139
+ return nil if data.nil?
140
+
141
+ # Handle canonical-form chunks (from conformance fixtures or other translators)
142
+ return handle_canonical_chunk(data) if data['type']
143
+
144
+ if data['error']
145
+ return Canonical::Chunk.error_chunk(
146
+ error: data['error'],
147
+ request_id: data['id']
148
+ )
149
+ end
150
+
151
+ choice = Array(data['choices']).first
152
+ return build_done_chunk(data) if choice.nil? && data['usage']
153
+ return nil unless choice
154
+
155
+ delta = choice['delta'] || {}
156
+ finish_reason = choice['finish_reason']
157
+ request_id = data['request_id'] || data['id']
158
+
159
+ if finish_reason && empty_delta?(delta)
160
+ return Canonical::Chunk.done(
161
+ request_id: request_id,
162
+ usage: Canonical::Usage.from_hash(data['usage']),
163
+ stop_reason: map_stop_reason(finish_reason)
164
+ )
165
+ end
166
+
167
+ tool_calls = delta['tool_calls']
168
+ unless Array(tool_calls).empty?
169
+ first_call = tool_calls.first
170
+ function = first_call.fetch('function', {})
171
+
172
+ tc = Canonical::ToolCall.build(
173
+ id: (first_call['id'] || function['name'] || 'synthesized').to_s,
174
+ name: function['name'].to_s,
175
+ arguments: parse_tool_arguments(function['arguments']),
176
+ source: :client
177
+ )
178
+
179
+ return Canonical::Chunk.tool_call_delta(
180
+ tool_call: tc,
181
+ request_id: request_id,
182
+ block_index: first_call['index']
183
+ )
184
+ end
185
+
186
+ # Thinking delta from reasoning_content
187
+ reasoning_content = delta['reasoning_content'] || delta['reasoning']
188
+ unless reasoning_content.to_s.empty?
189
+ return Canonical::Chunk.thinking_delta(
190
+ delta: reasoning_content,
191
+ request_id: request_id,
192
+ block_index: delta.dig('content_block', 'index'),
193
+ item_id: delta['content_block_start']&.dig('id')
194
+ )
195
+ end
196
+
197
+ # Text delta — check for embedded think tags
198
+ content = delta['content']
199
+ return parse_text_delta_with_thinking(content, request_id, data) unless content.to_s.empty?
200
+
201
+ nil
202
+ rescue Legion::JSON::ParseError => e
203
+ handle_exception(e, level: :warn, handled: true, operation: 'vllm.translator.parse_chunk')
204
+ nil
205
+ rescue StandardError => e
206
+ handle_exception(e, level: :error, handled: false, operation: 'vllm.translator.parse_chunk')
207
+ raise
208
+ end
209
+
210
+ # Declared capabilities for the vLLM provider.
211
+ def capabilities
212
+ {
213
+ provider: 'vllm',
214
+ wire_format: 'openai_compatible',
215
+ tool_calls_as_text: true,
216
+ forced_tool_choice: true,
217
+ thinking_tags: %w[think thinking],
218
+ stop_reason_map: VLLM_STOP_REASON_MAP,
219
+ streaming_token_usage: true
220
+ }.freeze
221
+ end
222
+
223
+ private
224
+
225
+ attr_reader :config
226
+
227
+ # ── Message formatting ──
228
+
229
+ def format_messages(request)
230
+ messages = format_request_messages(request.messages)
231
+
232
+ if request.system.to_s.strip.empty?
233
+ messages
234
+ else
235
+ [{ role: 'system', content: request.system.strip }] + messages
236
+ end
237
+ end
238
+
239
+ def format_request_messages(messages)
240
+ return [] if messages.nil? || messages.empty?
241
+
242
+ messages.map { |msg| format_message(msg) }
243
+ end
244
+
245
+ def format_message(msg)
246
+ role = msg.role.to_s
247
+ content = format_message_content(msg)
248
+ tool_calls = format_message_tool_calls(msg.tool_calls) if msg.tool_calls&.any?
249
+ tool_call_id = msg.tool_call_id
250
+ name = msg.name
251
+
252
+ {
253
+ role: role,
254
+ content: content,
255
+ tool_call_id: tool_call_id,
256
+ tool_calls: tool_calls,
257
+ name: name
258
+ }.compact.reject { |k, v| k == :name && (v.nil? || v.to_s.empty?) }
259
+ end
260
+
261
+ def format_message_content(msg)
262
+ content = msg.content
263
+ return content if content.is_a?(String) && !content.empty?
264
+
265
+ case content
266
+ when Array
267
+ format_content_blocks(content)
268
+ when Canonical::ContentBlock
269
+ format_content_blocks([content])
270
+ when Hash
271
+ format_content_blocks_from_hash(content)
272
+ else
273
+ content&.to_s
274
+ end
275
+ end
276
+
277
+ def format_content_blocks(blocks)
278
+ parts = blocks.map do |block|
279
+ if block.is_a?(Canonical::ContentBlock)
280
+ format_content_block(block)
281
+ elsif block.is_a?(Hash)
282
+ format_content_block_from_hash(block)
283
+ else
284
+ { type: 'text', text: block.to_s }
285
+ end
286
+ end
287
+ parts.empty? ? '' : parts
288
+ end
289
+
290
+ # rubocop:disable Lint/DuplicateBranch -- multiple block types intentionally normalize to text in OpenAI wire format
291
+ def format_content_block(block)
292
+ case block.type
293
+ when :text, :thinking, :tool_result
294
+ { type: 'text', text: block.text.to_s }
295
+ when :tool_use
296
+ { type: 'text', text: Legion::JSON.generate(block.input || {}) }
297
+ when :image
298
+ build_image_block(block)
299
+ else
300
+ { type: 'text', text: block.text.to_s }
301
+ end
302
+ end
303
+ # rubocop:enable Lint/DuplicateBranch
304
+
305
+ def format_content_blocks_from_hash(hash_input)
306
+ case hash_input
307
+ when Hash
308
+ [format_content_block_from_hash(hash_input)]
309
+ when Array
310
+ hash_input.map { |h| format_content_block_from_hash(h) }
311
+ else
312
+ []
313
+ end
314
+ end
315
+
316
+ # rubocop:disable Lint/DuplicateBranch -- multiple block types intentionally normalize to text in OpenAI wire format
317
+ def format_content_block_from_hash(block_hash)
318
+ h = block_hash.transform_keys(&:to_sym)
319
+ type = (h[:type] || :text).to_sym
320
+
321
+ case type
322
+ when :text, :thinking, :tool_result
323
+ { type: 'text', text: h[:text].to_s }
324
+ when :tool_use
325
+ { type: 'text', text: Legion::JSON.generate(h[:input] || {}) }
326
+ when :image, :image_url
327
+ { type: 'image_url', image_url: { url: h[:data] || h[:url] || '' } }
328
+ else
329
+ { type: 'text', text: h[:text].to_s }
330
+ end
331
+ end
332
+ # rubocop:enable Lint/DuplicateBranch
333
+
334
+ def build_image_block(block)
335
+ return {} unless block.data || block.source_type
336
+
337
+ url = if block.source_type == :base64 && block.media_type
338
+ "data:#{block.media_type};base64,#{block.data}"
339
+ else
340
+ block.data
341
+ end
342
+ { type: 'image_url', image_url: { url: url } }
343
+ end
344
+
345
+ def format_message_tool_calls(tool_calls)
346
+ return [] if tool_calls.empty?
347
+
348
+ tool_calls.map { |tc| format_tool_call_for_history(tc) }
349
+ end
350
+
351
+ def format_tool_call_for_history(tool_call_entry)
352
+ tc_hash = case tool_call_entry
353
+ when Canonical::ToolCall
354
+ { name: tool_call_entry&.name&.to_s, id: tool_call_entry&.id&.to_s,
355
+ arguments: tool_call_entry&.arguments || {} }
356
+ when Hash
357
+ tool_call_entry.transform_keys(&:to_sym)
358
+ else
359
+ tool_call_entry
360
+ end
361
+
362
+ name = tc_hash[:name] || tc_hash['name']
363
+ id = tc_hash[:id] || tc_hash['id']
364
+ args = tc_hash[:arguments] || tc_hash['arguments'] || {}
365
+ args = args.is_a?(Hash) ? Legion::JSON.generate(args) : args.to_s
366
+
367
+ {
368
+ id: id.to_s,
369
+ type: 'function',
370
+ function: { name: name.to_s, arguments: args }
371
+ }
372
+ end
373
+
374
+ # ── Tool formatting ──
375
+
376
+ def format_tools(tools)
377
+ return [] if tools.to_h.empty?
378
+
379
+ tools.to_h.values.map do |tool|
380
+ tool_hash = if tool.is_a?(Canonical::ToolDefinition)
381
+ { name: tool.name, description: tool.description, parameters: tool.parameters }
382
+ elsif tool.is_a?(Hash)
383
+ tool.transform_keys(&:to_sym)
384
+ else
385
+ tool
386
+ end
387
+
388
+ name = tool_hash[:name] || tool_hash['name']
389
+ description = (tool_hash[:description] || tool_hash['description'] || '').to_s
390
+ parameters = tool_hash[:parameters] || tool_hash[:input_schema] ||
391
+ { type: 'object', properties: {} }
392
+ parameters = parameters.to_h if parameters.respond_to?(:to_h) && !parameters.is_a?(Hash)
393
+ parameters = { type: 'object', properties: {} } unless parameters.is_a?(Hash)
394
+
395
+ {
396
+ type: 'function',
397
+ function: {
398
+ name: name.to_s,
399
+ description: description,
400
+ parameters: parameters
401
+ }
402
+ }
403
+ end
404
+ end
405
+
406
+ def format_tool_choice(choice)
407
+ return nil unless choice
408
+
409
+ case choice
410
+ when :auto, 'auto'
411
+ 'auto'
412
+ when :none, 'none'
413
+ 'none'
414
+ when :required, 'required'
415
+ 'required'
416
+ when Hash
417
+ name = choice[:name] || choice['name']
418
+ { type: 'function', function: { name: name.to_s } }
419
+ when Symbol, String
420
+ { type: 'function', function: { name: choice.to_s } }
421
+ end
422
+ end
423
+
424
+ # ── Parameter mapping (G18) ──
425
+
426
+ def map_params_to_wire(params)
427
+ return {} unless params.is_a?(Canonical::Params)
428
+
429
+ wire = {}
430
+ SUPPORTED_PARAMS.each do |param_key|
431
+ value = params.public_send(param_key)
432
+ next if value.nil?
433
+
434
+ wire_key = PARAM_WIRE_KEYS[param_key]
435
+ wire[wire_key] = case param_key
436
+ when :stop_sequences
437
+ format_stop_sequences(value)
438
+ when :response_format
439
+ format_response_format_value(value)
440
+ else
441
+ value
442
+ end
443
+ end
444
+
445
+ unsupported = {}
446
+ unsupported[:max_thinking_tokens] = params.max_thinking_tokens if params.max_thinking_tokens
447
+
448
+ unless unsupported.empty?
449
+ log.debug do
450
+ "vLLM translator dropping unsupported params: #{unsupported.keys.join(', ')} " \
451
+ '(handled via vLLM-specific render paths)'
452
+ end
453
+ end
454
+
455
+ wire
456
+ end
457
+
458
+ def format_stop_sequences(sequences)
459
+ sequences.is_a?(Array) ? sequences : [sequences]
460
+ end
461
+
462
+ def format_response_format(params)
463
+ return nil unless formatted_response_format?(params)
464
+
465
+ format_response_format_value(params.response_format)
466
+ end
467
+
468
+ def formatted_response_format?(params)
469
+ params.is_a?(Canonical::Params) && params.response_format
470
+ end
471
+
472
+ def format_response_format_value(value)
473
+ return value if value.is_a?(String)
474
+
475
+ val_hash = value.is_a?(Hash) ? value.transform_keys(&:to_sym) : {}
476
+ type = val_hash[:type] || val_hash['type']
477
+
478
+ case type&.to_s
479
+ when 'json_schema'
480
+ schema = val_hash[:schema] || val_hash['schema'] || val_hash[:json_schema] || val_hash['json_schema']
481
+ { type: 'json_schema', json_schema: schema }
482
+ when 'json_object'
483
+ { type: 'json_object' }
484
+ else
485
+ value
486
+ end
487
+ end
488
+
489
+ # ── Thinking configuration ──
490
+
491
+ def apply_thinking_config(payload, request)
492
+ return unless enable_thinking?(request)
493
+
494
+ payload[:chat_template_kwargs] = { enable_thinking: true }
495
+ budget = request.params&.max_thinking_tokens
496
+ return unless budget&.positive?
497
+
498
+ log.debug { "vLLM translator thinking max_thinking_tokens=#{budget} via chat template" }
499
+ end
500
+
501
+ def enable_thinking?(request)
502
+ return true if request.thinking.is_a?(Canonical::Thinking::Config) && request.thinking.enabled?
503
+ return true if request.thinking.is_a?(Hash) && (request.thinking[:enabled] != false)
504
+
505
+ if request.thinking.nil? && config
506
+ config_thinking = if config.respond_to?(:enable_thinking)
507
+ config.enable_thinking
508
+ else
509
+ config.respond_to?(:[]) ? config[:enable_thinking] : nil
510
+ end
511
+ return true if config_thinking == true
512
+ end
513
+
514
+ false
515
+ end
516
+
517
+ # ── Response parsing ──
518
+
519
+ def canonical_error_response(wire)
520
+ body = wire.is_a?(Hash) ? wire : {}
521
+ error_info = body['error'] || { type: 'parse_error', message: 'Failed to parse response' }
522
+
523
+ Canonical::Response.build(
524
+ text: '',
525
+ tool_calls: [],
526
+ usage: Canonical::Usage.from_hash(body['usage'] || {}),
527
+ stop_reason: :error,
528
+ model: body['model'],
529
+ metadata: { error: error_info }
530
+ )
531
+ end
532
+
533
+ def extract_thinking_metadata(message)
534
+ {
535
+ reasoning_content: message['reasoning_content'],
536
+ reasoning: message['reasoning'],
537
+ thinking: message['thinking'],
538
+ thinking_text: message['thinking_text'],
539
+ thinking_signature: message['thinking_signature'],
540
+ reasoning_signature: message['reasoning_signature']
541
+ }.compact
542
+ end
543
+
544
+ def build_canonical_thinking(extraction)
545
+ return nil unless extraction.thinking || extraction.signature
546
+
547
+ Canonical::Thinking.new(
548
+ content: extraction.thinking,
549
+ signature: extraction.signature
550
+ )
551
+ end
552
+
553
+ def parse_tool_calls(tool_calls)
554
+ return [] unless tool_calls.is_a?(Array) && !tool_calls.empty?
555
+
556
+ tool_calls.filter_map do |call|
557
+ function = call.fetch('function', {})
558
+ name = function['name']
559
+ id = call['id'] || name || call['index']
560
+ args = parse_tool_arguments(function['arguments'])
561
+
562
+ Canonical::ToolCall.build(
563
+ id: id.to_s,
564
+ name: name.to_s,
565
+ arguments: args,
566
+ source: :client
567
+ )
568
+ rescue StandardError => e
569
+ handle_exception(e, level: :warn, handled: true, operation: 'vllm.translator.parse_tool_call')
570
+ nil
571
+ end
572
+ end
573
+
574
+ def parse_tool_arguments(arguments)
575
+ return {} if arguments.nil? || arguments == ''
576
+ return arguments if arguments.is_a?(Hash)
577
+
578
+ Legion::JSON.load(arguments)
579
+ rescue Legion::JSON::ParseError
580
+ {}
581
+ end
582
+
583
+ # vLLM quirk: synthesize tool calls from content text JSON.
584
+ def synthesize_tool_calls_from_content(content, _message)
585
+ return [] unless content.is_a?(String) && !content.empty?
586
+
587
+ tool_call = try_parse_tool_call_from_text(content)
588
+ return [tool_call] if tool_call
589
+
590
+ json_match = content.match(/\{[^{}]*(?:tool|function|name|arguments)[^{}]*\}/m)
591
+ return [] unless json_match
592
+
593
+ tool_call = try_parse_tool_call_from_text(json_match[0])
594
+ tool_call ? [tool_call] : []
595
+ end
596
+
597
+ def try_parse_tool_call_from_text(text)
598
+ parsed = Legion::JSON.load(text)
599
+ return nil unless parsed.is_a?(Hash)
600
+
601
+ name = parsed[:name] || parsed[:function_name]
602
+ args = parsed[:arguments] || parsed[:parameters] || parsed[:input] || {}
603
+ args = Legion::JSON.load(args) if args.is_a?(String)
604
+
605
+ return nil if name.nil? || name.to_s.empty?
606
+
607
+ Canonical::ToolCall.build(
608
+ name: name.to_s,
609
+ arguments: args.is_a?(Hash) ? args : {},
610
+ source: :client
611
+ )
612
+ rescue Legion::JSON::ParseError
613
+ nil
614
+ end
615
+
616
+ def wire_metadata(wire, message, _thinking_meta)
617
+ meta = {}
618
+ meta[:reasoning_content] = message['reasoning_content'] if message['reasoning_content']
619
+ raw_usage = wire['usage']
620
+ if raw_usage.is_a?(Hash) && raw_usage['completion_tokens_details']
621
+ meta[:completion_tokens_details] = raw_usage['completion_tokens_details']
622
+ end
623
+ meta
624
+ end
625
+
626
+ # ── Chunk helpers ──
627
+
628
+ def build_done_chunk(data)
629
+ Canonical::Chunk.done(
630
+ request_id: data['request_id'] || data['id'],
631
+ usage: Canonical::Usage.from_hash(data['usage']),
632
+ stop_reason: nil
633
+ )
634
+ end
635
+
636
+ def empty_delta?(delta)
637
+ (delta['content'].nil? || delta['content'].to_s.empty?) &&
638
+ (delta['tool_calls'].nil? || Array(delta['tool_calls']).empty?) &&
639
+ (delta['reasoning_content'].nil? || delta['reasoning_content'].to_s.empty?)
640
+ end
641
+
642
+ def parse_text_delta_with_thinking(content, request_id, data)
643
+ extraction = Responses::ThinkingExtractor.extract_from_content(content)
644
+ clean_text = extraction[0]
645
+ thinking_text = extraction[1]
646
+
647
+ if thinking_text && !thinking_text.empty?
648
+ Canonical::Chunk.thinking_delta(delta: thinking_text, request_id: request_id)
649
+ else
650
+ Canonical::Chunk.text_delta(
651
+ delta: clean_text || content,
652
+ request_id: request_id,
653
+ index: data['index']
654
+ )
655
+ end
656
+ end
657
+
658
+ # Parse a canonical-form chunk (from conformance kit fixtures).
659
+
660
+ # Detect canonical-form response (from conformance fixtures).
661
+ def canonical_response?(wire)
662
+ wire.key?('text') || wire['text'] || wire.key?(:stop_reason) || wire.key?('stop_reason')
663
+ end
664
+
665
+ def handle_canonical_chunk(data)
666
+ Canonical::Chunk.from_hash(data)
667
+ rescue StandardError => e
668
+ log.debug { "vLLM translator canonical chunk parse error: #{e.message}" }
669
+ nil
670
+ end
671
+
672
+ # ── Stop reason mapping ──
673
+
674
+ def map_stop_reason(raw)
675
+ return FALLBACK_STOP_REASON if raw.nil? || raw.to_s.empty?
676
+
677
+ VLLM_STOP_REASON_MAP.fetch(raw.to_s, FALLBACK_STOP_REASON)
678
+ end
679
+
680
+ # ── JSON helpers ──
681
+ # Never use bare ::JSON inside the Legion namespace.
682
+
683
+ def parse_json_safely(raw)
684
+ return nil unless raw.is_a?(String)
685
+
686
+ Legion::JSON.load(raw)
687
+ rescue Legion::JSON::ParseError => e
688
+ log.debug { "vLLM translator chunk parse error: #{e.message}" }
689
+ nil
690
+ end
691
+ end
692
+ # rubocop:enable Metrics/ClassLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
693
+ end
694
+ end
695
+ end
696
+ end
@@ -4,7 +4,7 @@ module Legion
4
4
  module Extensions
5
5
  module Llm
6
6
  module Vllm
7
- VERSION = '0.2.13'
7
+ VERSION = '0.3.0'
8
8
  end
9
9
  end
10
10
  end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'legion/extensions/llm'
4
+ require 'legion/extensions/llm/vllm/translator'
4
5
  require 'legion/extensions/llm/vllm/provider'
5
6
  require 'legion/extensions/llm/vllm/version'
6
7
  require 'legion/logging'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-llm-vllm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.13
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - LegionIO
@@ -71,14 +71,14 @@ dependencies:
71
71
  requirements:
72
72
  - - ">="
73
73
  - !ruby/object:Gem::Version
74
- version: 0.4.3
74
+ version: 0.5.0
75
75
  type: :runtime
76
76
  prerelease: false
77
77
  version_requirements: !ruby/object:Gem::Requirement
78
78
  requirements:
79
79
  - - ">="
80
80
  - !ruby/object:Gem::Version
81
- version: 0.4.3
81
+ version: 0.5.0
82
82
  description: vLLM provider integration for the LegionIO LLM routing framework.
83
83
  email:
84
84
  - matthewdiverson@gmail.com
@@ -101,6 +101,7 @@ files:
101
101
  - lib/legion/extensions/llm/vllm/actors/fleet_worker.rb
102
102
  - lib/legion/extensions/llm/vllm/provider.rb
103
103
  - lib/legion/extensions/llm/vllm/runners/fleet_worker.rb
104
+ - lib/legion/extensions/llm/vllm/translator.rb
104
105
  - lib/legion/extensions/llm/vllm/version.rb
105
106
  homepage: https://github.com/LegionIO/lex-llm-vllm
106
107
  licenses: