lex-ollama 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 978c53ff8a178c003a5bb593a934536c20b616500d80ea0624f97014f9a88213
4
- data.tar.gz: d700e31e6f38fe2b9c6cac3da627d67ce1ccab9a75d3d6d741cdc04f5cc614bf
3
+ metadata.gz: c764634fdcad6f8a0d70a8221eb26979cd4bc0338e68b61b072d552574da5ced
4
+ data.tar.gz: cab333e0e78243cddfbc291fc9d243a652ecd4ff5fbcecf942a442e76ab171fc
5
5
  SHA512:
6
- metadata.gz: 6f44dcfc98336bcd0d28e6985ed468f7676b156d5135ff642256120db59563e161d46615b9acab0e3cdac6b578144121d60a23efc17c31f6f6c686349519f076
7
- data.tar.gz: b191eacce0844eb0be9b6b4b22f12969007b37f335da700f6ea6bd4936b22fd6aa2eec945dbdfbb419f5b1a9f9f1b0c9e15c004d772e18a1a98c059c133e83e8
6
+ metadata.gz: 6efd9372da01f35c6e23a81156e209937b594674c32255a3f15e3bde04196108d734478735bdbe0e081191354269be5df212e7635df0c5511aafa6f24e56c4fb
7
+ data.tar.gz: cc829781858f0786b3a29353b372e792b45363b3b294a604f6496b3f0779b698f229db99996c8f8ebeae9461f0cb150a725dfdf49209e87b8e7f89a06206d233
data/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.3.5] - 2026-04-25
4
+
5
+ ### Added
6
+ - Fleet model workers now bind transient classic queues to shared `llm.fleet` model lanes, with configurable consumer priority, queue expiration, and message TTL.
7
+ - Subscription entries can provide a context window so inference workers bind lanes like `llm.fleet.inference.qwen3-5-27b.ctx32768`.
8
+
3
9
  ## [0.3.4] - 2026-04-24
4
10
 
5
11
  ### Fixed
@@ -0,0 +1,230 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Ollama
6
+ module Actor
7
+ # Polls configured fleet queues with basic_get so endpoint machines choose
8
+ # when they are ready for work instead of holding prefetched messages.
9
+ class EndpointPuller < Legion::Extensions::Actors::Every
10
+ def runner_class
11
+ self.class
12
+ end
13
+
14
+ def runner_function
15
+ 'action'
16
+ end
17
+
18
+ def use_runner?
19
+ false
20
+ end
21
+
22
+ def check_subtask?
23
+ false
24
+ end
25
+
26
+ def generate_task?
27
+ false
28
+ end
29
+
30
+ def enabled?
31
+ fleet_scheduler == :basic_get && endpoint_enabled? && subscriptions.any?
32
+ rescue StandardError => e
33
+ handle_exception(e, level: :warn, handled: true)
34
+ false
35
+ end
36
+
37
+ def time
38
+ (setting_value(endpoint_settings, :idle_backoff_ms) || 1_000).to_f / 1000
39
+ end
40
+
41
+ def action
42
+ return unless enabled?
43
+
44
+ now = monotonic_time
45
+ ordered_subscriptions.each do |sub|
46
+ next if lane_backed_off?(sub, now)
47
+
48
+ pulled = drain_lane(sub)
49
+ mark_lane_empty(sub) if pulled.zero?
50
+ end
51
+ end
52
+
53
+ def ordered_subscriptions
54
+ subscriptions.sort_by do |sub|
55
+ type = sub[:type].to_s
56
+ [embed_type?(type) ? 0 : 1, context_limit(sub), sub[:model].to_s]
57
+ end
58
+ end
59
+
60
+ def drain_lane(subscription)
61
+ pulls = 0
62
+ queue = queue_for(subscription)
63
+
64
+ loop do
65
+ break if max_consecutive_pulls_per_lane.positive? && pulls >= max_consecutive_pulls_per_lane
66
+ break unless pull_one(queue, subscription)
67
+
68
+ pulls += 1
69
+ end
70
+ pulls
71
+ end
72
+
73
+ def pull_one(queue, subscription)
74
+ delivery_info, metadata, payload = queue.pop(manual_ack: true)
75
+ return false unless delivery_info
76
+
77
+ message = process_payload(payload, metadata, delivery_info, subscription)
78
+ Legion::Extensions::Ollama::Runners::Fleet.handle_request(**message)
79
+ queue.acknowledge(delivery_info.delivery_tag)
80
+ true
81
+ rescue StandardError => e
82
+ handle_exception(e, lex: lex_name, routing_key: delivery_info&.routing_key)
83
+ queue.reject(delivery_info.delivery_tag, requeue: false) if delivery_info
84
+ true
85
+ end
86
+
87
+ def queue_for(subscription)
88
+ @queues ||= {}
89
+ @queues[lane_key(subscription)] ||= ModelWorker.queue_class_for(
90
+ request_type: subscription[:type],
91
+ model: subscription[:model],
92
+ context_window: finite_context_limit(subscription),
93
+ queue_config: queue_config
94
+ ).new
95
+ end
96
+
97
+ def process_payload(payload, metadata, delivery_info, subscription)
98
+ message = decode_payload(payload, metadata)
99
+ message = message.merge(metadata.headers.transform_keys(&:to_sym)) if metadata&.headers
100
+ message[:routing_key] = delivery_info.routing_key if delivery_info.respond_to?(:routing_key)
101
+ message[:request_type] ||= subscription[:type].to_s
102
+ message[:model] ||= subscription[:model].to_s
103
+ message[:message_context] ||= {}
104
+ message
105
+ end
106
+
107
+ def decode_payload(payload, metadata)
108
+ decoded = if metadata&.content_encoding == 'encrypted/cs'
109
+ Legion::Crypt.decrypt(payload, metadata_header(metadata, :iv))
110
+ elsif metadata&.content_encoding == 'encrypted/pk'
111
+ Legion::Crypt.decrypt_from_keypair(metadata_header(metadata, :public_key), payload)
112
+ else
113
+ payload
114
+ end
115
+
116
+ if metadata&.content_type == 'application/json'
117
+ Legion::JSON.load(decoded)
118
+ else
119
+ { value: decoded }
120
+ end
121
+ end
122
+
123
+ def subscriptions
124
+ configured = setting_value(settings, :subscriptions)
125
+ return [] unless configured.is_a?(Array)
126
+
127
+ configured.filter_map do |sub|
128
+ next unless sub.is_a?(Hash)
129
+
130
+ normalized = sub.transform_keys(&:to_sym)
131
+ next unless normalized[:type] && normalized[:model]
132
+
133
+ normalized
134
+ end
135
+ end
136
+
137
+ def queue_config
138
+ {
139
+ queue_expires_ms: nested_setting(settings, :fleet, :queue_expires_ms),
140
+ message_ttl_ms: nested_setting(settings, :fleet, :message_ttl_ms),
141
+ queue_max_length: nested_setting(settings, :fleet, :queue_max_length),
142
+ delivery_limit: nested_setting(settings, :fleet, :delivery_limit),
143
+ consumer_ack_timeout_ms: nested_setting(settings, :fleet, :consumer_ack_timeout_ms)
144
+ }.compact
145
+ end
146
+
147
+ def endpoint_settings
148
+ nested_setting(settings, :fleet, :endpoint) || {}
149
+ end
150
+
151
+ def endpoint_enabled?
152
+ setting_value(endpoint_settings, :enabled) == true
153
+ end
154
+
155
+ def max_consecutive_pulls_per_lane
156
+ setting_value(endpoint_settings, :max_consecutive_pulls_per_lane) || 0
157
+ end
158
+
159
+ def empty_lane_backoff_seconds
160
+ (setting_value(endpoint_settings, :empty_lane_backoff_ms) || 250).to_f / 1000
161
+ end
162
+
163
+ def lane_backed_off?(subscription, now)
164
+ (@empty_lanes ||= {}).fetch(lane_key(subscription), 0) > now
165
+ end
166
+
167
+ def mark_lane_empty(subscription)
168
+ (@empty_lanes ||= {})[lane_key(subscription)] = monotonic_time + empty_lane_backoff_seconds
169
+ end
170
+
171
+ def lane_key(subscription)
172
+ type = subscription[:type]
173
+ model = subscription[:model]
174
+ context = context_limit(subscription)
175
+ context.finite? ? "#{type}:#{model}:ctx#{context}" : "#{type}:#{model}"
176
+ end
177
+
178
+ def monotonic_time
179
+ Process.clock_gettime(Process::CLOCK_MONOTONIC)
180
+ end
181
+
182
+ def fleet_scheduler
183
+ (nested_setting(settings, :fleet, :scheduler) || :basic_get).to_sym
184
+ end
185
+
186
+ def context_limit(subscription)
187
+ raw = setting_value(subscription, :max_context_size) ||
188
+ setting_value(subscription, :context_window) ||
189
+ setting_value(subscription, :max_input_tokens) ||
190
+ setting_value(subscription, :context) ||
191
+ setting_value(subscription, :ctx)
192
+ Integer(raw || Float::INFINITY)
193
+ rescue ArgumentError, TypeError, FloatDomainError
194
+ Float::INFINITY
195
+ end
196
+
197
+ def finite_context_limit(subscription)
198
+ context = context_limit(subscription)
199
+ context.finite? ? context : nil
200
+ end
201
+
202
+ def embed_type?(type)
203
+ %w[embed embedding embeddings].include?(type)
204
+ end
205
+
206
+ def metadata_header(metadata, key)
207
+ setting_value(metadata&.headers || {}, key)
208
+ end
209
+
210
+ def nested_setting(hash, *keys)
211
+ keys.reduce(hash) do |current, key|
212
+ return nil unless current.respond_to?(:key?)
213
+
214
+ setting_value(current, key)
215
+ end
216
+ end
217
+
218
+ def setting_value(hash, key)
219
+ return nil unless hash.respond_to?(:key?)
220
+
221
+ string_key = key.to_s
222
+ return hash[string_key] if hash.key?(string_key)
223
+
224
+ hash[key] if hash.key?(key)
225
+ end
226
+ end
227
+ end
228
+ end
229
+ end
230
+ end
@@ -19,15 +19,17 @@ module Legion
19
19
  # - type: chat
20
20
  # model: "qwen3.5:27b"
21
21
  #
22
- # The queue name and routing key both follow the schema:
23
- # llm.request.ollama.<type>.<model>
24
- # where model colons are converted to dots (AMQP topic word separator).
22
+ # The queue name and routing key both follow shared fleet lane schemas:
23
+ # llm.fleet.embed.<model>
24
+ # llm.fleet.inference.<model>.ctx<context_window>
25
+ # when an inference context window is known.
25
26
  class ModelWorker < Legion::Extensions::Actors::Subscription
26
- attr_reader :request_type, :model_name
27
+ attr_reader :request_type, :model_name, :context_window
27
28
 
28
- def initialize(request_type:, model:, **)
29
- @request_type = request_type.to_s
30
- @model_name = model.to_s
29
+ def initialize(request_type:, model:, context_window: nil, **)
30
+ @request_type = request_type.to_s
31
+ @model_name = model.to_s
32
+ @context_window = context_window&.to_i
31
33
  super(**)
32
34
  end
33
35
 
@@ -59,7 +61,27 @@ module Legion
59
61
  # Standard scale: GPU server = 10, Mac Studio = 5, developer laptop = 1.
60
62
  # Defaults to 0 (equal priority) if not configured.
61
63
  def consumer_priority
62
- settings.dig(:fleet, :consumer_priority) || 0
64
+ setting_value(fleet_settings, :consumer_priority) || 0
65
+ end
66
+
67
+ def queue_expires_ms
68
+ setting_value(fleet_settings, :queue_expires_ms) || 60_000
69
+ end
70
+
71
+ def message_ttl_ms
72
+ setting_value(fleet_settings, :message_ttl_ms) || 120_000
73
+ end
74
+
75
+ def queue_max_length
76
+ setting_value(fleet_settings, :queue_max_length) || 100
77
+ end
78
+
79
+ def delivery_limit
80
+ setting_value(fleet_settings, :delivery_limit) || 3
81
+ end
82
+
83
+ def consumer_ack_timeout_ms
84
+ setting_value(fleet_settings, :consumer_ack_timeout_ms) || 300_000
63
85
  end
64
86
 
65
87
  # Subscribe options include x-priority argument so RabbitMQ can honour
@@ -73,14 +95,45 @@ module Legion
73
95
  base.merge(arguments: { 'x-priority' => consumer_priority })
74
96
  end
75
97
 
76
- # Returns a queue CLASS (not instance) bound to the llm.request exchange
77
- # with the routing key for this worker's (type, model) pair.
98
+ # Returns a queue CLASS (not instance) bound to the llm.fleet exchange
99
+ # with the routing key for this worker's model offering lane.
78
100
  # The Subscription base class calls queue.new in initialize, so this must
79
101
  # return a class, not an instance.
80
102
  def queue
81
103
  @queue ||= build_queue_class
82
104
  end
83
105
 
106
+ def self.queue_class_for(request_type:, model:, context_window: nil, queue_config: {})
107
+ worker = allocate
108
+ worker.instance_variable_set(:@request_type, request_type.to_s)
109
+ worker.instance_variable_set(:@model_name, model.to_s)
110
+ worker.instance_variable_set(:@context_window, context_window&.to_i)
111
+ worker.send(:build_queue_class, queue_config)
112
+ end
113
+
114
+ def self.fallback_queue_options(settings)
115
+ {
116
+ durable: true,
117
+ auto_delete: false,
118
+ arguments: {
119
+ 'x-queue-type' => 'quorum',
120
+ 'x-queue-leader-locator' => 'balanced',
121
+ 'x-expires' => settings.fetch(:queue_expires_ms),
122
+ 'x-message-ttl' => settings.fetch(:message_ttl_ms),
123
+ 'x-overflow' => 'reject-publish',
124
+ 'x-max-length' => settings.fetch(:queue_max_length),
125
+ 'x-delivery-limit' => settings.fetch(:delivery_limit),
126
+ 'x-consumer-timeout' => settings.fetch(:consumer_ack_timeout_ms)
127
+ }
128
+ }
129
+ end
130
+
131
+ def routing_key
132
+ parts = ['llm.fleet', lane_kind, sanitized_model]
133
+ parts << "ctx#{@context_window}" if lane_kind == 'inference' && @context_window
134
+ parts.join('.')
135
+ end
136
+
84
137
  # Enrich every inbound message with the worker's own request_type and model
85
138
  # so Runners::Fleet#handle_request always has them, even if the sender omitted
86
139
  # them. Also defaults message_context to {} if absent.
@@ -94,23 +147,62 @@ module Legion
94
147
 
95
148
  private
96
149
 
97
- def build_queue_class
98
- sanitised_model = @model_name.tr(':', '.')
99
- routing_key = "llm.request.ollama.#{@request_type}.#{sanitised_model}"
150
+ def build_queue_class(queue_config = {})
151
+ lane_key = routing_key
100
152
  exchange_class = Transport::Exchanges::LlmRequest
153
+ queue_settings = {
154
+ queue_expires_ms: queue_expires_ms,
155
+ message_ttl_ms: message_ttl_ms,
156
+ queue_max_length: queue_max_length,
157
+ delivery_limit: delivery_limit,
158
+ consumer_ack_timeout_ms: consumer_ack_timeout_ms
159
+ }.merge((queue_config || {}).compact)
160
+
161
+ if defined?(::Legion::Extensions::Llm::Transport::FleetLane)
162
+ return ::Legion::Extensions::Llm::Transport::FleetLane.build_queue_class(
163
+ queue_name: lane_key,
164
+ exchange_class: exchange_class,
165
+ routing_key: lane_key,
166
+ base_queue_class: Legion::Transport::Queue,
167
+ settings: queue_settings
168
+ )
169
+ end
170
+
171
+ queue_options = self.class.fallback_queue_options(queue_settings)
101
172
 
102
173
  Class.new(Legion::Transport::Queue) do
103
- define_method(:queue_name) { routing_key }
104
- define_method(:queue_options) do
105
- { durable: false, auto_delete: true, arguments: { 'x-max-priority' => 10 } }
106
- end
174
+ define_method(:queue_name) { lane_key }
175
+ define_method(:queue_options) { queue_options }
107
176
  define_method(:dlx_enabled) { false }
108
177
  define_method(:initialize) do
109
178
  super()
110
- bind(exchange_class.new, routing_key: routing_key)
179
+ bind(exchange_class.new, routing_key: lane_key)
111
180
  end
112
181
  end
113
182
  end
183
+
184
+ def fleet_settings
185
+ setting_value(settings, :fleet) || {}
186
+ rescue NameError
187
+ {}
188
+ end
189
+
190
+ def setting_value(hash, key)
191
+ return nil unless hash.respond_to?(:key?)
192
+
193
+ string_key = key.to_s
194
+ return hash[string_key] if hash.key?(string_key)
195
+
196
+ hash[key] if hash.key?(key)
197
+ end
198
+
199
+ def lane_kind
200
+ %w[embed embedding embeddings].include?(@request_type) ? 'embed' : 'inference'
201
+ end
202
+
203
+ def sanitized_model
204
+ @model_name.downcase.gsub(/[^a-z0-9]+/, '-').gsub(/\A-+|-+\z/, '').squeeze('-')
205
+ end
114
206
  end
115
207
  end
116
208
  end
@@ -3,7 +3,7 @@
3
3
  module Legion
4
4
  module Extensions
5
5
  module Ollama
6
- VERSION = '0.3.4'
6
+ VERSION = '0.3.5'
7
7
  end
8
8
  end
9
9
  end
@@ -14,6 +14,12 @@ require 'legion/extensions/ollama/runners/version'
14
14
  require 'legion/extensions/ollama/runners/fleet'
15
15
  require 'legion/extensions/ollama/client'
16
16
 
17
+ begin
18
+ require 'legion/extensions/llm'
19
+ rescue LoadError
20
+ nil
21
+ end
22
+
17
23
  # Fleet transport and actor wiring — only loaded when Legion::Extensions::Core is present
18
24
  # so the gem still works as a standalone HTTP client without any AMQP runtime.
19
25
  if Legion::Extensions.const_defined?(:Core, false)
@@ -21,6 +27,7 @@ if Legion::Extensions.const_defined?(:Core, false)
21
27
  require 'legion/extensions/ollama/transport/messages/llm_response'
22
28
  require 'legion/extensions/ollama/transport'
23
29
  require 'legion/extensions/ollama/actors/model_worker'
30
+ require 'legion/extensions/ollama/actors/endpoint_puller'
24
31
  require 'legion/extensions/ollama/actors/model_sync'
25
32
  end
26
33
 
@@ -32,7 +39,22 @@ module Legion
32
39
  def self.default_settings
33
40
  {
34
41
  s3: {},
35
- fleet: {}
42
+ fleet: {
43
+ consumer_priority: 0,
44
+ scheduler: :basic_get,
45
+ queue_expires_ms: 60_000,
46
+ message_ttl_ms: 120_000,
47
+ queue_max_length: 100,
48
+ delivery_limit: 3,
49
+ consumer_ack_timeout_ms: 300_000,
50
+ endpoint: {
51
+ enabled: false,
52
+ empty_lane_backoff_ms: 250,
53
+ idle_backoff_ms: 1_000,
54
+ max_consecutive_pulls_per_lane: 0,
55
+ accept_when: []
56
+ }
57
+ }
36
58
  }
37
59
  end
38
60
 
@@ -43,17 +65,29 @@ module Legion
43
65
  super
44
66
  @actors.delete(:model_worker)
45
67
 
46
- subs = settings[:subscriptions]
68
+ subs = setting_value(settings, :subscriptions)
69
+ valid_subscriptions = valid_fleet_subscriptions(subs)
70
+ endpoint_configured = fleet_scheduler == :basic_get &&
71
+ nested_setting(settings, :fleet, :endpoint, :enabled) == true &&
72
+ valid_subscriptions.any?
73
+ @actors.delete(:endpoint_puller) unless endpoint_configured
74
+
47
75
  return unless subs.is_a?(Array)
76
+ return if fleet_scheduler == :basic_get
48
77
 
49
78
  subs.each do |sub|
50
- request_type = sub[:type]&.to_s
51
- model = sub[:model]&.to_s
79
+ request_type = setting_value(sub, :type)&.to_s
80
+ model = setting_value(sub, :model)&.to_s
81
+ limits = setting_value(sub, :limits) || {}
82
+ context_window = setting_value(sub, :context_window) ||
83
+ setting_value(limits, :context_window)
52
84
  next unless request_type && model
53
85
 
54
86
  actor_name = :"model_worker_#{request_type}_#{model.tr(':.', '__')}"
55
87
  worker_class = Class.new(Legion::Extensions::Ollama::Actor::ModelWorker) do
56
- define_method(:initialize) { super(request_type: request_type, model: model) }
88
+ define_method(:initialize) do
89
+ super(request_type: request_type, model: model, context_window: context_window)
90
+ end
57
91
  end
58
92
 
59
93
  @actors[actor_name] = {
@@ -65,6 +99,35 @@ module Legion
65
99
  }
66
100
  end
67
101
  end
102
+
103
+ def self.fleet_scheduler
104
+ (nested_setting(settings, :fleet, :scheduler) || :basic_get).to_sym
105
+ end
106
+
107
+ def self.valid_fleet_subscriptions(subscriptions)
108
+ return [] unless subscriptions.is_a?(Array)
109
+
110
+ subscriptions.select do |sub|
111
+ setting_value(sub, :type) && setting_value(sub, :model)
112
+ end
113
+ end
114
+
115
+ def self.setting_value(hash, key)
116
+ return nil unless hash.respond_to?(:key?)
117
+
118
+ string_key = key.to_s
119
+ return hash[string_key] if hash.key?(string_key)
120
+
121
+ hash[key] if hash.key?(key)
122
+ end
123
+
124
+ def self.nested_setting(hash, *keys)
125
+ keys.reduce(hash) do |current, key|
126
+ return nil unless current.respond_to?(:key?)
127
+
128
+ setting_value(current, key)
129
+ end
130
+ end
68
131
  end
69
132
  end
70
133
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-ollama
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esity
@@ -56,6 +56,7 @@ files:
56
56
  - README.md
57
57
  - lex-ollama.gemspec
58
58
  - lib/legion/extensions/ollama.rb
59
+ - lib/legion/extensions/ollama/actors/endpoint_puller.rb
59
60
  - lib/legion/extensions/ollama/actors/model_sync.rb
60
61
  - lib/legion/extensions/ollama/actors/model_worker.rb
61
62
  - lib/legion/extensions/ollama/client.rb