lex-ollama 0.3.4 → 0.3.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/CHANGELOG.md +38 -0
- data/Gemfile +6 -0
- data/README.md +31 -6
- data/lex-ollama.gemspec +5 -0
- data/lib/legion/extensions/ollama/actors/endpoint_puller.rb +230 -0
- data/lib/legion/extensions/ollama/actors/model_worker.rb +480 -21
- data/lib/legion/extensions/ollama/transport/exchanges/llm_registry.rb +18 -0
- data/lib/legion/extensions/ollama/transport/messages/registry_event.rb +41 -0
- data/lib/legion/extensions/ollama/version.rb +1 -1
- data/lib/legion/extensions/ollama.rb +166 -15
- metadata +74 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 382e4c199c00f2ddd25fe9eea874e9e06340cefe26cc384e7e92a65504263241
|
|
4
|
+
data.tar.gz: cc45f497e285427b21b899ef1a282c5254fdf19c2d71ebb3a52457440b2892b3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 867a24f1bb87195ef55dfeede679eafdf612c5eac98e25fd733dbebb75e79e3e210459694de8feeb068a7808cf0ce55a52a45f35b2254cebfba6af4eeb8c890e
|
|
7
|
+
data.tar.gz: 3e4b46cbba3ce845379f62209c7ced95879134294e171226b06263d76b1aaa196dc440c34f6c6a6ebbccc5059d796225555da385a3d2ec350ae157365943084e
|
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,43 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.3.10] - 2026-04-28
|
|
4
|
+
|
|
5
|
+
### Fixed
|
|
6
|
+
- Require `lex-llm >= 0.1.6` so registry availability publishing always has the shared `RegistryEvent` envelope implementation it depends on.
|
|
7
|
+
|
|
8
|
+
## [0.3.9] - 2026-04-28
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
- Declare explicit shared Legion JSON, logging, and settings dependencies used by the legacy Ollama extension during the LLM uplift transition.
|
|
12
|
+
|
|
13
|
+
## [0.3.8] - 2026-04-28
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
- Publish nonblocking `llm.registry` availability, unavailability, heartbeat, and degraded events from `Actor::ModelWorker` using `lex-llm` `Legion::Extensions::Llm::Routing::RegistryEvent` envelopes when transport is available.
|
|
17
|
+
- Add local `Transport::Exchanges::LlmRegistry` and `Transport::Messages::RegistryEvent` wrappers for `llm.registry` topic publishing without requiring a database.
|
|
18
|
+
|
|
19
|
+
## [0.3.7] - 2026-04-28
|
|
20
|
+
|
|
21
|
+
### Fixed
|
|
22
|
+
- Declare the `legion-llm` runtime dependency required by the fleet exchange, response, and error classes inherited by lex-ollama fleet workers.
|
|
23
|
+
|
|
24
|
+
## [0.3.6] - 2026-04-28
|
|
25
|
+
|
|
26
|
+
### Added
|
|
27
|
+
- `Actor::ModelWorker` can now bind opt-in exact offering lanes compatible with legion-llm's `llm.fleet.offering.<instance>.<model>.<operation>` routing style while preserving the existing shared `llm.fleet.*` lanes
|
|
28
|
+
- `legion.ollama.fleet.offering_lanes` settings default to disabled with no instance id, so existing shared-lane fleet workers keep their current behavior unless exact offering lanes are explicitly enabled
|
|
29
|
+
|
|
30
|
+
## [0.3.5] - 2026-04-28
|
|
31
|
+
|
|
32
|
+
### Added
|
|
33
|
+
- Fleet model workers now bind transient classic queues to shared `llm.fleet` model lanes, with configurable consumer priority, queue expiration, and message TTL.
|
|
34
|
+
- Subscription entries can provide a context window so inference workers bind lanes like `llm.fleet.inference.qwen3-5-27b.ctx32768`.
|
|
35
|
+
|
|
36
|
+
### Changed
|
|
37
|
+
- `Actor::ModelWorker` now defaults endpoint fleet workers to explicit `basic_get` polling with a process-wide lane lock so local one-model-at-a-time devices do not reserve messages from multiple model queues; GPU/datacenter workers can opt back into RabbitMQ subscriptions with `legion.ollama.fleet.scheduler: :subscription`
|
|
38
|
+
- Fleet worker queue names and routing keys now use shared `llm.fleet.*` lanes (`llm.fleet.embed.<model>` and `llm.fleet.inference.<model>.ctx<context>`) instead of legacy `llm.request.ollama.*` keys
|
|
39
|
+
- `Ollama.build_actors` now orders generated model workers with embeddings first, then inference/chat workers from smallest to largest configured context window
|
|
40
|
+
|
|
3
41
|
## [0.3.4] - 2026-04-24
|
|
4
42
|
|
|
5
43
|
### Fixed
|
data/Gemfile
CHANGED
|
@@ -3,6 +3,12 @@
|
|
|
3
3
|
source 'https://rubygems.org'
|
|
4
4
|
gemspec
|
|
5
5
|
|
|
6
|
+
legion_llm_path = File.expand_path('../../legion-llm', __dir__)
|
|
7
|
+
gem 'legion-llm', path: legion_llm_path if Dir.exist?(legion_llm_path)
|
|
8
|
+
|
|
9
|
+
lex_llm_path = File.expand_path('../lex-llm', __dir__)
|
|
10
|
+
gem 'lex-llm', path: lex_llm_path if Dir.exist?(lex_llm_path)
|
|
11
|
+
|
|
6
12
|
group :test do
|
|
7
13
|
gem 'rake'
|
|
8
14
|
gem 'rspec'
|
data/README.md
CHANGED
|
@@ -49,11 +49,19 @@ gem install lex-ollama
|
|
|
49
49
|
- `handle_request` - Dispatch inbound fleet AMQP messages to the appropriate runner (chat/embed/generate)
|
|
50
50
|
|
|
51
51
|
When `Legion::Extensions::Core` is present, lex-ollama subscribes to model-scoped queues on the
|
|
52
|
-
`llm.
|
|
52
|
+
`llm.fleet` topic exchange, accepting routed LLM inference work from other Legion fleet members.
|
|
53
53
|
|
|
54
|
-
Each configured `(type, model)` pair gets its own
|
|
55
|
-
`llm.
|
|
56
|
-
|
|
54
|
+
Each configured `(type, model)` pair gets its own durable quorum lane queue. Shared lanes use
|
|
55
|
+
`llm.fleet.embed.<model>` for embeddings and `llm.fleet.inference.<model>.ctx<context>` for
|
|
56
|
+
generation/chat subscriptions with a configured context window. Endpoint workers default to
|
|
57
|
+
explicit `basic_get` polling with a process-wide lane lock, so local one-model-at-a-time
|
|
58
|
+
devices do not reserve work from multiple model queues. GPU or datacenter workers can opt into
|
|
59
|
+
RabbitMQ consumer subscriptions with `legion.ollama.fleet.scheduler: :subscription`.
|
|
60
|
+
|
|
61
|
+
When offering lanes are enabled, workers also bind exact `legion-llm` compatible lanes in the
|
|
62
|
+
form `llm.fleet.offering.<instance>.<model>.<operation>`. Workers publish nonblocking
|
|
63
|
+
availability, heartbeat, degraded, and unavailable events to `llm.registry` when the transport
|
|
64
|
+
runtime is loaded.
|
|
57
65
|
|
|
58
66
|
```yaml
|
|
59
67
|
legion:
|
|
@@ -67,12 +75,29 @@ legion:
|
|
|
67
75
|
- "qwen3.5:4b"
|
|
68
76
|
- "nomic-embed-text:latest"
|
|
69
77
|
fleet:
|
|
70
|
-
|
|
78
|
+
scheduler: basic_get
|
|
79
|
+
consumer_priority: 10
|
|
80
|
+
queue_expires_ms: 60000
|
|
81
|
+
message_ttl_ms: 120000
|
|
82
|
+
queue_max_length: 100
|
|
83
|
+
delivery_limit: 3
|
|
84
|
+
consumer_ack_timeout_ms: 300000
|
|
85
|
+
endpoint:
|
|
86
|
+
enabled: false
|
|
87
|
+
empty_lane_backoff_ms: 250
|
|
88
|
+
idle_backoff_ms: 1000
|
|
89
|
+
max_consecutive_pulls_per_lane: 0
|
|
90
|
+
offering_lanes:
|
|
91
|
+
enabled: false
|
|
92
|
+
instance_id: "macbook-m4"
|
|
93
|
+
registry:
|
|
94
|
+
heartbeat_interval_seconds: 30
|
|
71
95
|
subscriptions:
|
|
72
96
|
- type: embed
|
|
73
97
|
model: nomic-embed-text
|
|
74
98
|
- type: chat
|
|
75
99
|
model: "qwen3.5:27b"
|
|
100
|
+
context_window: 32768
|
|
76
101
|
```
|
|
77
102
|
|
|
78
103
|
**Auto-provisioning**: When `s3` and `default_models` are configured, the `ModelSync` actor
|
|
@@ -163,7 +188,7 @@ result[:usage] # => { input_tokens: 1, output_tokens: 5, total_duration: ..., .
|
|
|
163
188
|
|
|
164
189
|
## Version
|
|
165
190
|
|
|
166
|
-
0.3.
|
|
191
|
+
0.3.10
|
|
167
192
|
|
|
168
193
|
## License
|
|
169
194
|
|
data/lex-ollama.gemspec
CHANGED
|
@@ -27,5 +27,10 @@ Gem::Specification.new do |spec|
|
|
|
27
27
|
spec.require_paths = ['lib']
|
|
28
28
|
|
|
29
29
|
spec.add_dependency 'faraday', '>= 2.0'
|
|
30
|
+
spec.add_dependency 'legion-json', '>= 1.2.1'
|
|
31
|
+
spec.add_dependency 'legion-llm', '>= 0.8.32'
|
|
32
|
+
spec.add_dependency 'legion-logging', '>= 1.3.2'
|
|
33
|
+
spec.add_dependency 'legion-settings', '>= 1.3.14'
|
|
34
|
+
spec.add_dependency 'lex-llm', '>= 0.1.6'
|
|
30
35
|
spec.add_dependency 'lex-s3', '>= 0.2'
|
|
31
36
|
end
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Ollama
|
|
6
|
+
module Actor
|
|
7
|
+
# Polls configured fleet queues with basic_get so endpoint machines choose
|
|
8
|
+
# when they are ready for work instead of holding prefetched messages.
|
|
9
|
+
class EndpointPuller < Legion::Extensions::Actors::Every
|
|
10
|
+
def runner_class
|
|
11
|
+
self.class
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def runner_function
|
|
15
|
+
'action'
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def use_runner?
|
|
19
|
+
false
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def check_subtask?
|
|
23
|
+
false
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def generate_task?
|
|
27
|
+
false
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def enabled?
|
|
31
|
+
fleet_scheduler == :basic_get && endpoint_enabled? && subscriptions.any?
|
|
32
|
+
rescue StandardError => e
|
|
33
|
+
handle_exception(e, level: :warn, handled: true)
|
|
34
|
+
false
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def time
|
|
38
|
+
(setting_value(endpoint_settings, :idle_backoff_ms) || 1_000).to_f / 1000
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def action
|
|
42
|
+
return unless enabled?
|
|
43
|
+
|
|
44
|
+
now = monotonic_time
|
|
45
|
+
ordered_subscriptions.each do |sub|
|
|
46
|
+
next if lane_backed_off?(sub, now)
|
|
47
|
+
|
|
48
|
+
pulled = drain_lane(sub)
|
|
49
|
+
mark_lane_empty(sub) if pulled.zero?
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def ordered_subscriptions
|
|
54
|
+
subscriptions.sort_by do |sub|
|
|
55
|
+
type = sub[:type].to_s
|
|
56
|
+
[embed_type?(type) ? 0 : 1, context_limit(sub), sub[:model].to_s]
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def drain_lane(subscription)
|
|
61
|
+
pulls = 0
|
|
62
|
+
queue = queue_for(subscription)
|
|
63
|
+
|
|
64
|
+
loop do
|
|
65
|
+
break if max_consecutive_pulls_per_lane.positive? && pulls >= max_consecutive_pulls_per_lane
|
|
66
|
+
break unless pull_one(queue, subscription)
|
|
67
|
+
|
|
68
|
+
pulls += 1
|
|
69
|
+
end
|
|
70
|
+
pulls
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def pull_one(queue, subscription)
|
|
74
|
+
delivery_info, metadata, payload = queue.pop(manual_ack: true)
|
|
75
|
+
return false unless delivery_info
|
|
76
|
+
|
|
77
|
+
message = process_payload(payload, metadata, delivery_info, subscription)
|
|
78
|
+
Legion::Extensions::Ollama::Runners::Fleet.handle_request(**message)
|
|
79
|
+
queue.acknowledge(delivery_info.delivery_tag)
|
|
80
|
+
true
|
|
81
|
+
rescue StandardError => e
|
|
82
|
+
handle_exception(e, lex: lex_name, routing_key: delivery_info&.routing_key)
|
|
83
|
+
queue.reject(delivery_info.delivery_tag, requeue: false) if delivery_info
|
|
84
|
+
true
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def queue_for(subscription)
|
|
88
|
+
@queues ||= {}
|
|
89
|
+
@queues[lane_key(subscription)] ||= ModelWorker.queue_class_for(
|
|
90
|
+
request_type: subscription[:type],
|
|
91
|
+
model: subscription[:model],
|
|
92
|
+
context_window: finite_context_limit(subscription),
|
|
93
|
+
queue_config: queue_config
|
|
94
|
+
).new
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def process_payload(payload, metadata, delivery_info, subscription)
|
|
98
|
+
message = decode_payload(payload, metadata)
|
|
99
|
+
message = message.merge(metadata.headers.transform_keys(&:to_sym)) if metadata&.headers
|
|
100
|
+
message[:routing_key] = delivery_info.routing_key if delivery_info.respond_to?(:routing_key)
|
|
101
|
+
message[:request_type] ||= subscription[:type].to_s
|
|
102
|
+
message[:model] ||= subscription[:model].to_s
|
|
103
|
+
message[:message_context] ||= {}
|
|
104
|
+
message
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def decode_payload(payload, metadata)
|
|
108
|
+
decoded = if metadata&.content_encoding == 'encrypted/cs'
|
|
109
|
+
Legion::Crypt.decrypt(payload, metadata_header(metadata, :iv))
|
|
110
|
+
elsif metadata&.content_encoding == 'encrypted/pk'
|
|
111
|
+
Legion::Crypt.decrypt_from_keypair(metadata_header(metadata, :public_key), payload)
|
|
112
|
+
else
|
|
113
|
+
payload
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
if metadata&.content_type == 'application/json'
|
|
117
|
+
Legion::JSON.load(decoded)
|
|
118
|
+
else
|
|
119
|
+
{ value: decoded }
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def subscriptions
|
|
124
|
+
configured = setting_value(settings, :subscriptions)
|
|
125
|
+
return [] unless configured.is_a?(Array)
|
|
126
|
+
|
|
127
|
+
configured.filter_map do |sub|
|
|
128
|
+
next unless sub.is_a?(Hash)
|
|
129
|
+
|
|
130
|
+
normalized = sub.transform_keys(&:to_sym)
|
|
131
|
+
next unless normalized[:type] && normalized[:model]
|
|
132
|
+
|
|
133
|
+
normalized
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def queue_config
|
|
138
|
+
{
|
|
139
|
+
queue_expires_ms: nested_setting(settings, :fleet, :queue_expires_ms),
|
|
140
|
+
message_ttl_ms: nested_setting(settings, :fleet, :message_ttl_ms),
|
|
141
|
+
queue_max_length: nested_setting(settings, :fleet, :queue_max_length),
|
|
142
|
+
delivery_limit: nested_setting(settings, :fleet, :delivery_limit),
|
|
143
|
+
consumer_ack_timeout_ms: nested_setting(settings, :fleet, :consumer_ack_timeout_ms)
|
|
144
|
+
}.compact
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def endpoint_settings
|
|
148
|
+
nested_setting(settings, :fleet, :endpoint) || {}
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def endpoint_enabled?
|
|
152
|
+
setting_value(endpoint_settings, :enabled) == true
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def max_consecutive_pulls_per_lane
|
|
156
|
+
setting_value(endpoint_settings, :max_consecutive_pulls_per_lane) || 0
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def empty_lane_backoff_seconds
|
|
160
|
+
(setting_value(endpoint_settings, :empty_lane_backoff_ms) || 250).to_f / 1000
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def lane_backed_off?(subscription, now)
|
|
164
|
+
(@empty_lanes ||= {}).fetch(lane_key(subscription), 0) > now
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def mark_lane_empty(subscription)
|
|
168
|
+
(@empty_lanes ||= {})[lane_key(subscription)] = monotonic_time + empty_lane_backoff_seconds
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def lane_key(subscription)
|
|
172
|
+
type = subscription[:type]
|
|
173
|
+
model = subscription[:model]
|
|
174
|
+
context = context_limit(subscription)
|
|
175
|
+
context.finite? ? "#{type}:#{model}:ctx#{context}" : "#{type}:#{model}"
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def monotonic_time
|
|
179
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
def fleet_scheduler
|
|
183
|
+
(nested_setting(settings, :fleet, :scheduler) || :basic_get).to_sym
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def context_limit(subscription)
|
|
187
|
+
raw = setting_value(subscription, :max_context_size) ||
|
|
188
|
+
setting_value(subscription, :context_window) ||
|
|
189
|
+
setting_value(subscription, :max_input_tokens) ||
|
|
190
|
+
setting_value(subscription, :context) ||
|
|
191
|
+
setting_value(subscription, :ctx)
|
|
192
|
+
Integer(raw || Float::INFINITY)
|
|
193
|
+
rescue ArgumentError, TypeError, FloatDomainError
|
|
194
|
+
Float::INFINITY
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def finite_context_limit(subscription)
|
|
198
|
+
context = context_limit(subscription)
|
|
199
|
+
context.finite? ? context : nil
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def embed_type?(type)
|
|
203
|
+
%w[embed embedding embeddings].include?(type)
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def metadata_header(metadata, key)
|
|
207
|
+
setting_value(metadata&.headers || {}, key)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def nested_setting(hash, *keys)
|
|
211
|
+
keys.reduce(hash) do |current, key|
|
|
212
|
+
return nil unless current.respond_to?(:key?)
|
|
213
|
+
|
|
214
|
+
setting_value(current, key)
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def setting_value(hash, key)
|
|
219
|
+
return nil unless hash.respond_to?(:key?)
|
|
220
|
+
|
|
221
|
+
string_key = key.to_s
|
|
222
|
+
return hash[string_key] if hash.key?(string_key)
|
|
223
|
+
|
|
224
|
+
hash[key] if hash.key?(key)
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
end
|
|
@@ -4,8 +4,12 @@ module Legion
|
|
|
4
4
|
module Extensions
|
|
5
5
|
module Ollama
|
|
6
6
|
module Actor
|
|
7
|
-
#
|
|
7
|
+
# Fleet actor that listens on a model-scoped queue and forwards
|
|
8
8
|
# inbound LLM request messages to Runners::Fleet#handle_request.
|
|
9
|
+
# Endpoint workers default to explicit basic_get polling so a local
|
|
10
|
+
# one-model-at-a-time device does not reserve messages from every lane.
|
|
11
|
+
# Set legion.ollama.fleet.scheduler to :subscription for GPU/datacenter
|
|
12
|
+
# workers that should use RabbitMQ consumer priority and prefetch.
|
|
9
13
|
#
|
|
10
14
|
# One instance is created per (request_type, model) entry in settings:
|
|
11
15
|
#
|
|
@@ -19,15 +23,27 @@ module Legion
|
|
|
19
23
|
# - type: chat
|
|
20
24
|
# model: "qwen3.5:27b"
|
|
21
25
|
#
|
|
22
|
-
#
|
|
23
|
-
# llm.
|
|
24
|
-
#
|
|
26
|
+
# Queue names and routing keys follow the shared fleet lane schema:
|
|
27
|
+
# llm.fleet.embed.<model-slug>
|
|
28
|
+
# llm.fleet.inference.<model-slug>.ctx<context-window>
|
|
29
|
+
# or, when explicitly enabled, exact offering lanes:
|
|
30
|
+
# llm.fleet.offering.<instance>.<model-slug>.<operation>
|
|
25
31
|
class ModelWorker < Legion::Extensions::Actors::Subscription
|
|
26
|
-
|
|
32
|
+
POLLING_SCHEDULERS = %i[basic_get poll polling].freeze
|
|
33
|
+
SUBSCRIPTION_SCHEDULERS = %i[subscribe subscription basic_consume consumer].freeze
|
|
34
|
+
POLL_LOCK = Mutex.new
|
|
35
|
+
REGISTRY_HEARTBEAT_INTERVAL = 30.0
|
|
27
36
|
|
|
28
|
-
|
|
37
|
+
attr_reader :request_type, :model_name, :context_window, :offering_instance_id
|
|
38
|
+
|
|
39
|
+
def initialize(request_type:, model:, context_window: nil, lane_style: :shared,
|
|
40
|
+
offering_instance_id: nil, **)
|
|
29
41
|
@request_type = request_type.to_s
|
|
30
|
-
@model_name
|
|
42
|
+
@model_name = model.to_s
|
|
43
|
+
@context_window = normalize_context_window(context_window)
|
|
44
|
+
@lane_style = lane_style.to_s
|
|
45
|
+
@offering_instance_id = offering_instance_id&.to_s
|
|
46
|
+
@polling = false
|
|
31
47
|
super(**)
|
|
32
48
|
end
|
|
33
49
|
|
|
@@ -59,7 +75,27 @@ module Legion
|
|
|
59
75
|
# Standard scale: GPU server = 10, Mac Studio = 5, developer laptop = 1.
|
|
60
76
|
# Defaults to 0 (equal priority) if not configured.
|
|
61
77
|
def consumer_priority
|
|
62
|
-
|
|
78
|
+
setting_value(fleet_settings, :consumer_priority) || 0
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def queue_expires_ms
|
|
82
|
+
setting_value(fleet_settings, :queue_expires_ms) || 60_000
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def message_ttl_ms
|
|
86
|
+
setting_value(fleet_settings, :message_ttl_ms) || 120_000
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def queue_max_length
|
|
90
|
+
setting_value(fleet_settings, :queue_max_length) || 100
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def delivery_limit
|
|
94
|
+
setting_value(fleet_settings, :delivery_limit) || 3
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def consumer_ack_timeout_ms
|
|
98
|
+
setting_value(fleet_settings, :consumer_ack_timeout_ms) || 300_000
|
|
63
99
|
end
|
|
64
100
|
|
|
65
101
|
# Subscribe options include x-priority argument so RabbitMQ can honour
|
|
@@ -73,44 +109,467 @@ module Legion
|
|
|
73
109
|
base.merge(arguments: { 'x-priority' => consumer_priority })
|
|
74
110
|
end
|
|
75
111
|
|
|
76
|
-
|
|
77
|
-
|
|
112
|
+
def prepare
|
|
113
|
+
return super unless endpoint_polling?
|
|
114
|
+
|
|
115
|
+
@queue = queue.new
|
|
116
|
+
@polling = true
|
|
117
|
+
log.info "[ModelWorker] prepared polling lane #{lane_key}" if defined?(log)
|
|
118
|
+
rescue StandardError => e
|
|
119
|
+
handle_exception(e, level: :fatal)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def activate
|
|
123
|
+
result = if endpoint_polling?
|
|
124
|
+
@polling = true
|
|
125
|
+
@poll_task = async.run_basic_get_loop
|
|
126
|
+
log.info "[ModelWorker] activated polling lane #{lane_key}" if defined?(log)
|
|
127
|
+
@poll_task
|
|
128
|
+
else
|
|
129
|
+
super
|
|
130
|
+
end
|
|
131
|
+
publish_registry_event_async(:available)
|
|
132
|
+
start_registry_heartbeat
|
|
133
|
+
result
|
|
134
|
+
rescue StandardError => e
|
|
135
|
+
publish_registry_event_async(:degraded, error: e)
|
|
136
|
+
handle_exception(e, level: :fatal)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def cancel
|
|
140
|
+
@polling = false
|
|
141
|
+
stop_registry_heartbeat
|
|
142
|
+
publish_registry_event_async(:unavailable)
|
|
143
|
+
return true unless instance_variable_defined?(:@consumer) && @consumer
|
|
144
|
+
|
|
145
|
+
super
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def endpoint_polling?
|
|
149
|
+
scheduler = fleet_scheduler
|
|
150
|
+
return true if POLLING_SCHEDULERS.include?(scheduler)
|
|
151
|
+
return false if SUBSCRIPTION_SCHEDULERS.include?(scheduler)
|
|
152
|
+
|
|
153
|
+
nested_setting(settings, :fleet, :endpoint, :enabled) == true
|
|
154
|
+
rescue StandardError
|
|
155
|
+
false
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def lane_key
|
|
159
|
+
@lane_key ||= offering_lane? ? offering_lane_key : shared_lane_key
|
|
160
|
+
end
|
|
161
|
+
alias routing_key lane_key
|
|
162
|
+
|
|
163
|
+
def run_basic_get_loop
|
|
164
|
+
consecutive_pulls = 0
|
|
165
|
+
while @polling && !shutting_down?
|
|
166
|
+
pulled = POLL_LOCK.synchronize { pull_one_message }
|
|
167
|
+
consecutive_pulls = pulled ? consecutive_pulls + 1 : 0
|
|
168
|
+
sleep(pulled ? post_pull_backoff(consecutive_pulls) : empty_lane_backoff)
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def pull_one_message
|
|
173
|
+
delivery_info, metadata, payload = @queue.pop(manual_ack: manual_ack)
|
|
174
|
+
return false unless delivery_info
|
|
175
|
+
|
|
176
|
+
handle_delivery(delivery_info, metadata, payload)
|
|
177
|
+
true
|
|
178
|
+
rescue StandardError => e
|
|
179
|
+
handle_exception(e)
|
|
180
|
+
reject_or_retry(delivery_info, metadata, payload) if manual_ack && delivery_info
|
|
181
|
+
true
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Returns a queue CLASS (not instance) bound to the llm.fleet exchange
|
|
185
|
+
# with the routing key for this worker's model lane.
|
|
78
186
|
# The Subscription base class calls queue.new in initialize, so this must
|
|
79
187
|
# return a class, not an instance.
|
|
80
188
|
def queue
|
|
81
189
|
@queue ||= build_queue_class
|
|
82
190
|
end
|
|
83
191
|
|
|
192
|
+
def self.queue_class_for(request_type:, model:, context_window: nil, queue_config: {},
|
|
193
|
+
lane_style: :shared, offering_instance_id: nil)
|
|
194
|
+
worker = allocate
|
|
195
|
+
worker.instance_variable_set(:@request_type, request_type.to_s)
|
|
196
|
+
worker.instance_variable_set(:@model_name, model.to_s)
|
|
197
|
+
worker.instance_variable_set(:@context_window, context_window&.to_i)
|
|
198
|
+
worker.instance_variable_set(:@lane_style, lane_style.to_s)
|
|
199
|
+
worker.instance_variable_set(:@offering_instance_id, offering_instance_id&.to_s)
|
|
200
|
+
worker.send(:build_queue_class, queue_config)
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def self.fallback_queue_options(settings)
|
|
204
|
+
{
|
|
205
|
+
durable: true,
|
|
206
|
+
auto_delete: false,
|
|
207
|
+
arguments: {
|
|
208
|
+
'x-queue-type' => 'quorum',
|
|
209
|
+
'x-queue-leader-locator' => 'balanced',
|
|
210
|
+
'x-expires' => settings.fetch(:queue_expires_ms),
|
|
211
|
+
'x-message-ttl' => settings.fetch(:message_ttl_ms),
|
|
212
|
+
'x-overflow' => 'reject-publish',
|
|
213
|
+
'x-max-length' => settings.fetch(:queue_max_length),
|
|
214
|
+
'x-delivery-limit' => settings.fetch(:delivery_limit),
|
|
215
|
+
'x-consumer-timeout' => settings.fetch(:consumer_ack_timeout_ms)
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
end
|
|
219
|
+
|
|
84
220
|
# Enrich every inbound message with the worker's own request_type and model
|
|
85
221
|
# so Runners::Fleet#handle_request always has them, even if the sender omitted
|
|
86
222
|
# them. Also defaults message_context to {} if absent.
|
|
87
223
|
def process_message(payload, metadata, delivery_info)
|
|
88
224
|
msg = super
|
|
89
|
-
msg[:request_type]
|
|
90
|
-
msg[:model]
|
|
225
|
+
msg[:request_type] ||= @request_type
|
|
226
|
+
msg[:model] ||= @model_name
|
|
91
227
|
msg[:message_context] ||= {}
|
|
92
228
|
msg
|
|
93
229
|
end
|
|
94
230
|
|
|
95
231
|
private
|
|
96
232
|
|
|
97
|
-
def
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
exchange_class = Transport::Exchanges::LlmRequest
|
|
233
|
+
def start_registry_heartbeat
|
|
234
|
+
return unless registry_publishing_available?
|
|
235
|
+
return if @registry_heartbeat_thread&.alive?
|
|
101
236
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
237
|
+
@registry_heartbeat_running = true
|
|
238
|
+
@registry_heartbeat_thread = Thread.new do
|
|
239
|
+
Thread.current.abort_on_exception = false
|
|
240
|
+
while @registry_heartbeat_running && !shutting_down?
|
|
241
|
+
sleep registry_heartbeat_interval
|
|
242
|
+
publish_registry_event(:heartbeat) if @registry_heartbeat_running && !shutting_down?
|
|
106
243
|
end
|
|
244
|
+
rescue StandardError => e
|
|
245
|
+
log_registry_publish_failure(e, level: :debug)
|
|
246
|
+
end
|
|
247
|
+
rescue StandardError => e
|
|
248
|
+
log_registry_publish_failure(e, level: :debug)
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
def stop_registry_heartbeat
|
|
252
|
+
@registry_heartbeat_running = false
|
|
253
|
+
@registry_heartbeat_thread&.kill if @registry_heartbeat_thread&.alive?
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def registry_heartbeat_interval
|
|
257
|
+
configured = nested_setting(settings, :fleet, :registry, :heartbeat_interval_seconds) ||
|
|
258
|
+
nested_setting(settings, :fleet, :registry_heartbeat_interval_seconds)
|
|
259
|
+
interval = configured.nil? ? REGISTRY_HEARTBEAT_INTERVAL : Float(configured)
|
|
260
|
+
interval.positive? ? interval : REGISTRY_HEARTBEAT_INTERVAL
|
|
261
|
+
rescue StandardError
|
|
262
|
+
REGISTRY_HEARTBEAT_INTERVAL
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
def publish_registry_event_async(kind, error: nil)
|
|
266
|
+
return unless registry_publishing_available?
|
|
267
|
+
|
|
268
|
+
Thread.new do
|
|
269
|
+
Thread.current.abort_on_exception = false
|
|
270
|
+
publish_registry_event(kind, error: error)
|
|
271
|
+
rescue StandardError => e
|
|
272
|
+
log_registry_publish_failure(e, level: :debug)
|
|
273
|
+
end
|
|
274
|
+
rescue StandardError => e
|
|
275
|
+
log_registry_publish_failure(e, level: :debug)
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
def publish_registry_event(kind, error: nil)
|
|
279
|
+
event = registry_event_for(kind, error: error)
|
|
280
|
+
Transport::Messages::RegistryEvent.new(event: event).publish(spool: false)
|
|
281
|
+
rescue StandardError => e
|
|
282
|
+
log_registry_publish_failure(e)
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
def registry_event_for(kind, error: nil)
|
|
286
|
+
registry_event_class.public_send(
|
|
287
|
+
registry_event_method(kind),
|
|
288
|
+
registry_offering,
|
|
289
|
+
runtime: registry_runtime,
|
|
290
|
+
capacity: registry_capacity,
|
|
291
|
+
health: registry_health(kind, error: error),
|
|
292
|
+
lane: lane_key,
|
|
293
|
+
metadata: registry_metadata
|
|
294
|
+
)
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
def registry_event_method(kind)
|
|
298
|
+
case kind.to_sym
|
|
299
|
+
when :available then :available
|
|
300
|
+
when :unavailable then :unavailable
|
|
301
|
+
when :heartbeat then :heartbeat
|
|
302
|
+
else :degraded
|
|
303
|
+
end
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
def registry_offering
|
|
307
|
+
limits = {}
|
|
308
|
+
limits[:context_window] = @context_window if @context_window
|
|
309
|
+
{
|
|
310
|
+
provider_family: :ollama,
|
|
311
|
+
provider_instance: registry_provider_instance,
|
|
312
|
+
transport: :rabbitmq,
|
|
313
|
+
model: @model_name,
|
|
314
|
+
usage_type: registry_usage_type,
|
|
315
|
+
capabilities: registry_capabilities,
|
|
316
|
+
limits: limits,
|
|
317
|
+
routing_metadata: { lane: lane_key },
|
|
318
|
+
metadata: { lex: :ollama, lane_style: @lane_style || 'shared' }
|
|
319
|
+
}
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
def registry_runtime
|
|
323
|
+
{
|
|
324
|
+
node: registry_provider_instance,
|
|
325
|
+
scheduler: fleet_scheduler,
|
|
326
|
+
polling: endpoint_polling?
|
|
327
|
+
}
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
def registry_capacity
|
|
331
|
+
{
|
|
332
|
+
concurrency: 1,
|
|
333
|
+
consumer_priority: consumer_priority,
|
|
334
|
+
queue_max_length: queue_max_length
|
|
335
|
+
}
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
def registry_health(kind, error: nil)
|
|
339
|
+
health = {
|
|
340
|
+
ready: %i[available heartbeat].include?(kind.to_sym),
|
|
341
|
+
status: registry_health_status(kind)
|
|
342
|
+
}
|
|
343
|
+
health[:error_class] = error.class.name if error
|
|
344
|
+
health[:error] = error.message if error
|
|
345
|
+
health
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
def registry_health_status(kind)
|
|
349
|
+
case kind.to_sym
|
|
350
|
+
when :available, :heartbeat then :available
|
|
351
|
+
when :unavailable then :unavailable
|
|
352
|
+
else :degraded
|
|
353
|
+
end
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
def registry_metadata
|
|
357
|
+
{
|
|
358
|
+
extension: :lex_ollama,
|
|
359
|
+
request_type: @request_type,
|
|
360
|
+
lane_kind: lane_kind
|
|
361
|
+
}
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
def registry_usage_type
|
|
365
|
+
lane_kind == 'embed' ? :embedding : :inference
|
|
366
|
+
end
|
|
367
|
+
|
|
368
|
+
def registry_capabilities
|
|
369
|
+
return %i[embedding] if lane_kind == 'embed'
|
|
370
|
+
return %i[completion] if @request_type == 'generate'
|
|
371
|
+
|
|
372
|
+
%i[chat]
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
def registry_provider_instance
|
|
376
|
+
@offering_instance_id || node_identity
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
def node_identity
|
|
380
|
+
return Legion::Settings.dig(:node, :canonical_name).to_s if defined?(Legion::Settings) &&
|
|
381
|
+
Legion::Settings.dig(:node, :canonical_name)
|
|
382
|
+
|
|
383
|
+
'unknown'
|
|
384
|
+
rescue StandardError
|
|
385
|
+
'unknown'
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
def registry_publishing_available?
|
|
389
|
+
defined?(::Legion::Transport) &&
|
|
390
|
+
defined?(::Legion::Extensions::Llm::Routing::RegistryEvent) &&
|
|
391
|
+
defined?(Transport::Messages::RegistryEvent)
|
|
392
|
+
end
|
|
393
|
+
|
|
394
|
+
def registry_event_class
|
|
395
|
+
::Legion::Extensions::Llm::Routing::RegistryEvent
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
def log_registry_publish_failure(error, level: :warn)
|
|
399
|
+
message = "[ModelWorker] llm.registry publish failed lane=#{lane_key}: #{error.class}: #{error.message}"
|
|
400
|
+
if defined?(log) && log.respond_to?(level)
|
|
401
|
+
log.public_send(level, message)
|
|
402
|
+
elsif defined?(log) && log.respond_to?(:debug)
|
|
403
|
+
log.debug(message)
|
|
404
|
+
end
|
|
405
|
+
rescue StandardError
|
|
406
|
+
nil
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
def build_queue_class(queue_config = {})
|
|
410
|
+
lane_key = self.lane_key
|
|
411
|
+
exchange_class = Transport::Exchanges::LlmRequest
|
|
412
|
+
queue_settings = {
|
|
413
|
+
queue_expires_ms: queue_expires_ms,
|
|
414
|
+
message_ttl_ms: message_ttl_ms,
|
|
415
|
+
queue_max_length: queue_max_length,
|
|
416
|
+
delivery_limit: delivery_limit,
|
|
417
|
+
consumer_ack_timeout_ms: consumer_ack_timeout_ms
|
|
418
|
+
}.merge((queue_config || {}).compact)
|
|
419
|
+
|
|
420
|
+
if defined?(::Legion::Extensions::Llm::Transport::FleetLane)
|
|
421
|
+
return ::Legion::Extensions::Llm::Transport::FleetLane.build_queue_class(
|
|
422
|
+
queue_name: lane_key,
|
|
423
|
+
exchange_class: exchange_class,
|
|
424
|
+
routing_key: lane_key,
|
|
425
|
+
base_queue_class: Legion::Transport::Queue,
|
|
426
|
+
settings: queue_settings
|
|
427
|
+
)
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
queue_options = self.class.fallback_queue_options(queue_settings)
|
|
431
|
+
|
|
432
|
+
Class.new(Legion::Transport::Queue) do
|
|
433
|
+
define_method(:queue_name) { lane_key }
|
|
434
|
+
define_method(:queue_options) { queue_options }
|
|
107
435
|
define_method(:dlx_enabled) { false }
|
|
108
436
|
define_method(:initialize) do
|
|
109
437
|
super()
|
|
110
|
-
bind(exchange_class.new, routing_key:
|
|
438
|
+
bind(exchange_class.new, routing_key: lane_key)
|
|
111
439
|
end
|
|
112
440
|
end
|
|
113
441
|
end
|
|
442
|
+
|
|
443
|
+
def handle_delivery(delivery_info, metadata, payload)
|
|
444
|
+
message = process_message(payload, metadata, delivery_info)
|
|
445
|
+
fn = find_function(message)
|
|
446
|
+
log.debug "[ModelWorker] basic_get message received: #{lex_name}/#{fn}" if defined?(log)
|
|
447
|
+
|
|
448
|
+
affinity_result = check_region_affinity(message)
|
|
449
|
+
if affinity_result == :reject
|
|
450
|
+
log.warn '[ModelWorker] nack: region affinity mismatch' if defined?(log)
|
|
451
|
+
@queue.reject(delivery_info.delivery_tag) if manual_ack
|
|
452
|
+
return
|
|
453
|
+
end
|
|
454
|
+
|
|
455
|
+
record_cross_region_metric(message) if affinity_result == :remote
|
|
456
|
+
|
|
457
|
+
if use_runner?
|
|
458
|
+
dispatch_runner(message, runner_class, fn, check_subtask?, generate_task?)
|
|
459
|
+
else
|
|
460
|
+
runner_class.send(fn, **message)
|
|
461
|
+
end
|
|
462
|
+
@queue.acknowledge(delivery_info.delivery_tag) if manual_ack
|
|
463
|
+
end
|
|
464
|
+
|
|
465
|
+
def fleet_settings
|
|
466
|
+
setting_value(settings, :fleet) || {}
|
|
467
|
+
rescue NameError
|
|
468
|
+
{}
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
def fleet_scheduler
|
|
472
|
+
(setting_value(fleet_settings, :scheduler) || :basic_get).to_sym
|
|
473
|
+
end
|
|
474
|
+
|
|
475
|
+
def setting_value(hash, key)
|
|
476
|
+
return nil unless hash.respond_to?(:key?)
|
|
477
|
+
|
|
478
|
+
string_key = key.to_s
|
|
479
|
+
return hash[string_key] if hash.key?(string_key)
|
|
480
|
+
|
|
481
|
+
hash[key] if hash.key?(key)
|
|
482
|
+
end
|
|
483
|
+
|
|
484
|
+
def nested_setting(hash, *keys)
|
|
485
|
+
keys.reduce(hash) do |current, key|
|
|
486
|
+
return nil unless current.respond_to?(:key?)
|
|
487
|
+
|
|
488
|
+
setting_value(current, key)
|
|
489
|
+
end
|
|
490
|
+
end
|
|
491
|
+
|
|
492
|
+
def lane_kind
|
|
493
|
+
%w[embed embedding embeddings].include?(@request_type) ? 'embed' : 'inference'
|
|
494
|
+
end
|
|
495
|
+
|
|
496
|
+
def sanitized_model
|
|
497
|
+
sanitize_segment(@model_name)
|
|
498
|
+
end
|
|
499
|
+
|
|
500
|
+
def offering_lane?
|
|
501
|
+
@lane_style == 'offering'
|
|
502
|
+
end
|
|
503
|
+
|
|
504
|
+
def shared_lane_key
|
|
505
|
+
parts = ['llm.fleet', lane_kind, sanitized_model]
|
|
506
|
+
parts << "ctx#{@context_window}" if lane_kind == 'inference' && @context_window
|
|
507
|
+
parts.join('.')
|
|
508
|
+
end
|
|
509
|
+
|
|
510
|
+
def offering_lane_key
|
|
511
|
+
[
|
|
512
|
+
'llm',
|
|
513
|
+
'fleet',
|
|
514
|
+
'offering',
|
|
515
|
+
public_segment(:offering_instance_id, @offering_instance_id),
|
|
516
|
+
sanitized_model,
|
|
517
|
+
lane_kind
|
|
518
|
+
].join('.')
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
def sanitize_segment(value)
|
|
522
|
+
value.to_s.downcase.gsub(/[^a-z0-9]+/, '-').gsub(/\A-+|-+\z/, '').squeeze('-')
|
|
523
|
+
end
|
|
524
|
+
|
|
525
|
+
def public_segment(label, value)
|
|
526
|
+
segment = sanitize_segment(value)
|
|
527
|
+
raise ArgumentError, "#{label} is empty after sanitization" if segment.empty?
|
|
528
|
+
raise ArgumentError, "#{label} exceeds 64 characters" if segment.length > 64
|
|
529
|
+
|
|
530
|
+
segment
|
|
531
|
+
end
|
|
532
|
+
|
|
533
|
+
def normalize_context_window(value)
|
|
534
|
+
return nil if value.nil? || value.to_s.empty?
|
|
535
|
+
|
|
536
|
+
Integer(value)
|
|
537
|
+
rescue ArgumentError, TypeError
|
|
538
|
+
nil
|
|
539
|
+
end
|
|
540
|
+
|
|
541
|
+
def empty_lane_backoff
|
|
542
|
+
milliseconds = nested_setting(settings, :fleet, :endpoint, :empty_lane_backoff_ms) || 250
|
|
543
|
+
milliseconds.to_f / 1000.0
|
|
544
|
+
rescue StandardError
|
|
545
|
+
0.25
|
|
546
|
+
end
|
|
547
|
+
|
|
548
|
+
def idle_backoff
|
|
549
|
+
milliseconds = nested_setting(settings, :fleet, :endpoint, :idle_backoff_ms) || 1_000
|
|
550
|
+
milliseconds.to_f / 1000.0
|
|
551
|
+
rescue StandardError
|
|
552
|
+
1.0
|
|
553
|
+
end
|
|
554
|
+
|
|
555
|
+
def max_consecutive_pulls_per_lane
|
|
556
|
+
Integer(nested_setting(settings, :fleet, :endpoint, :max_consecutive_pulls_per_lane) || 0)
|
|
557
|
+
rescue StandardError
|
|
558
|
+
0
|
|
559
|
+
end
|
|
560
|
+
|
|
561
|
+
def post_pull_backoff(consecutive_pulls)
|
|
562
|
+
max_pulls = max_consecutive_pulls_per_lane
|
|
563
|
+
return 0 if max_pulls.zero? || consecutive_pulls < max_pulls
|
|
564
|
+
|
|
565
|
+
idle_backoff
|
|
566
|
+
end
|
|
567
|
+
|
|
568
|
+
def shutting_down?
|
|
569
|
+
defined?(Legion::Settings) && Legion::Settings.dig(:client, :shutting_down)
|
|
570
|
+
rescue StandardError
|
|
571
|
+
false
|
|
572
|
+
end
|
|
114
573
|
end
|
|
115
574
|
end
|
|
116
575
|
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Ollama
|
|
6
|
+
module Transport
|
|
7
|
+
module Exchanges
|
|
8
|
+
# Topic exchange for provider availability events consumed by LLM routing registries.
|
|
9
|
+
class LlmRegistry < Legion::Transport::Exchange
|
|
10
|
+
def exchange_name
|
|
11
|
+
'llm.registry'
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'legion/extensions/ollama/transport/exchanges/llm_registry'
|
|
4
|
+
|
|
5
|
+
module Legion
|
|
6
|
+
module Extensions
|
|
7
|
+
module Ollama
|
|
8
|
+
module Transport
|
|
9
|
+
module Messages
|
|
10
|
+
# Publishes lex-llm RegistryEvent envelopes to the llm.registry exchange.
|
|
11
|
+
class RegistryEvent < Legion::Transport::Message
|
|
12
|
+
def initialize(event:, **options)
|
|
13
|
+
envelope = event.to_h
|
|
14
|
+
super(**envelope.merge(options))
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def exchange
|
|
18
|
+
Transport::Exchanges::LlmRegistry
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def routing_key
|
|
22
|
+
@options[:routing_key] || "llm.registry.#{@options.fetch(:event_type)}"
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def type
|
|
26
|
+
'llm.registry.event'
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def app_id
|
|
30
|
+
'lex-ollama'
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def persistent
|
|
34
|
+
false
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -14,13 +14,22 @@ require 'legion/extensions/ollama/runners/version'
|
|
|
14
14
|
require 'legion/extensions/ollama/runners/fleet'
|
|
15
15
|
require 'legion/extensions/ollama/client'
|
|
16
16
|
|
|
17
|
+
begin
|
|
18
|
+
require 'legion/extensions/llm'
|
|
19
|
+
rescue LoadError
|
|
20
|
+
nil
|
|
21
|
+
end
|
|
22
|
+
|
|
17
23
|
# Fleet transport and actor wiring — only loaded when Legion::Extensions::Core is present
|
|
18
24
|
# so the gem still works as a standalone HTTP client without any AMQP runtime.
|
|
19
25
|
if Legion::Extensions.const_defined?(:Core, false)
|
|
20
26
|
require 'legion/extensions/ollama/transport/exchanges/llm_request'
|
|
27
|
+
require 'legion/extensions/ollama/transport/exchanges/llm_registry'
|
|
21
28
|
require 'legion/extensions/ollama/transport/messages/llm_response'
|
|
29
|
+
require 'legion/extensions/ollama/transport/messages/registry_event'
|
|
22
30
|
require 'legion/extensions/ollama/transport'
|
|
23
31
|
require 'legion/extensions/ollama/actors/model_worker'
|
|
32
|
+
require 'legion/extensions/ollama/actors/endpoint_puller'
|
|
24
33
|
require 'legion/extensions/ollama/actors/model_sync'
|
|
25
34
|
end
|
|
26
35
|
|
|
@@ -32,7 +41,26 @@ module Legion
|
|
|
32
41
|
def self.default_settings
|
|
33
42
|
{
|
|
34
43
|
s3: {},
|
|
35
|
-
fleet: {
|
|
44
|
+
fleet: {
|
|
45
|
+
consumer_priority: 0,
|
|
46
|
+
scheduler: :basic_get,
|
|
47
|
+
queue_expires_ms: 60_000,
|
|
48
|
+
message_ttl_ms: 120_000,
|
|
49
|
+
queue_max_length: 100,
|
|
50
|
+
delivery_limit: 3,
|
|
51
|
+
consumer_ack_timeout_ms: 300_000,
|
|
52
|
+
endpoint: {
|
|
53
|
+
enabled: false,
|
|
54
|
+
empty_lane_backoff_ms: 250,
|
|
55
|
+
idle_backoff_ms: 1_000,
|
|
56
|
+
max_consecutive_pulls_per_lane: 0,
|
|
57
|
+
accept_when: []
|
|
58
|
+
},
|
|
59
|
+
offering_lanes: {
|
|
60
|
+
enabled: false,
|
|
61
|
+
instance_id: nil
|
|
62
|
+
}
|
|
63
|
+
}
|
|
36
64
|
}
|
|
37
65
|
end
|
|
38
66
|
|
|
@@ -43,26 +71,149 @@ module Legion
|
|
|
43
71
|
super
|
|
44
72
|
@actors.delete(:model_worker)
|
|
45
73
|
|
|
46
|
-
subs = settings
|
|
74
|
+
subs = setting_value(settings, :subscriptions)
|
|
75
|
+
valid_subscriptions = valid_fleet_subscriptions(subs)
|
|
76
|
+
endpoint_configured = fleet_scheduler == :basic_get &&
|
|
77
|
+
nested_setting(settings, :fleet, :endpoint, :enabled) == true &&
|
|
78
|
+
valid_subscriptions.any?
|
|
79
|
+
@actors.delete(:endpoint_puller) unless endpoint_configured
|
|
80
|
+
|
|
47
81
|
return unless subs.is_a?(Array)
|
|
82
|
+
return if fleet_scheduler == :basic_get
|
|
48
83
|
|
|
49
|
-
subs.each do |sub|
|
|
50
|
-
request_type = sub
|
|
51
|
-
model
|
|
84
|
+
sorted_subscriptions(subs).each do |sub|
|
|
85
|
+
request_type = setting_value(sub, :type)&.to_s
|
|
86
|
+
model = setting_value(sub, :model)&.to_s
|
|
87
|
+
context_window = context_window_for(sub)
|
|
52
88
|
next unless request_type && model
|
|
53
89
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
90
|
+
register_model_worker(request_type: request_type, model: model, context_window: context_window)
|
|
91
|
+
|
|
92
|
+
offering_instance_id = offering_instance_for(sub)
|
|
93
|
+
next unless offering_instance_id
|
|
94
|
+
|
|
95
|
+
register_model_worker(request_type: request_type, model: model, context_window: context_window,
|
|
96
|
+
lane_style: :offering, offering_instance_id: offering_instance_id)
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def self.sorted_subscriptions(subscriptions)
|
|
101
|
+
subscriptions.sort_by do |sub|
|
|
102
|
+
type = setting_value(sub, :type).to_s
|
|
103
|
+
[
|
|
104
|
+
type == 'embed' ? 0 : 1,
|
|
105
|
+
context_window_for(sub) || Float::INFINITY,
|
|
106
|
+
setting_value(sub, :model).to_s
|
|
107
|
+
]
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def self.context_window_for(subscription)
|
|
112
|
+
limits = setting_value(subscription, :limits) || {}
|
|
113
|
+
raw = setting_value(subscription, :context_window) ||
|
|
114
|
+
setting_value(subscription, :max_context) ||
|
|
115
|
+
setting_value(subscription, :max_input_tokens) ||
|
|
116
|
+
setting_value(limits, :context_window) ||
|
|
117
|
+
setting_value(limits, :max_input_tokens)
|
|
118
|
+
return nil if raw.nil? || raw.to_s.empty?
|
|
119
|
+
|
|
120
|
+
Integer(raw)
|
|
121
|
+
rescue ArgumentError, TypeError
|
|
122
|
+
nil
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def self.register_model_worker(request_type:, model:, context_window:, lane_style: :shared,
|
|
126
|
+
offering_instance_id: nil)
|
|
127
|
+
actor_name = model_worker_actor_name(
|
|
128
|
+
request_type: request_type,
|
|
129
|
+
model: model,
|
|
130
|
+
lane_style: lane_style,
|
|
131
|
+
offering_instance_id: offering_instance_id
|
|
132
|
+
)
|
|
133
|
+
worker_class = Class.new(Legion::Extensions::Ollama::Actor::ModelWorker) do
|
|
134
|
+
define_method(:initialize) do
|
|
135
|
+
super(
|
|
136
|
+
request_type: request_type,
|
|
137
|
+
model: model,
|
|
138
|
+
context_window: context_window,
|
|
139
|
+
lane_style: lane_style,
|
|
140
|
+
offering_instance_id: offering_instance_id
|
|
141
|
+
)
|
|
57
142
|
end
|
|
143
|
+
end
|
|
58
144
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
145
|
+
@actors[actor_name] = {
|
|
146
|
+
extension: 'lex-ollama',
|
|
147
|
+
extension_name: :ollama,
|
|
148
|
+
actor_name: actor_name,
|
|
149
|
+
actor_class: worker_class,
|
|
150
|
+
type: 'literal'
|
|
151
|
+
}
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def self.offering_instance_for(subscription)
|
|
155
|
+
return nil unless offering_lanes_enabled?
|
|
156
|
+
|
|
157
|
+
raw = setting_value(subscription, :offering_instance_id) ||
|
|
158
|
+
setting_value(subscription, :provider_instance) ||
|
|
159
|
+
setting_value(subscription, :instance_id) ||
|
|
160
|
+
fleet_offering_lane_setting(:instance_id) ||
|
|
161
|
+
fleet_offering_lane_setting(:provider_instance) ||
|
|
162
|
+
fleet_offering_lane_setting(:offering_instance_id)
|
|
163
|
+
normalized = raw&.to_s
|
|
164
|
+
return nil if normalized.nil? || normalized.empty?
|
|
165
|
+
|
|
166
|
+
normalized
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def self.offering_lanes_enabled?
|
|
170
|
+
fleet_offering_lane_setting(:enabled) == true
|
|
171
|
+
rescue StandardError
|
|
172
|
+
false
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def self.fleet_offering_lane_setting(key)
|
|
176
|
+
offering_lanes = nested_setting(settings, :fleet, :offering_lanes) || {}
|
|
177
|
+
setting_value(offering_lanes, key)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def self.model_worker_actor_name(request_type:, model:, lane_style:, offering_instance_id:)
|
|
181
|
+
return :"model_worker_#{request_type}_#{model.tr(':.', '__')}" if lane_style.to_s == 'shared'
|
|
182
|
+
|
|
183
|
+
suffix = [lane_style, request_type, model, offering_instance_id].compact.join('_')
|
|
184
|
+
:"model_worker_#{actor_suffix(suffix)}"
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def self.actor_suffix(value)
|
|
188
|
+
value.to_s.downcase.gsub(/[^a-z0-9]+/, '_').gsub(/\A_+|_+\z/, '')
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def self.fleet_scheduler
|
|
192
|
+
(nested_setting(settings, :fleet, :scheduler) || :basic_get).to_sym
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def self.valid_fleet_subscriptions(subscriptions)
|
|
196
|
+
return [] unless subscriptions.is_a?(Array)
|
|
197
|
+
|
|
198
|
+
subscriptions.select do |sub|
|
|
199
|
+
setting_value(sub, :type) && setting_value(sub, :model)
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def self.setting_value(hash, key)
|
|
204
|
+
return nil unless hash.respond_to?(:key?)
|
|
205
|
+
|
|
206
|
+
string_key = key.to_s
|
|
207
|
+
return hash[string_key] if hash.key?(string_key)
|
|
208
|
+
|
|
209
|
+
hash[key] if hash.key?(key)
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def self.nested_setting(hash, *keys)
|
|
213
|
+
keys.reduce(hash) do |current, key|
|
|
214
|
+
return nil unless current.respond_to?(:key?)
|
|
215
|
+
|
|
216
|
+
setting_value(current, key)
|
|
66
217
|
end
|
|
67
218
|
end
|
|
68
219
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lex-ollama
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.3.
|
|
4
|
+
version: 0.3.10
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Esity
|
|
@@ -23,6 +23,76 @@ dependencies:
|
|
|
23
23
|
- - ">="
|
|
24
24
|
- !ruby/object:Gem::Version
|
|
25
25
|
version: '2.0'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: legion-json
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - ">="
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: 1.2.1
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - ">="
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: 1.2.1
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: legion-llm
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - ">="
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: 0.8.32
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - ">="
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: 0.8.32
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: legion-logging
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - ">="
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: 1.3.2
|
|
61
|
+
type: :runtime
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - ">="
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: 1.3.2
|
|
68
|
+
- !ruby/object:Gem::Dependency
|
|
69
|
+
name: legion-settings
|
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - ">="
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: 1.3.14
|
|
75
|
+
type: :runtime
|
|
76
|
+
prerelease: false
|
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - ">="
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: 1.3.14
|
|
82
|
+
- !ruby/object:Gem::Dependency
|
|
83
|
+
name: lex-llm
|
|
84
|
+
requirement: !ruby/object:Gem::Requirement
|
|
85
|
+
requirements:
|
|
86
|
+
- - ">="
|
|
87
|
+
- !ruby/object:Gem::Version
|
|
88
|
+
version: 0.1.6
|
|
89
|
+
type: :runtime
|
|
90
|
+
prerelease: false
|
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
92
|
+
requirements:
|
|
93
|
+
- - ">="
|
|
94
|
+
- !ruby/object:Gem::Version
|
|
95
|
+
version: 0.1.6
|
|
26
96
|
- !ruby/object:Gem::Dependency
|
|
27
97
|
name: lex-s3
|
|
28
98
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -56,6 +126,7 @@ files:
|
|
|
56
126
|
- README.md
|
|
57
127
|
- lex-ollama.gemspec
|
|
58
128
|
- lib/legion/extensions/ollama.rb
|
|
129
|
+
- lib/legion/extensions/ollama/actors/endpoint_puller.rb
|
|
59
130
|
- lib/legion/extensions/ollama/actors/model_sync.rb
|
|
60
131
|
- lib/legion/extensions/ollama/actors/model_worker.rb
|
|
61
132
|
- lib/legion/extensions/ollama/client.rb
|
|
@@ -71,8 +142,10 @@ files:
|
|
|
71
142
|
- lib/legion/extensions/ollama/runners/s3_models.rb
|
|
72
143
|
- lib/legion/extensions/ollama/runners/version.rb
|
|
73
144
|
- lib/legion/extensions/ollama/transport.rb
|
|
145
|
+
- lib/legion/extensions/ollama/transport/exchanges/llm_registry.rb
|
|
74
146
|
- lib/legion/extensions/ollama/transport/exchanges/llm_request.rb
|
|
75
147
|
- lib/legion/extensions/ollama/transport/messages/llm_response.rb
|
|
148
|
+
- lib/legion/extensions/ollama/transport/messages/registry_event.rb
|
|
76
149
|
- lib/legion/extensions/ollama/version.rb
|
|
77
150
|
homepage: https://github.com/LegionIO/lex-ollama
|
|
78
151
|
licenses:
|