lex-ollama 0.3.3 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/CLAUDE.md +31 -6
- data/README.md +13 -1
- data/lib/legion/extensions/ollama/actors/endpoint_puller.rb +230 -0
- data/lib/legion/extensions/ollama/actors/model_sync.rb +21 -62
- data/lib/legion/extensions/ollama/actors/model_worker.rb +109 -19
- data/lib/legion/extensions/ollama/runners/s3_models.rb +23 -0
- data/lib/legion/extensions/ollama/version.rb +1 -1
- data/lib/legion/extensions/ollama.rb +100 -0
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c764634fdcad6f8a0d70a8221eb26979cd4bc0338e68b61b072d552574da5ced
|
|
4
|
+
data.tar.gz: cab333e0e78243cddfbc291fc9d243a652ecd4ff5fbcecf942a442e76ab171fc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6efd9372da01f35c6e23a81156e209937b594674c32255a3f15e3bde04196108d734478735bdbe0e081191354269be5df212e7635df0c5511aafa6f24e56c4fb
|
|
7
|
+
data.tar.gz: cc829781858f0786b3a29353b372e792b45363b3b294a604f6496b3f0779b698f229db99996c8f8ebeae9461f0cb150a725dfdf49209e87b8e7f89a06206d233
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,16 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.3.5] - 2026-04-25
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- Fleet model workers now bind transient classic queues to shared `llm.fleet` model lanes, with configurable consumer priority, queue expiration, and message TTL.
|
|
7
|
+
- Subscription entries can provide a context window so inference workers bind lanes like `llm.fleet.inference.qwen3-5-27b.ctx32768`.
|
|
8
|
+
|
|
9
|
+
## [0.3.4] - 2026-04-24
|
|
10
|
+
|
|
11
|
+
### Fixed
|
|
12
|
+
- `Ollama.build_actors` and `Ollama.default_settings` were absent from the installed 0.3.3 gem (gem was packaged before `bafb124` landed) — `Actor::ModelWorker` (requires `request_type:` and `model:` kwargs) was reaching the subscription actor pool with no zero-arg initializer, raising `ArgumentError: missing keywords: :request_type, :model` on every boot when running under the Homebrew legionio install
|
|
13
|
+
|
|
3
14
|
## [0.3.3] - 2026-04-16
|
|
4
15
|
|
|
5
16
|
### Added
|
data/CLAUDE.md
CHANGED
|
@@ -12,8 +12,8 @@ reporting, and **fleet queue subscription** for receiving routed LLM requests fr
|
|
|
12
12
|
|
|
13
13
|
**GitHub**: https://github.com/LegionIO/lex-ollama
|
|
14
14
|
**License**: MIT
|
|
15
|
-
**Version**: 0.3.
|
|
16
|
-
**Specs**:
|
|
15
|
+
**Version**: 0.3.3
|
|
16
|
+
**Specs**: 154 examples (16 spec files)
|
|
17
17
|
|
|
18
18
|
---
|
|
19
19
|
|
|
@@ -28,7 +28,8 @@ Legion::Extensions::Ollama
|
|
|
28
28
|
│ │ # pull_model, push_model, list_running
|
|
29
29
|
│ ├── Embeddings # embed
|
|
30
30
|
│ ├── Blobs # check_blob, push_blob
|
|
31
|
-
│ ├── S3Models # list_s3_models, import_from_s3, sync_from_s3, import_default_models
|
|
31
|
+
│ ├── S3Models # list_s3_models, import_from_s3, sync_from_s3, import_default_models,
|
|
32
|
+
│ │ # sync_configured_models
|
|
32
33
|
│ ├── Version # server_version
|
|
33
34
|
│ └── Fleet # handle_request (fleet dispatcher — chat/embed/generate)
|
|
34
35
|
├── Helpers/
|
|
@@ -44,7 +45,8 @@ Legion::Extensions::Ollama
|
|
|
44
45
|
│ └── Messages/
|
|
45
46
|
│ └── LlmResponse # Legion::LLM::Fleet::Response subclass, reply via default exchange
|
|
46
47
|
└── Actor/
|
|
47
|
-
|
|
48
|
+
├── ModelWorker # subscription actor — one per registered model/type
|
|
49
|
+
└── ModelSync # once actor — fires 5s after boot, pulls default models from S3
|
|
48
50
|
```
|
|
49
51
|
|
|
50
52
|
---
|
|
@@ -93,6 +95,15 @@ RabbitMQ policies (applied externally via Terraform) set `max-length` and
|
|
|
93
95
|
legion:
|
|
94
96
|
ollama:
|
|
95
97
|
host: "http://localhost:11434"
|
|
98
|
+
s3:
|
|
99
|
+
bucket: "legion"
|
|
100
|
+
prefix: "ollama/models"
|
|
101
|
+
endpoint: "https://s3.example.internal"
|
|
102
|
+
default_models:
|
|
103
|
+
- "qwen3.5:4b"
|
|
104
|
+
- "nomic-embed-text:latest"
|
|
105
|
+
fleet:
|
|
106
|
+
consumer_priority: 10 # H100: 10, Mac Studio: 5, MacBook: 1
|
|
96
107
|
subscriptions:
|
|
97
108
|
- type: embed
|
|
98
109
|
model: nomic-embed-text
|
|
@@ -104,7 +115,15 @@ legion:
|
|
|
104
115
|
model: llama3.2
|
|
105
116
|
```
|
|
106
117
|
|
|
107
|
-
|
|
118
|
+
**`s3` + `default_models`**: `Actor::ModelSync` fires 5 seconds after extension load and calls
|
|
119
|
+
`Runners::S3Models#sync_configured_models` to import any listed models not already present
|
|
120
|
+
locally. All download logic lives in the runner; the actor is only the trigger. Uses the
|
|
121
|
+
inherited `Actors::Base#manual` path (not `Legion::Runner`) so errors surface via
|
|
122
|
+
`handle_exception` rather than being silently swallowed by `Concurrent::ScheduledTask`.
|
|
123
|
+
|
|
124
|
+
**`subscriptions`**: `Ollama.build_actors` replaces the base `ModelWorker` actor entry with one
|
|
125
|
+
dynamically generated subclass per subscription entry (each with a zero-arg `initialize`).
|
|
126
|
+
The extension spawns one `Actor::ModelWorker` per entry at boot.
|
|
108
127
|
|
|
109
128
|
### Data Flow
|
|
110
129
|
|
|
@@ -154,6 +173,12 @@ The gem still works as a pure HTTP client library without AMQP, exactly as befor
|
|
|
154
173
|
- `request_type: 'generate'` → `Client#generate`.
|
|
155
174
|
- anything else (including `'chat'` or unknown) → `Client#chat`.
|
|
156
175
|
- **`Actor::ModelWorker#use_runner?` is `false`** — bypasses `Legion::Runner` / task DB entirely.
|
|
176
|
+
- **`Actor::ModelSync#use_runner?` is `false`** — uses inherited `Actors::Base#manual` which calls
|
|
177
|
+
`runner_class.send(runner_function, **{})` with proper `handle_exception` error handling.
|
|
178
|
+
- **`Ollama.build_actors`** dynamically generates one `ModelWorker` subclass per subscription
|
|
179
|
+
entry, each with a zero-arg `initialize` that passes the frozen `request_type` and `model`.
|
|
180
|
+
- **`Ollama.default_settings`** returns `{ s3: {}, fleet: {} }` so `settings[:s3]` and
|
|
181
|
+
`settings[:fleet]` are always hashes even without user configuration.
|
|
157
182
|
- **Reply publishing** never raises — errors are swallowed so the AMQP ack is not blocked.
|
|
158
183
|
- **Colon sanitisation** — `qwen3.5:27b` becomes `qwen3.5.27b` in queue/routing-key strings.
|
|
159
184
|
|
|
@@ -211,4 +236,4 @@ bundle exec rubocop
|
|
|
211
236
|
---
|
|
212
237
|
|
|
213
238
|
**Maintained By**: Matthew Iverson (@Esity)
|
|
214
|
-
**Last Updated**: 2026-04-
|
|
239
|
+
**Last Updated**: 2026-04-17
|
data/README.md
CHANGED
|
@@ -40,6 +40,7 @@ gem install lex-ollama
|
|
|
40
40
|
- `import_from_s3` - Download model from S3 directly to Ollama's filesystem (works before Ollama starts)
|
|
41
41
|
- `sync_from_s3` - Download model from S3, push blobs through Ollama's API, write manifest to filesystem
|
|
42
42
|
- `import_default_models` - Import a list of models from S3 (fleet provisioning)
|
|
43
|
+
- `sync_configured_models` - Import all `default_models` from S3 that aren't already present locally
|
|
43
44
|
|
|
44
45
|
### Version
|
|
45
46
|
- `server_version` - Retrieve the Ollama server version (GET /api/version)
|
|
@@ -58,6 +59,13 @@ via RabbitMQ round-robin with consumer priority.
|
|
|
58
59
|
legion:
|
|
59
60
|
ollama:
|
|
60
61
|
host: "http://localhost:11434"
|
|
62
|
+
s3:
|
|
63
|
+
bucket: "legion"
|
|
64
|
+
prefix: "ollama/models"
|
|
65
|
+
endpoint: "https://s3.example.internal"
|
|
66
|
+
default_models:
|
|
67
|
+
- "qwen3.5:4b"
|
|
68
|
+
- "nomic-embed-text:latest"
|
|
61
69
|
fleet:
|
|
62
70
|
consumer_priority: 10 # H100: 10, Mac Studio: 5, MacBook: 1
|
|
63
71
|
subscriptions:
|
|
@@ -67,6 +75,10 @@ legion:
|
|
|
67
75
|
model: "qwen3.5:27b"
|
|
68
76
|
```
|
|
69
77
|
|
|
78
|
+
**Auto-provisioning**: When `s3` and `default_models` are configured, the `ModelSync` actor
|
|
79
|
+
fires 5 seconds after boot and imports any listed models not already present on disk from the
|
|
80
|
+
S3 mirror. No manual pull step needed for fleet nodes.
|
|
81
|
+
|
|
70
82
|
Fleet messages use the wire protocol defined in `legion-llm`: typed AMQP messages
|
|
71
83
|
(`llm.fleet.request` / `llm.fleet.response` / `llm.fleet.error`) with `message_context`
|
|
72
84
|
propagation for end-to-end tracing.
|
|
@@ -151,7 +163,7 @@ result[:usage] # => { input_tokens: 1, output_tokens: 5, total_duration: ..., .
|
|
|
151
163
|
|
|
152
164
|
## Version
|
|
153
165
|
|
|
154
|
-
0.3.
|
|
166
|
+
0.3.3
|
|
155
167
|
|
|
156
168
|
## License
|
|
157
169
|
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Ollama
|
|
6
|
+
module Actor
|
|
7
|
+
# Polls configured fleet queues with basic_get so endpoint machines choose
|
|
8
|
+
# when they are ready for work instead of holding prefetched messages.
|
|
9
|
+
class EndpointPuller < Legion::Extensions::Actors::Every
|
|
10
|
+
def runner_class
|
|
11
|
+
self.class
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def runner_function
|
|
15
|
+
'action'
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def use_runner?
|
|
19
|
+
false
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def check_subtask?
|
|
23
|
+
false
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def generate_task?
|
|
27
|
+
false
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def enabled?
|
|
31
|
+
fleet_scheduler == :basic_get && endpoint_enabled? && subscriptions.any?
|
|
32
|
+
rescue StandardError => e
|
|
33
|
+
handle_exception(e, level: :warn, handled: true)
|
|
34
|
+
false
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def time
|
|
38
|
+
(setting_value(endpoint_settings, :idle_backoff_ms) || 1_000).to_f / 1000
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def action
|
|
42
|
+
return unless enabled?
|
|
43
|
+
|
|
44
|
+
now = monotonic_time
|
|
45
|
+
ordered_subscriptions.each do |sub|
|
|
46
|
+
next if lane_backed_off?(sub, now)
|
|
47
|
+
|
|
48
|
+
pulled = drain_lane(sub)
|
|
49
|
+
mark_lane_empty(sub) if pulled.zero?
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def ordered_subscriptions
|
|
54
|
+
subscriptions.sort_by do |sub|
|
|
55
|
+
type = sub[:type].to_s
|
|
56
|
+
[embed_type?(type) ? 0 : 1, context_limit(sub), sub[:model].to_s]
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def drain_lane(subscription)
|
|
61
|
+
pulls = 0
|
|
62
|
+
queue = queue_for(subscription)
|
|
63
|
+
|
|
64
|
+
loop do
|
|
65
|
+
break if max_consecutive_pulls_per_lane.positive? && pulls >= max_consecutive_pulls_per_lane
|
|
66
|
+
break unless pull_one(queue, subscription)
|
|
67
|
+
|
|
68
|
+
pulls += 1
|
|
69
|
+
end
|
|
70
|
+
pulls
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def pull_one(queue, subscription)
|
|
74
|
+
delivery_info, metadata, payload = queue.pop(manual_ack: true)
|
|
75
|
+
return false unless delivery_info
|
|
76
|
+
|
|
77
|
+
message = process_payload(payload, metadata, delivery_info, subscription)
|
|
78
|
+
Legion::Extensions::Ollama::Runners::Fleet.handle_request(**message)
|
|
79
|
+
queue.acknowledge(delivery_info.delivery_tag)
|
|
80
|
+
true
|
|
81
|
+
rescue StandardError => e
|
|
82
|
+
handle_exception(e, lex: lex_name, routing_key: delivery_info&.routing_key)
|
|
83
|
+
queue.reject(delivery_info.delivery_tag, requeue: false) if delivery_info
|
|
84
|
+
true
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def queue_for(subscription)
|
|
88
|
+
@queues ||= {}
|
|
89
|
+
@queues[lane_key(subscription)] ||= ModelWorker.queue_class_for(
|
|
90
|
+
request_type: subscription[:type],
|
|
91
|
+
model: subscription[:model],
|
|
92
|
+
context_window: finite_context_limit(subscription),
|
|
93
|
+
queue_config: queue_config
|
|
94
|
+
).new
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def process_payload(payload, metadata, delivery_info, subscription)
|
|
98
|
+
message = decode_payload(payload, metadata)
|
|
99
|
+
message = message.merge(metadata.headers.transform_keys(&:to_sym)) if metadata&.headers
|
|
100
|
+
message[:routing_key] = delivery_info.routing_key if delivery_info.respond_to?(:routing_key)
|
|
101
|
+
message[:request_type] ||= subscription[:type].to_s
|
|
102
|
+
message[:model] ||= subscription[:model].to_s
|
|
103
|
+
message[:message_context] ||= {}
|
|
104
|
+
message
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def decode_payload(payload, metadata)
|
|
108
|
+
decoded = if metadata&.content_encoding == 'encrypted/cs'
|
|
109
|
+
Legion::Crypt.decrypt(payload, metadata_header(metadata, :iv))
|
|
110
|
+
elsif metadata&.content_encoding == 'encrypted/pk'
|
|
111
|
+
Legion::Crypt.decrypt_from_keypair(metadata_header(metadata, :public_key), payload)
|
|
112
|
+
else
|
|
113
|
+
payload
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
if metadata&.content_type == 'application/json'
|
|
117
|
+
Legion::JSON.load(decoded)
|
|
118
|
+
else
|
|
119
|
+
{ value: decoded }
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def subscriptions
|
|
124
|
+
configured = setting_value(settings, :subscriptions)
|
|
125
|
+
return [] unless configured.is_a?(Array)
|
|
126
|
+
|
|
127
|
+
configured.filter_map do |sub|
|
|
128
|
+
next unless sub.is_a?(Hash)
|
|
129
|
+
|
|
130
|
+
normalized = sub.transform_keys(&:to_sym)
|
|
131
|
+
next unless normalized[:type] && normalized[:model]
|
|
132
|
+
|
|
133
|
+
normalized
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def queue_config
|
|
138
|
+
{
|
|
139
|
+
queue_expires_ms: nested_setting(settings, :fleet, :queue_expires_ms),
|
|
140
|
+
message_ttl_ms: nested_setting(settings, :fleet, :message_ttl_ms),
|
|
141
|
+
queue_max_length: nested_setting(settings, :fleet, :queue_max_length),
|
|
142
|
+
delivery_limit: nested_setting(settings, :fleet, :delivery_limit),
|
|
143
|
+
consumer_ack_timeout_ms: nested_setting(settings, :fleet, :consumer_ack_timeout_ms)
|
|
144
|
+
}.compact
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def endpoint_settings
|
|
148
|
+
nested_setting(settings, :fleet, :endpoint) || {}
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def endpoint_enabled?
|
|
152
|
+
setting_value(endpoint_settings, :enabled) == true
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def max_consecutive_pulls_per_lane
|
|
156
|
+
setting_value(endpoint_settings, :max_consecutive_pulls_per_lane) || 0
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def empty_lane_backoff_seconds
|
|
160
|
+
(setting_value(endpoint_settings, :empty_lane_backoff_ms) || 250).to_f / 1000
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def lane_backed_off?(subscription, now)
|
|
164
|
+
(@empty_lanes ||= {}).fetch(lane_key(subscription), 0) > now
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def mark_lane_empty(subscription)
|
|
168
|
+
(@empty_lanes ||= {})[lane_key(subscription)] = monotonic_time + empty_lane_backoff_seconds
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def lane_key(subscription)
|
|
172
|
+
type = subscription[:type]
|
|
173
|
+
model = subscription[:model]
|
|
174
|
+
context = context_limit(subscription)
|
|
175
|
+
context.finite? ? "#{type}:#{model}:ctx#{context}" : "#{type}:#{model}"
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def monotonic_time
|
|
179
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
def fleet_scheduler
|
|
183
|
+
(nested_setting(settings, :fleet, :scheduler) || :basic_get).to_sym
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def context_limit(subscription)
|
|
187
|
+
raw = setting_value(subscription, :max_context_size) ||
|
|
188
|
+
setting_value(subscription, :context_window) ||
|
|
189
|
+
setting_value(subscription, :max_input_tokens) ||
|
|
190
|
+
setting_value(subscription, :context) ||
|
|
191
|
+
setting_value(subscription, :ctx)
|
|
192
|
+
Integer(raw || Float::INFINITY)
|
|
193
|
+
rescue ArgumentError, TypeError, FloatDomainError
|
|
194
|
+
Float::INFINITY
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def finite_context_limit(subscription)
|
|
198
|
+
context = context_limit(subscription)
|
|
199
|
+
context.finite? ? context : nil
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def embed_type?(type)
|
|
203
|
+
%w[embed embedding embeddings].include?(type)
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def metadata_header(metadata, key)
|
|
207
|
+
setting_value(metadata&.headers || {}, key)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def nested_setting(hash, *keys)
|
|
211
|
+
keys.reduce(hash) do |current, key|
|
|
212
|
+
return nil unless current.respond_to?(:key?)
|
|
213
|
+
|
|
214
|
+
setting_value(current, key)
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def setting_value(hash, key)
|
|
219
|
+
return nil unless hash.respond_to?(:key?)
|
|
220
|
+
|
|
221
|
+
string_key = key.to_s
|
|
222
|
+
return hash[string_key] if hash.key?(string_key)
|
|
223
|
+
|
|
224
|
+
hash[key] if hash.key?(key)
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
end
|
|
@@ -4,84 +4,43 @@ module Legion
|
|
|
4
4
|
module Extensions
|
|
5
5
|
module Ollama
|
|
6
6
|
module Actor
|
|
7
|
-
# Once actor —
|
|
8
|
-
#
|
|
9
|
-
#
|
|
7
|
+
# Once actor — fires 5s after extension load and calls
|
|
8
|
+
# Runners::S3Models#sync_configured_models to pull any configured
|
|
9
|
+
# default models from S3 that are not already present locally.
|
|
10
10
|
#
|
|
11
|
-
#
|
|
12
|
-
# {
|
|
13
|
-
# "legion": {
|
|
14
|
-
# "ollama": {
|
|
15
|
-
# "s3": {
|
|
16
|
-
# "bucket": "legion",
|
|
17
|
-
# "prefix": "ollama/models",
|
|
18
|
-
# "endpoint": "https://s3.example.internal"
|
|
19
|
-
# },
|
|
20
|
-
# "default_models": ["qwen3.5:4b", "nomic-embed-text:latest"]
|
|
21
|
-
# }
|
|
22
|
-
# }
|
|
23
|
-
# }
|
|
11
|
+
# All download logic lives in the runner. This actor is only the trigger.
|
|
24
12
|
class ModelSync < Legion::Extensions::Actors::Once
|
|
25
|
-
include Legion::Logging::Helper
|
|
26
|
-
|
|
27
|
-
# Run 5 seconds after extension load to allow the rest of startup to complete.
|
|
28
13
|
def delay
|
|
29
14
|
5.0
|
|
30
15
|
end
|
|
31
16
|
|
|
32
|
-
def use_runner?
|
|
33
|
-
false
|
|
34
|
-
end
|
|
35
|
-
|
|
36
17
|
def runner_class
|
|
37
|
-
|
|
18
|
+
Legion::Extensions::Ollama::Runners::S3Models
|
|
38
19
|
end
|
|
39
20
|
|
|
40
|
-
def
|
|
41
|
-
|
|
21
|
+
def runner_function
|
|
22
|
+
'sync_configured_models'
|
|
23
|
+
end
|
|
42
24
|
|
|
43
|
-
|
|
44
|
-
s3_cfg = Legion::Settings.dig(:ollama, :s3)
|
|
45
|
-
models.is_a?(Array) && !models.empty? && s3_cfg.is_a?(Hash) && s3_cfg[:bucket]
|
|
46
|
-
rescue StandardError => e
|
|
47
|
-
handle_exception(e, level: :warn, handled: true)
|
|
25
|
+
def use_runner?
|
|
48
26
|
false
|
|
49
27
|
end
|
|
50
28
|
|
|
51
|
-
def
|
|
52
|
-
|
|
53
|
-
s3_cfg = Legion::Settings.dig(:ollama, :s3)
|
|
54
|
-
bucket = s3_cfg[:bucket]
|
|
55
|
-
s3_opts = s3_cfg.except(:bucket)
|
|
56
|
-
|
|
57
|
-
client = Object.new.extend(Legion::Extensions::Ollama::Runners::S3Models)
|
|
58
|
-
models_path = ENV.fetch('OLLAMA_MODELS', File.join(Dir.home, '.ollama', 'models'))
|
|
59
|
-
|
|
60
|
-
models.each do |model|
|
|
61
|
-
if model_present_locally?(model, models_path)
|
|
62
|
-
log.debug "[ModelSync] #{model} already present locally, skipping"
|
|
63
|
-
next
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
log.info "[ModelSync] importing #{model} from S3"
|
|
67
|
-
result = client.import_from_s3(model: model, bucket: bucket, models_path: models_path, **s3_opts)
|
|
68
|
-
if result[:status] == 200
|
|
69
|
-
log.info "[ModelSync] imported #{model} (blobs_downloaded=#{result[:blobs_downloaded]}, blobs_skipped=#{result[:blobs_skipped]})"
|
|
70
|
-
else
|
|
71
|
-
log.warn "[ModelSync] failed to import #{model}: #{result.inspect}"
|
|
72
|
-
end
|
|
73
|
-
rescue StandardError => e
|
|
74
|
-
handle_exception(e, level: :error, handled: true, model: model)
|
|
75
|
-
end
|
|
29
|
+
def check_subtask?
|
|
30
|
+
false
|
|
76
31
|
end
|
|
77
32
|
|
|
78
|
-
|
|
33
|
+
def generate_task?
|
|
34
|
+
false
|
|
35
|
+
end
|
|
79
36
|
|
|
80
|
-
def
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
37
|
+
def enabled?
|
|
38
|
+
s3_cfg = settings[:s3]
|
|
39
|
+
models = settings[:default_models]
|
|
40
|
+
s3_cfg.is_a?(Hash) && !s3_cfg[:bucket].nil? && models.is_a?(Array) && !models.empty?
|
|
41
|
+
rescue StandardError => e
|
|
42
|
+
handle_exception(e, level: :warn, handled: true)
|
|
43
|
+
false
|
|
85
44
|
end
|
|
86
45
|
end
|
|
87
46
|
end
|
|
@@ -19,15 +19,17 @@ module Legion
|
|
|
19
19
|
# - type: chat
|
|
20
20
|
# model: "qwen3.5:27b"
|
|
21
21
|
#
|
|
22
|
-
# The queue name and routing key both follow
|
|
23
|
-
# llm.
|
|
24
|
-
#
|
|
22
|
+
# The queue name and routing key both follow shared fleet lane schemas:
|
|
23
|
+
# llm.fleet.embed.<model>
|
|
24
|
+
# llm.fleet.inference.<model>.ctx<context_window>
|
|
25
|
+
# when an inference context window is known.
|
|
25
26
|
class ModelWorker < Legion::Extensions::Actors::Subscription
|
|
26
|
-
attr_reader :request_type, :model_name
|
|
27
|
+
attr_reader :request_type, :model_name, :context_window
|
|
27
28
|
|
|
28
|
-
def initialize(request_type:, model:, **)
|
|
29
|
-
@request_type
|
|
30
|
-
@model_name
|
|
29
|
+
def initialize(request_type:, model:, context_window: nil, **)
|
|
30
|
+
@request_type = request_type.to_s
|
|
31
|
+
@model_name = model.to_s
|
|
32
|
+
@context_window = context_window&.to_i
|
|
31
33
|
super(**)
|
|
32
34
|
end
|
|
33
35
|
|
|
@@ -59,9 +61,27 @@ module Legion
|
|
|
59
61
|
# Standard scale: GPU server = 10, Mac Studio = 5, developer laptop = 1.
|
|
60
62
|
# Defaults to 0 (equal priority) if not configured.
|
|
61
63
|
def consumer_priority
|
|
62
|
-
|
|
64
|
+
setting_value(fleet_settings, :consumer_priority) || 0
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def queue_expires_ms
|
|
68
|
+
setting_value(fleet_settings, :queue_expires_ms) || 60_000
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def message_ttl_ms
|
|
72
|
+
setting_value(fleet_settings, :message_ttl_ms) || 120_000
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def queue_max_length
|
|
76
|
+
setting_value(fleet_settings, :queue_max_length) || 100
|
|
77
|
+
end
|
|
63
78
|
|
|
64
|
-
|
|
79
|
+
def delivery_limit
|
|
80
|
+
setting_value(fleet_settings, :delivery_limit) || 3
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def consumer_ack_timeout_ms
|
|
84
|
+
setting_value(fleet_settings, :consumer_ack_timeout_ms) || 300_000
|
|
65
85
|
end
|
|
66
86
|
|
|
67
87
|
# Subscribe options include x-priority argument so RabbitMQ can honour
|
|
@@ -75,14 +95,45 @@ module Legion
|
|
|
75
95
|
base.merge(arguments: { 'x-priority' => consumer_priority })
|
|
76
96
|
end
|
|
77
97
|
|
|
78
|
-
# Returns a queue CLASS (not instance) bound to the llm.
|
|
79
|
-
# with the routing key for this worker's
|
|
98
|
+
# Returns a queue CLASS (not instance) bound to the llm.fleet exchange
|
|
99
|
+
# with the routing key for this worker's model offering lane.
|
|
80
100
|
# The Subscription base class calls queue.new in initialize, so this must
|
|
81
101
|
# return a class, not an instance.
|
|
82
102
|
def queue
|
|
83
103
|
@queue ||= build_queue_class
|
|
84
104
|
end
|
|
85
105
|
|
|
106
|
+
def self.queue_class_for(request_type:, model:, context_window: nil, queue_config: {})
|
|
107
|
+
worker = allocate
|
|
108
|
+
worker.instance_variable_set(:@request_type, request_type.to_s)
|
|
109
|
+
worker.instance_variable_set(:@model_name, model.to_s)
|
|
110
|
+
worker.instance_variable_set(:@context_window, context_window&.to_i)
|
|
111
|
+
worker.send(:build_queue_class, queue_config)
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def self.fallback_queue_options(settings)
|
|
115
|
+
{
|
|
116
|
+
durable: true,
|
|
117
|
+
auto_delete: false,
|
|
118
|
+
arguments: {
|
|
119
|
+
'x-queue-type' => 'quorum',
|
|
120
|
+
'x-queue-leader-locator' => 'balanced',
|
|
121
|
+
'x-expires' => settings.fetch(:queue_expires_ms),
|
|
122
|
+
'x-message-ttl' => settings.fetch(:message_ttl_ms),
|
|
123
|
+
'x-overflow' => 'reject-publish',
|
|
124
|
+
'x-max-length' => settings.fetch(:queue_max_length),
|
|
125
|
+
'x-delivery-limit' => settings.fetch(:delivery_limit),
|
|
126
|
+
'x-consumer-timeout' => settings.fetch(:consumer_ack_timeout_ms)
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def routing_key
|
|
132
|
+
parts = ['llm.fleet', lane_kind, sanitized_model]
|
|
133
|
+
parts << "ctx#{@context_window}" if lane_kind == 'inference' && @context_window
|
|
134
|
+
parts.join('.')
|
|
135
|
+
end
|
|
136
|
+
|
|
86
137
|
# Enrich every inbound message with the worker's own request_type and model
|
|
87
138
|
# so Runners::Fleet#handle_request always has them, even if the sender omitted
|
|
88
139
|
# them. Also defaults message_context to {} if absent.
|
|
@@ -96,23 +147,62 @@ module Legion
|
|
|
96
147
|
|
|
97
148
|
private
|
|
98
149
|
|
|
99
|
-
def build_queue_class
|
|
100
|
-
|
|
101
|
-
routing_key = "llm.request.ollama.#{@request_type}.#{sanitised_model}"
|
|
150
|
+
def build_queue_class(queue_config = {})
|
|
151
|
+
lane_key = routing_key
|
|
102
152
|
exchange_class = Transport::Exchanges::LlmRequest
|
|
153
|
+
queue_settings = {
|
|
154
|
+
queue_expires_ms: queue_expires_ms,
|
|
155
|
+
message_ttl_ms: message_ttl_ms,
|
|
156
|
+
queue_max_length: queue_max_length,
|
|
157
|
+
delivery_limit: delivery_limit,
|
|
158
|
+
consumer_ack_timeout_ms: consumer_ack_timeout_ms
|
|
159
|
+
}.merge((queue_config || {}).compact)
|
|
160
|
+
|
|
161
|
+
if defined?(::Legion::Extensions::Llm::Transport::FleetLane)
|
|
162
|
+
return ::Legion::Extensions::Llm::Transport::FleetLane.build_queue_class(
|
|
163
|
+
queue_name: lane_key,
|
|
164
|
+
exchange_class: exchange_class,
|
|
165
|
+
routing_key: lane_key,
|
|
166
|
+
base_queue_class: Legion::Transport::Queue,
|
|
167
|
+
settings: queue_settings
|
|
168
|
+
)
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
queue_options = self.class.fallback_queue_options(queue_settings)
|
|
103
172
|
|
|
104
173
|
Class.new(Legion::Transport::Queue) do
|
|
105
|
-
define_method(:queue_name) {
|
|
106
|
-
define_method(:queue_options)
|
|
107
|
-
{ durable: false, auto_delete: true, arguments: { 'x-max-priority' => 10 } }
|
|
108
|
-
end
|
|
174
|
+
define_method(:queue_name) { lane_key }
|
|
175
|
+
define_method(:queue_options) { queue_options }
|
|
109
176
|
define_method(:dlx_enabled) { false }
|
|
110
177
|
define_method(:initialize) do
|
|
111
178
|
super()
|
|
112
|
-
bind(exchange_class.new, routing_key:
|
|
179
|
+
bind(exchange_class.new, routing_key: lane_key)
|
|
113
180
|
end
|
|
114
181
|
end
|
|
115
182
|
end
|
|
183
|
+
|
|
184
|
+
def fleet_settings
|
|
185
|
+
setting_value(settings, :fleet) || {}
|
|
186
|
+
rescue NameError
|
|
187
|
+
{}
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def setting_value(hash, key)
|
|
191
|
+
return nil unless hash.respond_to?(:key?)
|
|
192
|
+
|
|
193
|
+
string_key = key.to_s
|
|
194
|
+
return hash[string_key] if hash.key?(string_key)
|
|
195
|
+
|
|
196
|
+
hash[key] if hash.key?(key)
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def lane_kind
|
|
200
|
+
%w[embed embedding embeddings].include?(@request_type) ? 'embed' : 'inference'
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def sanitized_model
|
|
204
|
+
@model_name.downcase.gsub(/[^a-z0-9]+/, '-').gsub(/\A-+|-+\z/, '').squeeze('-')
|
|
205
|
+
end
|
|
116
206
|
end
|
|
117
207
|
end
|
|
118
208
|
end
|
|
@@ -145,6 +145,29 @@ module Legion
|
|
|
145
145
|
{ result: results, status: 200 }
|
|
146
146
|
end
|
|
147
147
|
|
|
148
|
+
def sync_configured_models(**)
|
|
149
|
+
s3_cfg = settings[:s3]
|
|
150
|
+
models = settings[:default_models]
|
|
151
|
+
|
|
152
|
+
return { result: false, status: 412, error: 'no s3 config' } unless s3_cfg.is_a?(Hash) && s3_cfg[:bucket]
|
|
153
|
+
return { result: false, status: 412, error: 'no default_models configured' } unless models.is_a?(Array) && !models.empty?
|
|
154
|
+
|
|
155
|
+
bucket = s3_cfg[:bucket]
|
|
156
|
+
s3_opts = s3_cfg.except(:bucket)
|
|
157
|
+
models_path = ENV.fetch('OLLAMA_MODELS', File.join(Dir.home, '.ollama', 'models'))
|
|
158
|
+
|
|
159
|
+
results = models.filter_map do |model|
|
|
160
|
+
name, tag = model.split(':')
|
|
161
|
+
tag ||= 'latest'
|
|
162
|
+
manifest = File.join(models_path, 'manifests', 'registry.ollama.ai', 'library', name, tag)
|
|
163
|
+
next if File.exist?(manifest)
|
|
164
|
+
|
|
165
|
+
import_from_s3(model: model, bucket: bucket, models_path: models_path, **s3_opts)
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
{ result: results, status: 200 }
|
|
169
|
+
end
|
|
170
|
+
|
|
148
171
|
private
|
|
149
172
|
|
|
150
173
|
def default_models_path
|
|
@@ -14,6 +14,12 @@ require 'legion/extensions/ollama/runners/version'
|
|
|
14
14
|
require 'legion/extensions/ollama/runners/fleet'
|
|
15
15
|
require 'legion/extensions/ollama/client'
|
|
16
16
|
|
|
17
|
+
begin
|
|
18
|
+
require 'legion/extensions/llm'
|
|
19
|
+
rescue LoadError
|
|
20
|
+
nil
|
|
21
|
+
end
|
|
22
|
+
|
|
17
23
|
# Fleet transport and actor wiring — only loaded when Legion::Extensions::Core is present
|
|
18
24
|
# so the gem still works as a standalone HTTP client without any AMQP runtime.
|
|
19
25
|
if Legion::Extensions.const_defined?(:Core, false)
|
|
@@ -21,6 +27,7 @@ if Legion::Extensions.const_defined?(:Core, false)
|
|
|
21
27
|
require 'legion/extensions/ollama/transport/messages/llm_response'
|
|
22
28
|
require 'legion/extensions/ollama/transport'
|
|
23
29
|
require 'legion/extensions/ollama/actors/model_worker'
|
|
30
|
+
require 'legion/extensions/ollama/actors/endpoint_puller'
|
|
24
31
|
require 'legion/extensions/ollama/actors/model_sync'
|
|
25
32
|
end
|
|
26
33
|
|
|
@@ -28,6 +35,99 @@ module Legion
|
|
|
28
35
|
module Extensions
|
|
29
36
|
module Ollama
|
|
30
37
|
extend Legion::Extensions::Core if Legion::Extensions.const_defined?(:Core, false)
|
|
38
|
+
|
|
39
|
+
def self.default_settings
|
|
40
|
+
{
|
|
41
|
+
s3: {},
|
|
42
|
+
fleet: {
|
|
43
|
+
consumer_priority: 0,
|
|
44
|
+
scheduler: :basic_get,
|
|
45
|
+
queue_expires_ms: 60_000,
|
|
46
|
+
message_ttl_ms: 120_000,
|
|
47
|
+
queue_max_length: 100,
|
|
48
|
+
delivery_limit: 3,
|
|
49
|
+
consumer_ack_timeout_ms: 300_000,
|
|
50
|
+
endpoint: {
|
|
51
|
+
enabled: false,
|
|
52
|
+
empty_lane_backoff_ms: 250,
|
|
53
|
+
idle_backoff_ms: 1_000,
|
|
54
|
+
max_consecutive_pulls_per_lane: 0,
|
|
55
|
+
accept_when: []
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Called by the framework during autobuild. Runs normal actor discovery,
|
|
62
|
+
# then replaces the single ModelWorker entry with one concrete subclass
|
|
63
|
+
# per subscription entry in settings (each has a zero-arg initialize).
|
|
64
|
+
def self.build_actors
|
|
65
|
+
super
|
|
66
|
+
@actors.delete(:model_worker)
|
|
67
|
+
|
|
68
|
+
subs = setting_value(settings, :subscriptions)
|
|
69
|
+
valid_subscriptions = valid_fleet_subscriptions(subs)
|
|
70
|
+
endpoint_configured = fleet_scheduler == :basic_get &&
|
|
71
|
+
nested_setting(settings, :fleet, :endpoint, :enabled) == true &&
|
|
72
|
+
valid_subscriptions.any?
|
|
73
|
+
@actors.delete(:endpoint_puller) unless endpoint_configured
|
|
74
|
+
|
|
75
|
+
return unless subs.is_a?(Array)
|
|
76
|
+
return if fleet_scheduler == :basic_get
|
|
77
|
+
|
|
78
|
+
subs.each do |sub|
|
|
79
|
+
request_type = setting_value(sub, :type)&.to_s
|
|
80
|
+
model = setting_value(sub, :model)&.to_s
|
|
81
|
+
limits = setting_value(sub, :limits) || {}
|
|
82
|
+
context_window = setting_value(sub, :context_window) ||
|
|
83
|
+
setting_value(limits, :context_window)
|
|
84
|
+
next unless request_type && model
|
|
85
|
+
|
|
86
|
+
actor_name = :"model_worker_#{request_type}_#{model.tr(':.', '__')}"
|
|
87
|
+
worker_class = Class.new(Legion::Extensions::Ollama::Actor::ModelWorker) do
|
|
88
|
+
define_method(:initialize) do
|
|
89
|
+
super(request_type: request_type, model: model, context_window: context_window)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
@actors[actor_name] = {
|
|
94
|
+
extension: 'lex-ollama',
|
|
95
|
+
extension_name: :ollama,
|
|
96
|
+
actor_name: actor_name,
|
|
97
|
+
actor_class: worker_class,
|
|
98
|
+
type: 'literal'
|
|
99
|
+
}
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def self.fleet_scheduler
|
|
104
|
+
(nested_setting(settings, :fleet, :scheduler) || :basic_get).to_sym
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def self.valid_fleet_subscriptions(subscriptions)
|
|
108
|
+
return [] unless subscriptions.is_a?(Array)
|
|
109
|
+
|
|
110
|
+
subscriptions.select do |sub|
|
|
111
|
+
setting_value(sub, :type) && setting_value(sub, :model)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def self.setting_value(hash, key)
|
|
116
|
+
return nil unless hash.respond_to?(:key?)
|
|
117
|
+
|
|
118
|
+
string_key = key.to_s
|
|
119
|
+
return hash[string_key] if hash.key?(string_key)
|
|
120
|
+
|
|
121
|
+
hash[key] if hash.key?(key)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def self.nested_setting(hash, *keys)
|
|
125
|
+
keys.reduce(hash) do |current, key|
|
|
126
|
+
return nil unless current.respond_to?(:key?)
|
|
127
|
+
|
|
128
|
+
setting_value(current, key)
|
|
129
|
+
end
|
|
130
|
+
end
|
|
31
131
|
end
|
|
32
132
|
end
|
|
33
133
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lex-ollama
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.3.
|
|
4
|
+
version: 0.3.5
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Esity
|
|
@@ -56,6 +56,7 @@ files:
|
|
|
56
56
|
- README.md
|
|
57
57
|
- lex-ollama.gemspec
|
|
58
58
|
- lib/legion/extensions/ollama.rb
|
|
59
|
+
- lib/legion/extensions/ollama/actors/endpoint_puller.rb
|
|
59
60
|
- lib/legion/extensions/ollama/actors/model_sync.rb
|
|
60
61
|
- lib/legion/extensions/ollama/actors/model_worker.rb
|
|
61
62
|
- lib/legion/extensions/ollama/client.rb
|