embedding_util 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +7 -1
- data/lib/embedding_util/cli.rb +4 -0
- data/lib/embedding_util/configuration.rb +3 -1
- data/lib/embedding_util/profiles.rb +1 -1
- data/lib/embedding_util/providers/endpoint.rb +13 -1
- data/lib/embedding_util/providers/self_hosted.rb +16 -1
- data/lib/embedding_util/runtime_command.rb +5 -4
- data/lib/embedding_util/server_manager.rb +65 -1
- data/lib/embedding_util/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fb8276720e51a283fb4b8c70507ae54d25ea968caffe8142f636a6e6ea948b71
|
|
4
|
+
data.tar.gz: eccb6e6d11006238a1af1bfc25fb3b5146325bf78a1deb0f447a73541dec128d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e625a2389828218a8f39cc4dc69d6ea965f59e032d190407668029df227f3c7ffee32ce30b64141d8a38654573ac591315c08696f051db3f68772efaf1b7bf34
|
|
7
|
+
data.tar.gz: 5da07a5487e7167128b67d8b6e39009d6a2fd4c0949daf1e61ae2cb9e98fa668beecd618713b1d0545daa47a86ac6f4e77f66d3429093c658852b594b4aa3cc5
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
## [0.1.2] - 2026-06-10
|
|
2
|
+
|
|
3
|
+
- Add self-hosted reranker recovery for llama.cpp physical batch-size failures
|
|
4
|
+
- Start managed reranker servers with `--ubatch-size 1024`
|
|
5
|
+
- Restart managed reranker servers once with `--ubatch-size 4096` when larger rerank requests require it
|
|
6
|
+
- Add configuration and CLI options for reranker ubatch defaults and maximums
|
|
7
|
+
- Add clearer guidance for app-managed reranker endpoints that need a larger `--ubatch-size`
|
|
8
|
+
|
|
1
9
|
## [0.1.1] - 2026-06-08
|
|
2
10
|
|
|
3
11
|
- Fix local server lifecycle cleanup for Ramalama and direct `llama-server`
|
data/README.md
CHANGED
|
@@ -118,7 +118,9 @@ Reranker model:
|
|
|
118
118
|
|
|
119
119
|
- repo: `ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF`
|
|
120
120
|
- file: `qwen3-reranker-0.6b-q8_0.gguf`
|
|
121
|
-
- server flags: `--reranking`
|
|
121
|
+
- server flags: `--reranking --ubatch-size 1024`
|
|
122
|
+
|
|
123
|
+
For self-hosted rerankers, `embedding_util` starts with `--ubatch-size 1024`. If a larger rerank request exceeds llama.cpp's physical batch size, the managed reranker is restarted once with `config.reranker_max_ubatch_size`, which defaults to `4096`, and the request is retried.
|
|
122
124
|
|
|
123
125
|
Do not combine embedding and reranking flags for this profile. Run separate local servers.
|
|
124
126
|
|
|
@@ -133,6 +135,8 @@ EmbeddingUtil.configure do |config|
|
|
|
133
135
|
config.reranker_port = 18081
|
|
134
136
|
config.startup_timeout = 3600
|
|
135
137
|
config.shutdown_idle = 300
|
|
138
|
+
config.reranker_ubatch_size = 1024
|
|
139
|
+
config.reranker_max_ubatch_size = 4096
|
|
136
140
|
config.timeout = 60
|
|
137
141
|
end
|
|
138
142
|
```
|
|
@@ -159,6 +163,8 @@ Environment variables are also supported:
|
|
|
159
163
|
- `EMBEDDING_UTIL_VERBOSE`
|
|
160
164
|
- `EMBEDDING_UTIL_EMBEDDING_PORT`
|
|
161
165
|
- `EMBEDDING_UTIL_RERANKER_PORT`
|
|
166
|
+
- `EMBEDDING_UTIL_RERANKER_UBATCH_SIZE`
|
|
167
|
+
- `EMBEDDING_UTIL_RERANKER_MAX_UBATCH_SIZE`
|
|
162
168
|
|
|
163
169
|
## Development
|
|
164
170
|
|
data/lib/embedding_util/cli.rb
CHANGED
|
@@ -14,6 +14,8 @@ module EmbeddingUtil
|
|
|
14
14
|
timeout: ->(value) { value },
|
|
15
15
|
startup_timeout: ->(value) { value },
|
|
16
16
|
shutdown_idle: :to_i.to_proc,
|
|
17
|
+
reranker_ubatch_size: :to_i.to_proc,
|
|
18
|
+
reranker_max_ubatch_size: :to_i.to_proc,
|
|
17
19
|
verbose: ->(value) { value }
|
|
18
20
|
}.freeze
|
|
19
21
|
|
|
@@ -25,6 +27,8 @@ module EmbeddingUtil
|
|
|
25
27
|
class_option :timeout, type: :numeric, desc: "HTTP timeout in seconds"
|
|
26
28
|
class_option :startup_timeout, type: :numeric, desc: "Seconds to wait for self-hosted server startup"
|
|
27
29
|
class_option :shutdown_idle, type: :numeric, desc: "Stop self-hosted server after this many seconds without stdout/stderr activity"
|
|
30
|
+
class_option :reranker_ubatch_size, type: :numeric, desc: "llama.cpp physical batch size for self-hosted reranker servers"
|
|
31
|
+
class_option :reranker_max_ubatch_size, type: :numeric, desc: "Largest reranker physical batch size for automatic retry"
|
|
28
32
|
class_option :verbose, type: :boolean, desc: "Print self-hosting diagnostics"
|
|
29
33
|
|
|
30
34
|
desc "support", "Display configured provider support"
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
module EmbeddingUtil
|
|
4
4
|
class Configuration
|
|
5
5
|
attr_accessor :profile, :provider, :endpoint, :embedding_endpoint, :reranker_endpoint, :timeout, :startup_timeout, :shutdown_idle, :host,
|
|
6
|
-
:embedding_port, :reranker_port, :state_dir, :verbose
|
|
6
|
+
:embedding_port, :reranker_port, :state_dir, :verbose, :reranker_ubatch_size, :reranker_max_ubatch_size
|
|
7
7
|
attr_reader :runtime
|
|
8
8
|
|
|
9
9
|
def initialize
|
|
@@ -19,6 +19,8 @@ module EmbeddingUtil
|
|
|
19
19
|
@host = ENV.fetch("EMBEDDING_UTIL_HOST", "127.0.0.1")
|
|
20
20
|
@embedding_port = Integer(ENV.fetch("EMBEDDING_UTIL_EMBEDDING_PORT", "18080"))
|
|
21
21
|
@reranker_port = Integer(ENV.fetch("EMBEDDING_UTIL_RERANKER_PORT", "18081"))
|
|
22
|
+
@reranker_ubatch_size = Integer(ENV.fetch("EMBEDDING_UTIL_RERANKER_UBATCH_SIZE", "1024"))
|
|
23
|
+
@reranker_max_ubatch_size = Integer(ENV.fetch("EMBEDDING_UTIL_RERANKER_MAX_UBATCH_SIZE", "4096"))
|
|
22
24
|
@state_dir = ENV.fetch("EMBEDDING_UTIL_STATE_DIR", File.expand_path("~/.local/state/embedding_util"))
|
|
23
25
|
@verbose = ENV.fetch("EMBEDDING_UTIL_VERBOSE", "false").match?(/\A(?:1|true|yes|on)\z/i)
|
|
24
26
|
end
|
|
@@ -118,7 +118,7 @@ module EmbeddingUtil
|
|
|
118
118
|
end
|
|
119
119
|
|
|
120
120
|
raise EndpointNotFoundError.new(uri, path: path, body: response.body) if response.code.to_i == 404 && route_missing_response?(response.body)
|
|
121
|
-
raise EndpointError,
|
|
121
|
+
raise EndpointError, endpoint_error_message(uri, response, path) unless response.is_a?(Net::HTTPSuccess)
|
|
122
122
|
|
|
123
123
|
JSON.parse(response.body)
|
|
124
124
|
rescue JSON::ParserError => e
|
|
@@ -148,6 +148,18 @@ module EmbeddingUtil
|
|
|
148
148
|
def fallback_rerank_not_found?(error)
|
|
149
149
|
error.path == "/v1/rerank"
|
|
150
150
|
end
|
|
151
|
+
|
|
152
|
+
def endpoint_error_message(uri, response, path)
|
|
153
|
+
message = "#{uri} returned #{response.code}: #{response.body}"
|
|
154
|
+
return message unless reranker_batch_size_error?(path, response.body)
|
|
155
|
+
|
|
156
|
+
"#{message}. Restart the reranker server with a larger llama.cpp --ubatch-size; " \
|
|
157
|
+
"embedding_util-managed reranker servers use --ubatch-size 1024 by default."
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def reranker_batch_size_error?(path, body)
|
|
161
|
+
path.end_with?("/rerank") && body.to_s.include?("increase the physical batch size")
|
|
162
|
+
end
|
|
151
163
|
end
|
|
152
164
|
end
|
|
153
165
|
end
|
|
@@ -27,7 +27,14 @@ module EmbeddingUtil
|
|
|
27
27
|
end
|
|
28
28
|
|
|
29
29
|
def rerank(query, documents, profile: config.resolved_profile)
|
|
30
|
-
|
|
30
|
+
manager = ServerManager.new(config: config)
|
|
31
|
+
endpoint = manager.ensure_server(:reranker, profile: profile)
|
|
32
|
+
endpoint_provider(reranker_endpoint: endpoint).rerank(query, documents, profile: profile)
|
|
33
|
+
rescue EndpointError => e
|
|
34
|
+
raise unless reranker_batch_size_error?(e) && can_escalate_reranker_ubatch?
|
|
35
|
+
|
|
36
|
+
config.reranker_ubatch_size = config.reranker_max_ubatch_size
|
|
37
|
+
endpoint = manager.restart_server(:reranker, profile: profile)
|
|
31
38
|
endpoint_provider(reranker_endpoint: endpoint).rerank(query, documents, profile: profile)
|
|
32
39
|
end
|
|
33
40
|
|
|
@@ -39,6 +46,14 @@ module EmbeddingUtil
|
|
|
39
46
|
endpoint_config.reranker_endpoint = reranker_endpoint
|
|
40
47
|
Endpoint.new(config: endpoint_config)
|
|
41
48
|
end
|
|
49
|
+
|
|
50
|
+
def reranker_batch_size_error?(error)
|
|
51
|
+
error.message.include?("increase the physical batch size")
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def can_escalate_reranker_ubatch?
|
|
55
|
+
config.reranker_ubatch_size < config.reranker_max_ubatch_size
|
|
56
|
+
end
|
|
42
57
|
end
|
|
43
58
|
end
|
|
44
59
|
end
|
|
@@ -2,13 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
module EmbeddingUtil
|
|
4
4
|
class RuntimeCommand
|
|
5
|
-
attr_reader :runtime, :server_model, :host, :port
|
|
5
|
+
attr_reader :runtime, :server_model, :host, :port, :server_flags
|
|
6
6
|
|
|
7
|
-
def initialize(runtime:, server_model:, host:, port:)
|
|
7
|
+
def initialize(runtime:, server_model:, host:, port:, server_flags: nil)
|
|
8
8
|
@runtime = self.class.normalize_runtime(runtime)
|
|
9
9
|
@server_model = server_model
|
|
10
10
|
@host = host
|
|
11
11
|
@port = port
|
|
12
|
+
@server_flags = server_flags || server_model.settings.fetch(:server_flags)
|
|
12
13
|
end
|
|
13
14
|
|
|
14
15
|
def self.available?(runtime)
|
|
@@ -86,7 +87,7 @@ module EmbeddingUtil
|
|
|
86
87
|
"--name", server_name,
|
|
87
88
|
"--host", host,
|
|
88
89
|
"--port", port.to_s,
|
|
89
|
-
"--runtime-args=#{
|
|
90
|
+
"--runtime-args=#{server_flags.join(' ')}",
|
|
90
91
|
huggingface_model
|
|
91
92
|
]
|
|
92
93
|
end
|
|
@@ -98,7 +99,7 @@ module EmbeddingUtil
|
|
|
98
99
|
"--port", port.to_s,
|
|
99
100
|
"-hf", server_model.settings.fetch(:repo),
|
|
100
101
|
"-hff", server_model.settings.fetch(:file),
|
|
101
|
-
*
|
|
102
|
+
*server_flags
|
|
102
103
|
]
|
|
103
104
|
end
|
|
104
105
|
|
|
@@ -37,7 +37,7 @@ module EmbeddingUtil
|
|
|
37
37
|
server_model = model.is_a?(ServerModel) ? model : ServerModel.parse(model)
|
|
38
38
|
resolved_runtime = RuntimeCommand.resolve(runtime)
|
|
39
39
|
selected_port = selected_port_for(server_model, host: host, port: port)
|
|
40
|
-
command =
|
|
40
|
+
command = runtime_command(resolved_runtime, server_model, host, selected_port)
|
|
41
41
|
last_output_at = Time.now
|
|
42
42
|
|
|
43
43
|
FileUtils.mkdir_p(config.state_dir)
|
|
@@ -61,6 +61,17 @@ module EmbeddingUtil
|
|
|
61
61
|
end
|
|
62
62
|
end
|
|
63
63
|
|
|
64
|
+
def restart_server(capability, profile: config.resolved_profile)
|
|
65
|
+
server_model = ServerModel.for(capability, profile)
|
|
66
|
+
|
|
67
|
+
with_lock(server_model) do
|
|
68
|
+
stop_server(server_model)
|
|
69
|
+
start_background(server_model)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
wait_for_healthy(server_model, log_path: server_log_path(server_model))
|
|
73
|
+
end
|
|
74
|
+
|
|
64
75
|
private
|
|
65
76
|
|
|
66
77
|
def start_background(server_model)
|
|
@@ -75,6 +86,8 @@ module EmbeddingUtil
|
|
|
75
86
|
"--port", selected_port.to_s
|
|
76
87
|
]
|
|
77
88
|
argv.push("--shutdown-idle", config.shutdown_idle.to_s) unless config.shutdown_idle.nil?
|
|
89
|
+
argv.push("--reranker-ubatch-size", config.reranker_ubatch_size.to_s)
|
|
90
|
+
argv.push("--reranker-max-ubatch-size", config.reranker_max_ubatch_size.to_s)
|
|
78
91
|
warn "starting #{server_model.name} in background: #{argv.join(' ')}" if config.verbose
|
|
79
92
|
warn "#{server_model.name} log: #{log_path}" if config.verbose
|
|
80
93
|
pid = Process.spawn(*argv, out: [log_path, "a"], err: %i[child out], pgroup: true)
|
|
@@ -100,6 +113,42 @@ module EmbeddingUtil
|
|
|
100
113
|
available_port(host, server_model.default_port(config))
|
|
101
114
|
end
|
|
102
115
|
|
|
116
|
+
def runtime_command(runtime, server_model, host, port)
|
|
117
|
+
RuntimeCommand.new(
|
|
118
|
+
runtime: runtime,
|
|
119
|
+
server_model: server_model,
|
|
120
|
+
host: host,
|
|
121
|
+
port: port,
|
|
122
|
+
server_flags: server_flags(server_model)
|
|
123
|
+
)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def server_flags(server_model)
|
|
127
|
+
flags = server_model.settings.fetch(:server_flags)
|
|
128
|
+
return flags unless server_model.capability == :reranker
|
|
129
|
+
|
|
130
|
+
with_ubatch_size(flags, config.reranker_ubatch_size)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def with_ubatch_size(flags, size)
|
|
134
|
+
filtered = []
|
|
135
|
+
skip_next = false
|
|
136
|
+
flags.each do |flag|
|
|
137
|
+
if skip_next
|
|
138
|
+
skip_next = false
|
|
139
|
+
next
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
if ["--ubatch-size", "-ub"].include?(flag)
|
|
143
|
+
skip_next = true
|
|
144
|
+
next
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
filtered << flag
|
|
148
|
+
end
|
|
149
|
+
filtered + ["--ubatch-size", size.to_s]
|
|
150
|
+
end
|
|
151
|
+
|
|
103
152
|
def required_port(host, port)
|
|
104
153
|
return port if port_available?(host, port)
|
|
105
154
|
|
|
@@ -213,6 +262,21 @@ module EmbeddingUtil
|
|
|
213
262
|
end
|
|
214
263
|
end
|
|
215
264
|
|
|
265
|
+
def stop_server(server_model)
|
|
266
|
+
state = read_state(server_model)
|
|
267
|
+
return delete_state(server_model) unless state
|
|
268
|
+
|
|
269
|
+
runtime = state.fetch("runtime", config.runtime)
|
|
270
|
+
port = state.fetch("port", server_model.default_port(config))
|
|
271
|
+
command = runtime_command(runtime, server_model, config.host, port)
|
|
272
|
+
if command.detached_server?
|
|
273
|
+
stop_detached_server(command)
|
|
274
|
+
else
|
|
275
|
+
terminate_runtime_process(command, state["pid"])
|
|
276
|
+
end
|
|
277
|
+
delete_state(server_model)
|
|
278
|
+
end
|
|
279
|
+
|
|
216
280
|
def cleanup_runtime(command, wait_thread)
|
|
217
281
|
return unless command
|
|
218
282
|
|