embedding_util 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f388bd90069b46caca18046622511f7fe2bd5c25f2a56a8d02fc01cc673bc682
4
- data.tar.gz: 75b484781ae6689e6bf471007257c1e63f4e8aa2485b558e0d41b2f03643749f
3
+ metadata.gz: fb8276720e51a283fb4b8c70507ae54d25ea968caffe8142f636a6e6ea948b71
4
+ data.tar.gz: eccb6e6d11006238a1af1bfc25fb3b5146325bf78a1deb0f447a73541dec128d
5
5
  SHA512:
6
- metadata.gz: 4bd65f54bc228373056843d55aa866d510660361e294bff33b18ec8440f02ae807f4bf5773e6250eb2a5152e53d3656dadef0f624b50f0c1ec32fab689f78367
7
- data.tar.gz: 9f51238b20d2aabfda68aace11f276e79cbdeb291468bd888933330869a5717c52f33db2ceb7a46b870a8f138a1b9fcbdbc94e4e5108f529ba7d4aa5ab299e6a
6
+ metadata.gz: e625a2389828218a8f39cc4dc69d6ea965f59e032d190407668029df227f3c7ffee32ce30b64141d8a38654573ac591315c08696f051db3f68772efaf1b7bf34
7
+ data.tar.gz: 5da07a5487e7167128b67d8b6e39009d6a2fd4c0949daf1e61ae2cb9e98fa668beecd618713b1d0545daa47a86ac6f4e77f66d3429093c658852b594b4aa3cc5
data/CHANGELOG.md CHANGED
@@ -1,3 +1,11 @@
1
+ ## [0.1.2] - 2026-06-10
2
+
3
+ - Add self-hosted reranker recovery for llama.cpp physical batch-size failures
4
+ - Start managed reranker servers with `--ubatch-size 1024`
5
+ - Restart managed reranker servers once with `--ubatch-size 4096` when larger rerank requests require it
6
+ - Add configuration and CLI options for reranker ubatch defaults and maximums
7
+ - Add clearer guidance for app-managed reranker endpoints that need a larger `--ubatch-size`
8
+
1
9
  ## [0.1.1] - 2026-06-08
2
10
 
3
11
  - Fix local server lifecycle cleanup for Ramalama and direct `llama-server`
data/README.md CHANGED
@@ -118,7 +118,9 @@ Reranker model:
118
118
 
119
119
  - repo: `ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF`
120
120
  - file: `qwen3-reranker-0.6b-q8_0.gguf`
121
- - server flags: `--reranking`
121
+ - server flags: `--reranking --ubatch-size 1024`
122
+
123
+ For self-hosted rerankers, `embedding_util` starts with `--ubatch-size 1024`. If a larger rerank request exceeds llama.cpp's physical batch size, the managed reranker is restarted once with `config.reranker_max_ubatch_size`, which defaults to `4096`, and the request is retried.
122
124
 
123
125
  Do not combine embedding and reranking flags for this profile. Run separate local servers.
124
126
 
@@ -133,6 +135,8 @@ EmbeddingUtil.configure do |config|
133
135
  config.reranker_port = 18081
134
136
  config.startup_timeout = 3600
135
137
  config.shutdown_idle = 300
138
+ config.reranker_ubatch_size = 1024
139
+ config.reranker_max_ubatch_size = 4096
136
140
  config.timeout = 60
137
141
  end
138
142
  ```
@@ -159,6 +163,8 @@ Environment variables are also supported:
159
163
  - `EMBEDDING_UTIL_VERBOSE`
160
164
  - `EMBEDDING_UTIL_EMBEDDING_PORT`
161
165
  - `EMBEDDING_UTIL_RERANKER_PORT`
166
+ - `EMBEDDING_UTIL_RERANKER_UBATCH_SIZE`
167
+ - `EMBEDDING_UTIL_RERANKER_MAX_UBATCH_SIZE`
162
168
 
163
169
  ## Development
164
170
 
@@ -14,6 +14,8 @@ module EmbeddingUtil
14
14
  timeout: ->(value) { value },
15
15
  startup_timeout: ->(value) { value },
16
16
  shutdown_idle: :to_i.to_proc,
17
+ reranker_ubatch_size: :to_i.to_proc,
18
+ reranker_max_ubatch_size: :to_i.to_proc,
17
19
  verbose: ->(value) { value }
18
20
  }.freeze
19
21
 
@@ -25,6 +27,8 @@ module EmbeddingUtil
25
27
  class_option :timeout, type: :numeric, desc: "HTTP timeout in seconds"
26
28
  class_option :startup_timeout, type: :numeric, desc: "Seconds to wait for self-hosted server startup"
27
29
  class_option :shutdown_idle, type: :numeric, desc: "Stop self-hosted server after this many seconds without stdout/stderr activity"
30
+ class_option :reranker_ubatch_size, type: :numeric, desc: "llama.cpp physical batch size for self-hosted reranker servers"
31
+ class_option :reranker_max_ubatch_size, type: :numeric, desc: "Largest reranker physical batch size for automatic retry"
28
32
  class_option :verbose, type: :boolean, desc: "Print self-hosting diagnostics"
29
33
 
30
34
  desc "support", "Display configured provider support"
@@ -3,7 +3,7 @@
3
3
  module EmbeddingUtil
4
4
  class Configuration
5
5
  attr_accessor :profile, :provider, :endpoint, :embedding_endpoint, :reranker_endpoint, :timeout, :startup_timeout, :shutdown_idle, :host,
6
- :embedding_port, :reranker_port, :state_dir, :verbose
6
+ :embedding_port, :reranker_port, :state_dir, :verbose, :reranker_ubatch_size, :reranker_max_ubatch_size
7
7
  attr_reader :runtime
8
8
 
9
9
  def initialize
@@ -19,6 +19,8 @@ module EmbeddingUtil
19
19
  @host = ENV.fetch("EMBEDDING_UTIL_HOST", "127.0.0.1")
20
20
  @embedding_port = Integer(ENV.fetch("EMBEDDING_UTIL_EMBEDDING_PORT", "18080"))
21
21
  @reranker_port = Integer(ENV.fetch("EMBEDDING_UTIL_RERANKER_PORT", "18081"))
22
+ @reranker_ubatch_size = Integer(ENV.fetch("EMBEDDING_UTIL_RERANKER_UBATCH_SIZE", "1024"))
23
+ @reranker_max_ubatch_size = Integer(ENV.fetch("EMBEDDING_UTIL_RERANKER_MAX_UBATCH_SIZE", "4096"))
22
24
  @state_dir = ENV.fetch("EMBEDDING_UTIL_STATE_DIR", File.expand_path("~/.local/state/embedding_util"))
23
25
  @verbose = ENV.fetch("EMBEDDING_UTIL_VERBOSE", "false").match?(/\A(?:1|true|yes|on)\z/i)
24
26
  end
@@ -19,7 +19,7 @@ module EmbeddingUtil
19
19
  repo: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF",
20
20
  file: "qwen3-reranker-0.6b-q8_0.gguf",
21
21
  model: "qwen3-reranker-0.6b",
22
- server_flags: ["--reranking"]
22
+ server_flags: ["--reranking", "--ubatch-size", "1024"]
23
23
  }
24
24
  )
25
25
 
@@ -118,7 +118,7 @@ module EmbeddingUtil
118
118
  end
119
119
 
120
120
  raise EndpointNotFoundError.new(uri, path: path, body: response.body) if response.code.to_i == 404 && route_missing_response?(response.body)
121
- raise EndpointError, "#{uri} returned #{response.code}: #{response.body}" unless response.is_a?(Net::HTTPSuccess)
121
+ raise EndpointError, endpoint_error_message(uri, response, path) unless response.is_a?(Net::HTTPSuccess)
122
122
 
123
123
  JSON.parse(response.body)
124
124
  rescue JSON::ParserError => e
@@ -148,6 +148,18 @@ module EmbeddingUtil
148
148
  def fallback_rerank_not_found?(error)
149
149
  error.path == "/v1/rerank"
150
150
  end
151
+
152
+ def endpoint_error_message(uri, response, path)
153
+ message = "#{uri} returned #{response.code}: #{response.body}"
154
+ return message unless reranker_batch_size_error?(path, response.body)
155
+
156
+ "#{message}. Restart the reranker server with a larger llama.cpp --ubatch-size; " \
157
+ "embedding_util-managed reranker servers use --ubatch-size 1024 by default."
158
+ end
159
+
160
+ def reranker_batch_size_error?(path, body)
161
+ path.end_with?("/rerank") && body.to_s.include?("increase the physical batch size")
162
+ end
151
163
  end
152
164
  end
153
165
  end
@@ -27,7 +27,14 @@ module EmbeddingUtil
27
27
  end
28
28
 
29
29
  def rerank(query, documents, profile: config.resolved_profile)
30
- endpoint = ServerManager.new(config: config).ensure_server(:reranker, profile: profile)
30
+ manager = ServerManager.new(config: config)
31
+ endpoint = manager.ensure_server(:reranker, profile: profile)
32
+ endpoint_provider(reranker_endpoint: endpoint).rerank(query, documents, profile: profile)
33
+ rescue EndpointError => e
34
+ raise unless reranker_batch_size_error?(e) && can_escalate_reranker_ubatch?
35
+
36
+ config.reranker_ubatch_size = config.reranker_max_ubatch_size
37
+ endpoint = manager.restart_server(:reranker, profile: profile)
31
38
  endpoint_provider(reranker_endpoint: endpoint).rerank(query, documents, profile: profile)
32
39
  end
33
40
 
@@ -39,6 +46,14 @@ module EmbeddingUtil
39
46
  endpoint_config.reranker_endpoint = reranker_endpoint
40
47
  Endpoint.new(config: endpoint_config)
41
48
  end
49
+
50
+ def reranker_batch_size_error?(error)
51
+ error.message.include?("increase the physical batch size")
52
+ end
53
+
54
+ def can_escalate_reranker_ubatch?
55
+ config.reranker_ubatch_size < config.reranker_max_ubatch_size
56
+ end
42
57
  end
43
58
  end
44
59
  end
@@ -2,13 +2,14 @@
2
2
 
3
3
  module EmbeddingUtil
4
4
  class RuntimeCommand
5
- attr_reader :runtime, :server_model, :host, :port
5
+ attr_reader :runtime, :server_model, :host, :port, :server_flags
6
6
 
7
- def initialize(runtime:, server_model:, host:, port:)
7
+ def initialize(runtime:, server_model:, host:, port:, server_flags: nil)
8
8
  @runtime = self.class.normalize_runtime(runtime)
9
9
  @server_model = server_model
10
10
  @host = host
11
11
  @port = port
12
+ @server_flags = server_flags || server_model.settings.fetch(:server_flags)
12
13
  end
13
14
 
14
15
  def self.available?(runtime)
@@ -86,7 +87,7 @@ module EmbeddingUtil
86
87
  "--name", server_name,
87
88
  "--host", host,
88
89
  "--port", port.to_s,
89
- "--runtime-args=#{server_model.settings.fetch(:server_flags).join(' ')}",
90
+ "--runtime-args=#{server_flags.join(' ')}",
90
91
  huggingface_model
91
92
  ]
92
93
  end
@@ -98,7 +99,7 @@ module EmbeddingUtil
98
99
  "--port", port.to_s,
99
100
  "-hf", server_model.settings.fetch(:repo),
100
101
  "-hff", server_model.settings.fetch(:file),
101
- *server_model.settings.fetch(:server_flags)
102
+ *server_flags
102
103
  ]
103
104
  end
104
105
 
@@ -37,7 +37,7 @@ module EmbeddingUtil
37
37
  server_model = model.is_a?(ServerModel) ? model : ServerModel.parse(model)
38
38
  resolved_runtime = RuntimeCommand.resolve(runtime)
39
39
  selected_port = selected_port_for(server_model, host: host, port: port)
40
- command = RuntimeCommand.new(runtime: resolved_runtime, server_model: server_model, host: host, port: selected_port)
40
+ command = runtime_command(resolved_runtime, server_model, host, selected_port)
41
41
  last_output_at = Time.now
42
42
 
43
43
  FileUtils.mkdir_p(config.state_dir)
@@ -61,6 +61,17 @@ module EmbeddingUtil
61
61
  end
62
62
  end
63
63
 
64
+ def restart_server(capability, profile: config.resolved_profile)
65
+ server_model = ServerModel.for(capability, profile)
66
+
67
+ with_lock(server_model) do
68
+ stop_server(server_model)
69
+ start_background(server_model)
70
+ end
71
+
72
+ wait_for_healthy(server_model, log_path: server_log_path(server_model))
73
+ end
74
+
64
75
  private
65
76
 
66
77
  def start_background(server_model)
@@ -75,6 +86,8 @@ module EmbeddingUtil
75
86
  "--port", selected_port.to_s
76
87
  ]
77
88
  argv.push("--shutdown-idle", config.shutdown_idle.to_s) unless config.shutdown_idle.nil?
89
+ argv.push("--reranker-ubatch-size", config.reranker_ubatch_size.to_s)
90
+ argv.push("--reranker-max-ubatch-size", config.reranker_max_ubatch_size.to_s)
78
91
  warn "starting #{server_model.name} in background: #{argv.join(' ')}" if config.verbose
79
92
  warn "#{server_model.name} log: #{log_path}" if config.verbose
80
93
  pid = Process.spawn(*argv, out: [log_path, "a"], err: %i[child out], pgroup: true)
@@ -100,6 +113,42 @@ module EmbeddingUtil
100
113
  available_port(host, server_model.default_port(config))
101
114
  end
102
115
 
116
+ def runtime_command(runtime, server_model, host, port)
117
+ RuntimeCommand.new(
118
+ runtime: runtime,
119
+ server_model: server_model,
120
+ host: host,
121
+ port: port,
122
+ server_flags: server_flags(server_model)
123
+ )
124
+ end
125
+
126
+ def server_flags(server_model)
127
+ flags = server_model.settings.fetch(:server_flags)
128
+ return flags unless server_model.capability == :reranker
129
+
130
+ with_ubatch_size(flags, config.reranker_ubatch_size)
131
+ end
132
+
133
+ def with_ubatch_size(flags, size)
134
+ filtered = []
135
+ skip_next = false
136
+ flags.each do |flag|
137
+ if skip_next
138
+ skip_next = false
139
+ next
140
+ end
141
+
142
+ if ["--ubatch-size", "-ub"].include?(flag)
143
+ skip_next = true
144
+ next
145
+ end
146
+
147
+ filtered << flag
148
+ end
149
+ filtered + ["--ubatch-size", size.to_s]
150
+ end
151
+
103
152
  def required_port(host, port)
104
153
  return port if port_available?(host, port)
105
154
 
@@ -213,6 +262,21 @@ module EmbeddingUtil
213
262
  end
214
263
  end
215
264
 
265
+ def stop_server(server_model)
266
+ state = read_state(server_model)
267
+ return delete_state(server_model) unless state
268
+
269
+ runtime = state.fetch("runtime", config.runtime)
270
+ port = state.fetch("port", server_model.default_port(config))
271
+ command = runtime_command(runtime, server_model, config.host, port)
272
+ if command.detached_server?
273
+ stop_detached_server(command)
274
+ else
275
+ terminate_runtime_process(command, state["pid"])
276
+ end
277
+ delete_state(server_model)
278
+ end
279
+
216
280
  def cleanup_runtime(command, wait_thread)
217
281
  return unless command
218
282
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EmbeddingUtil
4
- VERSION = "0.1.1"
4
+ VERSION = "0.1.2"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embedding_util
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - hmdne