RubyGems - embedding_util - Versions diffs - 0.1.3 → 0.1.4 - Mend

embedding_util 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +8 -0
data/README.md +6 -2
data/lib/embedding_util/cli.rb +2 -0
data/lib/embedding_util/configuration.rb +2 -1
data/lib/embedding_util/providers/self_hosted.rb +22 -5
data/lib/embedding_util/runtime_command.rb +9 -3
data/lib/embedding_util/server_manager.rb +96 -19
data/lib/embedding_util/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: d7fe54bc241ead7c4f0aab2f252dfedff61627a1e6cbc3814e02096e455849bb
-  data.tar.gz: 540daea5a8b3ac13123b879f4644622bda81697981a4270867510be752bcb3c8
+  metadata.gz: 25117f1c8eb2df3a23d26b45a76df8f850af2e96a9dc7c5ecfc9f2820a8cd913
+  data.tar.gz: a1d173a4933740d65cadbc6dad877cf605859029a0bd351c3123a7899dd49fe3
 SHA512:
-  metadata.gz: 8fd953d5fe00539c084aed32136ab17c3645a401ef768778baa2534b3125f8fcd57771839b11e1d1890896adbc60850bfb82e74c3b07708df04f9efff82eb5b5
-  data.tar.gz: 54d4812b7425a12ee64841e43604375267ef25ca1f6d0f9a661fd1e1e4d710fc8208904b9b836a233c10820be624ec248df063ab98cfb81709f8b6c8efd88693
+  metadata.gz: 5b61c1d0e518af3aa80acbb19db28314412ff24611681dc61ce1a8b2d1a989da0bbe181e73c488ac0ab409d2ab579b598e2ca41b23871667ed19faec59e6e9a1
+  data.tar.gz: 9eb004c7a36be9a82638bd7883d865219f6f09dfe443d016bd73267c8124cac7f41d9f94960d86ea34d02828f7c186b49d5220a00808f0eec1195ff7c40769de

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,11 @@
+## [0.1.4] - 2026-06-10
+- Stabilize managed Ramalama reranker startup, restart, and idle cleanup
+- Add request-aware idle tracking so active embedding/reranking requests are not stopped during quiet server output
+- Add `ramalama_device` configuration, CLI, and environment support for hosts that need `--device none`
+- Retry managed reranker EOF/connection-drop failures once after escalating batch settings
+- Verify stability through repeated `index_util/examples/ruby_api` queries
 ## [0.1.3] - 2026-06-10
 - Set self-hosted reranker `--batch-size` and `--ubatch-size` together

data/README.md CHANGED Viewed

@@ -67,7 +67,7 @@ embedding_util serve --model embedding-small_multilingual_v1
 embedding_util serve --model reranker-small_multilingual_v1
 ```
-`serve` starts one model server per command and runs until stopped. Add `--shutdown-idle SECONDS` only when you want that manually managed server to stop itself after idle output; omit it, set it to `nil`, or pass `0` to disable idle shutdown.
+`serve` starts one model server per command and runs until stopped. Add `--shutdown-idle SECONDS` only when you want that manually managed server to stop itself after it is idle; omit it, set it to `nil`, or pass `0` to disable idle shutdown.
 ## CLI
@@ -86,12 +86,14 @@ embedding_util rerank \
 `embed` prints a JSON array. `rerank` prints JSON objects with `index`, `document`, `score`, and `metadata`.
-`serve` starts one local model server. The default model is `embedding-small_multilingual_v1`; use `reranker-small_multilingual_v1` for the reranker server. By default, `serve` uses Ramalama when available and falls back to direct `llama-server`. It runs until stopped unless a positive `--shutdown-idle` value is provided.
+`serve` starts one local model server. The default model is `embedding-small_multilingual_v1`; use `reranker-small_multilingual_v1` for the reranker server. By default, `serve` uses Ramalama when available and falls back to direct `llama-server`. It runs until stopped unless a positive `--shutdown-idle` value is provided. Idle shutdown is request-aware for `embedding_util`-managed calls, so long-running embedding or reranking requests are not stopped just because the model server is temporarily quiet.
 Explicit `serve --port PORT` requires that exact port to be free. Without `--port`, `serve` prefers the profile default port and chooses the next free local port if needed.
 Use `--verbose` on `embed` or `rerank` to print self-hosting diagnostics, including the background `serve` command and log path. First-time model downloads are expected to work with the default startup timeout; use `--startup-timeout` only when you explicitly want to shorten or extend that wait.
+If Ramalama's automatic device passthrough does not work on a host, pass `--ramalama-device none` or set `EMBEDDING_UTIL_RAMALAMA_DEVICE=none` to force CPU/container-only serving.
 ## API
 - `EmbeddingUtil.embed(text)` returns one embedding array.
@@ -137,6 +139,7 @@ EmbeddingUtil.configure do |config|
   config.shutdown_idle = 300
   config.reranker_ubatch_size = 1024
   config.reranker_max_ubatch_size = 4096
+  config.ramalama_device = nil
   config.timeout = 60
 end
 ```
@@ -165,6 +168,7 @@ Environment variables are also supported:
 - `EMBEDDING_UTIL_RERANKER_PORT`
 - `EMBEDDING_UTIL_RERANKER_UBATCH_SIZE`
 - `EMBEDDING_UTIL_RERANKER_MAX_UBATCH_SIZE`
+- `EMBEDDING_UTIL_RAMALAMA_DEVICE`
 ## Development

data/lib/embedding_util/cli.rb CHANGED Viewed

@@ -16,6 +16,7 @@ module EmbeddingUtil
       shutdown_idle: :to_i.to_proc,
       reranker_ubatch_size: :to_i.to_proc,
       reranker_max_ubatch_size: :to_i.to_proc,
+      ramalama_device: ->(value) { value },
       verbose: ->(value) { value }
     }.freeze
@@ -29,6 +30,7 @@ module EmbeddingUtil
     class_option :shutdown_idle, type: :numeric, desc: "Stop self-hosted server after this many seconds without stdout/stderr activity"
     class_option :reranker_ubatch_size, type: :numeric, desc: "llama.cpp physical batch size for self-hosted reranker servers"
     class_option :reranker_max_ubatch_size, type: :numeric, desc: "Largest reranker physical batch size for automatic retry"
+    class_option :ramalama_device, type: :string, desc: "Ramalama device option, for example none"
     class_option :verbose, type: :boolean, desc: "Print self-hosting diagnostics"
     desc "support", "Display configured provider support"

data/lib/embedding_util/configuration.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 module EmbeddingUtil
   class Configuration
     attr_accessor :profile, :provider, :endpoint, :embedding_endpoint, :reranker_endpoint, :timeout, :startup_timeout, :shutdown_idle, :host,
-                  :embedding_port, :reranker_port, :state_dir, :verbose, :reranker_ubatch_size, :reranker_max_ubatch_size
+                  :embedding_port, :reranker_port, :state_dir, :verbose, :reranker_ubatch_size, :reranker_max_ubatch_size, :ramalama_device
     attr_reader :runtime
     def initialize
@@ -21,6 +21,7 @@ module EmbeddingUtil
       @reranker_port = Integer(ENV.fetch("EMBEDDING_UTIL_RERANKER_PORT", "18081"))
       @reranker_ubatch_size = Integer(ENV.fetch("EMBEDDING_UTIL_RERANKER_UBATCH_SIZE", "1024"))
       @reranker_max_ubatch_size = Integer(ENV.fetch("EMBEDDING_UTIL_RERANKER_MAX_UBATCH_SIZE", "4096"))
+      @ramalama_device = ENV["EMBEDDING_UTIL_RAMALAMA_DEVICE"]
       @state_dir = ENV.fetch("EMBEDDING_UTIL_STATE_DIR", File.expand_path("~/.local/state/embedding_util"))
       @verbose = ENV.fetch("EMBEDDING_UTIL_VERBOSE", "false").match?(/\A(?:1|true|yes|on)\z/i)
     end

data/lib/embedding_util/providers/self_hosted.rb CHANGED Viewed

@@ -22,20 +22,23 @@ module EmbeddingUtil
       end
       def embed(texts, profile: config.resolved_profile)
-        endpoint = ServerManager.new(config: config).ensure_server(:embedding, profile: profile)
-        endpoint_provider(embedding_endpoint: endpoint).embed(texts, profile: profile)
+        manager = ServerManager.new(config: config)
+        endpoint = manager.ensure_server(:embedding, profile: profile)
+        manager.track_activity(:embedding, profile: profile) do
+          endpoint_provider(embedding_endpoint: endpoint).embed(texts, profile: profile)
+        end
       end
       def rerank(query, documents, profile: config.resolved_profile)
         manager = ServerManager.new(config: config)
         endpoint = manager.ensure_server(:reranker, profile: profile)
-        endpoint_provider(reranker_endpoint: endpoint).rerank(query, documents, profile: profile)
+        rerank_with_activity(manager, endpoint, query, documents, profile)
       rescue EndpointError => e
-        raise unless reranker_batch_size_error?(e) && can_escalate_reranker_ubatch?
+        raise unless retryable_reranker_error?(e) && can_escalate_reranker_ubatch?
         config.reranker_ubatch_size = config.reranker_max_ubatch_size
         endpoint = manager.restart_server(:reranker, profile: profile)
-        endpoint_provider(reranker_endpoint: endpoint).rerank(query, documents, profile: profile)
+        rerank_with_activity(manager, endpoint, query, documents, profile)
       end
       private
@@ -47,10 +50,24 @@ module EmbeddingUtil
         Endpoint.new(config: endpoint_config)
       end
+      def rerank_with_activity(manager, endpoint, query, documents, profile)
+        manager.track_activity(:reranker, profile: profile) do
+          endpoint_provider(reranker_endpoint: endpoint).rerank(query, documents, profile: profile)
+        end
+      end
       def reranker_batch_size_error?(error)
         error.message.include?("increase the physical batch size")
       end
+      def retryable_reranker_error?(error)
+        reranker_batch_size_error?(error) || reranker_connection_dropped?(error)
+      end
+      def reranker_connection_dropped?(error)
+        error.message.match?(%r{could not reach http://[^ ]+/v1/rerank: (?:end of file reached|Connection reset|connection reset|stream closed)})
+      end
       def can_escalate_reranker_ubatch?
         config.reranker_ubatch_size < config.reranker_max_ubatch_size
       end

data/lib/embedding_util/runtime_command.rb CHANGED Viewed

@@ -2,14 +2,15 @@
 module EmbeddingUtil
   class RuntimeCommand
-    attr_reader :runtime, :server_model, :host, :port, :server_flags
+    attr_reader :runtime, :server_model, :host, :port, :server_flags, :ramalama_device
-    def initialize(runtime:, server_model:, host:, port:, server_flags: nil)
+    def initialize(runtime:, server_model:, host:, port:, **options)
       @runtime = self.class.normalize_runtime(runtime)
       @server_model = server_model
       @host = host
       @port = port
-      @server_flags = server_flags || server_model.settings.fetch(:server_flags)
+      @server_flags = options[:server_flags] || server_model.settings.fetch(:server_flags)
+      @ramalama_device = options[:ramalama_device]
     end
     def self.available?(runtime)
@@ -85,6 +86,7 @@ module EmbeddingUtil
       [
         "ramalama", "--runtime=llama.cpp", "serve",
         "--name", server_name,
+        *ramalama_device_args,
         "--host", host,
         "--port", port.to_s,
         "--runtime-args=#{server_flags.join(' ')}",
@@ -106,5 +108,9 @@ module EmbeddingUtil
     def huggingface_model
       "hf://#{server_model.settings.fetch(:repo)}/#{server_model.settings.fetch(:file)}"
     end
+    def ramalama_device_args
+      ramalama_device.to_s.empty? ? [] : ["--device", ramalama_device.to_s]
+    end
   end
 end

data/lib/embedding_util/server_manager.rb CHANGED Viewed

@@ -11,6 +11,8 @@ require "uri"
 module EmbeddingUtil
   class ServerManager
+    STOP_TIMEOUT = 30
     attr_reader :config
     def initialize(config: EmbeddingUtil.configuration)
@@ -47,10 +49,10 @@ module EmbeddingUtil
       previous_traps = install_interrupt_traps
       Open3.popen2e(*command.argv) do |_stdin, output, wait_thread|
         url = "http://#{host}:#{selected_port}"
-        write_state(server_model, pid: wait_thread.pid, url: url, runtime: command.label, port: selected_port)
+        write_state(server_model, pid: state_pid(command, wait_thread), url: url, runtime: command.label, port: selected_port)
         last_output_at_mutex = Mutex.new
         reader = stream_output(output) { last_output_at_mutex.synchronize { last_output_at = Time.now } }
-        wait_for_runtime_serving(command, server_model, url, wait_thread.pid)
+        wait_for_runtime_serving(command, server_model, url, wait_thread)
         supervise_runtime(command, wait_thread, shutdown_idle) { last_output_at_mutex.synchronize { last_output_at } }
       ensure
         cleanup_runtime(command, wait_thread)
@@ -65,13 +67,22 @@ module EmbeddingUtil
       server_model = ServerModel.for(capability, profile)
       with_lock(server_model) do
-        stop_server(server_model)
+        stopped_url = stop_server(server_model)
+        wait_for_stopped(server_model, stopped_url)
         start_background(server_model)
       end
       wait_for_healthy(server_model, log_path: server_log_path(server_model))
     end
+    def track_activity(capability, profile: config.resolved_profile)
+      server_model = ServerModel.for(capability, profile)
+      update_activity(server_model, 1)
+      yield
+    ensure
+      update_activity(server_model, -1) if server_model
+    end
     private
     def start_background(server_model)
@@ -88,6 +99,7 @@ module EmbeddingUtil
       argv.push("--shutdown-idle", config.shutdown_idle.to_s) unless config.shutdown_idle.nil?
       argv.push("--reranker-ubatch-size", config.reranker_ubatch_size.to_s)
       argv.push("--reranker-max-ubatch-size", config.reranker_max_ubatch_size.to_s)
+      argv.push("--ramalama-device", config.ramalama_device.to_s) unless config.ramalama_device.to_s.empty?
       warn "starting #{server_model.name} in background: #{argv.join(' ')}" if config.verbose
       warn "#{server_model.name} log: #{log_path}" if config.verbose
       pid = Process.spawn(*argv, out: [log_path, "a"], err: %i[child out], pgroup: true)
@@ -119,7 +131,8 @@ module EmbeddingUtil
         server_model: server_model,
         host: host,
         port: port,
-        server_flags: server_flags(server_model)
+        server_flags: server_flags(server_model),
+        ramalama_device: config.ramalama_device
       )
     end
@@ -210,12 +223,16 @@ module EmbeddingUtil
       end
     end
-    def wait_for_runtime_serving(command, server_model, url, pid)
+    def wait_for_runtime_serving(command, server_model, url, wait_thread)
       warn "waiting for #{server_model.name} at #{url}" if config.verbose
-      wait_for_serving(server_model, url, pid, check_process: !command.detached_server?)
+      wait_for_serving(server_model, url, wait_thread.pid, wait_thread: wait_thread, check_process: !command.detached_server?)
       warn "#{server_model.name} is healthy" if config.verbose
     end
+    def state_pid(command, wait_thread)
+      command.detached_server? ? Process.pid : wait_thread.pid
+    end
     def supervise_runtime(command, wait_thread, shutdown_idle, &last_output_at)
       warn "supervising #{command.server_name}" if config.verbose && command.detached_server?
       return supervise_detached_server(command, shutdown_idle, &last_output_at) if command.detached_server?
@@ -226,10 +243,11 @@ module EmbeddingUtil
       watchdog&.kill
     end
-    def wait_for_serving(server_model, url, pid, check_process: true)
+    def wait_for_serving(server_model, url, pid, wait_thread: nil, check_process: true)
       deadline = Time.now + config.startup_timeout
       loop do
         return if healthy_url?(url)
+        raise UnsupportedProviderError, "#{server_model.name} runtime launcher exited before server became healthy" if launcher_failed?(wait_thread)
         raise UnsupportedProviderError, "#{server_model.name} server process exited before becoming healthy" if check_process && !process_running?(pid)
         raise UnsupportedProviderError, "timed out after #{config.startup_timeout}s waiting for #{server_model.name} to become healthy" if Time.now >= deadline
@@ -237,9 +255,15 @@ module EmbeddingUtil
       end
     end
+    def launcher_failed?(wait_thread)
+      return false unless wait_thread && !wait_thread.alive?
+      !wait_thread.value.success?
+    end
     def supervise_detached_server(command, shutdown_idle)
       loop do
-        if idle_expired?(shutdown_idle, yield)
+        if idle_expired?(shutdown_idle, command.server_model, yield)
           warn "stopping #{command.server_name} after #{shutdown_idle}s idle" if config.verbose
           stop_detached_server(command)
           return 0
@@ -252,8 +276,29 @@ module EmbeddingUtil
       130
     end
-    def idle_expired?(shutdown_idle, last_output_at)
-      shutdown_idle&.positive? && Time.now - last_output_at >= shutdown_idle
+    def idle_expired?(shutdown_idle, server_model, last_output_at)
+      return false unless shutdown_idle&.positive?
+      activity = activity_state(server_model, last_output_at)
+      activity.fetch(:active_requests).zero? && Time.now - activity.fetch(:last_activity_at) >= shutdown_idle
+    end
+    def activity_state(server_model, fallback_time)
+      state = read_state(server_model)
+      last_activity_at = parse_state_time(state&.fetch("last_activity_at", nil)) || fallback_time
+      last_output_at = [fallback_time, last_activity_at].max
+      {
+        active_requests: Integer(state&.fetch("active_requests", 0) || 0),
+        last_activity_at: last_output_at
+      }
+    rescue ArgumentError
+      { active_requests: 0, last_activity_at: fallback_time }
+    end
+    def parse_state_time(value)
+      Time.iso8601(value) if value
+    rescue ArgumentError
+      nil
     end
     def stop_detached_server(command)
@@ -268,13 +313,28 @@ module EmbeddingUtil
       runtime = state.fetch("runtime", config.runtime)
       port = state.fetch("port", server_model.default_port(config))
+      url = state["url"]
       command = runtime_command(runtime, server_model, config.host, port)
       if command.detached_server?
         stop_detached_server(command)
       else
         terminate_runtime_process(command, state["pid"])
+        stop_detached_server(runtime_command(:ramalama, server_model, config.host, port))
       end
       delete_state(server_model)
+      url
+    end
+    def wait_for_stopped(server_model, url)
+      return unless url
+      deadline = Time.now + STOP_TIMEOUT
+      loop do
+        return unless healthy_url?(url)
+        raise UnsupportedProviderError, "#{server_model.name} did not stop before restart" if Time.now >= deadline
+        sleep 0.25
+      end
     end
     def cleanup_runtime(command, wait_thread)
@@ -382,15 +442,32 @@ module EmbeddingUtil
     end
     def write_state(server_model, pid:, url:, runtime:, port:)
-      File.write(state_path(server_model), JSON.pretty_generate({
-                                                                  pid: pid,
-                                                                  url: url,
-                                                                  profile: server_model.profile.name,
-                                                                  capability: server_model.capability,
-                                                                  runtime: runtime,
-                                                                  port: port,
-                                                                  updated_at: Time.now.utc.iso8601
-                                                                }))
+      state = {
+        pid: pid,
+        url: url,
+        profile: server_model.profile.name,
+        capability: server_model.capability,
+        runtime: runtime,
+        port: port,
+        active_requests: 0,
+        last_activity_at: Time.now.utc.iso8601,
+        updated_at: Time.now.utc.iso8601
+      }
+      File.write(state_path(server_model), JSON.pretty_generate(state))
+    end
+    def update_activity(server_model, delta)
+      with_lock(server_model) do
+        state = read_state(server_model)
+        next unless state
+        state["active_requests"] = [Integer(state.fetch("active_requests", 0)) + delta, 0].max
+        state["last_activity_at"] = Time.now.utc.iso8601
+        state["updated_at"] = Time.now.utc.iso8601
+        File.write(state_path(server_model), JSON.pretty_generate(state))
+      end
+    rescue ArgumentError
+      nil
     end
     def read_state(server_model)

data/lib/embedding_util/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module EmbeddingUtil
-  VERSION = "0.1.3"
+  VERSION = "0.1.4"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: embedding_util
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.1.4
 platform: ruby
 authors:
 - hmdne