embedding_util 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/lib/embedding_util/providers/self_hosted.rb +18 -1
- data/lib/embedding_util/runtime_command.rb +9 -1
- data/lib/embedding_util/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5042d545b7971013889232394972d9f444de89cc14f459c2b852b60049f08741
|
|
4
|
+
data.tar.gz: 305c3b11b11ae626ee14e04950052c02900c502ff6e477616e1641ee01aea0c1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0d960182799ed19de3510589def9203bdb0b3caa3b0fd477eb46cd2b8077705c1e55304021a9c5473199316d1750047ef42cb03a43dbbde0a38fef13b8088c6b
|
|
7
|
+
data.tar.gz: 5b7438b8c8f163723728e16dfcbe90c8b46025d18f93831890a5f6d6b3484e554bd7bd8ad86468a3f5179a1809ec6731b2446dd1e929f2780a7aeb1bb035ba1d
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
## [0.1.5] - 2026-06-10
|
|
2
|
+
|
|
3
|
+
- Add lean Ramalama defaults with bounded `--ctx-size 4096` and disabled llama.cpp prompt cache via `--cache-ram 0`
|
|
4
|
+
- Split self-hosted embedding arrays into smaller endpoint batches to keep local document indexing within bounded context
|
|
5
|
+
- Preserve embedding output order across self-hosted batches
|
|
6
|
+
|
|
1
7
|
## [0.1.4] - 2026-06-10
|
|
2
8
|
|
|
3
9
|
- Stabilize managed Ramalama reranker startup, restart, and idle cleanup
|
|
@@ -7,6 +7,8 @@ require_relative "endpoint"
|
|
|
7
7
|
module EmbeddingUtil
|
|
8
8
|
module Providers
|
|
9
9
|
class SelfHosted < Provider
|
|
10
|
+
EMBEDDING_BATCH_SIZE = 32
|
|
11
|
+
|
|
10
12
|
def supported?
|
|
11
13
|
ServerManager.supported?(config)
|
|
12
14
|
end
|
|
@@ -25,7 +27,7 @@ module EmbeddingUtil
|
|
|
25
27
|
manager = ServerManager.new(config: config)
|
|
26
28
|
endpoint = manager.ensure_server(:embedding, profile: profile)
|
|
27
29
|
manager.track_activity(:embedding, profile: profile) do
|
|
28
|
-
|
|
30
|
+
embed_batches(endpoint, texts, profile)
|
|
29
31
|
end
|
|
30
32
|
end
|
|
31
33
|
|
|
@@ -50,6 +52,21 @@ module EmbeddingUtil
|
|
|
50
52
|
Endpoint.new(config: endpoint_config)
|
|
51
53
|
end
|
|
52
54
|
|
|
55
|
+
def embed_batches(endpoint, texts, profile)
|
|
56
|
+
results = texts.each_slice(EMBEDDING_BATCH_SIZE).map do |batch|
|
|
57
|
+
endpoint_provider(embedding_endpoint: endpoint).embed(batch, profile: profile)
|
|
58
|
+
end
|
|
59
|
+
return results.fetch(0) if results.size == 1
|
|
60
|
+
|
|
61
|
+
EmbeddingResult.new(
|
|
62
|
+
embedding: results.flat_map(&:embedding),
|
|
63
|
+
model: results.fetch(0).model,
|
|
64
|
+
profile: profile.name,
|
|
65
|
+
provider: provider_name,
|
|
66
|
+
metadata: { batches: results.size }
|
|
67
|
+
)
|
|
68
|
+
end
|
|
69
|
+
|
|
53
70
|
def rerank_with_activity(manager, endpoint, query, documents, profile)
|
|
54
71
|
manager.track_activity(:reranker, profile: profile) do
|
|
55
72
|
endpoint_provider(reranker_endpoint: endpoint).rerank(query, documents, profile: profile)
|
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
module EmbeddingUtil
|
|
4
4
|
class RuntimeCommand
|
|
5
|
+
RAMALAMA_CONTEXT_SIZE = "4096"
|
|
6
|
+
RAMALAMA_RUNTIME_FLAGS = ["--cache-ram", "0"].freeze
|
|
7
|
+
|
|
5
8
|
attr_reader :runtime, :server_model, :host, :port, :server_flags, :ramalama_device
|
|
6
9
|
|
|
7
10
|
def initialize(runtime:, server_model:, host:, port:, **options)
|
|
@@ -87,13 +90,18 @@ module EmbeddingUtil
|
|
|
87
90
|
"ramalama", "--runtime=llama.cpp", "serve",
|
|
88
91
|
"--name", server_name,
|
|
89
92
|
*ramalama_device_args,
|
|
93
|
+
"--ctx-size", RAMALAMA_CONTEXT_SIZE,
|
|
90
94
|
"--host", host,
|
|
91
95
|
"--port", port.to_s,
|
|
92
|
-
"--runtime-args=#{
|
|
96
|
+
"--runtime-args=#{ramalama_runtime_flags.join(' ')}",
|
|
93
97
|
huggingface_model
|
|
94
98
|
]
|
|
95
99
|
end
|
|
96
100
|
|
|
101
|
+
def ramalama_runtime_flags
|
|
102
|
+
server_flags + RAMALAMA_RUNTIME_FLAGS
|
|
103
|
+
end
|
|
104
|
+
|
|
97
105
|
def llama_server_argv
|
|
98
106
|
[
|
|
99
107
|
"llama-server",
|