llm_optimizer 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +26 -0
- data/README.md +1 -13
- data/lib/generators/llm_optimizer/templates/initializer.rb +18 -12
- data/lib/llm_optimizer/configuration.rb +8 -2
- data/lib/llm_optimizer/optimize_result.rb +39 -4
- data/lib/llm_optimizer/pipeline.rb +54 -52
- data/lib/llm_optimizer/semantic_cache.rb +21 -16
- data/lib/llm_optimizer/version.rb +1 -1
- data/lib/llm_optimizer.rb +5 -4
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 853ac31b04c7eec26e0976f8691c7c053305c6b10192ae33ece7c5cab93e71f2
|
|
4
|
+
data.tar.gz: 8ead9117a7fea7093166f43566932222946a682b3c1ace2268891b461e88cea0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c9393955027ac023f82b3afa2e7604460b4526aa470549f1ca1d71cac7091544af5fc2b8edff54c98852c573b70edf50f3c4f074a266f86e8abc27d6d5d20281
|
|
7
|
+
data.tar.gz: 2bf9b96778fc0fdaec64f3cf86bab4346488a33cd083021a788a408d9d6c304ced23b2961ae1f8f63002739fd44010fec99d194607f46734262e5ffd540b7b4b
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,32 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.1.7] - 2026-05-05
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `schema` configuration option for structured output support
|
|
14
|
+
|
|
15
|
+
### Fixed
|
|
16
|
+
- Prevented empty or nil LLM responses from being stored in the semantic cache
|
|
17
|
+
- Fixed `NoMethodError` in the caching pipeline by replacing ActiveSupport-dependent `.blank?` with standard Ruby checks
|
|
18
|
+
- Cleaned up duplicated keys in `LlmOptimizer::Configuration`
|
|
19
|
+
|
|
20
|
+
## [0.1.6] - 2026-05-04
|
|
21
|
+
|
|
22
|
+
### Added
|
|
23
|
+
- `with_tools` configuration option (aliased as `tools`) — allows passing function/tool definitions to LLM calls via the `optimize` method
|
|
24
|
+
- Tool support for both `llm_caller` and `messages_caller` — `tools:` keyword argument is now passed to all underlying LLM callers
|
|
25
|
+
- `with_tools` examples in the README and Rails initializer template
|
|
26
|
+
- `cache_scope` configuration option — isolates semantic cache entries into separate namespaces; useful for ensuring cache hits only occur within specific contexts (e.g., user IDs, account types, or dynamic categories)
|
|
27
|
+
|
|
28
|
+
### Changed
|
|
29
|
+
- `Pipeline#raw_llm_call` refactored to handle global and per-call tools consistently
|
|
30
|
+
- Refactored `Pipeline` to remove duplicate internal method definitions (`semantic_cache_lookup`, `store_in_cache`)
|
|
31
|
+
- `SemanticCache#lookup` return format updated to `[response, token_info]` to support better metadata tracking
|
|
32
|
+
|
|
33
|
+
### Fixed
|
|
34
|
+
- RuboCop `Metrics/ParameterLists` offense in `OptimizeResult#initialize` by adding targeted override for the necessary result fields
|
|
35
|
+
|
|
10
36
|
## [0.1.5] - 2026-04-22
|
|
11
37
|
|
|
12
38
|
### Added
|
data/README.md
CHANGED
|
@@ -200,6 +200,7 @@ end
|
|
|
200
200
|
| `messages_caller` | Lambda | `nil` | `(messages, model:) -> String` — used when `conversation_id` is present; receives full history including current user turn |
|
|
201
201
|
| `system_prompt` | String | `nil` | Seeded as the first system message when a new conversation is created via `conversation_id` |
|
|
202
202
|
| `conversation_ttl` | Integer | `86400` | TTL in seconds for Redis-backed conversation history (`0` for no expiry) |
|
|
203
|
+
| `with_tools` | Array | `nil` | Tools (functions) available to the LLM; passed as `tools:` keyword to callers |
|
|
203
204
|
|
|
204
205
|
## Per-call configuration
|
|
205
206
|
|
|
@@ -212,19 +213,6 @@ result = LlmOptimizer.optimize(prompt) do |config|
|
|
|
212
213
|
end
|
|
213
214
|
```
|
|
214
215
|
|
|
215
|
-
## Conversation history
|
|
216
|
-
|
|
217
|
-
Pass a `messages` array to enable history management:
|
|
218
|
-
|
|
219
|
-
```ruby
|
|
220
|
-
messages = [
|
|
221
|
-
{ role: "user", content: "Tell me about Redis" },
|
|
222
|
-
{ role: "assistant", content: "Redis is an in-memory data store..." },
|
|
223
|
-
# ... more messages
|
|
224
|
-
]
|
|
225
|
-
|
|
226
|
-
result = LlmOptimizer.optimize("What else can it do?", messages: messages)
|
|
227
|
-
|
|
228
216
|
## OptimizeResult
|
|
229
217
|
|
|
230
218
|
Every call returns an `OptimizeResult` struct:
|
|
@@ -27,6 +27,9 @@ LlmOptimizer.configure do |config|
|
|
|
27
27
|
config.cache_ttl = 86_400 # cache entry TTL in seconds (default: 24h)
|
|
28
28
|
config.timeout_seconds = 5 # timeout for embedding / external API calls
|
|
29
29
|
|
|
30
|
+
# --- Tools ---
|
|
31
|
+
# config.with_tools = [] # Array of tool definitions (OpenAI/Anthropic format)
|
|
32
|
+
|
|
30
33
|
# --- Logging ---
|
|
31
34
|
config.logger = Rails.logger
|
|
32
35
|
config.debug_logging = Rails.env.development?
|
|
@@ -81,36 +84,39 @@ LlmOptimizer.configure do |config|
|
|
|
81
84
|
# Messages caller for history manager/conversation summary - Optional
|
|
82
85
|
# config.system_prompt = "You are a helpful person who gives responses in a non harmful way. " \
|
|
83
86
|
# "If any serious question is asked, handle it in effectively."
|
|
84
|
-
# OpenAI
|
|
85
|
-
# config.messages_caller = ->(messages, model:) {
|
|
86
|
-
#
|
|
87
|
-
#
|
|
88
|
-
#
|
|
89
|
-
#
|
|
90
|
-
#
|
|
91
|
-
#
|
|
87
|
+
# OpenAI implementation -
|
|
88
|
+
# config.messages_caller = ->(messages, model:, tools: nil) {
|
|
89
|
+
# parameters = {
|
|
90
|
+
# model: model,
|
|
91
|
+
# messages: messages.map { |m| { role: m[:role], content: m[:content] } }
|
|
92
|
+
# }
|
|
93
|
+
# parameters[:tools] = tools if tools&.any?
|
|
94
|
+
#
|
|
95
|
+
# response = $openai.chat(parameters: parameters)
|
|
92
96
|
# response.dig("choices", 0, "message", "content")
|
|
93
97
|
# }
|
|
94
98
|
|
|
95
99
|
# RubyLLM implementation -
|
|
96
|
-
# config.messages_caller = ->(messages, model:) {
|
|
100
|
+
# config.messages_caller = ->(messages, model:, tools: nil) {
|
|
97
101
|
# chat = RubyLLM.chat(model: model)
|
|
102
|
+
# chat.with_tools(*tools) if tools&.any?
|
|
98
103
|
# messages[0..-2].each { |m| chat.add_message(role: m[:role], content: m[:content]) }
|
|
99
104
|
# chat.ask(messages.last[:content]).content
|
|
100
105
|
# }
|
|
101
106
|
|
|
102
107
|
# Anthropic implementation -
|
|
103
|
-
# config.messages_caller = ->(messages, model:) {
|
|
108
|
+
# config.messages_caller = ->(messages, model:, tools: nil) {
|
|
104
109
|
# # Anthropic separates system messages from the messages array
|
|
105
110
|
# system_msg = messages.find { |m| m[:role] == "system" }&.dig(:content)
|
|
106
111
|
# chat_msgs = messages.reject { |m| m[:role] == "system" }
|
|
107
112
|
# .map { |m| { role: m[:role], content: m[:content] } }
|
|
108
|
-
|
|
113
|
+
#
|
|
109
114
|
# response = $anthropic.messages(
|
|
110
115
|
# model: model,
|
|
111
116
|
# max_tokens: 1024,
|
|
112
117
|
# system: system_msg,
|
|
113
|
-
# messages: chat_msgs
|
|
118
|
+
# messages: chat_msgs,
|
|
119
|
+
# tools: tools
|
|
114
120
|
# )
|
|
115
121
|
# response["content"].first["text"]
|
|
116
122
|
# }
|
|
@@ -22,9 +22,14 @@ module LlmOptimizer
|
|
|
22
22
|
llm_caller
|
|
23
23
|
embedding_caller
|
|
24
24
|
classifier_caller
|
|
25
|
-
conversation_ttl
|
|
26
|
-
system_prompt
|
|
27
25
|
messages_caller
|
|
26
|
+
system_prompt
|
|
27
|
+
conversation_ttl
|
|
28
|
+
cache_scope
|
|
29
|
+
tools
|
|
30
|
+
with_tools
|
|
31
|
+
tools_caller
|
|
32
|
+
schema
|
|
28
33
|
].freeze
|
|
29
34
|
|
|
30
35
|
# Define readers for all known keys (setters below track explicit sets)
|
|
@@ -52,6 +57,7 @@ module LlmOptimizer
|
|
|
52
57
|
@classifier_caller = nil
|
|
53
58
|
@conversation_ttl = 86_400
|
|
54
59
|
@system_prompt = nil
|
|
60
|
+
@with_tools = nil
|
|
55
61
|
end
|
|
56
62
|
|
|
57
63
|
# Copies only explicitly set keys from other_config without resetting unmentioned keys.
|
|
@@ -1,8 +1,43 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module LlmOptimizer
|
|
4
|
-
OptimizeResult
|
|
5
|
-
:response, :model, :model_tier, :cache_status,
|
|
6
|
-
|
|
7
|
-
|
|
4
|
+
class OptimizeResult
|
|
5
|
+
attr_accessor :response, :model, :model_tier, :cache_status,
|
|
6
|
+
:original_tokens, :compressed_tokens, :input_tokens,
|
|
7
|
+
:output_tokens, :cached_tokens, :latency_ms, :messages
|
|
8
|
+
|
|
9
|
+
# rubocop:disable Metrics/ParameterLists
|
|
10
|
+
def initialize(response: nil, model: nil, model_tier: nil, cache_status: nil,
|
|
11
|
+
original_tokens: 0, compressed_tokens: 0, input_tokens: 0,
|
|
12
|
+
output_tokens: 0, cached_tokens: 0, latency_ms: 0, messages: [])
|
|
13
|
+
@response = response
|
|
14
|
+
@model = model
|
|
15
|
+
@model_tier = model_tier
|
|
16
|
+
@cache_status = cache_status
|
|
17
|
+
@original_tokens = original_tokens
|
|
18
|
+
@compressed_tokens = compressed_tokens
|
|
19
|
+
@input_tokens = input_tokens
|
|
20
|
+
@output_tokens = output_tokens
|
|
21
|
+
@cached_tokens = cached_tokens
|
|
22
|
+
@latency_ms = latency_ms
|
|
23
|
+
@messages = messages
|
|
24
|
+
end
|
|
25
|
+
# rubocop:enable Metrics/ParameterLists
|
|
26
|
+
|
|
27
|
+
def to_h
|
|
28
|
+
{
|
|
29
|
+
response: @response,
|
|
30
|
+
model: @model,
|
|
31
|
+
model_tier: @model_tier,
|
|
32
|
+
cache_status: @cache_status,
|
|
33
|
+
original_tokens: @original_tokens,
|
|
34
|
+
compressed_tokens: @compressed_tokens,
|
|
35
|
+
input_tokens: @input_tokens,
|
|
36
|
+
output_tokens: @output_tokens,
|
|
37
|
+
cached_tokens: @cached_tokens,
|
|
38
|
+
latency_ms: @latency_ms,
|
|
39
|
+
messages: @messages
|
|
40
|
+
}
|
|
41
|
+
end
|
|
42
|
+
end
|
|
8
43
|
end
|
|
@@ -44,25 +44,6 @@ module LlmOptimizer
|
|
|
44
44
|
[model_tier, model]
|
|
45
45
|
end
|
|
46
46
|
|
|
47
|
-
def semantic_cache_lookup(prompt, model, model_tier, original_tokens,
|
|
48
|
-
compressed_tokens, original_prompt, start, config)
|
|
49
|
-
return [nil, nil] unless config.use_semantic_cache
|
|
50
|
-
|
|
51
|
-
emb_client = EmbeddingClient.new(
|
|
52
|
-
model: config.embedding_model,
|
|
53
|
-
timeout_seconds: config.timeout_seconds,
|
|
54
|
-
embedding_caller: config.embedding_caller
|
|
55
|
-
)
|
|
56
|
-
embedding = emb_client.embed(prompt)
|
|
57
|
-
embedding, result = check_cache_hit(embedding, prompt, model, model_tier,
|
|
58
|
-
original_tokens, compressed_tokens,
|
|
59
|
-
original_prompt, start, config)
|
|
60
|
-
[embedding, result]
|
|
61
|
-
rescue EmbeddingError => e
|
|
62
|
-
config.logger.warn("[llm_optimizer] EmbeddingError (treating as cache miss): #{e.message}")
|
|
63
|
-
[nil, nil]
|
|
64
|
-
end
|
|
65
|
-
|
|
66
47
|
def load_conversation(conversation_id, options, config)
|
|
67
48
|
return [options[:messages], nil] unless conversation_id
|
|
68
49
|
|
|
@@ -93,41 +74,42 @@ module LlmOptimizer
|
|
|
93
74
|
store.save(conversation_id, messages, prompt, response) || messages
|
|
94
75
|
end
|
|
95
76
|
|
|
96
|
-
def store_in_cache(embedding, response, config)
|
|
97
|
-
return unless config.use_semantic_cache && embedding && config.redis_url
|
|
98
|
-
|
|
99
|
-
redis = build_redis(config.redis_url)
|
|
100
|
-
cache = SemanticCache.new(redis, threshold: config.similarity_threshold, ttl: config.cache_ttl)
|
|
101
|
-
cache.store(embedding, response)
|
|
102
|
-
rescue StandardError => e
|
|
103
|
-
config.logger.warn("[llm_optimizer] SemanticCache store failed: #{e.message}")
|
|
104
|
-
end
|
|
105
|
-
|
|
106
77
|
def build_result(response, model, model_tier, cache_status,
|
|
107
|
-
original_tokens, compressed_tokens, latency_ms, messages)
|
|
78
|
+
original_tokens, compressed_tokens, latency_ms, messages, token_info = {})
|
|
108
79
|
OptimizeResult.new(
|
|
109
80
|
response: response, model: model, model_tier: model_tier,
|
|
110
81
|
cache_status: cache_status, original_tokens: original_tokens,
|
|
111
|
-
compressed_tokens: compressed_tokens,
|
|
82
|
+
compressed_tokens: compressed_tokens,
|
|
83
|
+
input_tokens: token_info[:input_tokens] || compressed_tokens || original_tokens,
|
|
84
|
+
output_tokens: token_info[:output_tokens],
|
|
85
|
+
cached_tokens: token_info[:cached_tokens],
|
|
86
|
+
latency_ms: latency_ms,
|
|
112
87
|
messages: messages
|
|
113
88
|
)
|
|
114
89
|
end
|
|
115
90
|
|
|
116
91
|
def fallback_result(original_prompt, original_tokens, options, start)
|
|
117
92
|
latency_ms = elapsed_ms(start)
|
|
118
|
-
response
|
|
93
|
+
response, _token_info = raw_llm_call(original_prompt, model: nil, config: configuration)
|
|
119
94
|
build_result(response, nil, nil, :miss, original_tokens || 0, nil,
|
|
120
95
|
latency_ms, options[:messages])
|
|
121
96
|
end
|
|
122
97
|
|
|
123
98
|
def raw_llm_call(prompt, model:, messages: nil, config: nil)
|
|
124
|
-
|
|
125
|
-
|
|
99
|
+
tools = config&.with_tools || config&.tools
|
|
100
|
+
result = if messages && !messages.empty? && config&.messages_caller
|
|
101
|
+
config.messages_caller.call(messages + [{ role: "user", content: prompt }], model: model, tools: tools)
|
|
102
|
+
else
|
|
103
|
+
llm = config&.llm_caller || @_current_llm_caller
|
|
104
|
+
raise ConfigurationError, "No llm_caller configured." unless llm
|
|
105
|
+
|
|
106
|
+
llm.call(prompt, model: model, tools: tools)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
if result.is_a?(Hash)
|
|
110
|
+
[result[:content], result]
|
|
126
111
|
else
|
|
127
|
-
|
|
128
|
-
raise ConfigurationError, "No llm_caller configured." unless llm
|
|
129
|
-
|
|
130
|
-
llm.call(prompt, model: model)
|
|
112
|
+
[result, {}]
|
|
131
113
|
end
|
|
132
114
|
end
|
|
133
115
|
|
|
@@ -152,22 +134,42 @@ module LlmOptimizer
|
|
|
152
134
|
Redis.new(url: redis_url)
|
|
153
135
|
end
|
|
154
136
|
|
|
155
|
-
def
|
|
156
|
-
|
|
157
|
-
return [
|
|
137
|
+
def semantic_cache_lookup(prompt, model, model_tier, original_tokens,
|
|
138
|
+
compressed_tokens, original_prompt, start, config)
|
|
139
|
+
return [nil, nil] unless config.use_semantic_cache
|
|
140
|
+
|
|
141
|
+
embedding = config.embedding_caller.call(prompt)
|
|
142
|
+
cache = SemanticCache.new(build_redis(config.redis_url),
|
|
143
|
+
threshold: config.similarity_threshold,
|
|
144
|
+
ttl: config.cache_ttl,
|
|
145
|
+
cache_scope: config.cache_scope)
|
|
146
|
+
cached, token_info = cache.lookup(embedding)
|
|
147
|
+
|
|
148
|
+
if cached
|
|
149
|
+
latency_ms = elapsed_ms(start)
|
|
150
|
+
emit_log(config.logger, config,
|
|
151
|
+
cache_status: :hit, model_tier: model_tier,
|
|
152
|
+
original_tokens: original_tokens, compressed_tokens: compressed_tokens,
|
|
153
|
+
latency_ms: latency_ms, prompt: original_prompt, response: cached)
|
|
154
|
+
|
|
155
|
+
[embedding, build_result(cached, model, model_tier, :hit,
|
|
156
|
+
original_tokens, compressed_tokens, latency_ms, nil, token_info)]
|
|
157
|
+
else
|
|
158
|
+
[embedding, nil]
|
|
159
|
+
end
|
|
160
|
+
rescue StandardError => e
|
|
161
|
+
config.logger.warn("[llm_optimizer] semantic_cache_lookup failed: #{e.message}")
|
|
162
|
+
[nil, nil]
|
|
163
|
+
end
|
|
158
164
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
return [embedding, nil] unless cached
|
|
165
|
+
def store_in_cache(embedding, response, config, token_info = {})
|
|
166
|
+
return unless config.use_semantic_cache && embedding
|
|
167
|
+
return if response.nil? || response.to_s.strip.empty? # Don't cache empty or nil responses
|
|
163
168
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
latency_ms: latency_ms, prompt: original_prompt, response: cached)
|
|
169
|
-
[embedding, build_result(cached, model, model_tier, :hit,
|
|
170
|
-
original_tokens, compressed_tokens, latency_ms, nil)]
|
|
169
|
+
SemanticCache.new(build_redis(config.redis_url),
|
|
170
|
+
threshold: config.similarity_threshold,
|
|
171
|
+
ttl: config.cache_ttl,
|
|
172
|
+
cache_scope: config.cache_scope).store(embedding, response, token_info)
|
|
171
173
|
end
|
|
172
174
|
end
|
|
173
175
|
end
|
|
@@ -7,20 +7,19 @@ module LlmOptimizer
|
|
|
7
7
|
class SemanticCache
|
|
8
8
|
KEY_NAMESPACE = "llm_optimizer:cache:"
|
|
9
9
|
|
|
10
|
-
def initialize(redis_client, threshold:, ttl:)
|
|
11
|
-
@redis
|
|
12
|
-
@threshold
|
|
13
|
-
@ttl
|
|
10
|
+
def initialize(redis_client, threshold:, ttl:, cache_scope: nil)
|
|
11
|
+
@redis = redis_client
|
|
12
|
+
@threshold = threshold
|
|
13
|
+
@ttl = ttl
|
|
14
|
+
@cache_scope = cache_scope
|
|
14
15
|
end
|
|
15
16
|
|
|
16
|
-
def store(embedding, response)
|
|
17
|
+
def store(embedding, response, token_info = {})
|
|
17
18
|
key = cache_key(embedding)
|
|
18
|
-
# Serialize embedding as raw 64-bit big-endian doubles to preserve full
|
|
19
|
-
# Float precision. MessagePack silently downcasts Ruby Float to 32-bit,
|
|
20
|
-
# which corrupts cosine similarity on deserialization.
|
|
21
19
|
payload = MessagePack.pack({
|
|
22
|
-
"embedding" => embedding.pack("G*"),
|
|
23
|
-
"response" => response
|
|
20
|
+
"embedding" => embedding.pack("G*"),
|
|
21
|
+
"response" => response,
|
|
22
|
+
"token_info" => token_info
|
|
24
23
|
})
|
|
25
24
|
@redis.set(key, payload, ex: @ttl)
|
|
26
25
|
rescue ::Redis::BaseError => e
|
|
@@ -28,28 +27,32 @@ module LlmOptimizer
|
|
|
28
27
|
end
|
|
29
28
|
|
|
30
29
|
def lookup(embedding)
|
|
31
|
-
|
|
30
|
+
prefix = KEY_NAMESPACE
|
|
31
|
+
prefix += "#{@cache_scope}:" if @cache_scope
|
|
32
|
+
keys = @redis.keys("#{prefix}*")
|
|
33
|
+
|
|
34
|
+
keys.reject! { |k| k.count(":") > 2 } unless @cache_scope
|
|
35
|
+
|
|
32
36
|
return nil if keys.empty?
|
|
33
37
|
|
|
34
38
|
best_score = -Float::INFINITY
|
|
35
|
-
|
|
39
|
+
best_entry = nil
|
|
36
40
|
|
|
37
41
|
keys.each do |key|
|
|
38
42
|
raw = @redis.get(key)
|
|
39
43
|
next unless raw
|
|
40
44
|
|
|
41
45
|
entry = MessagePack.unpack(raw)
|
|
42
|
-
# Unpack the binary string back to 64-bit doubles
|
|
43
46
|
stored_embedding = entry["embedding"].unpack("G*")
|
|
44
47
|
score = cosine_similarity(embedding, stored_embedding)
|
|
45
48
|
|
|
46
49
|
if score > best_score
|
|
47
50
|
best_score = score
|
|
48
|
-
|
|
51
|
+
best_entry = entry
|
|
49
52
|
end
|
|
50
53
|
end
|
|
51
54
|
|
|
52
|
-
best_score >= @threshold
|
|
55
|
+
[best_entry["response"], best_entry["token_info"] || {}] if best_score >= @threshold
|
|
53
56
|
rescue ::Redis::BaseError => e
|
|
54
57
|
warn "[llm_optimizer] SemanticCache lookup failed: #{e.message}"
|
|
55
58
|
nil
|
|
@@ -70,7 +73,9 @@ module LlmOptimizer
|
|
|
70
73
|
# Use "G*" (64-bit big-endian double) to match Ruby's native Float precision.
|
|
71
74
|
# "f*" (32-bit) truncates precision and produces inconsistent hashes for the
|
|
72
75
|
# same embedding across serialize/deserialize round trips.
|
|
73
|
-
|
|
76
|
+
prefix = KEY_NAMESPACE
|
|
77
|
+
prefix += "#{@cache_scope}:" if @cache_scope
|
|
78
|
+
prefix + Digest::SHA256.hexdigest(embedding.pack("G*"))
|
|
74
79
|
end
|
|
75
80
|
end
|
|
76
81
|
end
|
data/lib/llm_optimizer.rb
CHANGED
|
@@ -103,17 +103,18 @@ module LlmOptimizer
|
|
|
103
103
|
|
|
104
104
|
messages, store = load_conversation(conversation_id, options, call_config)
|
|
105
105
|
messages = apply_history_manager(messages, call_config)
|
|
106
|
-
response
|
|
107
|
-
messages
|
|
108
|
-
store_in_cache(embedding, response, call_config)
|
|
106
|
+
response, token_info = raw_llm_call(prompt, messages: messages, model: model, config: call_config)
|
|
107
|
+
messages = persist_conversation(store, conversation_id, messages, prompt, response)
|
|
108
|
+
store_in_cache(embedding, response, call_config, token_info)
|
|
109
109
|
|
|
110
110
|
latency_ms = elapsed_ms(start)
|
|
111
111
|
emit_log(call_config.logger, call_config,
|
|
112
112
|
cache_status: :miss, model_tier: model_tier,
|
|
113
113
|
original_tokens: original_tokens, compressed_tokens: compressed_tokens,
|
|
114
114
|
latency_ms: latency_ms, prompt: original_prompt, response: response)
|
|
115
|
+
|
|
115
116
|
build_result(response, model, model_tier, :miss, original_tokens, compressed_tokens,
|
|
116
|
-
latency_ms, messages)
|
|
117
|
+
latency_ms, messages, token_info)
|
|
117
118
|
rescue EmbeddingError => e
|
|
118
119
|
configuration.logger.warn("[llm_optimizer] EmbeddingError (outer rescue): #{e.message}")
|
|
119
120
|
fallback_result(original_prompt, original_tokens, options, start)
|