llm_optimizer 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -1
- data/README.md +3 -3
- data/lib/llm_optimizer/version.rb +1 -1
- data/lib/llm_optimizer.rb +76 -4
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c6903d2d4c2163d93ffe8d0d5ad9708d64a8472a430ed9f266c9237e468c8585
|
|
4
|
+
data.tar.gz: c7270f4717ece6778976f46f1601f9e5d45939e3e7926ea7e3ed05b3b641f413
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 858cad7443f7adcbe42b3d5ce62b4e815081d2238b7711066276ee2a7c0fb6a506d267ccb48dbe611a2ed08b2eab29139057dcddc2d033155561499a0d6f5421
|
|
7
|
+
data.tar.gz: b3afc392e8fb2ef5b7baa468f74f9def34a15db9f6df898fd738503638d32f5dda9b04a6c8f2e005cd94aa893eca864111f3be0f2e8bfa1cc0aeef6391e0ae2c
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.1.4] - 2026-04-13
|
|
11
|
+
|
|
12
|
+
### Fixed
|
|
13
|
+
- `WrapperModule#chat` (used by `wrap_client`) incorrectly called `LlmOptimizer.optimize` internally which required `llm_caller` to be configured — causing `ConfigurationError` for users who only called `wrap_client`. Refactored into `optimize_pre_call` / `optimize_post_call` so the wrapped client handles the actual LLM call via `super`. `llm_caller` is no longer needed when using `wrap_client`
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
- `LlmOptimizer.optimize_pre_call(prompt, config)` — runs compress → route → cache lookup without making an LLM call; used internally by `WrapperModule` and available for advanced integrations
|
|
17
|
+
- `LlmOptimizer.optimize_post_call(pre_call_result, response, config)` — stores a response in the semantic cache after an LLM call; used internally by `WrapperModule`
|
|
18
|
+
|
|
10
19
|
## [0.1.3] - 2026-04-10
|
|
11
20
|
|
|
12
21
|
### Added
|
|
@@ -70,7 +79,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
70
79
|
- `OptimizeResult` struct with `response`, `model`, `model_tier`, `cache_status`, `original_tokens`, `compressed_tokens`, `latency_ms`, `messages`
|
|
71
80
|
- Unit test suite covering all components with positive and negative scenarios using Minitest + Mocha
|
|
72
81
|
|
|
73
|
-
[Unreleased]: https://github.com/arunkumarry/llm_optimizer/compare/v0.1.
|
|
82
|
+
[Unreleased]: https://github.com/arunkumarry/llm_optimizer/compare/v0.1.4...HEAD
|
|
83
|
+
[0.1.4]: https://github.com/arunkumarry/llm_optimizer/compare/v0.1.3...v0.1.4
|
|
84
|
+
[0.1.3]: https://github.com/arunkumarry/llm_optimizer/compare/v0.1.2...v0.1.3
|
|
74
85
|
[0.1.2]: https://github.com/arunkumarry/llm_optimizer/compare/v0.1.1...v0.1.2
|
|
75
86
|
[0.1.1]: https://github.com/arunkumarry/llm_optimizer/compare/v0.1.0...v0.1.1
|
|
76
87
|
[0.1.0]: https://github.com/arunkumarry/llm_optimizer/releases/tag/v0.1.0
|
data/README.md
CHANGED
|
@@ -31,9 +31,9 @@ Routing uses a three-layer decision chain:
|
|
|
31
31
|
3. **LLM classifier** (optional) — for ambiguous prompts, calls a cheap model with a classification prompt; falls back to word-count heuristic if not configured or if the call fails
|
|
32
32
|
|
|
33
33
|
This hybrid approach fixes the core weakness of pure heuristics:
|
|
34
|
-
- `"Fix this bug"` → 3 words but `:complex` via classifier
|
|
35
|
-
- `"Explain Ruby blocks simply"` → long but `:simple` via classifier
|
|
36
|
-
- `"analyze this code"` → keyword fast-path → `:complex` instantly (no classifier call)
|
|
34
|
+
- `"Fix this bug"` → 3 words but `:complex` via classifier
|
|
35
|
+
- `"Explain Ruby blocks simply"` → long but `:simple` via classifier
|
|
36
|
+
- `"analyze this code"` → keyword fast-path → `:complex` instantly (no classifier call)
|
|
37
37
|
|
|
38
38
|
Configure the classifier with any cheap model your app already uses:
|
|
39
39
|
|
data/lib/llm_optimizer.rb
CHANGED
|
@@ -58,12 +58,37 @@ module LlmOptimizer
|
|
|
58
58
|
end
|
|
59
59
|
|
|
60
60
|
# Opt-in client wrapping
|
|
61
|
+
# WrapperModule intercepts `chat` on the wrapped client, runs the pre-call
|
|
62
|
+
# optimization pipeline (compress, route, cache lookup), and delegates the
|
|
63
|
+
# actual LLM call to the original client via `super` — so llm_caller is NOT
|
|
64
|
+
# required when using wrap_client.
|
|
61
65
|
module WrapperModule
|
|
62
|
-
def chat(params, &)
|
|
66
|
+
def chat(params, &block)
|
|
67
|
+
config = LlmOptimizer.configuration
|
|
63
68
|
prompt = params[:messages] || params[:prompt]
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
69
|
+
|
|
70
|
+
# Run pre-call pipeline: compress, route, cache lookup
|
|
71
|
+
result = LlmOptimizer.optimize_pre_call(prompt, config)
|
|
72
|
+
|
|
73
|
+
# Cache hit — return immediately without calling the LLM
|
|
74
|
+
if result[:cache_status] == :hit
|
|
75
|
+
return result[:response]
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Apply compressed prompt and routed model, then delegate to original client
|
|
79
|
+
optimized_params = params.merge(model: result[:model])
|
|
80
|
+
if params[:messages]
|
|
81
|
+
optimized_params = optimized_params.merge(messages: result[:prompt])
|
|
82
|
+
elsif params[:prompt]
|
|
83
|
+
optimized_params = optimized_params.merge(prompt: result[:prompt])
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
response = super(optimized_params, &block)
|
|
87
|
+
|
|
88
|
+
# Store in cache after successful LLM call
|
|
89
|
+
LlmOptimizer.optimize_post_call(result, response, config)
|
|
90
|
+
|
|
91
|
+
response
|
|
67
92
|
end
|
|
68
93
|
end
|
|
69
94
|
|
|
@@ -231,6 +256,53 @@ module LlmOptimizer
|
|
|
231
256
|
)
|
|
232
257
|
end
|
|
233
258
|
|
|
259
|
+
# Pre-call pipeline for wrap_client: compress, route, cache lookup.
|
|
260
|
+
# Returns a hash with :prompt, :model, :model_tier, :embedding, :cache_status, :response.
|
|
261
|
+
# Does NOT make an LLM call — the wrapped client handles that via super.
|
|
262
|
+
def self.optimize_pre_call(prompt, config = configuration)
|
|
263
|
+
compressor = Compressor.new
|
|
264
|
+
prompt = compressor.compress(prompt) if config.compress_prompt
|
|
265
|
+
|
|
266
|
+
router = ModelRouter.new(config)
|
|
267
|
+
model_tier = router.route(prompt)
|
|
268
|
+
model = model_tier == :simple ? config.simple_model : config.complex_model
|
|
269
|
+
|
|
270
|
+
embedding = nil
|
|
271
|
+
if config.use_semantic_cache && config.redis_url
|
|
272
|
+
begin
|
|
273
|
+
emb_client = EmbeddingClient.new(
|
|
274
|
+
model: config.embedding_model,
|
|
275
|
+
timeout_seconds: config.timeout_seconds,
|
|
276
|
+
embedding_caller: config.embedding_caller
|
|
277
|
+
)
|
|
278
|
+
embedding = emb_client.embed(prompt)
|
|
279
|
+
redis = build_redis(config.redis_url)
|
|
280
|
+
cache = SemanticCache.new(redis, threshold: config.similarity_threshold, ttl: config.cache_ttl)
|
|
281
|
+
cached = cache.lookup(embedding)
|
|
282
|
+
return { prompt: prompt, model: model, model_tier: model_tier,
|
|
283
|
+
embedding: embedding, cache_status: :hit, response: cached } if cached
|
|
284
|
+
rescue EmbeddingError => e
|
|
285
|
+
config.logger.warn("[llm_optimizer] wrap_client EmbeddingError (cache miss): #{e.message}")
|
|
286
|
+
embedding = nil
|
|
287
|
+
end
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
{ prompt: prompt, model: model, model_tier: model_tier,
|
|
291
|
+
embedding: embedding, cache_status: :miss, response: nil }
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
# Post-call: store the LLM response in the semantic cache if applicable.
|
|
295
|
+
def self.optimize_post_call(pre_call_result, response, config = configuration)
|
|
296
|
+
return unless config.use_semantic_cache && config.redis_url
|
|
297
|
+
return unless pre_call_result[:embedding]
|
|
298
|
+
|
|
299
|
+
redis = build_redis(config.redis_url)
|
|
300
|
+
cache = SemanticCache.new(redis, threshold: config.similarity_threshold, ttl: config.cache_ttl)
|
|
301
|
+
cache.store(pre_call_result[:embedding], response)
|
|
302
|
+
rescue StandardError => e
|
|
303
|
+
config.logger.warn("[llm_optimizer] wrap_client cache store failed: #{e.message}")
|
|
304
|
+
end
|
|
305
|
+
|
|
234
306
|
# Private helpers
|
|
235
307
|
|
|
236
308
|
class << self
|