semantic-cache 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +19 -0
- data/Gemfile +9 -0
- data/LICENSE +21 -0
- data/README.md +307 -0
- data/Rakefile +8 -0
- data/lib/semantic_cache/cache.rb +187 -0
- data/lib/semantic_cache/client_wrapper.rb +66 -0
- data/lib/semantic_cache/configuration.rb +50 -0
- data/lib/semantic_cache/embedding.rb +45 -0
- data/lib/semantic_cache/entry.rb +62 -0
- data/lib/semantic_cache/rails.rb +76 -0
- data/lib/semantic_cache/similarity.rb +31 -0
- data/lib/semantic_cache/stats.rb +108 -0
- data/lib/semantic_cache/stores/memory.rb +78 -0
- data/lib/semantic_cache/stores/redis.rb +127 -0
- data/lib/semantic_cache/version.rb +5 -0
- data/lib/semantic_cache.rb +39 -0
- data/semantic_cache.gemspec +43 -0
- metadata +162 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 0a16f649e20d3989d12d16c4e52b809c7f91fc81383d83cdd9e4b86306ceb8d4
|
|
4
|
+
data.tar.gz: 8e1a0cf3bfdec291aa24141ffdd4c32f31f4d33802e10b4bc08e36b5576daec1
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 339b424cff6654e37aa888b22ef9a9ab8f06d69fcda0d3ed42f62ded02d852d29ac18a3a1ee570539231e8cbdd83922ef909cdc24eb11e301859f73906ca774c
|
|
7
|
+
data.tar.gz: 857a0e0013fb7d93c006c87418c9adad1fbd0cfb79faa92cadcbf8a35ed9c690e3c26f02df2247d62e6a48254959e29bffc8457a6bef55e3c6807f624d54502c
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
## [0.1.0] - 2025-01-29
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
|
|
9
|
+
- Core semantic caching with cosine similarity matching
|
|
10
|
+
- In-memory and Redis cache stores
|
|
11
|
+
- Embedding generation via OpenAI `text-embedding-3-small`
|
|
12
|
+
- Configurable similarity threshold (default: 0.85)
|
|
13
|
+
- TTL-based and tag-based cache invalidation
|
|
14
|
+
- Cost tracking and savings reports
|
|
15
|
+
- Multi-model support (OpenAI, Anthropic, Gemini)
|
|
16
|
+
- Client wrapper / middleware pattern
|
|
17
|
+
- Rails integration (concern + around_action helper)
|
|
18
|
+
- Thread-safe statistics tracking
|
|
19
|
+
- Comprehensive test suite
|
data/Gemfile
ADDED
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 stokry
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
# SemanticCache
|
|
2
|
+
|
|
3
|
+
**Semantic caching for LLM API calls. Save 70%+ on costs.**
|
|
4
|
+
|
|
5
|
+
Cache LLM responses using semantic similarity matching. Similar questions return cached answers instantly, cutting API costs dramatically.
|
|
6
|
+
|
|
7
|
+
```ruby
|
|
8
|
+
cache = SemanticCache.new
|
|
9
|
+
|
|
10
|
+
# First call — hits the API
|
|
11
|
+
response = cache.fetch("What's the capital of France?") do
|
|
12
|
+
openai.chat(messages: [{ role: "user", content: "What's the capital of France?" }])
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Second call — semantically similar, returns cached response instantly
|
|
16
|
+
response = cache.fetch("What is France's capital city?") do
|
|
17
|
+
openai.chat(messages: [{ role: "user", content: "What is France's capital city?" }])
|
|
18
|
+
end
|
|
19
|
+
# => CACHE HIT! No API call.
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
Add to your Gemfile:
|
|
25
|
+
|
|
26
|
+
```ruby
|
|
27
|
+
gem "semantic-cache"
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Then:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
bundle install
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Or install directly:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
gem install semantic-cache
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Quick Start
|
|
43
|
+
|
|
44
|
+
```ruby
|
|
45
|
+
require "semantic_cache"
|
|
46
|
+
|
|
47
|
+
# Configure (or set OPENAI_API_KEY env var)
|
|
48
|
+
SemanticCache.configure do |c|
|
|
49
|
+
c.openai_api_key = "sk-..."
|
|
50
|
+
c.similarity_threshold = 0.85 # How similar queries must be to match (0.0-1.0)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
cache = SemanticCache.new
|
|
54
|
+
|
|
55
|
+
response = cache.fetch("What is Ruby?", model: "gpt-4o") do
|
|
56
|
+
openai.chat(parameters: {
|
|
57
|
+
model: "gpt-4o",
|
|
58
|
+
messages: [{ role: "user", content: "What is Ruby?" }]
|
|
59
|
+
})
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Check stats
|
|
63
|
+
puts cache.current_stats
|
|
64
|
+
# => { hits: 0, misses: 1, hit_rate: 0.0, savings: "$0.00", ... }
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## How It Works
|
|
68
|
+
|
|
69
|
+
1. Your query is converted to an embedding vector via OpenAI's `text-embedding-3-small`
|
|
70
|
+
2. The cache searches for stored entries with high cosine similarity
|
|
71
|
+
3. If a match exceeds the threshold (default 0.85), the cached response is returned
|
|
72
|
+
4. If no match, the block executes, and the result is cached for future queries
|
|
73
|
+
|
|
74
|
+
## Configuration
|
|
75
|
+
|
|
76
|
+
```ruby
|
|
77
|
+
SemanticCache.configure do |c|
|
|
78
|
+
# Similarity threshold (0.0 to 1.0). Higher = stricter matching.
|
|
79
|
+
c.similarity_threshold = 0.85
|
|
80
|
+
|
|
81
|
+
# Embedding model
|
|
82
|
+
c.embedding_model = "text-embedding-3-small"
|
|
83
|
+
|
|
84
|
+
# OpenAI API key
|
|
85
|
+
c.openai_api_key = ENV["OPENAI_API_KEY"]
|
|
86
|
+
|
|
87
|
+
# Default TTL for cached entries (nil = no expiry)
|
|
88
|
+
c.default_ttl = 3600 # 1 hour
|
|
89
|
+
|
|
90
|
+
# Cache store: :memory or :redis
|
|
91
|
+
c.store = :memory
|
|
92
|
+
c.store_options = {} # passed to Redis.new if store is :redis
|
|
93
|
+
|
|
94
|
+
# Cost tracking
|
|
95
|
+
c.track_costs = true
|
|
96
|
+
end
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Cache Stores
|
|
100
|
+
|
|
101
|
+
### In-Memory (default)
|
|
102
|
+
|
|
103
|
+
Thread-safe, no dependencies. Good for development and single-process apps.
|
|
104
|
+
|
|
105
|
+
```ruby
|
|
106
|
+
cache = SemanticCache.new(store: :memory)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Redis
|
|
110
|
+
|
|
111
|
+
For production, multi-process, and distributed apps. Requires the `redis` gem.
|
|
112
|
+
|
|
113
|
+
```ruby
|
|
114
|
+
gem "redis"
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
```ruby
|
|
118
|
+
cache = SemanticCache.new(
|
|
119
|
+
store: :redis,
|
|
120
|
+
store_options: { url: "redis://localhost:6379/0" }
|
|
121
|
+
)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Custom Store
|
|
125
|
+
|
|
126
|
+
Any object that responds to `write`, `entries`, `delete`, `invalidate_by_tags`, `clear`, and `size`:
|
|
127
|
+
|
|
128
|
+
```ruby
|
|
129
|
+
cache = SemanticCache.new(store: MyCustomStore.new)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## TTL & Tag-Based Invalidation
|
|
133
|
+
|
|
134
|
+
```ruby
|
|
135
|
+
# TTL — auto-expires after 1 hour
|
|
136
|
+
cache.fetch("Latest news?", ttl: 3600) do
|
|
137
|
+
fetch_news
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Tags — group related entries for bulk invalidation
|
|
141
|
+
cache.fetch("Ruby version?", tags: [:ruby, :versions]) do
|
|
142
|
+
"3.3.0"
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
cache.fetch("Best framework?", tags: [:ruby, :frameworks]) do
|
|
146
|
+
"Rails"
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Invalidate all entries tagged :versions
|
|
150
|
+
cache.invalidate(tags: [:versions])
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Multi-Model Support
|
|
154
|
+
|
|
155
|
+
Convenience methods for different LLM providers:
|
|
156
|
+
|
|
157
|
+
```ruby
|
|
158
|
+
cache.fetch_openai("query", model: "gpt-4o") do
|
|
159
|
+
openai.chat(...)
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
cache.fetch_anthropic("query", model: "claude-sonnet-4-20250514") do
|
|
163
|
+
anthropic.messages(...)
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
cache.fetch_gemini("query", model: "gemini-pro") do
|
|
167
|
+
gemini.generate(...)
|
|
168
|
+
end
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Client Wrapper
|
|
172
|
+
|
|
173
|
+
Wrap an existing OpenAI client to cache all chat calls automatically:
|
|
174
|
+
|
|
175
|
+
```ruby
|
|
176
|
+
require "openai"
|
|
177
|
+
|
|
178
|
+
client = OpenAI::Client.new(access_token: "sk-...")
|
|
179
|
+
cached_client = SemanticCache.wrap(client)
|
|
180
|
+
|
|
181
|
+
# All chat calls are now cached
|
|
182
|
+
response = cached_client.chat(parameters: {
|
|
183
|
+
model: "gpt-4o",
|
|
184
|
+
messages: [{ role: "user", content: "What is Ruby?" }]
|
|
185
|
+
})
|
|
186
|
+
|
|
187
|
+
# Access cache stats
|
|
188
|
+
cached_client.semantic_cache.current_stats
|
|
189
|
+
|
|
190
|
+
# Other methods are delegated to the original client
|
|
191
|
+
cached_client.models # => calls client.models directly
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Cost Tracking & Stats
|
|
195
|
+
|
|
196
|
+
```ruby
|
|
197
|
+
cache = SemanticCache.new
|
|
198
|
+
|
|
199
|
+
# After some usage...
|
|
200
|
+
cache.current_stats
|
|
201
|
+
# => {
|
|
202
|
+
# hits: 156,
|
|
203
|
+
# misses: 44,
|
|
204
|
+
# total_queries: 200,
|
|
205
|
+
# hit_rate: 78.0,
|
|
206
|
+
# savings: "$23.45",
|
|
207
|
+
# ...
|
|
208
|
+
# }
|
|
209
|
+
|
|
210
|
+
puts cache.detailed_stats
|
|
211
|
+
# Total queries: 200
|
|
212
|
+
# Cache hits: 156
|
|
213
|
+
# Cache misses: 44
|
|
214
|
+
# Hit rate: 78.0%
|
|
215
|
+
# Total savings: $23.45
|
|
216
|
+
|
|
217
|
+
puts cache.savings_report
|
|
218
|
+
# Total saved: $23.45 (156 cached calls)
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
Custom model costs:
|
|
222
|
+
|
|
223
|
+
```ruby
|
|
224
|
+
SemanticCache.configure do |c|
|
|
225
|
+
c.model_costs["my-custom-model"] = { input: 0.01, output: 0.03 }
|
|
226
|
+
end
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
## Rails Integration
|
|
230
|
+
|
|
231
|
+
```ruby
|
|
232
|
+
# Gemfile
|
|
233
|
+
gem "semantic-cache"
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
```ruby
|
|
237
|
+
# config/initializers/semantic_cache.rb
|
|
238
|
+
require "semantic_cache/rails"
|
|
239
|
+
|
|
240
|
+
SemanticCache.configure do |c|
|
|
241
|
+
c.openai_api_key = Rails.application.credentials.openai_api_key
|
|
242
|
+
c.store = :redis
|
|
243
|
+
c.store_options = { url: ENV["REDIS_URL"] }
|
|
244
|
+
end
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### Using the Concern
|
|
248
|
+
|
|
249
|
+
```ruby
|
|
250
|
+
class ChatController < ApplicationController
|
|
251
|
+
include SemanticCache::Cacheable
|
|
252
|
+
|
|
253
|
+
cache_ai_calls only: [:create], ttl: 1.hour
|
|
254
|
+
|
|
255
|
+
def create
|
|
256
|
+
response = SemanticCache.current.fetch(params[:message], model: "gpt-4o") do
|
|
257
|
+
openai_client.chat(parameters: {
|
|
258
|
+
model: "gpt-4o",
|
|
259
|
+
messages: [{ role: "user", content: params[:message] }]
|
|
260
|
+
})
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
render json: { response: response }
|
|
264
|
+
end
|
|
265
|
+
end
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
### Per-User Namespacing
|
|
269
|
+
|
|
270
|
+
```ruby
|
|
271
|
+
class ApplicationController < ActionController::Base
|
|
272
|
+
around_action :with_semantic_cache
|
|
273
|
+
|
|
274
|
+
private
|
|
275
|
+
|
|
276
|
+
def with_semantic_cache
|
|
277
|
+
SemanticCache.with_cache(namespace: "user_#{current_user.id}") do
|
|
278
|
+
yield
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
## Demo
|
|
285
|
+
|
|
286
|
+
Run the built-in demo (no API key needed):
|
|
287
|
+
|
|
288
|
+
```bash
|
|
289
|
+
ruby examples/demo.rb --simulate
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
Or with a real API key:
|
|
293
|
+
|
|
294
|
+
```bash
|
|
295
|
+
OPENAI_API_KEY=sk-... ruby examples/demo.rb
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
## Development
|
|
299
|
+
|
|
300
|
+
```bash
|
|
301
|
+
bundle install
|
|
302
|
+
bundle exec rspec
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
## License
|
|
306
|
+
|
|
307
|
+
MIT License. See [LICENSE](LICENSE).
|
data/Rakefile
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "digest"
|
|
4
|
+
|
|
5
|
+
module SemanticCache
|
|
6
|
+
class Cache
|
|
7
|
+
attr_reader :stats
|
|
8
|
+
|
|
9
|
+
def initialize(
|
|
10
|
+
similarity_threshold: nil,
|
|
11
|
+
embedding_model: nil,
|
|
12
|
+
store: nil,
|
|
13
|
+
store_options: {},
|
|
14
|
+
default_ttl: nil,
|
|
15
|
+
namespace: nil,
|
|
16
|
+
track_costs: nil
|
|
17
|
+
)
|
|
18
|
+
config = SemanticCache.configuration
|
|
19
|
+
|
|
20
|
+
@threshold = similarity_threshold || config.similarity_threshold
|
|
21
|
+
@default_ttl = default_ttl || config.default_ttl
|
|
22
|
+
@track_costs = track_costs.nil? ? config.track_costs : track_costs
|
|
23
|
+
@namespace = namespace || config.namespace
|
|
24
|
+
|
|
25
|
+
@embedding = Embedding.new(
|
|
26
|
+
model: embedding_model || config.embedding_model,
|
|
27
|
+
api_key: config.openai_api_key
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
@store = build_store(store || config.store, store_options.empty? ? config.store_options : store_options)
|
|
31
|
+
@stats = Stats.new
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Fetch a cached response or execute the block and cache the result.
|
|
35
|
+
#
|
|
36
|
+
# cache.fetch("What is Ruby?") do
|
|
37
|
+
# openai.chat(messages: [{ role: "user", content: "What is Ruby?" }])
|
|
38
|
+
# end
|
|
39
|
+
#
|
|
40
|
+
# Options:
|
|
41
|
+
# ttl: - Time-to-live in seconds (overrides default)
|
|
42
|
+
# tags: - Array of tags for grouped invalidation
|
|
43
|
+
# model: - Model name for cost tracking
|
|
44
|
+
def fetch(query, ttl: nil, tags: [], model: nil, metadata: {}, &block)
|
|
45
|
+
raise ArgumentError, "A block is required" unless block_given?
|
|
46
|
+
|
|
47
|
+
start_time = Time.now
|
|
48
|
+
|
|
49
|
+
# Generate embedding for the query
|
|
50
|
+
query_embedding = @embedding.generate(query)
|
|
51
|
+
|
|
52
|
+
# Search for a semantically similar cached entry
|
|
53
|
+
match = find_similar(query_embedding)
|
|
54
|
+
|
|
55
|
+
if match
|
|
56
|
+
elapsed = ((Time.now - start_time) * 1000).round(2)
|
|
57
|
+
saved_cost = estimate_cost(model)
|
|
58
|
+
@stats.record_hit(saved_cost: saved_cost, response_time: elapsed)
|
|
59
|
+
return match.response
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Cache miss — execute the block
|
|
63
|
+
response = block.call
|
|
64
|
+
|
|
65
|
+
elapsed = ((Time.now - start_time) * 1000).round(2)
|
|
66
|
+
@stats.record_miss(response_time: elapsed)
|
|
67
|
+
|
|
68
|
+
# Store the new entry
|
|
69
|
+
entry = Entry.new(
|
|
70
|
+
query: query,
|
|
71
|
+
embedding: query_embedding,
|
|
72
|
+
response: response,
|
|
73
|
+
model: model,
|
|
74
|
+
tags: Array(tags),
|
|
75
|
+
ttl: ttl || @default_ttl,
|
|
76
|
+
metadata: metadata
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
key = generate_key(query)
|
|
80
|
+
@store.write(key, entry)
|
|
81
|
+
|
|
82
|
+
response
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Convenience methods for specific providers
|
|
86
|
+
|
|
87
|
+
def fetch_openai(query, model: "gpt-4o", **options, &block)
|
|
88
|
+
fetch(query, model: model, **options, &block)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def fetch_anthropic(query, model: "claude-sonnet-4-20250514", **options, &block)
|
|
92
|
+
fetch(query, model: model, **options, &block)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def fetch_gemini(query, model: "gemini-pro", **options, &block)
|
|
96
|
+
fetch(query, model: model, **options, &block)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Invalidate cached entries by tags.
|
|
100
|
+
#
|
|
101
|
+
# cache.invalidate(tags: [:product_info])
|
|
102
|
+
# cache.invalidate(tags: "user_data")
|
|
103
|
+
def invalidate(tags:)
|
|
104
|
+
@store.invalidate_by_tags(Array(tags))
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Clear all cached entries.
|
|
108
|
+
def clear
|
|
109
|
+
@store.clear
|
|
110
|
+
@stats.reset!
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Return current cache statistics as a Hash.
|
|
114
|
+
def current_stats
|
|
115
|
+
@stats.to_h
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Return a formatted stats report string.
|
|
119
|
+
def detailed_stats
|
|
120
|
+
@stats.report
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Savings report string.
|
|
124
|
+
def savings_report
|
|
125
|
+
s = @stats
|
|
126
|
+
"Total saved: #{format("$%.2f", s.total_savings)} (#{s.hits} cached calls)"
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Number of entries currently in the cache.
|
|
130
|
+
def size
|
|
131
|
+
@store.size
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
private
|
|
135
|
+
|
|
136
|
+
def find_similar(query_embedding)
|
|
137
|
+
entries = @store.entries
|
|
138
|
+
return nil if entries.empty?
|
|
139
|
+
|
|
140
|
+
best_match = nil
|
|
141
|
+
best_score = -1.0
|
|
142
|
+
|
|
143
|
+
entries.each do |entry|
|
|
144
|
+
next if entry.expired?
|
|
145
|
+
|
|
146
|
+
score = Similarity.cosine(query_embedding, entry.embedding)
|
|
147
|
+
if score > best_score
|
|
148
|
+
best_score = score
|
|
149
|
+
best_match = entry
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
return nil if best_match.nil? || best_score < @threshold
|
|
154
|
+
|
|
155
|
+
best_match
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def estimate_cost(model)
|
|
159
|
+
return 0.0 unless @track_costs && model
|
|
160
|
+
|
|
161
|
+
costs = SemanticCache.configuration.cost_for(model)
|
|
162
|
+
# Rough estimate: average request ~500 input tokens, ~200 output tokens
|
|
163
|
+
((costs[:input] * 0.5) + (costs[:output] * 0.2)).round(6)
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def generate_key(query)
|
|
167
|
+
Digest::SHA256.hexdigest("#{@namespace}:#{query}")[0, 16]
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def build_store(type, options)
|
|
171
|
+
case type
|
|
172
|
+
when :memory, "memory"
|
|
173
|
+
Stores::Memory.new(**options)
|
|
174
|
+
when :redis, "redis"
|
|
175
|
+
Stores::Redis.new(**options)
|
|
176
|
+
when Stores::Memory, Stores::Redis
|
|
177
|
+
type # Already instantiated
|
|
178
|
+
else
|
|
179
|
+
if type.respond_to?(:write) && type.respond_to?(:entries)
|
|
180
|
+
type # Duck-typed custom store
|
|
181
|
+
else
|
|
182
|
+
raise ConfigurationError, "Unknown store type: #{type}. Use :memory or :redis."
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
end
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SemanticCache
|
|
4
|
+
# Wraps an OpenAI client (or compatible) to automatically cache chat calls.
|
|
5
|
+
#
|
|
6
|
+
# client = OpenAI::Client.new
|
|
7
|
+
# cached = SemanticCache::ClientWrapper.new(client)
|
|
8
|
+
#
|
|
9
|
+
# # All chat calls are automatically cached
|
|
10
|
+
# cached.chat(parameters: { model: "gpt-4o", messages: [...] })
|
|
11
|
+
#
|
|
12
|
+
# Or use the shorthand:
|
|
13
|
+
#
|
|
14
|
+
# cached = SemanticCache.wrap(client)
|
|
15
|
+
#
|
|
16
|
+
class ClientWrapper
|
|
17
|
+
def initialize(client, cache: nil, **cache_options)
|
|
18
|
+
@client = client
|
|
19
|
+
@cache = cache || Cache.new(**cache_options)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def chat(parameters: {}, **kwargs)
|
|
23
|
+
messages = parameters[:messages] || parameters["messages"] || []
|
|
24
|
+
model = parameters[:model] || parameters["model"]
|
|
25
|
+
|
|
26
|
+
# Use the last user message as the cache key
|
|
27
|
+
user_message = messages.reverse.find { |m| m[:role] == "user" || m["role"] == "user" }
|
|
28
|
+
query = user_message && (user_message[:content] || user_message["content"])
|
|
29
|
+
|
|
30
|
+
if query
|
|
31
|
+
@cache.fetch(query, model: model) do
|
|
32
|
+
@client.chat(parameters: parameters, **kwargs)
|
|
33
|
+
end
|
|
34
|
+
else
|
|
35
|
+
@client.chat(parameters: parameters, **kwargs)
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Delegate everything else to the wrapped client
|
|
40
|
+
def method_missing(method, ...)
|
|
41
|
+
if @client.respond_to?(method)
|
|
42
|
+
@client.send(method, ...)
|
|
43
|
+
else
|
|
44
|
+
super
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def respond_to_missing?(method, include_private = false)
|
|
49
|
+
@client.respond_to?(method, include_private) || super
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Access the underlying cache for stats, invalidation, etc.
|
|
53
|
+
def semantic_cache
|
|
54
|
+
@cache
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
class << self
|
|
59
|
+
# Convenience: wrap a client with semantic caching.
|
|
60
|
+
#
|
|
61
|
+
# cached_client = SemanticCache.wrap(openai_client)
|
|
62
|
+
def wrap(client, **options)
|
|
63
|
+
ClientWrapper.new(client, **options)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SemanticCache
|
|
4
|
+
class Configuration
|
|
5
|
+
attr_accessor :similarity_threshold,
|
|
6
|
+
:embedding_model,
|
|
7
|
+
:openai_api_key,
|
|
8
|
+
:default_ttl,
|
|
9
|
+
:store,
|
|
10
|
+
:store_options,
|
|
11
|
+
:track_costs,
|
|
12
|
+
:model_costs,
|
|
13
|
+
:namespace
|
|
14
|
+
|
|
15
|
+
# Cost per 1K tokens (USD)
|
|
16
|
+
DEFAULT_MODEL_COSTS = {
|
|
17
|
+
# OpenAI
|
|
18
|
+
"gpt-4" => { input: 0.03, output: 0.06 },
|
|
19
|
+
"gpt-4-turbo" => { input: 0.01, output: 0.03 },
|
|
20
|
+
"gpt-4o" => { input: 0.005, output: 0.015 },
|
|
21
|
+
"gpt-4o-mini" => { input: 0.00015, output: 0.0006 },
|
|
22
|
+
"gpt-3.5-turbo" => { input: 0.0005, output: 0.0015 },
|
|
23
|
+
# Anthropic
|
|
24
|
+
"claude-sonnet-4-20250514" => { input: 0.003, output: 0.015 },
|
|
25
|
+
"claude-3-5-haiku-20241022" => { input: 0.001, output: 0.005 },
|
|
26
|
+
# Gemini
|
|
27
|
+
"gemini-pro" => { input: 0.0005, output: 0.0015 },
|
|
28
|
+
"gemini-1.5-pro" => { input: 0.00125, output: 0.005 },
|
|
29
|
+
# Embedding (cost per 1K tokens)
|
|
30
|
+
"text-embedding-3-small" => { input: 0.00002, output: 0.0 },
|
|
31
|
+
"text-embedding-3-large" => { input: 0.00013, output: 0.0 }
|
|
32
|
+
}.freeze
|
|
33
|
+
|
|
34
|
+
def initialize
|
|
35
|
+
@similarity_threshold = 0.85
|
|
36
|
+
@embedding_model = "text-embedding-3-small"
|
|
37
|
+
@openai_api_key = ENV["OPENAI_API_KEY"]
|
|
38
|
+
@default_ttl = nil # No expiry by default
|
|
39
|
+
@store = :memory
|
|
40
|
+
@store_options = {}
|
|
41
|
+
@track_costs = true
|
|
42
|
+
@model_costs = DEFAULT_MODEL_COSTS.dup
|
|
43
|
+
@namespace = "semantic_cache"
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def cost_for(model)
|
|
47
|
+
model_costs[model] || { input: 0.001, output: 0.002 }
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|