@betterdb/semantic-cache 0.1.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +233 -124
- package/dist/SemanticCache.d.ts +127 -7
- package/dist/SemanticCache.js +867 -48
- package/dist/adapters/ai.js +6 -1
- package/dist/adapters/anthropic.d.ts +32 -0
- package/dist/adapters/anthropic.js +94 -0
- package/dist/adapters/langchain.js +6 -1
- package/dist/adapters/langgraph.d.ts +104 -0
- package/dist/adapters/langgraph.js +271 -0
- package/dist/adapters/llamaindex.d.ts +32 -0
- package/dist/adapters/llamaindex.js +76 -0
- package/dist/adapters/openai-responses.d.ts +31 -0
- package/dist/adapters/openai-responses.js +112 -0
- package/dist/adapters/openai.d.ts +42 -0
- package/dist/adapters/openai.js +97 -0
- package/dist/analytics.d.ts +24 -0
- package/dist/analytics.js +116 -0
- package/dist/cluster.d.ts +10 -0
- package/dist/cluster.js +43 -0
- package/dist/defaultCostTable.d.ts +11 -0
- package/dist/defaultCostTable.js +1976 -0
- package/dist/discovery.d.ts +67 -0
- package/dist/discovery.js +140 -0
- package/dist/embed/bedrock.d.ts +32 -0
- package/dist/embed/bedrock.js +109 -0
- package/dist/embed/cohere.d.ts +34 -0
- package/dist/embed/cohere.js +37 -0
- package/dist/embed/ollama.d.ts +30 -0
- package/dist/embed/ollama.js +24 -0
- package/dist/embed/openai.d.ts +31 -0
- package/dist/embed/openai.js +66 -0
- package/dist/embed/voyage.d.ts +31 -0
- package/dist/embed/voyage.js +32 -0
- package/dist/index.d.ts +8 -1
- package/dist/index.js +13 -1
- package/dist/normalizer.d.ts +68 -0
- package/dist/normalizer.js +102 -0
- package/dist/telemetry.d.ts +5 -0
- package/dist/telemetry.js +30 -0
- package/dist/types.d.ts +128 -7
- package/dist/utils.d.ts +58 -0
- package/dist/utils.js +30 -0
- package/package.json +81 -6
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# @betterdb/semantic-cache
|
|
2
2
|
|
|
3
|
-
A standalone, framework-agnostic semantic cache for LLM applications backed by [Valkey](https://valkey.io/)
|
|
3
|
+
A standalone, framework-agnostic semantic cache for LLM applications backed by [Valkey](https://valkey.io/). Uses Valkey's vector search (`valkey-search` module) for similarity matching with built-in [OpenTelemetry](https://opentelemetry.io/) tracing and [Prometheus](https://prometheus.io/) metrics. Full adapter parity with [`@betterdb/agent-cache`](../agent-cache/).
|
|
4
4
|
|
|
5
5
|
## Prerequisites
|
|
6
6
|
|
|
@@ -12,153 +12,172 @@ A standalone, framework-agnostic semantic cache for LLM applications backed by [
|
|
|
12
12
|
## Installation
|
|
13
13
|
|
|
14
14
|
```bash
|
|
15
|
-
npm install @betterdb/semantic-cache
|
|
15
|
+
npm install @betterdb/semantic-cache iovalkey
|
|
16
16
|
```
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
```bash
|
|
21
|
-
npm install iovalkey
|
|
22
|
-
```
|
|
18
|
+
`iovalkey` is a required peer dependency.
|
|
23
19
|
|
|
24
20
|
## Why @betterdb/semantic-cache
|
|
25
21
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
| Library / Service | Valkey-native | Standalone | Built-in OTel + Prometheus |
|
|
29
|
-
|---|---|---|---|
|
|
30
|
-
| **@betterdb/semantic-cache** | ✅ | ✅ | ✅ |
|
|
31
|
-
| RedisVL `SemanticCache` | ❌ Redis only | ✅ | ❌ |
|
|
32
|
-
| LangChain `RedisSemanticCache` | ❌ Redis only | ❌ Requires LangChain | ❌ |
|
|
33
|
-
| LiteLLM `redis-semantic` | ❌ Redis only | ❌ Requires LiteLLM | ❌ Partial (no cache metrics) |
|
|
34
|
-
| `langgraph-checkpoint-aws` `ValkeyCache` | ✅ | ❌ Requires AWS + LangGraph | ❌ |
|
|
35
|
-
| Mem0 + Valkey | ✅ | ❌ Full memory framework | ❌ |
|
|
36
|
-
| Redis LangCache | ❌ Redis Cloud only | ❌ Managed service | ✅ Dashboard only |
|
|
37
|
-
| Upstash `semantic-cache` | ❌ Upstash Vector only | ✅ | ❌ |
|
|
38
|
-
| GPTCache | ❌ Abandoned (2023) | ✅ | ❌ |
|
|
39
|
-
|
|
40
|
-
- **Valkey-native**: `valkey-search` has API differences from Redis's RediSearch that require explicit handling (see [Valkey Search 1.2 compatibility notes](#valkey-search-12-compatibility-notes) in the changelog). Libraries targeting Redis are not guaranteed to work correctly against self-hosted Valkey or managed Valkey services (ElastiCache, Memorystore).
|
|
41
|
-
- **Standalone**: no dependency on a specific AI framework means you can use this with any LLM client — OpenAI SDK, Anthropic SDK, a local model, or a custom inference endpoint — and swap it out without changing your cache layer.
|
|
42
|
-
- **Built-in OTel + Prometheus**: every `check()` and `store()` call emits a span and increments counters. You get hit rate, similarity score distribution, and latency percentiles in Grafana or any OTel-compatible backend without writing any instrumentation code. If you use [BetterDB Monitor](https://betterdb.com), these metrics are surfaced automatically alongside your other Valkey observability data.
|
|
22
|
+
The only semantic cache library that is simultaneously Valkey-native (explicit handling of `valkey-search` API differences), standalone (no coupling to any AI framework), and has built-in OpenTelemetry + Prometheus instrumentation at the cache operation level.
|
|
43
23
|
|
|
44
24
|
## Quick Start
|
|
45
25
|
|
|
46
26
|
```typescript
|
|
47
27
|
import Valkey from 'iovalkey';
|
|
48
28
|
import { SemanticCache } from '@betterdb/semantic-cache';
|
|
29
|
+
import { createOpenAIEmbed } from '@betterdb/semantic-cache/embed/openai';
|
|
49
30
|
|
|
50
31
|
const client = new Valkey({ host: 'localhost', port: 6399 });
|
|
51
32
|
|
|
52
33
|
const cache = new SemanticCache({
|
|
53
34
|
client,
|
|
54
|
-
embedFn:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
method: 'POST',
|
|
58
|
-
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${process.env.VOYAGE_API_KEY}` },
|
|
59
|
-
body: JSON.stringify({ model: 'voyage-3-lite', input: [text] }),
|
|
60
|
-
});
|
|
61
|
-
const json = await res.json();
|
|
62
|
-
return json.data[0].embedding;
|
|
63
|
-
},
|
|
35
|
+
embedFn: createOpenAIEmbed(), // or createVoyageEmbed(), createOllamaEmbed(), etc.
|
|
36
|
+
defaultThreshold: 0.15, // loosen slightly to catch paraphrases with high confidence
|
|
37
|
+
defaultTtl: 3600,
|
|
64
38
|
});
|
|
65
39
|
|
|
66
40
|
await cache.initialize();
|
|
67
41
|
|
|
68
|
-
// Store
|
|
69
|
-
await cache.store('What is the capital of France?', 'Paris'
|
|
42
|
+
// Store with cost tracking
|
|
43
|
+
await cache.store('What is the capital of France?', 'Paris', {
|
|
44
|
+
model: 'gpt-4o-mini',
|
|
45
|
+
inputTokens: 20,
|
|
46
|
+
outputTokens: 5,
|
|
47
|
+
});
|
|
70
48
|
|
|
71
|
-
//
|
|
72
|
-
const
|
|
73
|
-
//
|
|
49
|
+
// Exact match - always high confidence
|
|
50
|
+
const exact = await cache.check('What is the capital of France?');
|
|
51
|
+
// exact.hit === true
|
|
52
|
+
// exact.confidence === 'high'
|
|
53
|
+
// exact.similarity === 0.0000
|
|
54
|
+
// exact.costSaved === 0.0000085
|
|
55
|
+
|
|
56
|
+
// Paraphrase - typically 'uncertain' at threshold 0.1, 'high' at threshold 0.15
|
|
57
|
+
const paraphrase = await cache.check('What city is the capital of France?');
|
|
58
|
+
// paraphrase.hit === true
|
|
59
|
+
// paraphrase.confidence === 'high' // at threshold 0.15
|
|
60
|
+
// paraphrase.similarity ~= 0.087 // observed with text-embedding-3-small
|
|
61
|
+
// paraphrase.costSaved === 0.0000085
|
|
74
62
|
```
|
|
75
63
|
|
|
76
|
-
##
|
|
64
|
+
## Threshold and Confidence
|
|
77
65
|
|
|
78
|
-
|
|
66
|
+
This library uses **cosine distance** (0-2 scale, lower = more similar):
|
|
79
67
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
68
|
+
| Distance | Meaning |
|
|
69
|
+
|----------|---------|
|
|
70
|
+
| 0.00 | Identical vectors |
|
|
71
|
+
| 0.05-0.10 | Strong paraphrase |
|
|
72
|
+
| 0.10-0.20 | Loose paraphrase / related topic |
|
|
73
|
+
| 1.00 | Orthogonal (unrelated) |
|
|
83
74
|
|
|
84
|
-
|
|
75
|
+
A lookup is a **hit** when `score <= threshold`. The default threshold is `0.1`.
|
|
85
76
|
|
|
86
|
-
|
|
87
|
-
await client.quit();
|
|
88
|
-
```
|
|
77
|
+
### Confidence levels
|
|
89
78
|
|
|
90
|
-
|
|
79
|
+
| `confidence` | When | What to do |
|
|
80
|
+
|---|---|---|
|
|
81
|
+
| `high` | `score <= threshold - uncertaintyBand` (e.g. `<= 0.05`) | Return the cached response directly |
|
|
82
|
+
| `uncertain` | `threshold - band < score <= threshold` (e.g. `0.05–0.10`) | Return the response but consider flagging for review |
|
|
83
|
+
| `miss` | `score > threshold` | No hit - call the LLM |
|
|
91
84
|
|
|
92
|
-
|
|
85
|
+
**With real embeddings (`text-embedding-3-small`):**
|
|
86
|
+
- Exact same phrasing: `~0.000` - always `high`
|
|
87
|
+
- Close paraphrase ("Which city is the capital of France?"): `~0.08–0.09` - `uncertain` at default `0.1` threshold, `high` at `0.15`
|
|
88
|
+
- Loose paraphrase ("France's capital?"): `~0.10–0.15` - typically `miss` at `0.1`, `uncertain` at `0.15`
|
|
93
89
|
|
|
94
|
-
|
|
95
|
-
|----------|---------|
|
|
96
|
-
| 0 | Identical vectors |
|
|
97
|
-
| 1 | Orthogonal (unrelated) |
|
|
98
|
-
| 2 | Opposite vectors |
|
|
90
|
+
**Recommended thresholds by use case:**
|
|
99
91
|
|
|
100
|
-
|
|
92
|
+
| Use case | Threshold | Notes |
|
|
93
|
+
|---|---|---|
|
|
94
|
+
| FAQ / exact match only | `0.05` | Very strict, near-zero false positives |
|
|
95
|
+
| Standard Q&A | `0.10` | Default - paraphrases land as `uncertain` |
|
|
96
|
+
| Conversational / RAG | `0.15` | Paraphrases hit as `high` confidence |
|
|
97
|
+
| Broad search / recall | `0.20` | High hit rate, review uncertain hits |
|
|
101
98
|
|
|
102
|
-
|
|
99
|
+
## Configuration Reference
|
|
103
100
|
|
|
104
|
-
|
|
101
|
+
| Option | Type | Default | Description |
|
|
102
|
+
|--------|------|---------|-------------|
|
|
103
|
+
| `name` | `string` | `'betterdb_scache'` | Key prefix |
|
|
104
|
+
| `client` | `Valkey` | - | iovalkey client (required) |
|
|
105
|
+
| `embedFn` | `EmbedFn` | - | Embedding function (required) |
|
|
106
|
+
| `defaultThreshold` | `number` | `0.1` | Cosine distance threshold (0-2) |
|
|
107
|
+
| `defaultTtl` | `number` | `undefined` | Default TTL in seconds |
|
|
108
|
+
| `categoryThresholds` | `Record<string, number>` | `{}` | Per-category threshold overrides |
|
|
109
|
+
| `uncertaintyBand` | `number` | `0.05` | Width of uncertainty band below threshold |
|
|
110
|
+
| `costTable` | `Record<string, ModelCost>` | `undefined` | Per-model pricing overrides |
|
|
111
|
+
| `useDefaultCostTable` | `boolean` | `true` | Use bundled LiteLLM price table (1,971 models) |
|
|
112
|
+
| `normalizer` | `BinaryNormalizer` | `defaultNormalizer` | Binary content normalizer |
|
|
113
|
+
| `embeddingCache.enabled` | `boolean` | `true` | Cache computed embeddings in Valkey |
|
|
114
|
+
| `embeddingCache.ttl` | `number` | `86400` | Embedding cache TTL (seconds) |
|
|
115
|
+
| `telemetry.tracerName` | `string` | `'@betterdb/semantic-cache'` | OTel tracer name |
|
|
116
|
+
| `telemetry.metricsPrefix` | `string` | `'semantic_cache'` | Prometheus prefix |
|
|
117
|
+
| `telemetry.registry` | `Registry` | default | prom-client Registry |
|
|
105
118
|
|
|
106
|
-
|
|
107
|
-
the similarity threshold but close to the boundary. Three common patterns:
|
|
119
|
+
## Cost Tracking
|
|
108
120
|
|
|
109
|
-
|
|
110
|
-
separately via the `result: 'uncertain_hit'` Prometheus label. Review them
|
|
111
|
-
periodically to decide if the threshold needs adjustment.
|
|
121
|
+
Store token counts at cache-time to get per-hit cost savings:
|
|
112
122
|
|
|
113
|
-
|
|
114
|
-
|
|
123
|
+
```typescript
|
|
124
|
+
await cache.store('What is the capital of France?', 'Paris', {
|
|
125
|
+
model: 'claude-haiku-4-5', // looked up in bundled LiteLLM price table
|
|
126
|
+
inputTokens: 42,
|
|
127
|
+
outputTokens: 12,
|
|
128
|
+
});
|
|
115
129
|
|
|
116
|
-
|
|
117
|
-
|
|
130
|
+
const result = await cache.check('Capital of France?');
|
|
131
|
+
console.log(result.costSaved); // e.g. 0.000064 (dollars saved on this hit)
|
|
118
132
|
|
|
119
|
-
|
|
120
|
-
|
|
133
|
+
const stats = await cache.stats();
|
|
134
|
+
console.log(stats.costSavedMicros); // cumulative microdollars saved
|
|
135
|
+
```
|
|
121
136
|
|
|
122
|
-
|
|
137
|
+
Cost savings scale with the model. Observed values from live examples:
|
|
138
|
+
- `gpt-4o-mini`: ~`$0.000006` per hit (cheap model, short responses)
|
|
139
|
+
- `claude-haiku-4-5`: ~`$0.000064` per hit (~10x more expensive)
|
|
140
|
+
- `gpt-4o`: ~`$0.000100` per hit at 20 input / 5 output tokens
|
|
123
141
|
|
|
124
|
-
|
|
125
|
-
|--------|------|---------|-------------|
|
|
126
|
-
| `name` | `string` | `'betterdb_scache'` | Index name prefix for Valkey keys |
|
|
127
|
-
| `client` | `Valkey` | — | iovalkey client instance (required) |
|
|
128
|
-
| `embedFn` | `(text: string) => Promise<number[]>` | — | Embedding function (required) |
|
|
129
|
-
| `defaultThreshold` | `number` | `0.1` | Cosine distance threshold (0–2) |
|
|
130
|
-
| `defaultTtl` | `number` | `undefined` | Default TTL in seconds for entries |
|
|
131
|
-
| `categoryThresholds` | `Record<string, number>` | `{}` | Per-category threshold overrides |
|
|
132
|
-
| `uncertaintyBand` | `number` | `0.05` | Width of the uncertainty band below threshold |
|
|
133
|
-
| `telemetry.tracerName` | `string` | `'@betterdb/semantic-cache'` | OpenTelemetry tracer name |
|
|
134
|
-
| `telemetry.metricsPrefix` | `string` | `'semantic_cache'` | Prometheus metric name prefix |
|
|
135
|
-
| `telemetry.registry` | `Registry` | default registry | prom-client Registry for metrics |
|
|
142
|
+
## Adapters
|
|
136
143
|
|
|
137
|
-
|
|
144
|
+
| Import | Class/Function | Description |
|
|
145
|
+
|---|---|---|
|
|
146
|
+
| `@betterdb/semantic-cache/langchain` | `BetterDBSemanticCache` | LangChain `BaseCache` |
|
|
147
|
+
| `@betterdb/semantic-cache/ai` | `createSemanticCacheMiddleware` | Vercel AI SDK middleware |
|
|
148
|
+
| `@betterdb/semantic-cache/openai` | `prepareSemanticParams` | OpenAI Chat Completions |
|
|
149
|
+
| `@betterdb/semantic-cache/openai-responses` | `prepareSemanticParams` | OpenAI Responses API |
|
|
150
|
+
| `@betterdb/semantic-cache/anthropic` | `prepareSemanticParams` | Anthropic Messages API |
|
|
151
|
+
| `@betterdb/semantic-cache/llamaindex` | `prepareSemanticParams` | LlamaIndex ChatMessage[] |
|
|
152
|
+
| `@betterdb/semantic-cache/langgraph` | `BetterDBSemanticStore` | LangGraph BaseStore |
|
|
138
153
|
|
|
139
|
-
|
|
154
|
+
## Embedding Helpers
|
|
140
155
|
|
|
141
|
-
|
|
156
|
+
| Import | Default model | Dimensions |
|
|
157
|
+
|---|---|---|
|
|
158
|
+
| `@betterdb/semantic-cache/embed/openai` | `text-embedding-3-small` | 1536 |
|
|
159
|
+
| `@betterdb/semantic-cache/embed/bedrock` | `amazon.titan-embed-text-v2:0` | 1024 |
|
|
160
|
+
| `@betterdb/semantic-cache/embed/voyage` | `voyage-3-lite` | 512 |
|
|
161
|
+
| `@betterdb/semantic-cache/embed/cohere` | `embed-english-v3.0` | 1024 |
|
|
162
|
+
| `@betterdb/semantic-cache/embed/ollama` | `nomic-embed-text` | 768 |
|
|
142
163
|
|
|
143
|
-
|
|
144
|
-
|--------|------|--------|-------------|
|
|
145
|
-
| `semantic_cache_requests_total` | Counter | `cache_name`, `result`, `category` | Total cache requests. `result` is `hit`, `miss`, or `uncertain_hit` |
|
|
146
|
-
| `semantic_cache_similarity_score` | Histogram | `cache_name`, `category` | Cosine distance scores for lookups with candidates |
|
|
147
|
-
| `semantic_cache_operation_duration_seconds` | Histogram | `cache_name`, `operation` | Duration of cache operations (`check`, `store`, `invalidate`, `initialize`) |
|
|
148
|
-
| `semantic_cache_embedding_duration_seconds` | Histogram | `cache_name` | Duration of embedding function calls |
|
|
164
|
+
### Discovery markers
|
|
149
165
|
|
|
150
|
-
|
|
166
|
+
Starting in `0.2.0`, `initialize()` writes a small advisory record to a shared `__betterdb:caches` hash on the Valkey instance so Monitor (and other tooling) can enumerate caches without configuration. A 60s-TTL heartbeat key is refreshed every 30s; `flush()` and `dispose()` remove the heartbeat immediately. No sensitive data is ever written — only cache metadata (type, prefix, version, capabilities, configured thresholds).
|
|
151
167
|
|
|
152
|
-
|
|
168
|
+
Opt out by passing `discovery: { enabled: false }`. See `SemanticCacheOptions.discovery` for the full set of knobs.
|
|
153
169
|
|
|
154
|
-
|
|
170
|
+
If your Valkey runs with ACLs, grant the library's user access to the `__betterdb:*` prefix:
|
|
155
171
|
|
|
156
|
-
|
|
172
|
+
```
|
|
173
|
+
ACL SETUSER <user> +@write +@read ~__betterdb:* ~<your-cache-prefix>:*
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Discovery writes are best-effort — if the ACL denies them, the cache still functions and the `semantic_cache_discovery_write_failed_total` counter increments so operators can alert.
|
|
157
177
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
- Cost savings estimates based on cache hit rates
|
|
178
|
+
### `cache.dispose()`
|
|
179
|
+
|
|
180
|
+
Graceful shutdown: stops the heartbeat and deletes this instance's heartbeat key so Monitor marks the cache offline immediately. Does not drop the index or delete entries. Call from your SIGTERM handler alongside `client.quit()`.
|
|
162
181
|
|
|
163
182
|
## API
|
|
164
183
|
|
|
@@ -168,55 +187,145 @@ Creates or reconnects to the Valkey search index. Must be called before `check()
|
|
|
168
187
|
|
|
169
188
|
### `cache.check(prompt, options?)`
|
|
170
189
|
|
|
171
|
-
|
|
190
|
+
`prompt` is `string | ContentBlock[]`. Returns `CacheCheckResult`:
|
|
191
|
+
|
|
192
|
+
| Field | Description |
|
|
193
|
+
|---|---|
|
|
194
|
+
| `hit` | Whether the nearest neighbour's distance was `<= threshold` |
|
|
195
|
+
| `response` | Cached response text. Present on hit |
|
|
196
|
+
| `similarity` | Cosine distance (0-2). Present when a candidate was found |
|
|
197
|
+
| `confidence` | `'high'` / `'uncertain'` / `'miss'` |
|
|
198
|
+
| `costSaved` | Dollars saved on this hit. Present when cost was recorded at store time |
|
|
199
|
+
| `contentBlocks` | Structured response blocks. Present when stored via `storeMultipart()` |
|
|
200
|
+
| `nearestMiss` | On miss with a candidate: `{ similarity, deltaToThreshold }` |
|
|
201
|
+
|
|
202
|
+
**Options:** `threshold`, `category`, `filter`, `k`, `staleAfterModelChange`, `currentModel`, `rerank`
|
|
172
203
|
|
|
173
204
|
### `cache.store(prompt, response, options?)`
|
|
174
205
|
|
|
175
|
-
|
|
206
|
+
`prompt` is `string | ContentBlock[]`. Returns the Valkey key.
|
|
207
|
+
|
|
208
|
+
**Options:** `ttl`, `category`, `model`, `metadata`, `inputTokens`, `outputTokens`, `temperature`, `topP`, `seed`
|
|
209
|
+
|
|
210
|
+
### `cache.storeMultipart(prompt, blocks, options?)`
|
|
211
|
+
|
|
212
|
+
Stores structured `ContentBlock[]` as the response. On hit, `check()` returns `contentBlocks`.
|
|
213
|
+
|
|
214
|
+
### `cache.checkBatch(prompts[], options?)`
|
|
215
|
+
|
|
216
|
+
Pipelined multi-prompt lookups. ~50-70% faster than sequential `check()` calls. Returns results in input order.
|
|
176
217
|
|
|
177
218
|
### `cache.invalidate(filter)`
|
|
178
219
|
|
|
179
|
-
|
|
220
|
+
Delete entries matching a `valkey-search` filter (e.g. `'@model:{gpt-4o}'`).
|
|
221
|
+
|
|
222
|
+
### `cache.invalidateByModel(model)` / `cache.invalidateByCategory(category)`
|
|
223
|
+
|
|
224
|
+
Convenience wrappers around `invalidate()`.
|
|
180
225
|
|
|
181
226
|
### `cache.stats()`
|
|
182
227
|
|
|
183
|
-
Returns `{ hits, misses, total, hitRate }
|
|
228
|
+
Returns `{ hits, misses, total, hitRate, costSavedMicros }`.
|
|
184
229
|
|
|
185
230
|
### `cache.indexInfo()`
|
|
186
231
|
|
|
187
|
-
Returns
|
|
232
|
+
Returns `{ name, numDocs, dimension, indexingState }`.
|
|
188
233
|
|
|
189
234
|
### `cache.flush()`
|
|
190
235
|
|
|
191
|
-
Drops the index and all entries. Call `initialize()` again to rebuild.
|
|
236
|
+
Drops the index and all entries. Call `initialize()` again to rebuild. Also stops the discovery heartbeat and deletes its heartbeat key, but preserves the registry entry in `__betterdb:caches` so Monitor retains history.
|
|
192
237
|
|
|
193
|
-
|
|
238
|
+
### `cache.shutdown()`
|
|
194
239
|
|
|
195
|
-
|
|
240
|
+
Stops the analytics client, cancels the stats snapshot timer, and disposes the discovery heartbeat. Safe to call multiple times.
|
|
196
241
|
|
|
197
|
-
|
|
198
|
-
single-endpoint services (Amazon ElastiCache for Valkey, Google Cloud Memorystore
|
|
199
|
-
for Valkey). It does not fully support Valkey in cluster mode.
|
|
242
|
+
### `cache.dispose()`
|
|
200
243
|
|
|
201
|
-
|
|
202
|
-
but `SCAN` in cluster mode only iterates keys on the node it is sent to. In a
|
|
203
|
-
multi-node cluster, `flush()` will silently leave entry keys on other nodes
|
|
204
|
-
(the FT index itself is dropped correctly).
|
|
244
|
+
Graceful shutdown of the discovery layer for in-process caches without destroying data. Stops the discovery heartbeat and deletes the heartbeat key; does not touch the index or entries.
|
|
205
245
|
|
|
206
|
-
|
|
207
|
-
`FT.SEARCH`, `HSET`, `DEL`, and `HINCRBY` which route correctly in cluster mode
|
|
208
|
-
via the key hash slot.
|
|
246
|
+
### `cache.thresholdEffectiveness(options?)`
|
|
209
247
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
248
|
+
Analyzes the rolling similarity score window (last 10,000 entries, up to 7 days) and returns:
|
|
249
|
+
|
|
250
|
+
```typescript
|
|
251
|
+
{
|
|
252
|
+
recommendation: 'tighten_threshold' | 'loosen_threshold' | 'optimal' | 'insufficient_data',
|
|
253
|
+
recommendedThreshold?: number, // present when recommendation is tighten/loosen
|
|
254
|
+
reasoning: string, // human-readable explanation
|
|
255
|
+
hitRate: number,
|
|
256
|
+
uncertainHitRate: number, // >20% triggers tighten recommendation
|
|
257
|
+
nearMissRate: number, // >30% with avg delta <0.03 triggers loosen
|
|
258
|
+
// ...
|
|
259
|
+
}
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
### `cache.thresholdEffectivenessAll(options?)`
|
|
263
|
+
|
|
264
|
+
Returns one result per category seen in the window, plus one aggregate `'all'` result.
|
|
265
|
+
|
|
266
|
+
## Observability
|
|
267
|
+
|
|
268
|
+
### Prometheus Metrics
|
|
269
|
+
|
|
270
|
+
| Metric | Type | Labels | Description |
|
|
271
|
+
|--------|------|--------|-------------|
|
|
272
|
+
| `{prefix}_requests_total` | Counter | `cache_name`, `result`, `category` | `result`: `hit`, `miss`, `uncertain_hit` |
|
|
273
|
+
| `{prefix}_similarity_score` | Histogram | `cache_name`, `category` | Cosine distance per lookup |
|
|
274
|
+
| `{prefix}_operation_duration_seconds` | Histogram | `cache_name`, `operation` | End-to-end latency |
|
|
275
|
+
| `{prefix}_embedding_duration_seconds` | Histogram | `cache_name` | Time in `embedFn` |
|
|
276
|
+
| `{prefix}_cost_saved_total` | Counter | `cache_name`, `category` | Dollars saved from hits |
|
|
277
|
+
| `{prefix}_embedding_cache_total` | Counter | `cache_name`, `result` | Embedding cache hit/miss |
|
|
278
|
+
| `{prefix}_stale_model_evictions_total` | Counter | `cache_name` | Evictions from `staleAfterModelChange` |
|
|
279
|
+
|
|
280
|
+
### OpenTelemetry
|
|
281
|
+
|
|
282
|
+
Every public method emits an OTel span. Requires an OpenTelemetry SDK in the host application.
|
|
283
|
+
|
|
284
|
+
## Examples
|
|
285
|
+
|
|
286
|
+
Runnable examples in [examples/](./examples/). All examples connect to `localhost:6399` by default (override via `VALKEY_HOST` / `VALKEY_PORT`).
|
|
287
|
+
|
|
288
|
+
| Example | API key needed | What it shows |
|
|
289
|
+
|---|---|---|
|
|
290
|
+
| `basic/` | Voyage AI (or `--mock`) | Core store/check/invalidate |
|
|
291
|
+
| `openai/` | OpenAI | Chat Completions + cost tracking |
|
|
292
|
+
| `openai-responses/` | OpenAI | Responses API adapter |
|
|
293
|
+
| `anthropic/` | Anthropic + OpenAI | Messages API, high cost savings (~$0.000064/hit) |
|
|
294
|
+
| `llamaindex/` | OpenAI | ChatMessage[] adapter |
|
|
295
|
+
| `langchain/` | OpenAI | BetterDBSemanticCache + ChatOpenAI |
|
|
296
|
+
| `vercel-ai-sdk/` | OpenAI | createSemanticCacheMiddleware |
|
|
297
|
+
| `langgraph/` | None | BetterDBSemanticStore memory |
|
|
298
|
+
| `multimodal/` | None | ContentBlock[] with text + image |
|
|
299
|
+
| `cost-tracking/` | None | Cost savings with mock embedder |
|
|
300
|
+
| `threshold-tuning/` | None | thresholdEffectiveness() |
|
|
301
|
+
| `embedding-cache/` | None | Embedding cache on/off comparison |
|
|
302
|
+
| `batch-check/` | None | checkBatch() vs sequential |
|
|
303
|
+
| `rerank/` | None | Top-k rerank hook |
|
|
304
|
+
|
|
305
|
+
## Client Lifecycle
|
|
306
|
+
|
|
307
|
+
SemanticCache does **not** own the iovalkey client:
|
|
308
|
+
|
|
309
|
+
```typescript
|
|
310
|
+
const client = new Valkey({ host: 'localhost', port: 6399 });
|
|
311
|
+
const cache = new SemanticCache({ client, embedFn });
|
|
312
|
+
// ... use cache ...
|
|
313
|
+
await client.quit();
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
## Known Limitations
|
|
317
|
+
|
|
318
|
+
### Cluster mode
|
|
319
|
+
|
|
320
|
+
`flush()` fans out via `clusterScan()` across all master nodes. `FT.SEARCH` routes correctly via hash slots. `FT.CREATE` only creates the index on the receiving node - in a full cluster, create the index on each node separately.
|
|
213
321
|
|
|
214
322
|
### Streaming
|
|
215
323
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
324
|
+
`store()` requires a complete response string. The Vercel AI SDK adapter does not implement `wrapStream`. Accumulate the full streamed response before calling `store()`.
|
|
325
|
+
|
|
326
|
+
### Schema migration (v0.1 -> v0.2)
|
|
327
|
+
|
|
328
|
+
v0.2.0 added `binary_refs`, `temperature`, `top_p`, `seed` fields to the index schema. Existing v0.1.0 indexes operate in text-only mode until `flush()` + `initialize()` rebuilds the schema.
|
|
220
329
|
|
|
221
330
|
## License
|
|
222
331
|
|
package/dist/SemanticCache.d.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { SemanticCacheOptions, CacheCheckOptions, CacheStoreOptions, CacheCheckResult, CacheStats, IndexInfo, InvalidateResult } from './types';
|
|
2
|
+
import { type ContentBlock } from './utils';
|
|
2
3
|
export declare class SemanticCache {
|
|
3
4
|
private readonly client;
|
|
4
5
|
private readonly embedFn;
|
|
@@ -6,15 +7,34 @@ export declare class SemanticCache {
|
|
|
6
7
|
private readonly indexName;
|
|
7
8
|
private readonly entryPrefix;
|
|
8
9
|
private readonly statsKey;
|
|
9
|
-
private readonly
|
|
10
|
+
private readonly similarityWindowKey;
|
|
11
|
+
private readonly configKey;
|
|
12
|
+
private defaultThreshold;
|
|
10
13
|
private readonly defaultTtl;
|
|
11
|
-
private
|
|
14
|
+
private categoryThresholds;
|
|
12
15
|
private readonly uncertaintyBand;
|
|
13
16
|
private readonly telemetry;
|
|
17
|
+
private readonly costTable;
|
|
18
|
+
private readonly embeddingCacheEnabled;
|
|
19
|
+
private readonly embeddingCacheTtl;
|
|
20
|
+
private readonly embedKeyPrefix;
|
|
21
|
+
private readonly discoveryOptions;
|
|
22
|
+
private readonly _initialDefaultThreshold;
|
|
23
|
+
private readonly _initialCategoryThresholds;
|
|
24
|
+
private readonly configRefreshOptions;
|
|
25
|
+
private configRefreshTimer;
|
|
26
|
+
private discovery;
|
|
14
27
|
private _initialized;
|
|
15
28
|
private _dimension;
|
|
29
|
+
private _hasBinaryRefs;
|
|
16
30
|
private _initPromise;
|
|
17
31
|
private _initGeneration;
|
|
32
|
+
private readonly analyticsOpts;
|
|
33
|
+
private readonly usesDefaultCostTable;
|
|
34
|
+
private analytics;
|
|
35
|
+
private statsTimer;
|
|
36
|
+
private shutdownCalled;
|
|
37
|
+
private analyticsInitiated;
|
|
18
38
|
/**
|
|
19
39
|
* Creates a new SemanticCache instance.
|
|
20
40
|
*
|
|
@@ -27,33 +47,133 @@ export declare class SemanticCache {
|
|
|
27
47
|
constructor(options: SemanticCacheOptions);
|
|
28
48
|
initialize(): Promise<void>;
|
|
29
49
|
flush(): Promise<void>;
|
|
30
|
-
|
|
31
|
-
|
|
50
|
+
/**
|
|
51
|
+
* Shut down the analytics client, cancel the stats timer, and stop the
|
|
52
|
+
* discovery heartbeat. Safe to call multiple times.
|
|
53
|
+
*/
|
|
54
|
+
shutdown(): Promise<void>;
|
|
55
|
+
/**
|
|
56
|
+
* Graceful shutdown of the discovery layer — stops the heartbeat and
|
|
57
|
+
* deletes this instance's heartbeat key so Monitor marks the cache offline
|
|
58
|
+
* immediately. Does NOT touch the registry hash, the FT index, or any
|
|
59
|
+
* entries. Safe to call multiple times.
|
|
60
|
+
*/
|
|
61
|
+
dispose(): Promise<void>;
|
|
62
|
+
check(prompt: string | ContentBlock[], options?: CacheCheckOptions): Promise<CacheCheckResult>;
|
|
63
|
+
store(prompt: string | ContentBlock[], response: string, options?: CacheStoreOptions): Promise<string>;
|
|
64
|
+
/**
|
|
65
|
+
* Store structured content blocks as the cached response.
|
|
66
|
+
* Populates both the response field (from TextBlock text) and content_blocks (full JSON).
|
|
67
|
+
*/
|
|
68
|
+
storeMultipart(prompt: string | ContentBlock[], blocks: ContentBlock[], options?: CacheStoreOptions): Promise<string>;
|
|
69
|
+
/**
|
|
70
|
+
* Check multiple prompts in parallel, using pipelined FT.SEARCH calls.
|
|
71
|
+
* Returns results in input order.
|
|
72
|
+
*/
|
|
73
|
+
checkBatch(prompts: (string | ContentBlock[])[], options?: CacheCheckOptions): Promise<CacheCheckResult[]>;
|
|
32
74
|
/**
|
|
33
75
|
* Deletes all entries matching a valkey-search filter expression.
|
|
34
76
|
*
|
|
35
77
|
* **Security note:** `filter` is passed directly to FT.SEARCH. Only pass
|
|
36
|
-
* trusted, programmatically-constructed expressions
|
|
78
|
+
* trusted, programmatically-constructed expressions - never unsanitised
|
|
37
79
|
* user input.
|
|
38
80
|
*/
|
|
39
81
|
invalidate(filter: string): Promise<InvalidateResult>;
|
|
82
|
+
/** Delete all entries tagged with the given model name. */
|
|
83
|
+
invalidateByModel(model: string): Promise<number>;
|
|
84
|
+
/** Delete all entries tagged with the given category. */
|
|
85
|
+
invalidateByCategory(category: string): Promise<number>;
|
|
40
86
|
stats(): Promise<CacheStats>;
|
|
41
87
|
indexInfo(): Promise<IndexInfo>;
|
|
88
|
+
/**
|
|
89
|
+
* Analyze the rolling similarity score window and recommend threshold adjustments.
|
|
90
|
+
*/
|
|
91
|
+
thresholdEffectiveness(options?: {
|
|
92
|
+
category?: string;
|
|
93
|
+
minSamples?: number;
|
|
94
|
+
}): Promise<ThresholdEffectivenessResult>;
|
|
95
|
+
/**
|
|
96
|
+
* Returns threshold effectiveness results for every category seen in the
|
|
97
|
+
* rolling window, plus one aggregate result for all categories combined.
|
|
98
|
+
*/
|
|
99
|
+
thresholdEffectivenessAll(options?: {
|
|
100
|
+
minSamples?: number;
|
|
101
|
+
}): Promise<ThresholdEffectivenessResult[]>;
|
|
102
|
+
/**
|
|
103
|
+
* Refresh threshold config from Valkey. Returns true on a successful HGETALL,
|
|
104
|
+
* false if the call threw.
|
|
105
|
+
*
|
|
106
|
+
* Field semantics:
|
|
107
|
+
* - "threshold" -> updates defaultThreshold
|
|
108
|
+
* - "threshold:{category}" -> updates categoryThresholds[category]
|
|
109
|
+
* - "threshold:" (empty) -> ignored
|
|
110
|
+
* - non-numeric values -> ignored
|
|
111
|
+
* - out-of-range values -> ignored (must be 0 <= x <= 2)
|
|
112
|
+
*
|
|
113
|
+
* Categories present in memory but absent from the hash fall back to their
|
|
114
|
+
* constructor values (or are removed if no constructor override existed).
|
|
115
|
+
* The default threshold likewise falls back to its constructor value if
|
|
116
|
+
* `threshold` is absent from the hash.
|
|
117
|
+
*/
|
|
118
|
+
refreshConfig(): Promise<boolean>;
|
|
119
|
+
/** @internal Default similarity threshold. */
|
|
120
|
+
get _defaultThreshold(): number;
|
|
121
|
+
/** @internal Test-only getter. */
|
|
122
|
+
get _categoryThresholds(): Readonly<Record<string, number>>;
|
|
123
|
+
/** @internal Test-only getter. */
|
|
124
|
+
get _configRefreshIntervalMs(): number;
|
|
125
|
+
/**
|
|
126
|
+
* Execute a stable FT.SEARCH for use by adapters (e.g. LangGraph).
|
|
127
|
+
* SORTBY inserted_at ASC gives stable ordering across paginated calls.
|
|
128
|
+
* @internal
|
|
129
|
+
*/
|
|
130
|
+
_searchEntries(filterExpr: string, limit: number, offset: number): Promise<unknown>;
|
|
131
|
+
/**
|
|
132
|
+
* Embed text for use by adapters (e.g. LangGraph semantic search).
|
|
133
|
+
* @internal
|
|
134
|
+
*/
|
|
135
|
+
_embedText(text: string): Promise<{
|
|
136
|
+
vector: number[];
|
|
137
|
+
durationSec: number;
|
|
138
|
+
}>;
|
|
139
|
+
private startConfigRefresh;
|
|
42
140
|
private _doInitialize;
|
|
141
|
+
private registerDiscovery;
|
|
142
|
+
private initAnalyticsSafe;
|
|
143
|
+
private captureStatsSnapshot;
|
|
43
144
|
private ensureIndexAndGetDimension;
|
|
44
|
-
/**
|
|
145
|
+
/** Check if the index schema has a binary_refs field. */
|
|
146
|
+
private parseHasBinaryRefsFromInfo;
|
|
147
|
+
/** Resolve a prompt (string or ContentBlock[]) into text + binary refs. */
|
|
148
|
+
private resolvePrompt;
|
|
149
|
+
/** Wraps embedFn with error handling, duration tracking, and optional embedding cache. */
|
|
45
150
|
private embed;
|
|
46
151
|
/**
|
|
47
152
|
* Wraps a method body in an OTel span with automatic status, end, and
|
|
48
153
|
* operation duration metric. The span is passed to fn so callers can
|
|
49
|
-
* set attributes
|
|
154
|
+
* set attributes - but callers must NOT call span.end() or span.setStatus(),
|
|
50
155
|
* as traced() handles both.
|
|
51
156
|
*/
|
|
52
157
|
private traced;
|
|
53
158
|
/** Increment stats counters via pipeline. */
|
|
54
159
|
private recordStat;
|
|
160
|
+
/** Append to the rolling similarity window sorted set and trim to 10,000 entries or 7 days. */
|
|
161
|
+
private recordSimilarityWindow;
|
|
55
162
|
private assertInitialized;
|
|
56
163
|
private assertDimension;
|
|
57
164
|
private isIndexNotFoundError;
|
|
58
165
|
private parseDimensionFromInfo;
|
|
59
166
|
}
|
|
167
|
+
export interface ThresholdEffectivenessResult {
|
|
168
|
+
category: string;
|
|
169
|
+
sampleCount: number;
|
|
170
|
+
currentThreshold: number;
|
|
171
|
+
hitRate: number;
|
|
172
|
+
uncertainHitRate: number;
|
|
173
|
+
nearMissRate: number;
|
|
174
|
+
avgHitSimilarity: number;
|
|
175
|
+
avgMissSimilarity: number;
|
|
176
|
+
recommendation: 'tighten_threshold' | 'loosen_threshold' | 'optimal' | 'insufficient_data';
|
|
177
|
+
recommendedThreshold?: number;
|
|
178
|
+
reasoning: string;
|
|
179
|
+
}
|