semblend 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. semblend-0.1.0/LICENSE +61 -0
  2. semblend-0.1.0/PKG-INFO +285 -0
  3. semblend-0.1.0/README.md +172 -0
  4. semblend-0.1.0/pyproject.toml +89 -0
  5. semblend-0.1.0/semblend/__init__.py +126 -0
  6. semblend-0.1.0/semblend/core/__init__.py +14 -0
  7. semblend-0.1.0/semblend/integration/__init__.py +1 -0
  8. semblend-0.1.0/semblend/integration/sglang/__init__.py +22 -0
  9. semblend-0.1.0/semblend/integration/sglang/hicache_backend.py +324 -0
  10. semblend-0.1.0/semblend/integration/sglang/launch_semblend_sglang.py +92 -0
  11. semblend-0.1.0/semblend/integration/sglang/radix_backend.py +539 -0
  12. semblend-0.1.0/semblend/integration/sglang/radix_patcher.py +55 -0
  13. semblend-0.1.0/semblend/integration/vllm/__init__.py +12 -0
  14. semblend-0.1.0/semblend/integration/vllm/connector_v1.py +20 -0
  15. semblend-0.1.0/semblend/integration/vllm/model_runner_hook.py +2 -0
  16. semblend-0.1.0/semblend/py.typed +0 -0
  17. semblend-0.1.0/semblend.egg-info/PKG-INFO +285 -0
  18. semblend-0.1.0/semblend.egg-info/SOURCES.txt +52 -0
  19. semblend-0.1.0/semblend.egg-info/dependency_links.txt +1 -0
  20. semblend-0.1.0/semblend.egg-info/entry_points.txt +2 -0
  21. semblend-0.1.0/semblend.egg-info/requires.txt +33 -0
  22. semblend-0.1.0/semblend.egg-info/top_level.txt +3 -0
  23. semblend-0.1.0/semblend_core/__init__.py +137 -0
  24. semblend-0.1.0/semblend_core/alignment.py +817 -0
  25. semblend-0.1.0/semblend_core/backend.py +117 -0
  26. semblend-0.1.0/semblend_core/bathtub.py +197 -0
  27. semblend-0.1.0/semblend_core/donor_store.py +408 -0
  28. semblend-0.1.0/semblend_core/embedder.py +419 -0
  29. semblend-0.1.0/semblend_core/kv_tensor_store.py +281 -0
  30. semblend-0.1.0/semblend_core/metrics.py +173 -0
  31. semblend-0.1.0/semblend_core/partial_attention.py +305 -0
  32. semblend-0.1.0/semblend_core/pipeline.py +593 -0
  33. semblend-0.1.0/semblend_core/rope_correction.py +838 -0
  34. semblend-0.1.0/semblend_core/segmentation.py +431 -0
  35. semblend-0.1.0/semblend_core/simhash.py +91 -0
  36. semblend-0.1.0/semblend_core/triton_kernels.py +929 -0
  37. semblend-0.1.0/setup.cfg +4 -0
  38. semblend-0.1.0/synapse_kv_connector/__init__.py +86 -0
  39. semblend-0.1.0/synapse_kv_connector/alignment.py +24 -0
  40. semblend-0.1.0/synapse_kv_connector/attention_patch.py +297 -0
  41. semblend-0.1.0/synapse_kv_connector/bathtub.py +14 -0
  42. semblend-0.1.0/synapse_kv_connector/cagra_donor_store.py +378 -0
  43. semblend-0.1.0/synapse_kv_connector/client.py +270 -0
  44. semblend-0.1.0/synapse_kv_connector/connector.py +483 -0
  45. semblend-0.1.0/synapse_kv_connector/donor_store.py +10 -0
  46. semblend-0.1.0/synapse_kv_connector/embedder.py +13 -0
  47. semblend-0.1.0/synapse_kv_connector/model_runner_hook.py +670 -0
  48. semblend-0.1.0/synapse_kv_connector/partial_attention.py +14 -0
  49. semblend-0.1.0/synapse_kv_connector/pipeline.py +47 -0
  50. semblend-0.1.0/synapse_kv_connector/rope_correction.py +13 -0
  51. semblend-0.1.0/synapse_kv_connector/segment_client.py +564 -0
  52. semblend-0.1.0/synapse_kv_connector/semblend_connector.py +1898 -0
  53. semblend-0.1.0/synapse_kv_connector/simhash.py +11 -0
  54. semblend-0.1.0/synapse_kv_connector/triton_kernels.py +6 -0
semblend-0.1.0/LICENSE ADDED
@@ -0,0 +1,61 @@
1
+ Business Source License 1.1
2
+
3
+ Parameters
4
+
5
+ Licensor: WorldFlow AI, Inc.
6
+ Licensed Work: SemBlend 0.1.0 and later
7
+ The Licensed Work is (c) 2026 WorldFlow AI, Inc.
8
+ Additional Use Grant: You may use the Licensed Work for non-production
9
+ purposes, including testing, development, evaluation,
10
+ and academic research. Production use requires a
11
+ commercial license from WorldFlow AI.
12
+ Change Date: 2030-03-16
13
+ Change License: Apache License, Version 2.0
14
+
15
+ For information about alternative licensing arrangements for the Licensed Work,
16
+ please contact licensing@worldflow.ai.
17
+
18
+ Notice
19
+
20
+ Business Source License 1.1
21
+
22
+ Terms
23
+
24
+ The Licensor hereby grants you the right to copy, modify, create derivative
25
+ works, redistribute, and make non-production use of the Licensed Work. The
26
+ Licensor may make an Additional Use Grant, above, permitting limited production
27
+ use.
28
+
29
+ Effective on the Change Date, or the fourth anniversary of the first publicly
30
+ available distribution of a specific version of the Licensed Work under this
31
+ License, whichever comes first, the Licensor hereby grants you rights under the
32
+ terms of the Change License, and the rights granted in the paragraph above
33
+ terminate.
34
+
35
+ If your use of the Licensed Work does not comply with the requirements currently
36
+ in effect as described in this License, you must purchase a commercial license
37
+ from the Licensor, its affiliated entities, or authorized resellers, or you must
38
+ refrain from using the Licensed Work.
39
+
40
+ All copies of the original and modified Licensed Work, and derivative works of
41
+ the Licensed Work, are subject to this License. This License applies separately
42
+ for each version of the Licensed Work and the Change Date may vary for each
43
+ version of the Licensed Work released by Licensor.
44
+
45
+ You must conspicuously display this License on each original or modified copy of
46
+ the Licensed Work. If you receive the Licensed Work in original or modified form
47
+ from a third party, the terms and conditions set forth in this License apply to
48
+ your use of that work.
49
+
50
+ Any use of the Licensed Work in violation of this License will automatically
51
+ terminate your rights under this License for the current and all other versions
52
+ of the Licensed Work.
53
+
54
+ This License does not grant you any right in any trademark or logo of Licensor
55
+ or its affiliates (provided that you may use a trademark or logo of Licensor as
56
+ expressly required by this License).
57
+
58
+ TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON AN
59
+ "AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS
60
+ OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF MERCHANTABILITY,
61
+ FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND TITLE.
@@ -0,0 +1,285 @@
1
+ Metadata-Version: 2.4
2
+ Name: semblend
3
+ Version: 0.1.0
4
+ Summary: Semantic KV cache reuse for LLM inference engines (vLLM, SGLang)
5
+ Author: WorldFlow AI
6
+ License: Business Source License 1.1
7
+
8
+ Parameters
9
+
10
+ Licensor: WorldFlow AI, Inc.
11
+ Licensed Work: SemBlend 0.1.0 and later
12
+ The Licensed Work is (c) 2026 WorldFlow AI, Inc.
13
+ Additional Use Grant: You may use the Licensed Work for non-production
14
+ purposes, including testing, development, evaluation,
15
+ and academic research. Production use requires a
16
+ commercial license from WorldFlow AI.
17
+ Change Date: 2030-03-16
18
+ Change License: Apache License, Version 2.0
19
+
20
+ For information about alternative licensing arrangements for the Licensed Work,
21
+ please contact licensing@worldflow.ai.
22
+
23
+ Notice
24
+
25
+ Business Source License 1.1
26
+
27
+ Terms
28
+
29
+ The Licensor hereby grants you the right to copy, modify, create derivative
30
+ works, redistribute, and make non-production use of the Licensed Work. The
31
+ Licensor may make an Additional Use Grant, above, permitting limited production
32
+ use.
33
+
34
+ Effective on the Change Date, or the fourth anniversary of the first publicly
35
+ available distribution of a specific version of the Licensed Work under this
36
+ License, whichever comes first, the Licensor hereby grants you rights under the
37
+ terms of the Change License, and the rights granted in the paragraph above
38
+ terminate.
39
+
40
+ If your use of the Licensed Work does not comply with the requirements currently
41
+ in effect as described in this License, you must purchase a commercial license
42
+ from the Licensor, its affiliated entities, or authorized resellers, or you must
43
+ refrain from using the Licensed Work.
44
+
45
+ All copies of the original and modified Licensed Work, and derivative works of
46
+ the Licensed Work, are subject to this License. This License applies separately
47
+ for each version of the Licensed Work and the Change Date may vary for each
48
+ version of the Licensed Work released by Licensor.
49
+
50
+ You must conspicuously display this License on each original or modified copy of
51
+ the Licensed Work. If you receive the Licensed Work in original or modified form
52
+ from a third party, the terms and conditions set forth in this License apply to
53
+ your use of that work.
54
+
55
+ Any use of the Licensed Work in violation of this License will automatically
56
+ terminate your rights under this License for the current and all other versions
57
+ of the Licensed Work.
58
+
59
+ This License does not grant you any right in any trademark or logo of Licensor
60
+ or its affiliates (provided that you may use a trademark or logo of Licensor as
61
+ expressly required by this License).
62
+
63
+ TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON AN
64
+ "AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS
65
+ OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF MERCHANTABILITY,
66
+ FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND TITLE.
67
+
68
+ Project-URL: Homepage, https://worldflow.ai/semblend
69
+ Project-URL: Repository, https://github.com/worldflowai/semblend
70
+ Project-URL: Documentation, https://docs.worldflow.ai/semblend
71
+ Project-URL: Bug Tracker, https://github.com/worldflowai/semblend/issues
72
+ Keywords: llm,inference,kv-cache,vllm,sglang,semantic-cache
73
+ Classifier: Development Status :: 4 - Beta
74
+ Classifier: Intended Audience :: Developers
75
+ Classifier: Intended Audience :: Science/Research
76
+ Classifier: License :: Other/Proprietary License
77
+ Classifier: Programming Language :: Python :: 3
78
+ Classifier: Programming Language :: Python :: 3.10
79
+ Classifier: Programming Language :: Python :: 3.11
80
+ Classifier: Programming Language :: Python :: 3.12
81
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
82
+ Requires-Python: >=3.10
83
+ Description-Content-Type: text/markdown
84
+ License-File: LICENSE
85
+ Requires-Dist: numpy>=1.24
86
+ Requires-Dist: rapidfuzz>=3.0
87
+ Provides-Extra: embedder
88
+ Requires-Dist: sentence-transformers>=3.0; extra == "embedder"
89
+ Requires-Dist: onnxruntime>=1.15; extra == "embedder"
90
+ Provides-Extra: gpu
91
+ Requires-Dist: torch>=2.0; extra == "gpu"
92
+ Requires-Dist: triton>=2.0; extra == "gpu"
93
+ Provides-Extra: vllm
94
+ Requires-Dist: vllm>=0.8.0; extra == "vllm"
95
+ Requires-Dist: torch>=2.0; extra == "vllm"
96
+ Requires-Dist: sentence-transformers>=3.0; extra == "vllm"
97
+ Provides-Extra: sglang
98
+ Requires-Dist: sglang>=0.4.0; extra == "sglang"
99
+ Requires-Dist: torch>=2.0; extra == "sglang"
100
+ Requires-Dist: sentence-transformers>=3.0; extra == "sglang"
101
+ Provides-Extra: dev
102
+ Requires-Dist: pytest>=8.0; extra == "dev"
103
+ Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
104
+ Requires-Dist: ruff>=0.4; extra == "dev"
105
+ Provides-Extra: benchmarks
106
+ Requires-Dist: aiohttp>=3.9; extra == "benchmarks"
107
+ Requires-Dist: datasets>=2.16; extra == "benchmarks"
108
+ Requires-Dist: tqdm>=4.66; extra == "benchmarks"
109
+ Requires-Dist: rich>=13.7; extra == "benchmarks"
110
+ Requires-Dist: rouge-score>=0.1.2; extra == "benchmarks"
111
+ Requires-Dist: transformers>=4.40; extra == "benchmarks"
112
+ Dynamic: license-file
113
+
114
+ # SemBlend
115
+
116
+ **Semantic KV cache reuse for LLM inference engines.**
117
+
118
+ SemBlend extends exact-prefix KV caching (LMCache, vLLM prefix cache, SGLang RadixAttention) with *semantic* donor discovery. When a new prompt is semantically similar to a prior one but lexically different — different instruction wording, sentence ordering, or template fields — SemBlend finds and reuses the cached KV tensors, eliminating redundant prefill computation.
119
+
120
+ ```
121
+ Without SemBlend: vLLM + LMCache → 0% hit → full prefill every request
122
+ With SemBlend: vLLM + LMCache → 30–88% hit → reuse KV from similar past requests
123
+ ```
124
+
125
+ ## What It Does
126
+
127
+ | System | Hit Condition | Semantically Similar Prompts |
128
+ |--------|--------------|------------------------------|
129
+ | vLLM prefix cache | Exact token-level prefix match | Full prefill (0% hit) |
130
+ | LMCache alone | Exact 256-token chunk match | Full prefill (0% hit) |
131
+ | **LMCache + SemBlend** | Semantic similarity ≥ 0.60 | **Reuse donor KV (30–88% hit)** |
132
+
133
+ SemBlend runs ~8ms of in-process MiniLM embedding + cosine search on every request. On a miss it adds negligible overhead. On a hit it replaces a 2–17 second prefill with sub-second KV retrieval.
134
+
135
+ ## Performance
136
+
137
+ Empirical results on A10G GPU, Qwen2.5-7B-AWQ + vLLM 0.14.1 + LMCache.
138
+
139
+ ### TTFT speedup on hits vs. cold prefill (diverse real content)
140
+
141
+ | Context Length | Cold TTFT | SemBlend Miss (+overhead) | SemBlend Hit | Hit Speedup |
142
+ |---------------|-----------|--------------------------|--------------|-------------|
143
+ | 4K tokens | 1,859 ms | 1,864 ms (+0.3%) | 801 ms | **2.3x** |
144
+ | 8K tokens | 3,193 ms | 3,315 ms (+3.8%) | 817 ms | **3.9x** |
145
+ | 16K tokens | 5,852 ms | 6,064 ms (+3.6%) | 871 ms | **6.7x** |
146
+ | 32K tokens | 15,418 ms | — | 1,288 ms | **12.0x** |
147
+
148
+ *Hit TTFT is consistently ~800ms regardless of context length — it's bounded by KV retrieval, not prefill.*
149
+
150
+ Miss overhead grows with context length (5–212ms at ~20 donors in store). Break-even: P_hit > 5% at 8K is net-positive.
151
+
152
+ ### Real-world hit rates (WildChat-1M user conversations)
153
+
154
+ | Workload | Requests | Hit Rate | Hit-only TTFT speedup |
155
+ |---------|----------|----------|----------------------|
156
+ | Short prompts (≥4K chars) | 250 | 29.2% | **1.63x** |
157
+ | Long prompts (≥8K chars) | 150 | 30.0% | **1.88x** |
158
+
159
+ Hit rate increases with cosine similarity: 17% at 0.50–0.60 → 60% at 0.90–1.00.
160
+
161
+ ### Cross-dataset hit rates and speedups (8K tokens, Qwen2.5-7B)
162
+
163
+ | Dataset | Hit Rate | Overall Speedup | Hit-only Speedup |
164
+ |---------|----------|----------------|-----------------|
165
+ | CNN/DailyMail (summarization) | 50% | 2.29x | 2.39x |
166
+ | MultiNews (multi-doc summary) | 75% | 2.23x | 2.23x |
167
+ | SAMSum (dialogue summary) | 88% | 2.37x | 2.37x |
168
+
169
+ *Speedups measured over cold vLLM+LMCache baseline (0% hit on semantically different requests).*
170
+
171
+ ### Multi-turn dialogue (4K context, 3 turns)
172
+
173
+ | Turn | Hit Rate | TTFT Speedup |
174
+ |------|----------|-------------|
175
+ | Turn 1 (cold) | — | 1.0x (baseline: 4019 ms) |
176
+ | Turn 2 | 99.5% | **5.1x** (791 ms) |
177
+ | Turn 3 | 99.5% | **5.1x** (787 ms) |
178
+
179
+ Multi-turn conversations naturally reuse the same prefix → near-perfect hit rates without any workload tuning.
180
+
181
+ ### SGLang comparison (8K, cross-instruction RAG workload)
182
+
183
+ | Engine | Hit Rate | p50 TTFT | Speedup |
184
+ |--------|----------|----------|---------|
185
+ | SGLang (RadixAttention, no SemBlend) | 0% | 3,850 ms | 0.98x |
186
+ | vLLM + LMCache | 0% | 3,193 ms | 1.0x |
187
+ | vLLM + LMCache + SemBlend | 50–88% | 817 ms | **3.9x** |
188
+ | SGLang + SemBlend | 50–88% | 624 ms | **3.7x** |
189
+
190
+ ### Quality
191
+
192
+ SemBlend injects donor KV with RoPE position correction. Output quality impact on high-hit workloads:
193
+
194
+ | Dataset | PPL ratio (SemBlend / cold) |
195
+ |---------|---------------------------|
196
+ | CNN/DailyMail | 1.006 |
197
+ | WikiHow | 1.012 |
198
+ | XSum | 1.025 |
199
+ | MultiNews (hit-only runs) | 1.007 |
200
+
201
+ PPL ratio ≤ 1.025 on most datasets; elevated ratios (≥1.1) on low-hit runs are due to miss-run variance, not KV injection.
202
+
203
+ ## Installation
204
+
205
+ ```bash
206
+ # Core (CPU-only: numpy + rapidfuzz)
207
+ pip install semblend
208
+
209
+ # With vLLM integration
210
+ pip install semblend[vllm]
211
+
212
+ # With SGLang integration
213
+ pip install semblend[sglang]
214
+
215
+ # With sentence-transformers embedder
216
+ pip install semblend[embedder]
217
+ ```
218
+
219
+ ## Quick Start: vLLM
220
+
221
+ ```bash
222
+ pip install semblend[vllm] vllm lmcache
223
+
224
+ vllm serve Qwen/Qwen2.5-7B-Instruct-AWQ \
225
+ --kv-transfer-config '{
226
+ "kv_connector": "SemBlendConnectorV1",
227
+ "kv_connector_module_path": "semblend.integration.vllm.connector_v1",
228
+ "kv_role": "kv_both"
229
+ }'
230
+ ```
231
+
232
+ Configure via environment variables:
233
+
234
+ | Variable | Default | Description |
235
+ |----------|---------|-------------|
236
+ | `SEMBLEND_ENABLED` | `1` | Enable semantic donor search |
237
+ | `SEMBLEND_MIN_SIMILARITY` | `0.60` | Cosine similarity threshold |
238
+ | `SEMBLEND_EMBEDDER` | `minilm` | Embedder type (`minilm`, `jaccard`, `onnx_gpu`) |
239
+ | `SEMBLEND_FUZZY_CHUNKS` | `0` | Enable fuzzy chunk matching |
240
+
241
+ ## Quick Start: SGLang
242
+
243
+ ```bash
244
+ pip install semblend[sglang] sglang
245
+
246
+ # Option 1: CLI launcher (patches RadixCache automatically)
247
+ semblend-sglang --model-path Qwen/Qwen2.5-7B-Instruct \
248
+ --host 0.0.0.0 --port 8000
249
+
250
+ # Option 2: Programmatic patching
251
+ from semblend.integration.sglang.radix_patcher import patch_radix_cache
252
+ patch_radix_cache() # Call before SGLang starts
253
+ ```
254
+
255
+ ## How It Works
256
+
257
+ ```
258
+ Request → MiniLM Embed (5ms) → Cosine Search (1ms) → Align (1ms) → Inject KV
259
+ ↓ ↓ ↓
260
+ 384-dim vector Find similar donor Match chunk boundaries
261
+ in donor store via MD5 hash alignment
262
+ ```
263
+
264
+ 1. **Embed**: Compute a 384-dim MiniLM-L6-v2 embedding (sliding-window for long docs)
265
+ 2. **Search**: Brute-force cosine similarity against the donor store (<1ms at 1K donors)
266
+ 3. **Align**: MD5 chunk hashing at 256-token boundaries finds reusable KV chunks
267
+ 4. **Inject**: Replace target token IDs with donor token IDs — LMCache/RadixCache finds cached KV
268
+
269
+ ## When SemBlend Helps
270
+
271
+ SemBlend is most effective for workloads where prompts share a large common context:
272
+
273
+ - **Document Q&A / RAG**: Same retrieved documents, different questions
274
+ - **Summarization**: Same article, different instruction phrasing
275
+ - **Multi-turn dialogue**: Conversation history grows across turns
276
+ - **Code completion**: Shared repository context across requests
277
+
278
+ SemBlend has minimal overhead on dissimilar workloads (e.g. code generation from scratch: measured 0% hit, 0.96x — 4% overhead from embedding + search).
279
+
280
+ ## License
281
+
282
+ Business Source License 1.1 (BSL-1.1). Free for non-production use including testing, development, evaluation, and academic research. Production use requires a commercial license from WorldFlow AI. Converts to Apache License 2.0 on 2030-03-16.
283
+
284
+ Contact: licensing@worldflow.ai
285
+
@@ -0,0 +1,172 @@
1
+ # SemBlend
2
+
3
+ **Semantic KV cache reuse for LLM inference engines.**
4
+
5
+ SemBlend extends exact-prefix KV caching (LMCache, vLLM prefix cache, SGLang RadixAttention) with *semantic* donor discovery. When a new prompt is semantically similar to a prior one but lexically different — different instruction wording, sentence ordering, or template fields — SemBlend finds and reuses the cached KV tensors, eliminating redundant prefill computation.
6
+
7
+ ```
8
+ Without SemBlend: vLLM + LMCache → 0% hit → full prefill every request
9
+ With SemBlend: vLLM + LMCache → 30–88% hit → reuse KV from similar past requests
10
+ ```
11
+
12
+ ## What It Does
13
+
14
+ | System | Hit Condition | Semantically Similar Prompts |
15
+ |--------|--------------|------------------------------|
16
+ | vLLM prefix cache | Exact token-level prefix match | Full prefill (0% hit) |
17
+ | LMCache alone | Exact 256-token chunk match | Full prefill (0% hit) |
18
+ | **LMCache + SemBlend** | Semantic similarity ≥ 0.60 | **Reuse donor KV (30–88% hit)** |
19
+
20
+ SemBlend runs ~8ms of in-process MiniLM embedding + cosine search on every request. On a miss it adds negligible overhead. On a hit it replaces a 2–17 second prefill with sub-second KV retrieval.
21
+
22
+ ## Performance
23
+
24
+ Empirical results on A10G GPU, Qwen2.5-7B-AWQ + vLLM 0.14.1 + LMCache.
25
+
26
+ ### TTFT speedup on hits vs. cold prefill (diverse real content)
27
+
28
+ | Context Length | Cold TTFT | SemBlend Miss (+overhead) | SemBlend Hit | Hit Speedup |
29
+ |---------------|-----------|--------------------------|--------------|-------------|
30
+ | 4K tokens | 1,859 ms | 1,864 ms (+0.3%) | 801 ms | **2.3x** |
31
+ | 8K tokens | 3,193 ms | 3,315 ms (+3.8%) | 817 ms | **3.9x** |
32
+ | 16K tokens | 5,852 ms | 6,064 ms (+3.6%) | 871 ms | **6.7x** |
33
+ | 32K tokens | 15,418 ms | — | 1,288 ms | **12.0x** |
34
+
35
+ *Hit TTFT is consistently ~800ms regardless of context length — it's bounded by KV retrieval, not prefill.*
36
+
37
+ Miss overhead grows with context length (5–212ms at ~20 donors in store). Break-even: P_hit > 5% at 8K is net-positive.
38
+
39
+ ### Real-world hit rates (WildChat-1M user conversations)
40
+
41
+ | Workload | Requests | Hit Rate | Hit-only TTFT speedup |
42
+ |---------|----------|----------|----------------------|
43
+ | Short prompts (≥4K chars) | 250 | 29.2% | **1.63x** |
44
+ | Long prompts (≥8K chars) | 150 | 30.0% | **1.88x** |
45
+
46
+ Hit rate increases with cosine similarity: 17% at 0.50–0.60 → 60% at 0.90–1.00.
47
+
48
+ ### Cross-dataset hit rates and speedups (8K tokens, Qwen2.5-7B)
49
+
50
+ | Dataset | Hit Rate | Overall Speedup | Hit-only Speedup |
51
+ |---------|----------|----------------|-----------------|
52
+ | CNN/DailyMail (summarization) | 50% | 2.29x | 2.39x |
53
+ | MultiNews (multi-doc summary) | 75% | 2.23x | 2.23x |
54
+ | SAMSum (dialogue summary) | 88% | 2.37x | 2.37x |
55
+
56
+ *Speedups measured over cold vLLM+LMCache baseline (0% hit on semantically different requests).*
57
+
58
+ ### Multi-turn dialogue (4K context, 3 turns)
59
+
60
+ | Turn | Hit Rate | TTFT Speedup |
61
+ |------|----------|-------------|
62
+ | Turn 1 (cold) | — | 1.0x (baseline: 4019 ms) |
63
+ | Turn 2 | 99.5% | **5.1x** (791 ms) |
64
+ | Turn 3 | 99.5% | **5.1x** (787 ms) |
65
+
66
+ Multi-turn conversations naturally reuse the same prefix → near-perfect hit rates without any workload tuning.
67
+
68
+ ### SGLang comparison (8K, cross-instruction RAG workload)
69
+
70
+ | Engine | Hit Rate | p50 TTFT | Speedup |
71
+ |--------|----------|----------|---------|
72
+ | SGLang (RadixAttention, no SemBlend) | 0% | 3,850 ms | 0.98x |
73
+ | vLLM + LMCache | 0% | 3,193 ms | 1.0x |
74
+ | vLLM + LMCache + SemBlend | 50–88% | 817 ms | **3.9x** |
75
+ | SGLang + SemBlend | 50–88% | 624 ms | **3.7x** |
76
+
77
+ ### Quality
78
+
79
+ SemBlend injects donor KV with RoPE position correction. Output quality impact on high-hit workloads:
80
+
81
+ | Dataset | PPL ratio (SemBlend / cold) |
82
+ |---------|---------------------------|
83
+ | CNN/DailyMail | 1.006 |
84
+ | WikiHow | 1.012 |
85
+ | XSum | 1.025 |
86
+ | MultiNews (hit-only runs) | 1.007 |
87
+
88
+ PPL ratio ≤ 1.025 on most datasets; elevated ratios (≥1.1) on low-hit runs are due to miss-run variance, not KV injection.
89
+
90
+ ## Installation
91
+
92
+ ```bash
93
+ # Core (CPU-only: numpy + rapidfuzz)
94
+ pip install semblend
95
+
96
+ # With vLLM integration
97
+ pip install semblend[vllm]
98
+
99
+ # With SGLang integration
100
+ pip install semblend[sglang]
101
+
102
+ # With sentence-transformers embedder
103
+ pip install semblend[embedder]
104
+ ```
105
+
106
+ ## Quick Start: vLLM
107
+
108
+ ```bash
109
+ pip install semblend[vllm] vllm lmcache
110
+
111
+ vllm serve Qwen/Qwen2.5-7B-Instruct-AWQ \
112
+ --kv-transfer-config '{
113
+ "kv_connector": "SemBlendConnectorV1",
114
+ "kv_connector_module_path": "semblend.integration.vllm.connector_v1",
115
+ "kv_role": "kv_both"
116
+ }'
117
+ ```
118
+
119
+ Configure via environment variables:
120
+
121
+ | Variable | Default | Description |
122
+ |----------|---------|-------------|
123
+ | `SEMBLEND_ENABLED` | `1` | Enable semantic donor search |
124
+ | `SEMBLEND_MIN_SIMILARITY` | `0.60` | Cosine similarity threshold |
125
+ | `SEMBLEND_EMBEDDER` | `minilm` | Embedder type (`minilm`, `jaccard`, `onnx_gpu`) |
126
+ | `SEMBLEND_FUZZY_CHUNKS` | `0` | Enable fuzzy chunk matching |
127
+
128
+ ## Quick Start: SGLang
129
+
130
+ ```bash
131
+ pip install semblend[sglang] sglang
132
+
133
+ # Option 1: CLI launcher (patches RadixCache automatically)
134
+ semblend-sglang --model-path Qwen/Qwen2.5-7B-Instruct \
135
+ --host 0.0.0.0 --port 8000
136
+
137
+ # Option 2: Programmatic patching
138
+ from semblend.integration.sglang.radix_patcher import patch_radix_cache
139
+ patch_radix_cache() # Call before SGLang starts
140
+ ```
141
+
142
+ ## How It Works
143
+
144
+ ```
145
+ Request → MiniLM Embed (5ms) → Cosine Search (1ms) → Align (1ms) → Inject KV
146
+ ↓ ↓ ↓
147
+ 384-dim vector Find similar donor Match chunk boundaries
148
+ in donor store via MD5 hash alignment
149
+ ```
150
+
151
+ 1. **Embed**: Compute a 384-dim MiniLM-L6-v2 embedding (sliding-window for long docs)
152
+ 2. **Search**: Brute-force cosine similarity against the donor store (<1ms at 1K donors)
153
+ 3. **Align**: MD5 chunk hashing at 256-token boundaries finds reusable KV chunks
154
+ 4. **Inject**: Replace target token IDs with donor token IDs — LMCache/RadixCache finds cached KV
155
+
156
+ ## When SemBlend Helps
157
+
158
+ SemBlend is most effective for workloads where prompts share a large common context:
159
+
160
+ - **Document Q&A / RAG**: Same retrieved documents, different questions
161
+ - **Summarization**: Same article, different instruction phrasing
162
+ - **Multi-turn dialogue**: Conversation history grows across turns
163
+ - **Code completion**: Shared repository context across requests
164
+
165
+ SemBlend has minimal overhead on dissimilar workloads (e.g. code generation from scratch: measured 0% hit, 0.96x — 4% overhead from embedding + search).
166
+
167
+ ## License
168
+
169
+ Business Source License 1.1 (BSL-1.1). Free for non-production use including testing, development, evaluation, and academic research. Production use requires a commercial license from WorldFlow AI. Converts to Apache License 2.0 on 2030-03-16.
170
+
171
+ Contact: licensing@worldflow.ai
172
+
@@ -0,0 +1,89 @@
1
+ [project]
2
+ name = "semblend"
3
+ version = "0.1.0"
4
+ description = "Semantic KV cache reuse for LLM inference engines (vLLM, SGLang)"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ license = {file = "LICENSE"}
8
+ authors = [
9
+ { name = "WorldFlow AI" },
10
+ ]
11
+ keywords = ["llm", "inference", "kv-cache", "vllm", "sglang", "semantic-cache"]
12
+ classifiers = [
13
+ "Development Status :: 4 - Beta",
14
+ "Intended Audience :: Developers",
15
+ "Intended Audience :: Science/Research",
16
+ "License :: Other/Proprietary License",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.10",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
+ ]
23
+
24
+ dependencies = [
25
+ "numpy>=1.24",
26
+ "rapidfuzz>=3.0",
27
+ ]
28
+
29
+ [project.urls]
30
+ Homepage = "https://worldflow.ai/semblend"
31
+ Repository = "https://github.com/worldflowai/semblend"
32
+ Documentation = "https://docs.worldflow.ai/semblend"
33
+ "Bug Tracker" = "https://github.com/worldflowai/semblend/issues"
34
+
35
+ [project.scripts]
36
+ semblend-sglang = "semblend.integration.sglang.launch_semblend_sglang:main"
37
+
38
+ [project.optional-dependencies]
39
+ embedder = [
40
+ "sentence-transformers>=3.0",
41
+ "onnxruntime>=1.15",
42
+ ]
43
+ gpu = [
44
+ "torch>=2.0",
45
+ "triton>=2.0",
46
+ ]
47
+ vllm = [
48
+ "vllm>=0.8.0",
49
+ "torch>=2.0",
50
+ "sentence-transformers>=3.0",
51
+ ]
52
+ sglang = [
53
+ "sglang>=0.4.0",
54
+ "torch>=2.0",
55
+ "sentence-transformers>=3.0",
56
+ ]
57
+ dev = [
58
+ "pytest>=8.0",
59
+ "pytest-asyncio>=0.23",
60
+ "ruff>=0.4",
61
+ ]
62
+ benchmarks = [
63
+ "aiohttp>=3.9",
64
+ "datasets>=2.16",
65
+ "tqdm>=4.66",
66
+ "rich>=13.7",
67
+ "rouge-score>=0.1.2",
68
+ "transformers>=4.40",
69
+ ]
70
+
71
+ [build-system]
72
+ requires = ["setuptools>=68"]
73
+ build-backend = "setuptools.build_meta"
74
+
75
+ [tool.setuptools.packages.find]
76
+ include = ["semblend*", "semblend_core*", "synapse_kv_connector*"]
77
+ exclude = ["benchmarks*", "paper*", "patent*", "infra*", "literature*", "results*", "tests*", "synapse_kv_connector.tests*", "scripts*"]
78
+
79
+ [tool.pytest.ini_options]
80
+ testpaths = ["tests", "synapse_kv_connector/tests"]
81
+ addopts = "--ignore=synapse_kv_connector/tests/test_rope_correction.py --ignore=synapse_kv_connector/tests/test_triton_kernels.py"
82
+
83
+ [tool.ruff]
84
+ line-length = 100
85
+ target-version = "py310"
86
+
87
+ [tool.ruff.lint]
88
+ select = ["E", "F", "I", "W"]
89
+ ignore = ["E501"]