semblend 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- semblend-0.1.0/LICENSE +61 -0
- semblend-0.1.0/PKG-INFO +285 -0
- semblend-0.1.0/README.md +172 -0
- semblend-0.1.0/pyproject.toml +89 -0
- semblend-0.1.0/semblend/__init__.py +126 -0
- semblend-0.1.0/semblend/core/__init__.py +14 -0
- semblend-0.1.0/semblend/integration/__init__.py +1 -0
- semblend-0.1.0/semblend/integration/sglang/__init__.py +22 -0
- semblend-0.1.0/semblend/integration/sglang/hicache_backend.py +324 -0
- semblend-0.1.0/semblend/integration/sglang/launch_semblend_sglang.py +92 -0
- semblend-0.1.0/semblend/integration/sglang/radix_backend.py +539 -0
- semblend-0.1.0/semblend/integration/sglang/radix_patcher.py +55 -0
- semblend-0.1.0/semblend/integration/vllm/__init__.py +12 -0
- semblend-0.1.0/semblend/integration/vllm/connector_v1.py +20 -0
- semblend-0.1.0/semblend/integration/vllm/model_runner_hook.py +2 -0
- semblend-0.1.0/semblend/py.typed +0 -0
- semblend-0.1.0/semblend.egg-info/PKG-INFO +285 -0
- semblend-0.1.0/semblend.egg-info/SOURCES.txt +52 -0
- semblend-0.1.0/semblend.egg-info/dependency_links.txt +1 -0
- semblend-0.1.0/semblend.egg-info/entry_points.txt +2 -0
- semblend-0.1.0/semblend.egg-info/requires.txt +33 -0
- semblend-0.1.0/semblend.egg-info/top_level.txt +3 -0
- semblend-0.1.0/semblend_core/__init__.py +137 -0
- semblend-0.1.0/semblend_core/alignment.py +817 -0
- semblend-0.1.0/semblend_core/backend.py +117 -0
- semblend-0.1.0/semblend_core/bathtub.py +197 -0
- semblend-0.1.0/semblend_core/donor_store.py +408 -0
- semblend-0.1.0/semblend_core/embedder.py +419 -0
- semblend-0.1.0/semblend_core/kv_tensor_store.py +281 -0
- semblend-0.1.0/semblend_core/metrics.py +173 -0
- semblend-0.1.0/semblend_core/partial_attention.py +305 -0
- semblend-0.1.0/semblend_core/pipeline.py +593 -0
- semblend-0.1.0/semblend_core/rope_correction.py +838 -0
- semblend-0.1.0/semblend_core/segmentation.py +431 -0
- semblend-0.1.0/semblend_core/simhash.py +91 -0
- semblend-0.1.0/semblend_core/triton_kernels.py +929 -0
- semblend-0.1.0/setup.cfg +4 -0
- semblend-0.1.0/synapse_kv_connector/__init__.py +86 -0
- semblend-0.1.0/synapse_kv_connector/alignment.py +24 -0
- semblend-0.1.0/synapse_kv_connector/attention_patch.py +297 -0
- semblend-0.1.0/synapse_kv_connector/bathtub.py +14 -0
- semblend-0.1.0/synapse_kv_connector/cagra_donor_store.py +378 -0
- semblend-0.1.0/synapse_kv_connector/client.py +270 -0
- semblend-0.1.0/synapse_kv_connector/connector.py +483 -0
- semblend-0.1.0/synapse_kv_connector/donor_store.py +10 -0
- semblend-0.1.0/synapse_kv_connector/embedder.py +13 -0
- semblend-0.1.0/synapse_kv_connector/model_runner_hook.py +670 -0
- semblend-0.1.0/synapse_kv_connector/partial_attention.py +14 -0
- semblend-0.1.0/synapse_kv_connector/pipeline.py +47 -0
- semblend-0.1.0/synapse_kv_connector/rope_correction.py +13 -0
- semblend-0.1.0/synapse_kv_connector/segment_client.py +564 -0
- semblend-0.1.0/synapse_kv_connector/semblend_connector.py +1898 -0
- semblend-0.1.0/synapse_kv_connector/simhash.py +11 -0
- semblend-0.1.0/synapse_kv_connector/triton_kernels.py +6 -0
semblend-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
Business Source License 1.1
|
|
2
|
+
|
|
3
|
+
Parameters
|
|
4
|
+
|
|
5
|
+
Licensor: WorldFlow AI, Inc.
|
|
6
|
+
Licensed Work: SemBlend 0.1.0 and later
|
|
7
|
+
The Licensed Work is (c) 2026 WorldFlow AI, Inc.
|
|
8
|
+
Additional Use Grant: You may use the Licensed Work for non-production
|
|
9
|
+
purposes, including testing, development, evaluation,
|
|
10
|
+
and academic research. Production use requires a
|
|
11
|
+
commercial license from WorldFlow AI.
|
|
12
|
+
Change Date: 2030-03-16
|
|
13
|
+
Change License: Apache License, Version 2.0
|
|
14
|
+
|
|
15
|
+
For information about alternative licensing arrangements for the Licensed Work,
|
|
16
|
+
please contact licensing@worldflow.ai.
|
|
17
|
+
|
|
18
|
+
Notice
|
|
19
|
+
|
|
20
|
+
Business Source License 1.1
|
|
21
|
+
|
|
22
|
+
Terms
|
|
23
|
+
|
|
24
|
+
The Licensor hereby grants you the right to copy, modify, create derivative
|
|
25
|
+
works, redistribute, and make non-production use of the Licensed Work. The
|
|
26
|
+
Licensor may make an Additional Use Grant, above, permitting limited production
|
|
27
|
+
use.
|
|
28
|
+
|
|
29
|
+
Effective on the Change Date, or the fourth anniversary of the first publicly
|
|
30
|
+
available distribution of a specific version of the Licensed Work under this
|
|
31
|
+
License, whichever comes first, the Licensor hereby grants you rights under the
|
|
32
|
+
terms of the Change License, and the rights granted in the paragraph above
|
|
33
|
+
terminate.
|
|
34
|
+
|
|
35
|
+
If your use of the Licensed Work does not comply with the requirements currently
|
|
36
|
+
in effect as described in this License, you must purchase a commercial license
|
|
37
|
+
from the Licensor, its affiliated entities, or authorized resellers, or you must
|
|
38
|
+
refrain from using the Licensed Work.
|
|
39
|
+
|
|
40
|
+
All copies of the original and modified Licensed Work, and derivative works of
|
|
41
|
+
the Licensed Work, are subject to this License. This License applies separately
|
|
42
|
+
for each version of the Licensed Work and the Change Date may vary for each
|
|
43
|
+
version of the Licensed Work released by Licensor.
|
|
44
|
+
|
|
45
|
+
You must conspicuously display this License on each original or modified copy of
|
|
46
|
+
the Licensed Work. If you receive the Licensed Work in original or modified form
|
|
47
|
+
from a third party, the terms and conditions set forth in this License apply to
|
|
48
|
+
your use of that work.
|
|
49
|
+
|
|
50
|
+
Any use of the Licensed Work in violation of this License will automatically
|
|
51
|
+
terminate your rights under this License for the current and all other versions
|
|
52
|
+
of the Licensed Work.
|
|
53
|
+
|
|
54
|
+
This License does not grant you any right in any trademark or logo of Licensor
|
|
55
|
+
or its affiliates (provided that you may use a trademark or logo of Licensor as
|
|
56
|
+
expressly required by this License).
|
|
57
|
+
|
|
58
|
+
TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON AN
|
|
59
|
+
"AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS
|
|
60
|
+
OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF MERCHANTABILITY,
|
|
61
|
+
FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND TITLE.
|
semblend-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: semblend
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Semantic KV cache reuse for LLM inference engines (vLLM, SGLang)
|
|
5
|
+
Author: WorldFlow AI
|
|
6
|
+
License: Business Source License 1.1
|
|
7
|
+
|
|
8
|
+
Parameters
|
|
9
|
+
|
|
10
|
+
Licensor: WorldFlow AI, Inc.
|
|
11
|
+
Licensed Work: SemBlend 0.1.0 and later
|
|
12
|
+
The Licensed Work is (c) 2026 WorldFlow AI, Inc.
|
|
13
|
+
Additional Use Grant: You may use the Licensed Work for non-production
|
|
14
|
+
purposes, including testing, development, evaluation,
|
|
15
|
+
and academic research. Production use requires a
|
|
16
|
+
commercial license from WorldFlow AI.
|
|
17
|
+
Change Date: 2030-03-16
|
|
18
|
+
Change License: Apache License, Version 2.0
|
|
19
|
+
|
|
20
|
+
For information about alternative licensing arrangements for the Licensed Work,
|
|
21
|
+
please contact licensing@worldflow.ai.
|
|
22
|
+
|
|
23
|
+
Notice
|
|
24
|
+
|
|
25
|
+
Business Source License 1.1
|
|
26
|
+
|
|
27
|
+
Terms
|
|
28
|
+
|
|
29
|
+
The Licensor hereby grants you the right to copy, modify, create derivative
|
|
30
|
+
works, redistribute, and make non-production use of the Licensed Work. The
|
|
31
|
+
Licensor may make an Additional Use Grant, above, permitting limited production
|
|
32
|
+
use.
|
|
33
|
+
|
|
34
|
+
Effective on the Change Date, or the fourth anniversary of the first publicly
|
|
35
|
+
available distribution of a specific version of the Licensed Work under this
|
|
36
|
+
License, whichever comes first, the Licensor hereby grants you rights under the
|
|
37
|
+
terms of the Change License, and the rights granted in the paragraph above
|
|
38
|
+
terminate.
|
|
39
|
+
|
|
40
|
+
If your use of the Licensed Work does not comply with the requirements currently
|
|
41
|
+
in effect as described in this License, you must purchase a commercial license
|
|
42
|
+
from the Licensor, its affiliated entities, or authorized resellers, or you must
|
|
43
|
+
refrain from using the Licensed Work.
|
|
44
|
+
|
|
45
|
+
All copies of the original and modified Licensed Work, and derivative works of
|
|
46
|
+
the Licensed Work, are subject to this License. This License applies separately
|
|
47
|
+
for each version of the Licensed Work and the Change Date may vary for each
|
|
48
|
+
version of the Licensed Work released by Licensor.
|
|
49
|
+
|
|
50
|
+
You must conspicuously display this License on each original or modified copy of
|
|
51
|
+
the Licensed Work. If you receive the Licensed Work in original or modified form
|
|
52
|
+
from a third party, the terms and conditions set forth in this License apply to
|
|
53
|
+
your use of that work.
|
|
54
|
+
|
|
55
|
+
Any use of the Licensed Work in violation of this License will automatically
|
|
56
|
+
terminate your rights under this License for the current and all other versions
|
|
57
|
+
of the Licensed Work.
|
|
58
|
+
|
|
59
|
+
This License does not grant you any right in any trademark or logo of Licensor
|
|
60
|
+
or its affiliates (provided that you may use a trademark or logo of Licensor as
|
|
61
|
+
expressly required by this License).
|
|
62
|
+
|
|
63
|
+
TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON AN
|
|
64
|
+
"AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS
|
|
65
|
+
OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF MERCHANTABILITY,
|
|
66
|
+
FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND TITLE.
|
|
67
|
+
|
|
68
|
+
Project-URL: Homepage, https://worldflow.ai/semblend
|
|
69
|
+
Project-URL: Repository, https://github.com/worldflowai/semblend
|
|
70
|
+
Project-URL: Documentation, https://docs.worldflow.ai/semblend
|
|
71
|
+
Project-URL: Bug Tracker, https://github.com/worldflowai/semblend/issues
|
|
72
|
+
Keywords: llm,inference,kv-cache,vllm,sglang,semantic-cache
|
|
73
|
+
Classifier: Development Status :: 4 - Beta
|
|
74
|
+
Classifier: Intended Audience :: Developers
|
|
75
|
+
Classifier: Intended Audience :: Science/Research
|
|
76
|
+
Classifier: License :: Other/Proprietary License
|
|
77
|
+
Classifier: Programming Language :: Python :: 3
|
|
78
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
79
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
80
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
81
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
82
|
+
Requires-Python: >=3.10
|
|
83
|
+
Description-Content-Type: text/markdown
|
|
84
|
+
License-File: LICENSE
|
|
85
|
+
Requires-Dist: numpy>=1.24
|
|
86
|
+
Requires-Dist: rapidfuzz>=3.0
|
|
87
|
+
Provides-Extra: embedder
|
|
88
|
+
Requires-Dist: sentence-transformers>=3.0; extra == "embedder"
|
|
89
|
+
Requires-Dist: onnxruntime>=1.15; extra == "embedder"
|
|
90
|
+
Provides-Extra: gpu
|
|
91
|
+
Requires-Dist: torch>=2.0; extra == "gpu"
|
|
92
|
+
Requires-Dist: triton>=2.0; extra == "gpu"
|
|
93
|
+
Provides-Extra: vllm
|
|
94
|
+
Requires-Dist: vllm>=0.8.0; extra == "vllm"
|
|
95
|
+
Requires-Dist: torch>=2.0; extra == "vllm"
|
|
96
|
+
Requires-Dist: sentence-transformers>=3.0; extra == "vllm"
|
|
97
|
+
Provides-Extra: sglang
|
|
98
|
+
Requires-Dist: sglang>=0.4.0; extra == "sglang"
|
|
99
|
+
Requires-Dist: torch>=2.0; extra == "sglang"
|
|
100
|
+
Requires-Dist: sentence-transformers>=3.0; extra == "sglang"
|
|
101
|
+
Provides-Extra: dev
|
|
102
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
103
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
104
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
105
|
+
Provides-Extra: benchmarks
|
|
106
|
+
Requires-Dist: aiohttp>=3.9; extra == "benchmarks"
|
|
107
|
+
Requires-Dist: datasets>=2.16; extra == "benchmarks"
|
|
108
|
+
Requires-Dist: tqdm>=4.66; extra == "benchmarks"
|
|
109
|
+
Requires-Dist: rich>=13.7; extra == "benchmarks"
|
|
110
|
+
Requires-Dist: rouge-score>=0.1.2; extra == "benchmarks"
|
|
111
|
+
Requires-Dist: transformers>=4.40; extra == "benchmarks"
|
|
112
|
+
Dynamic: license-file
|
|
113
|
+
|
|
114
|
+
# SemBlend
|
|
115
|
+
|
|
116
|
+
**Semantic KV cache reuse for LLM inference engines.**
|
|
117
|
+
|
|
118
|
+
SemBlend extends exact-prefix KV caching (LMCache, vLLM prefix cache, SGLang RadixAttention) with *semantic* donor discovery. When a new prompt is semantically similar to a prior one but lexically different — different instruction wording, sentence ordering, or template fields — SemBlend finds and reuses the cached KV tensors, eliminating redundant prefill computation.
|
|
119
|
+
|
|
120
|
+
```
|
|
121
|
+
Without SemBlend: vLLM + LMCache → 0% hit → full prefill every request
|
|
122
|
+
With SemBlend: vLLM + LMCache → 30–88% hit → reuse KV from similar past requests
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## What It Does
|
|
126
|
+
|
|
127
|
+
| System | Hit Condition | Semantically Similar Prompts |
|
|
128
|
+
|--------|--------------|------------------------------|
|
|
129
|
+
| vLLM prefix cache | Exact token-level prefix match | Full prefill (0% hit) |
|
|
130
|
+
| LMCache alone | Exact 256-token chunk match | Full prefill (0% hit) |
|
|
131
|
+
| **LMCache + SemBlend** | Semantic similarity ≥ 0.60 | **Reuse donor KV (30–88% hit)** |
|
|
132
|
+
|
|
133
|
+
SemBlend runs ~8ms of in-process MiniLM embedding + cosine search on every request. On a miss it adds negligible overhead. On a hit it replaces a 2–17 second prefill with sub-second KV retrieval.
|
|
134
|
+
|
|
135
|
+
## Performance
|
|
136
|
+
|
|
137
|
+
Empirical results on A10G GPU, Qwen2.5-7B-AWQ + vLLM 0.14.1 + LMCache.
|
|
138
|
+
|
|
139
|
+
### TTFT speedup on hits vs. cold prefill (diverse real content)
|
|
140
|
+
|
|
141
|
+
| Context Length | Cold TTFT | SemBlend Miss (+overhead) | SemBlend Hit | Hit Speedup |
|
|
142
|
+
|---------------|-----------|--------------------------|--------------|-------------|
|
|
143
|
+
| 4K tokens | 1,859 ms | 1,864 ms (+0.3%) | 801 ms | **2.3x** |
|
|
144
|
+
| 8K tokens | 3,193 ms | 3,315 ms (+3.8%) | 817 ms | **3.9x** |
|
|
145
|
+
| 16K tokens | 5,852 ms | 6,064 ms (+3.6%) | 871 ms | **6.7x** |
|
|
146
|
+
| 32K tokens | 15,418 ms | — | 1,288 ms | **12.0x** |
|
|
147
|
+
|
|
148
|
+
*Hit TTFT is consistently ~800ms regardless of context length — it's bounded by KV retrieval, not prefill.*
|
|
149
|
+
|
|
150
|
+
Miss overhead grows with context length (5–212ms at ~20 donors in store). Break-even: P_hit > 5% at 8K is net-positive.
|
|
151
|
+
|
|
152
|
+
### Real-world hit rates (WildChat-1M user conversations)
|
|
153
|
+
|
|
154
|
+
| Workload | Requests | Hit Rate | Hit-only TTFT speedup |
|
|
155
|
+
|---------|----------|----------|----------------------|
|
|
156
|
+
| Short prompts (≥4K chars) | 250 | 29.2% | **1.63x** |
|
|
157
|
+
| Long prompts (≥8K chars) | 150 | 30.0% | **1.88x** |
|
|
158
|
+
|
|
159
|
+
Hit rate increases with cosine similarity: 17% at 0.50–0.60 → 60% at 0.90–1.00.
|
|
160
|
+
|
|
161
|
+
### Cross-dataset hit rates and speedups (8K tokens, Qwen2.5-7B)
|
|
162
|
+
|
|
163
|
+
| Dataset | Hit Rate | Overall Speedup | Hit-only Speedup |
|
|
164
|
+
|---------|----------|----------------|-----------------|
|
|
165
|
+
| CNN/DailyMail (summarization) | 50% | 2.29x | 2.39x |
|
|
166
|
+
| MultiNews (multi-doc summary) | 75% | 2.23x | 2.23x |
|
|
167
|
+
| SAMSum (dialogue summary) | 88% | 2.37x | 2.37x |
|
|
168
|
+
|
|
169
|
+
*Speedups measured over cold vLLM+LMCache baseline (0% hit on semantically different requests).*
|
|
170
|
+
|
|
171
|
+
### Multi-turn dialogue (4K context, 3 turns)
|
|
172
|
+
|
|
173
|
+
| Turn | Hit Rate | TTFT Speedup |
|
|
174
|
+
|------|----------|-------------|
|
|
175
|
+
| Turn 1 (cold) | — | 1.0x (baseline: 4019 ms) |
|
|
176
|
+
| Turn 2 | 99.5% | **5.1x** (791 ms) |
|
|
177
|
+
| Turn 3 | 99.5% | **5.1x** (787 ms) |
|
|
178
|
+
|
|
179
|
+
Multi-turn conversations naturally reuse the same prefix → near-perfect hit rates without any workload tuning.
|
|
180
|
+
|
|
181
|
+
### SGLang comparison (8K, cross-instruction RAG workload)
|
|
182
|
+
|
|
183
|
+
| Engine | Hit Rate | p50 TTFT | Speedup |
|
|
184
|
+
|--------|----------|----------|---------|
|
|
185
|
+
| SGLang (RadixAttention, no SemBlend) | 0% | 3,850 ms | 0.98x |
|
|
186
|
+
| vLLM + LMCache | 0% | 3,193 ms | 1.0x |
|
|
187
|
+
| vLLM + LMCache + SemBlend | 50–88% | 817 ms | **3.9x** |
|
|
188
|
+
| SGLang + SemBlend | 50–88% | 624 ms | **3.7x** |
|
|
189
|
+
|
|
190
|
+
### Quality
|
|
191
|
+
|
|
192
|
+
SemBlend injects donor KV with RoPE position correction. Output quality impact on high-hit workloads:
|
|
193
|
+
|
|
194
|
+
| Dataset | PPL ratio (SemBlend / cold) |
|
|
195
|
+
|---------|---------------------------|
|
|
196
|
+
| CNN/DailyMail | 1.006 |
|
|
197
|
+
| WikiHow | 1.012 |
|
|
198
|
+
| XSum | 1.025 |
|
|
199
|
+
| MultiNews (hit-only runs) | 1.007 |
|
|
200
|
+
|
|
201
|
+
PPL ratio ≤ 1.025 on most datasets; elevated ratios (≥1.1) on low-hit runs are due to miss-run variance, not KV injection.
|
|
202
|
+
|
|
203
|
+
## Installation
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
# Core (CPU-only: numpy + rapidfuzz)
|
|
207
|
+
pip install semblend
|
|
208
|
+
|
|
209
|
+
# With vLLM integration
|
|
210
|
+
pip install semblend[vllm]
|
|
211
|
+
|
|
212
|
+
# With SGLang integration
|
|
213
|
+
pip install semblend[sglang]
|
|
214
|
+
|
|
215
|
+
# With sentence-transformers embedder
|
|
216
|
+
pip install semblend[embedder]
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## Quick Start: vLLM
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
pip install semblend[vllm] vllm lmcache
|
|
223
|
+
|
|
224
|
+
vllm serve Qwen/Qwen2.5-7B-Instruct-AWQ \
|
|
225
|
+
--kv-transfer-config '{
|
|
226
|
+
"kv_connector": "SemBlendConnectorV1",
|
|
227
|
+
"kv_connector_module_path": "semblend.integration.vllm.connector_v1",
|
|
228
|
+
"kv_role": "kv_both"
|
|
229
|
+
}'
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
Configure via environment variables:
|
|
233
|
+
|
|
234
|
+
| Variable | Default | Description |
|
|
235
|
+
|----------|---------|-------------|
|
|
236
|
+
| `SEMBLEND_ENABLED` | `1` | Enable semantic donor search |
|
|
237
|
+
| `SEMBLEND_MIN_SIMILARITY` | `0.60` | Cosine similarity threshold |
|
|
238
|
+
| `SEMBLEND_EMBEDDER` | `minilm` | Embedder type (`minilm`, `jaccard`, `onnx_gpu`) |
|
|
239
|
+
| `SEMBLEND_FUZZY_CHUNKS` | `0` | Enable fuzzy chunk matching |
|
|
240
|
+
|
|
241
|
+
## Quick Start: SGLang
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
pip install semblend[sglang] sglang
|
|
245
|
+
|
|
246
|
+
# Option 1: CLI launcher (patches RadixCache automatically)
|
|
247
|
+
semblend-sglang --model-path Qwen/Qwen2.5-7B-Instruct \
|
|
248
|
+
--host 0.0.0.0 --port 8000
|
|
249
|
+
|
|
250
|
+
# Option 2: Programmatic patching
|
|
251
|
+
from semblend.integration.sglang.radix_patcher import patch_radix_cache
|
|
252
|
+
patch_radix_cache() # Call before SGLang starts
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
## How It Works
|
|
256
|
+
|
|
257
|
+
```
|
|
258
|
+
Request → MiniLM Embed (5ms) → Cosine Search (1ms) → Align (1ms) → Inject KV
|
|
259
|
+
↓ ↓ ↓
|
|
260
|
+
384-dim vector Find similar donor Match chunk boundaries
|
|
261
|
+
in donor store via MD5 hash alignment
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
1. **Embed**: Compute a 384-dim MiniLM-L6-v2 embedding (sliding-window for long docs)
|
|
265
|
+
2. **Search**: Brute-force cosine similarity against the donor store (<1ms at 1K donors)
|
|
266
|
+
3. **Align**: MD5 chunk hashing at 256-token boundaries finds reusable KV chunks
|
|
267
|
+
4. **Inject**: Replace target token IDs with donor token IDs — LMCache/RadixCache finds cached KV
|
|
268
|
+
|
|
269
|
+
## When SemBlend Helps
|
|
270
|
+
|
|
271
|
+
SemBlend is most effective for workloads where prompts share a large common context:
|
|
272
|
+
|
|
273
|
+
- **Document Q&A / RAG**: Same retrieved documents, different questions
|
|
274
|
+
- **Summarization**: Same article, different instruction phrasing
|
|
275
|
+
- **Multi-turn dialogue**: Conversation history grows across turns
|
|
276
|
+
- **Code completion**: Shared repository context across requests
|
|
277
|
+
|
|
278
|
+
SemBlend has minimal overhead on dissimilar workloads (e.g. code generation from scratch: measured 0% hit, 0.96x — 4% overhead from embedding + search).
|
|
279
|
+
|
|
280
|
+
## License
|
|
281
|
+
|
|
282
|
+
Business Source License 1.1 (BSL-1.1). Free for non-production use including testing, development, evaluation, and academic research. Production use requires a commercial license from WorldFlow AI. Converts to Apache License 2.0 on 2030-03-16.
|
|
283
|
+
|
|
284
|
+
Contact: licensing@worldflow.ai
|
|
285
|
+
|
semblend-0.1.0/README.md
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# SemBlend
|
|
2
|
+
|
|
3
|
+
**Semantic KV cache reuse for LLM inference engines.**
|
|
4
|
+
|
|
5
|
+
SemBlend extends exact-prefix KV caching (LMCache, vLLM prefix cache, SGLang RadixAttention) with *semantic* donor discovery. When a new prompt is semantically similar to a prior one but lexically different — different instruction wording, sentence ordering, or template fields — SemBlend finds and reuses the cached KV tensors, eliminating redundant prefill computation.
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
Without SemBlend: vLLM + LMCache → 0% hit → full prefill every request
|
|
9
|
+
With SemBlend: vLLM + LMCache → 30–88% hit → reuse KV from similar past requests
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## What It Does
|
|
13
|
+
|
|
14
|
+
| System | Hit Condition | Semantically Similar Prompts |
|
|
15
|
+
|--------|--------------|------------------------------|
|
|
16
|
+
| vLLM prefix cache | Exact token-level prefix match | Full prefill (0% hit) |
|
|
17
|
+
| LMCache alone | Exact 256-token chunk match | Full prefill (0% hit) |
|
|
18
|
+
| **LMCache + SemBlend** | Semantic similarity ≥ 0.60 | **Reuse donor KV (30–88% hit)** |
|
|
19
|
+
|
|
20
|
+
SemBlend runs ~8ms of in-process MiniLM embedding + cosine search on every request. On a miss it adds negligible overhead. On a hit it replaces a 2–17 second prefill with sub-second KV retrieval.
|
|
21
|
+
|
|
22
|
+
## Performance
|
|
23
|
+
|
|
24
|
+
Empirical results on A10G GPU, Qwen2.5-7B-AWQ + vLLM 0.14.1 + LMCache.
|
|
25
|
+
|
|
26
|
+
### TTFT speedup on hits vs. cold prefill (diverse real content)
|
|
27
|
+
|
|
28
|
+
| Context Length | Cold TTFT | SemBlend Miss (+overhead) | SemBlend Hit | Hit Speedup |
|
|
29
|
+
|---------------|-----------|--------------------------|--------------|-------------|
|
|
30
|
+
| 4K tokens | 1,859 ms | 1,864 ms (+0.3%) | 801 ms | **2.3x** |
|
|
31
|
+
| 8K tokens | 3,193 ms | 3,315 ms (+3.8%) | 817 ms | **3.9x** |
|
|
32
|
+
| 16K tokens | 5,852 ms | 6,064 ms (+3.6%) | 871 ms | **6.7x** |
|
|
33
|
+
| 32K tokens | 15,418 ms | — | 1,288 ms | **12.0x** |
|
|
34
|
+
|
|
35
|
+
*Hit TTFT is consistently ~800ms regardless of context length — it's bounded by KV retrieval, not prefill.*
|
|
36
|
+
|
|
37
|
+
Miss overhead grows with context length (5–212ms at ~20 donors in store). Break-even: P_hit > 5% at 8K is net-positive.
|
|
38
|
+
|
|
39
|
+
### Real-world hit rates (WildChat-1M user conversations)
|
|
40
|
+
|
|
41
|
+
| Workload | Requests | Hit Rate | Hit-only TTFT speedup |
|
|
42
|
+
|---------|----------|----------|----------------------|
|
|
43
|
+
| Short prompts (≥4K chars) | 250 | 29.2% | **1.63x** |
|
|
44
|
+
| Long prompts (≥8K chars) | 150 | 30.0% | **1.88x** |
|
|
45
|
+
|
|
46
|
+
Hit rate increases with cosine similarity: 17% at 0.50–0.60 → 60% at 0.90–1.00.
|
|
47
|
+
|
|
48
|
+
### Cross-dataset hit rates and speedups (8K tokens, Qwen2.5-7B)
|
|
49
|
+
|
|
50
|
+
| Dataset | Hit Rate | Overall Speedup | Hit-only Speedup |
|
|
51
|
+
|---------|----------|----------------|-----------------|
|
|
52
|
+
| CNN/DailyMail (summarization) | 50% | 2.29x | 2.39x |
|
|
53
|
+
| MultiNews (multi-doc summary) | 75% | 2.23x | 2.23x |
|
|
54
|
+
| SAMSum (dialogue summary) | 88% | 2.37x | 2.37x |
|
|
55
|
+
|
|
56
|
+
*Speedups measured over cold vLLM+LMCache baseline (0% hit on semantically different requests).*
|
|
57
|
+
|
|
58
|
+
### Multi-turn dialogue (4K context, 3 turns)
|
|
59
|
+
|
|
60
|
+
| Turn | Hit Rate | TTFT Speedup |
|
|
61
|
+
|------|----------|-------------|
|
|
62
|
+
| Turn 1 (cold) | — | 1.0x (baseline: 4019 ms) |
|
|
63
|
+
| Turn 2 | 99.5% | **5.1x** (791 ms) |
|
|
64
|
+
| Turn 3 | 99.5% | **5.1x** (787 ms) |
|
|
65
|
+
|
|
66
|
+
Multi-turn conversations naturally reuse the same prefix → near-perfect hit rates without any workload tuning.
|
|
67
|
+
|
|
68
|
+
### SGLang comparison (8K, cross-instruction RAG workload)
|
|
69
|
+
|
|
70
|
+
| Engine | Hit Rate | p50 TTFT | Speedup |
|
|
71
|
+
|--------|----------|----------|---------|
|
|
72
|
+
| SGLang (RadixAttention, no SemBlend) | 0% | 3,850 ms | 0.98x |
|
|
73
|
+
| vLLM + LMCache | 0% | 3,193 ms | 1.0x |
|
|
74
|
+
| vLLM + LMCache + SemBlend | 50–88% | 817 ms | **3.9x** |
|
|
75
|
+
| SGLang + SemBlend | 50–88% | 624 ms | **3.7x** |
|
|
76
|
+
|
|
77
|
+
### Quality
|
|
78
|
+
|
|
79
|
+
SemBlend injects donor KV with RoPE position correction. Output quality impact on high-hit workloads:
|
|
80
|
+
|
|
81
|
+
| Dataset | PPL ratio (SemBlend / cold) |
|
|
82
|
+
|---------|---------------------------|
|
|
83
|
+
| CNN/DailyMail | 1.006 |
|
|
84
|
+
| WikiHow | 1.012 |
|
|
85
|
+
| XSum | 1.025 |
|
|
86
|
+
| MultiNews (hit-only runs) | 1.007 |
|
|
87
|
+
|
|
88
|
+
PPL ratio ≤ 1.025 on most datasets; elevated ratios (≥1.1) on low-hit runs are due to miss-run variance, not KV injection.
|
|
89
|
+
|
|
90
|
+
## Installation
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
# Core (CPU-only: numpy + rapidfuzz)
|
|
94
|
+
pip install semblend
|
|
95
|
+
|
|
96
|
+
# With vLLM integration
|
|
97
|
+
pip install semblend[vllm]
|
|
98
|
+
|
|
99
|
+
# With SGLang integration
|
|
100
|
+
pip install semblend[sglang]
|
|
101
|
+
|
|
102
|
+
# With sentence-transformers embedder
|
|
103
|
+
pip install semblend[embedder]
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Quick Start: vLLM
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
pip install semblend[vllm] vllm lmcache
|
|
110
|
+
|
|
111
|
+
vllm serve Qwen/Qwen2.5-7B-Instruct-AWQ \
|
|
112
|
+
--kv-transfer-config '{
|
|
113
|
+
"kv_connector": "SemBlendConnectorV1",
|
|
114
|
+
"kv_connector_module_path": "semblend.integration.vllm.connector_v1",
|
|
115
|
+
"kv_role": "kv_both"
|
|
116
|
+
}'
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Configure via environment variables:
|
|
120
|
+
|
|
121
|
+
| Variable | Default | Description |
|
|
122
|
+
|----------|---------|-------------|
|
|
123
|
+
| `SEMBLEND_ENABLED` | `1` | Enable semantic donor search |
|
|
124
|
+
| `SEMBLEND_MIN_SIMILARITY` | `0.60` | Cosine similarity threshold |
|
|
125
|
+
| `SEMBLEND_EMBEDDER` | `minilm` | Embedder type (`minilm`, `jaccard`, `onnx_gpu`) |
|
|
126
|
+
| `SEMBLEND_FUZZY_CHUNKS` | `0` | Enable fuzzy chunk matching |
|
|
127
|
+
|
|
128
|
+
## Quick Start: SGLang
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
pip install semblend[sglang] sglang
|
|
132
|
+
|
|
133
|
+
# Option 1: CLI launcher (patches RadixCache automatically)
|
|
134
|
+
semblend-sglang --model-path Qwen/Qwen2.5-7B-Instruct \
|
|
135
|
+
--host 0.0.0.0 --port 8000
|
|
136
|
+
|
|
137
|
+
# Option 2: Programmatic patching
|
|
138
|
+
from semblend.integration.sglang.radix_patcher import patch_radix_cache
|
|
139
|
+
patch_radix_cache() # Call before SGLang starts
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## How It Works
|
|
143
|
+
|
|
144
|
+
```
|
|
145
|
+
Request → MiniLM Embed (5ms) → Cosine Search (1ms) → Align (1ms) → Inject KV
|
|
146
|
+
↓ ↓ ↓
|
|
147
|
+
384-dim vector Find similar donor Match chunk boundaries
|
|
148
|
+
in donor store via MD5 hash alignment
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
1. **Embed**: Compute a 384-dim MiniLM-L6-v2 embedding (sliding-window for long docs)
|
|
152
|
+
2. **Search**: Brute-force cosine similarity against the donor store (<1ms at 1K donors)
|
|
153
|
+
3. **Align**: MD5 chunk hashing at 256-token boundaries finds reusable KV chunks
|
|
154
|
+
4. **Inject**: Replace target token IDs with donor token IDs — LMCache/RadixCache finds cached KV
|
|
155
|
+
|
|
156
|
+
## When SemBlend Helps
|
|
157
|
+
|
|
158
|
+
SemBlend is most effective for workloads where prompts share a large common context:
|
|
159
|
+
|
|
160
|
+
- **Document Q&A / RAG**: Same retrieved documents, different questions
|
|
161
|
+
- **Summarization**: Same article, different instruction phrasing
|
|
162
|
+
- **Multi-turn dialogue**: Conversation history grows across turns
|
|
163
|
+
- **Code completion**: Shared repository context across requests
|
|
164
|
+
|
|
165
|
+
SemBlend has minimal overhead on dissimilar workloads (e.g. code generation from scratch: measured 0% hit, 0.96x — 4% overhead from embedding + search).
|
|
166
|
+
|
|
167
|
+
## License
|
|
168
|
+
|
|
169
|
+
Business Source License 1.1 (BSL-1.1). Free for non-production use including testing, development, evaluation, and academic research. Production use requires a commercial license from WorldFlow AI. Converts to Apache License 2.0 on 2030-03-16.
|
|
170
|
+
|
|
171
|
+
Contact: licensing@worldflow.ai
|
|
172
|
+
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "semblend"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Semantic KV cache reuse for LLM inference engines (vLLM, SGLang)"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = {file = "LICENSE"}
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "WorldFlow AI" },
|
|
10
|
+
]
|
|
11
|
+
keywords = ["llm", "inference", "kv-cache", "vllm", "sglang", "semantic-cache"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 4 - Beta",
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"Intended Audience :: Science/Research",
|
|
16
|
+
"License :: Other/Proprietary License",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
dependencies = [
|
|
25
|
+
"numpy>=1.24",
|
|
26
|
+
"rapidfuzz>=3.0",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[project.urls]
|
|
30
|
+
Homepage = "https://worldflow.ai/semblend"
|
|
31
|
+
Repository = "https://github.com/worldflowai/semblend"
|
|
32
|
+
Documentation = "https://docs.worldflow.ai/semblend"
|
|
33
|
+
"Bug Tracker" = "https://github.com/worldflowai/semblend/issues"
|
|
34
|
+
|
|
35
|
+
[project.scripts]
|
|
36
|
+
semblend-sglang = "semblend.integration.sglang.launch_semblend_sglang:main"
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
embedder = [
|
|
40
|
+
"sentence-transformers>=3.0",
|
|
41
|
+
"onnxruntime>=1.15",
|
|
42
|
+
]
|
|
43
|
+
gpu = [
|
|
44
|
+
"torch>=2.0",
|
|
45
|
+
"triton>=2.0",
|
|
46
|
+
]
|
|
47
|
+
vllm = [
|
|
48
|
+
"vllm>=0.8.0",
|
|
49
|
+
"torch>=2.0",
|
|
50
|
+
"sentence-transformers>=3.0",
|
|
51
|
+
]
|
|
52
|
+
sglang = [
|
|
53
|
+
"sglang>=0.4.0",
|
|
54
|
+
"torch>=2.0",
|
|
55
|
+
"sentence-transformers>=3.0",
|
|
56
|
+
]
|
|
57
|
+
dev = [
|
|
58
|
+
"pytest>=8.0",
|
|
59
|
+
"pytest-asyncio>=0.23",
|
|
60
|
+
"ruff>=0.4",
|
|
61
|
+
]
|
|
62
|
+
benchmarks = [
|
|
63
|
+
"aiohttp>=3.9",
|
|
64
|
+
"datasets>=2.16",
|
|
65
|
+
"tqdm>=4.66",
|
|
66
|
+
"rich>=13.7",
|
|
67
|
+
"rouge-score>=0.1.2",
|
|
68
|
+
"transformers>=4.40",
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
[build-system]
|
|
72
|
+
requires = ["setuptools>=68"]
|
|
73
|
+
build-backend = "setuptools.build_meta"
|
|
74
|
+
|
|
75
|
+
[tool.setuptools.packages.find]
|
|
76
|
+
include = ["semblend*", "semblend_core*", "synapse_kv_connector*"]
|
|
77
|
+
exclude = ["benchmarks*", "paper*", "patent*", "infra*", "literature*", "results*", "tests*", "synapse_kv_connector.tests*", "scripts*"]
|
|
78
|
+
|
|
79
|
+
[tool.pytest.ini_options]
|
|
80
|
+
testpaths = ["tests", "synapse_kv_connector/tests"]
|
|
81
|
+
addopts = "--ignore=synapse_kv_connector/tests/test_rope_correction.py --ignore=synapse_kv_connector/tests/test_triton_kernels.py"
|
|
82
|
+
|
|
83
|
+
[tool.ruff]
|
|
84
|
+
line-length = 100
|
|
85
|
+
target-version = "py310"
|
|
86
|
+
|
|
87
|
+
[tool.ruff.lint]
|
|
88
|
+
select = ["E", "F", "I", "W"]
|
|
89
|
+
ignore = ["E501"]
|