contextpilot 0.3.1__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextpilot-0.3.2/PKG-INFO +239 -0
- contextpilot-0.3.2/README.md +198 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/__init__.py +4 -1
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_index/index_construction.py +81 -9
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/server/http_client.py +31 -17
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/server/http_server.py +2 -1
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/server/live_index.py +324 -55
- contextpilot-0.3.2/contextpilot.egg-info/PKG-INFO +239 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/pyproject.toml +1 -1
- contextpilot-0.3.1/PKG-INFO +0 -137
- contextpilot-0.3.1/README.md +0 -96
- contextpilot-0.3.1/contextpilot.egg-info/PKG-INFO +0 -137
- {contextpilot-0.3.1 → contextpilot-0.3.2}/LICENSE +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_index/__init__.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_index/compute_distance_cpu.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_index/compute_distance_gpu.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_index/tree_nodes.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_ordering/__init__.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_ordering/inter_scheduler.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_ordering/intra_ordering.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/pipeline/__init__.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/pipeline/components.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/pipeline/multi_turn.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/pipeline/rag_pipeline.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/retriever/__init__.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/retriever/bm25.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/retriever/faiss_embedding.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/retriever/mem0_retriever.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/retriever/pageindex_retriever.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/server/__init__.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/server/conversation_tracker.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/server/eviction_heap.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/server/metadata.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/utils/__init__.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/utils/eval_metrics.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/utils/prompt_generator.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/utils/tools.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot.egg-info/SOURCES.txt +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot.egg-info/dependency_links.txt +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot.egg-info/requires.txt +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot.egg-info/top_level.txt +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/requirements.txt +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/setup.cfg +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_context_index.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_context_ordering.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_cpu_distances.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_deduplication.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_gpu_distance_performance.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_gpu_distances.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_group_prefix_sharing.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_incremental_build.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_live_index.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_mem0_integration.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_multi_turn.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_multi_turn_e2e.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_pageindex_integration.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_performance.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_pipeline.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_server_integration.py +0 -0
- {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_utils.py +0 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: contextpilot
|
|
3
|
+
Version: 0.3.2
|
|
4
|
+
Summary: Efficient Retrieval-Augmented Generation with Accuracy-Preserving Context Reuse
|
|
5
|
+
Author: Yinsicheng Jiang, Chivier Humber
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/SecretSettler/ContextPilot
|
|
8
|
+
Project-URL: Repository, https://github.com/SecretSettler/ContextPilot
|
|
9
|
+
Project-URL: Issues, https://github.com/SecretSettler/ContextPilot/issues
|
|
10
|
+
Keywords: rag,llm,context-reuse,kv-cache,retrieval-augmented-generation
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: datasets
|
|
24
|
+
Requires-Dist: transformers
|
|
25
|
+
Requires-Dist: elasticsearch==8.18.1
|
|
26
|
+
Requires-Dist: aiohttp
|
|
27
|
+
Requires-Dist: ujson
|
|
28
|
+
Requires-Dist: scipy
|
|
29
|
+
Requires-Dist: fastapi[all]
|
|
30
|
+
Requires-Dist: cupy-cuda12x
|
|
31
|
+
Requires-Dist: pytest
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: black; extra == "dev"
|
|
34
|
+
Requires-Dist: bumpver; extra == "dev"
|
|
35
|
+
Requires-Dist: isort; extra == "dev"
|
|
36
|
+
Requires-Dist: pip-tools; extra == "dev"
|
|
37
|
+
Requires-Dist: pytest; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
39
|
+
Requires-Dist: ipython; extra == "dev"
|
|
40
|
+
Dynamic: license-file
|
|
41
|
+
|
|
42
|
+
<div align="center">
|
|
43
|
+
<img src="assets/about.png" alt="ContextPilot Logo" width="800"/>
|
|
44
|
+
|
|
45
|
+
<h1><strong>ContextPilot: Efficient Long Context Inference with Context Reuse</strong></h1>
|
|
46
|
+
|
|
47
|
+
[](https://www.python.org/)
|
|
48
|
+
[](https://pypi.org/project/contextpilot/)
|
|
49
|
+
[](LICENSE)
|
|
50
|
+
|
|
51
|
+
</div>
|
|
52
|
+
|
|
53
|
+
--------------------------------------------------------------------------------
|
|
54
|
+
|
|
55
|
+
| [**Documentation**](docs/README.md) | [**Examples**](examples/) | [**Benchmarks**](docs/reference/benchmarks.md) |
|
|
56
|
+
|
|
57
|
+
## News
|
|
58
|
+
|
|
59
|
+
- [2026/02] ContextPilot v0.3.2 released, supporting [PageIndex](https://github.com/VectifyAI/PageIndex) and [Mem0](https://github.com/mem0ai/mem0).
|
|
60
|
+
- [2026/01] ContextPilot has been accepted to MLSys 2026 🎉! See you in Bellevue, WA, USA.
|
|
61
|
+
- [2025/12] ContextPilot v0.2.0 released.
|
|
62
|
+
|
|
63
|
+
## About
|
|
64
|
+
|
|
65
|
+
ContextPilot is a fast optimization system on context engineering layer for agentic workloads:
|
|
66
|
+
1. **High Throughput & Cache Hit Ratio**: Boosting prefill throughput and prefix cache hit ratio with intelligent context reuse.
|
|
67
|
+
2. **Strong Compatibility**: Strong compatibility with existing popular RAG libraries (PageIndex), Agentic memory layer (Mem0), KV cache optimization engine (LMCache), and Inference engines (vLLM and SGLang).
|
|
68
|
+
3. **Negligible Accuracy Loss**: Achieving significant performance improvements with minimal to no accuracy degradation across various benchmarks.
|
|
69
|
+
3. **Widely Tested**: Tested with a wide range of RAG and Agentic AI applications.
|
|
70
|
+
|
|
71
|
+
## Target Workloads
|
|
72
|
+
|
|
73
|
+
1. **Trending Topic QA** — Search and generation for breaking news and hot topics beyond model knowledge
|
|
74
|
+
2. **Closed-Domain Long-Context QA** — QA over specialized corpora (novels, financial reports, legal documents) with retrieval or in-context search
|
|
75
|
+
3. **Large-Batch Long-Context Execution** — High-throughput inference where many requests share overlapping contexts; ContextPilot maximizes prefix reuse regardless of the search method
|
|
76
|
+
4. **Multi-Turn Conversations with Long-Term Memory** — Persistent context reuse across turns (e.g. [Mem0](https://github.com/mem0ai/mem0))
|
|
77
|
+
|
|
78
|
+
## Benchmark and Performance
|
|
79
|
+
|
|
80
|
+
### System Performance
|
|
81
|
+
|
|
82
|
+
<div align="center">
|
|
83
|
+
<img src="assets/deepseek_r1_results.png" alt="Benchmark Results" width="600"/>
|
|
84
|
+
</div>
|
|
85
|
+
|
|
86
|
+
ContextPilot (Stateless) on DeepSeek-R1 maintains accuracy compared to SGLang, achieving 64.68% vs 64.15% F1 on MultihopRAG and 41.08% vs 40.20% F1 on NarrativeQA.
|
|
87
|
+
|
|
88
|
+
### Accuracy on MT-RAG Benchmark (Online Scheduling)
|
|
89
|
+
|
|
90
|
+
<div align="center">
|
|
91
|
+
|
|
92
|
+
| Method | Qwen3-4B | Llama3.1-8B | Qwen3-30B-A3B |
|
|
93
|
+
|--------|----------|-------------|-----------|
|
|
94
|
+
| LMCache | 62.56 | **68.46** | 75.12 |
|
|
95
|
+
| CacheBlend | 50.33 | 56.52 | X |
|
|
96
|
+
| RadixCache | 62.56 | **68.46** | 75.12 |
|
|
97
|
+
| **ContextPilot** | **64.27** | 68.12 | **75.81** |
|
|
98
|
+
|
|
99
|
+
</div>
|
|
100
|
+
|
|
101
|
+
ContextPilot delivers **4-13x** improvements in cache hit rates and **1.5-3.5x** reductions in prefill latency for large-batch RAG workloads, while maintaining or improving accuracy.
|
|
102
|
+
|
|
103
|
+
**Furthermore**, ContextPilot has been tested to reduce input token costs by around **36%** with GPT-5.2.
|
|
104
|
+
|
|
105
|
+
See [Benchmarks](docs/reference/benchmarks.md) in the documentation for GPU vs CPU performance analysis and detailed benchmark methodology.
|
|
106
|
+
|
|
107
|
+
## Getting Started
|
|
108
|
+
|
|
109
|
+
### Installation
|
|
110
|
+
|
|
111
|
+
**Requirements:** Python >= 3.10
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
pip install contextpilot
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Or install from source:
|
|
118
|
+
```bash
|
|
119
|
+
git clone https://github.com/Edinburgh-AgenticAI/ContextPilot.git
|
|
120
|
+
cd ContextPilot
|
|
121
|
+
pip install -e .
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
More [detailed installation instructions](docs/getting_started/installation.md) are available in the docs.
|
|
125
|
+
|
|
126
|
+
### Quick Start
|
|
127
|
+
|
|
128
|
+
**Offline / Online Stateless** — build index & schedule in one shot:
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
from openai import OpenAI
|
|
132
|
+
import contextpilot as cp
|
|
133
|
+
|
|
134
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
|
|
135
|
+
|
|
136
|
+
queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
|
|
137
|
+
all_contexts = [
|
|
138
|
+
["Doc about AI", "Doc about ML", "Doc about computing"],
|
|
139
|
+
["Doc about neural nets", "Doc about deep learning"],
|
|
140
|
+
["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
# Build index and schedule for prefix sharing
|
|
144
|
+
index = cp.build_context_index(all_contexts, use_gpu=False)
|
|
145
|
+
reordered, _, order, _ = cp.InterContextScheduler().schedule_contexts(index)
|
|
146
|
+
|
|
147
|
+
# Send in optimized order — shared prefixes hit KV cache
|
|
148
|
+
for ctx, orig_idx in zip(reordered, order):
|
|
149
|
+
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
150
|
+
# Importance ranking restores original retrieval order for the model
|
|
151
|
+
importance_ranking = ">".join(
|
|
152
|
+
str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
|
|
153
|
+
)
|
|
154
|
+
response = client.chat.completions.create(
|
|
155
|
+
model="Qwen/Qwen3-4B",
|
|
156
|
+
messages=[
|
|
157
|
+
{"role": "system", "content": (
|
|
158
|
+
f"Answer the question based on the provided documents.\n\n"
|
|
159
|
+
f"<documents>\n{docs_section}\n</documents>\n\n"
|
|
160
|
+
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
161
|
+
f"Prioritize information from higher-ranked documents."
|
|
162
|
+
)},
|
|
163
|
+
{"role": "user", "content": queries[orig_idx]},
|
|
164
|
+
],
|
|
165
|
+
)
|
|
166
|
+
print(f"Q: {queries[orig_idx]}\nA: {response.choices[0].message.content}\n")
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
> For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
|
|
170
|
+
|
|
171
|
+
**Stateful** — `LiveContextIndex` tracks cached state:
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
from openai import OpenAI
|
|
175
|
+
import contextpilot as cp
|
|
176
|
+
|
|
177
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
178
|
+
live = cp.LiveContextIndex(use_gpu=False)
|
|
179
|
+
|
|
180
|
+
# Simulate multi-turn: each turn has batch_size=1
|
|
181
|
+
turns = [
|
|
182
|
+
{
|
|
183
|
+
"query": "What is AI?",
|
|
184
|
+
"contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
"query": "Compare supervised and unsupervised learning",
|
|
188
|
+
# 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
|
|
189
|
+
"contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
|
|
190
|
+
},
|
|
191
|
+
]
|
|
192
|
+
|
|
193
|
+
for turn_idx, turn in enumerate(turns):
|
|
194
|
+
contexts = turn["contexts"]
|
|
195
|
+
query = turn["query"]
|
|
196
|
+
|
|
197
|
+
# build_incremental handles both cold start and incremental turns
|
|
198
|
+
result = live.build_incremental(contexts)
|
|
199
|
+
reordered = result['reordered_contexts']
|
|
200
|
+
# Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
|
|
201
|
+
# ^— shared prefix from Turn 1 —^ ^— new doc appended
|
|
202
|
+
|
|
203
|
+
ctx = reordered[0]
|
|
204
|
+
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
205
|
+
importance_ranking = ">".join(
|
|
206
|
+
str(ctx.index(doc) + 1) for doc in contexts[0] if doc in ctx
|
|
207
|
+
)
|
|
208
|
+
response = client.chat.completions.create(
|
|
209
|
+
model="Qwen/Qwen3-4B",
|
|
210
|
+
messages=[
|
|
211
|
+
{"role": "system", "content": (
|
|
212
|
+
f"Answer the question based on the provided documents.\n\n"
|
|
213
|
+
f"<documents>\n{docs_section}\n</documents>\n\n"
|
|
214
|
+
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
215
|
+
f"Prioritize information from higher-ranked documents."
|
|
216
|
+
)},
|
|
217
|
+
{"role": "user", "content": query},
|
|
218
|
+
],
|
|
219
|
+
)
|
|
220
|
+
print(f"[Turn {turn_idx+1}] Q: {query}")
|
|
221
|
+
print(f"A: {response.choices[0].message.content}\n")
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
> **Note:** Stateful mode works without eviction sync — `LiveContextIndex` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited storage size where the KV cache may evict entries, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
|
|
225
|
+
|
|
226
|
+
## Documentation
|
|
227
|
+
|
|
228
|
+
Check out the ContextPilot [documentation](docs/README.md) for comprehensive guides.
|
|
229
|
+
|
|
230
|
+
## Examples
|
|
231
|
+
|
|
232
|
+
Go hands-on with our [examples](examples/), demonstrating how to address different use cases with ContextPilot.
|
|
233
|
+
|
|
234
|
+
## Contributing
|
|
235
|
+
|
|
236
|
+
We welcome and value all contributions! Please feel free to submit issues and pull requests.
|
|
237
|
+
|
|
238
|
+
## Citation
|
|
239
|
+
We will include the paper citation soon!
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<img src="assets/about.png" alt="ContextPilot Logo" width="800"/>
|
|
3
|
+
|
|
4
|
+
<h1><strong>ContextPilot: Efficient Long Context Inference with Context Reuse</strong></h1>
|
|
5
|
+
|
|
6
|
+
[](https://www.python.org/)
|
|
7
|
+
[](https://pypi.org/project/contextpilot/)
|
|
8
|
+
[](LICENSE)
|
|
9
|
+
|
|
10
|
+
</div>
|
|
11
|
+
|
|
12
|
+
--------------------------------------------------------------------------------
|
|
13
|
+
|
|
14
|
+
| [**Documentation**](docs/README.md) | [**Examples**](examples/) | [**Benchmarks**](docs/reference/benchmarks.md) |
|
|
15
|
+
|
|
16
|
+
## News
|
|
17
|
+
|
|
18
|
+
- [2026/02] ContextPilot v0.3.2 released, supporting [PageIndex](https://github.com/VectifyAI/PageIndex) and [Mem0](https://github.com/mem0ai/mem0).
|
|
19
|
+
- [2026/01] ContextPilot has been accepted to MLSys 2026 🎉! See you in Bellevue, WA, USA.
|
|
20
|
+
- [2025/12] ContextPilot v0.2.0 released.
|
|
21
|
+
|
|
22
|
+
## About
|
|
23
|
+
|
|
24
|
+
ContextPilot is a fast optimization system on context engineering layer for agentic workloads:
|
|
25
|
+
1. **High Throughput & Cache Hit Ratio**: Boosting prefill throughput and prefix cache hit ratio with intelligent context reuse.
|
|
26
|
+
2. **Strong Compatibility**: Strong compatibility with existing popular RAG libraries (PageIndex), Agentic memory layer (Mem0), KV cache optimization engine (LMCache), and Inference engines (vLLM and SGLang).
|
|
27
|
+
3. **Negligible Accuracy Loss**: Achieving significant performance improvements with minimal to no accuracy degradation across various benchmarks.
|
|
28
|
+
3. **Widely Tested**: Tested with a wide range of RAG and Agentic AI applications.
|
|
29
|
+
|
|
30
|
+
## Target Workloads
|
|
31
|
+
|
|
32
|
+
1. **Trending Topic QA** — Search and generation for breaking news and hot topics beyond model knowledge
|
|
33
|
+
2. **Closed-Domain Long-Context QA** — QA over specialized corpora (novels, financial reports, legal documents) with retrieval or in-context search
|
|
34
|
+
3. **Large-Batch Long-Context Execution** — High-throughput inference where many requests share overlapping contexts; ContextPilot maximizes prefix reuse regardless of the search method
|
|
35
|
+
4. **Multi-Turn Conversations with Long-Term Memory** — Persistent context reuse across turns (e.g. [Mem0](https://github.com/mem0ai/mem0))
|
|
36
|
+
|
|
37
|
+
## Benchmark and Performance
|
|
38
|
+
|
|
39
|
+
### System Performance
|
|
40
|
+
|
|
41
|
+
<div align="center">
|
|
42
|
+
<img src="assets/deepseek_r1_results.png" alt="Benchmark Results" width="600"/>
|
|
43
|
+
</div>
|
|
44
|
+
|
|
45
|
+
ContextPilot (Stateless) on DeepSeek-R1 maintains accuracy compared to SGLang, achieving 64.68% vs 64.15% F1 on MultihopRAG and 41.08% vs 40.20% F1 on NarrativeQA.
|
|
46
|
+
|
|
47
|
+
### Accuracy on MT-RAG Benchmark (Online Scheduling)
|
|
48
|
+
|
|
49
|
+
<div align="center">
|
|
50
|
+
|
|
51
|
+
| Method | Qwen3-4B | Llama3.1-8B | Qwen3-30B-A3B |
|
|
52
|
+
|--------|----------|-------------|-----------|
|
|
53
|
+
| LMCache | 62.56 | **68.46** | 75.12 |
|
|
54
|
+
| CacheBlend | 50.33 | 56.52 | X |
|
|
55
|
+
| RadixCache | 62.56 | **68.46** | 75.12 |
|
|
56
|
+
| **ContextPilot** | **64.27** | 68.12 | **75.81** |
|
|
57
|
+
|
|
58
|
+
</div>
|
|
59
|
+
|
|
60
|
+
ContextPilot delivers **4-13x** improvements in cache hit rates and **1.5-3.5x** reductions in prefill latency for large-batch RAG workloads, while maintaining or improving accuracy.
|
|
61
|
+
|
|
62
|
+
**Furthermore**, ContextPilot has been tested to reduce input token costs by around **36%** with GPT-5.2.
|
|
63
|
+
|
|
64
|
+
See [Benchmarks](docs/reference/benchmarks.md) in the documentation for GPU vs CPU performance analysis and detailed benchmark methodology.
|
|
65
|
+
|
|
66
|
+
## Getting Started
|
|
67
|
+
|
|
68
|
+
### Installation
|
|
69
|
+
|
|
70
|
+
**Requirements:** Python >= 3.10
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install contextpilot
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Or install from source:
|
|
77
|
+
```bash
|
|
78
|
+
git clone https://github.com/Edinburgh-AgenticAI/ContextPilot.git
|
|
79
|
+
cd ContextPilot
|
|
80
|
+
pip install -e .
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
More [detailed installation instructions](docs/getting_started/installation.md) are available in the docs.
|
|
84
|
+
|
|
85
|
+
### Quick Start
|
|
86
|
+
|
|
87
|
+
**Offline / Online Stateless** — build index & schedule in one shot:
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from openai import OpenAI
|
|
91
|
+
import contextpilot as cp
|
|
92
|
+
|
|
93
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
|
|
94
|
+
|
|
95
|
+
queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
|
|
96
|
+
all_contexts = [
|
|
97
|
+
["Doc about AI", "Doc about ML", "Doc about computing"],
|
|
98
|
+
["Doc about neural nets", "Doc about deep learning"],
|
|
99
|
+
["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# Build index and schedule for prefix sharing
|
|
103
|
+
index = cp.build_context_index(all_contexts, use_gpu=False)
|
|
104
|
+
reordered, _, order, _ = cp.InterContextScheduler().schedule_contexts(index)
|
|
105
|
+
|
|
106
|
+
# Send in optimized order — shared prefixes hit KV cache
|
|
107
|
+
for ctx, orig_idx in zip(reordered, order):
|
|
108
|
+
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
109
|
+
# Importance ranking restores original retrieval order for the model
|
|
110
|
+
importance_ranking = ">".join(
|
|
111
|
+
str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
|
|
112
|
+
)
|
|
113
|
+
response = client.chat.completions.create(
|
|
114
|
+
model="Qwen/Qwen3-4B",
|
|
115
|
+
messages=[
|
|
116
|
+
{"role": "system", "content": (
|
|
117
|
+
f"Answer the question based on the provided documents.\n\n"
|
|
118
|
+
f"<documents>\n{docs_section}\n</documents>\n\n"
|
|
119
|
+
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
120
|
+
f"Prioritize information from higher-ranked documents."
|
|
121
|
+
)},
|
|
122
|
+
{"role": "user", "content": queries[orig_idx]},
|
|
123
|
+
],
|
|
124
|
+
)
|
|
125
|
+
print(f"Q: {queries[orig_idx]}\nA: {response.choices[0].message.content}\n")
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
> For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
|
|
129
|
+
|
|
130
|
+
**Stateful** — `LiveContextIndex` tracks cached state:
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from openai import OpenAI
|
|
134
|
+
import contextpilot as cp
|
|
135
|
+
|
|
136
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
137
|
+
live = cp.LiveContextIndex(use_gpu=False)
|
|
138
|
+
|
|
139
|
+
# Simulate multi-turn: each turn has batch_size=1
|
|
140
|
+
turns = [
|
|
141
|
+
{
|
|
142
|
+
"query": "What is AI?",
|
|
143
|
+
"contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
"query": "Compare supervised and unsupervised learning",
|
|
147
|
+
# 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
|
|
148
|
+
"contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
|
|
149
|
+
},
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
for turn_idx, turn in enumerate(turns):
|
|
153
|
+
contexts = turn["contexts"]
|
|
154
|
+
query = turn["query"]
|
|
155
|
+
|
|
156
|
+
# build_incremental handles both cold start and incremental turns
|
|
157
|
+
result = live.build_incremental(contexts)
|
|
158
|
+
reordered = result['reordered_contexts']
|
|
159
|
+
# Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
|
|
160
|
+
# ^— shared prefix from Turn 1 —^ ^— new doc appended
|
|
161
|
+
|
|
162
|
+
ctx = reordered[0]
|
|
163
|
+
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
164
|
+
importance_ranking = ">".join(
|
|
165
|
+
str(ctx.index(doc) + 1) for doc in contexts[0] if doc in ctx
|
|
166
|
+
)
|
|
167
|
+
response = client.chat.completions.create(
|
|
168
|
+
model="Qwen/Qwen3-4B",
|
|
169
|
+
messages=[
|
|
170
|
+
{"role": "system", "content": (
|
|
171
|
+
f"Answer the question based on the provided documents.\n\n"
|
|
172
|
+
f"<documents>\n{docs_section}\n</documents>\n\n"
|
|
173
|
+
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
174
|
+
f"Prioritize information from higher-ranked documents."
|
|
175
|
+
)},
|
|
176
|
+
{"role": "user", "content": query},
|
|
177
|
+
],
|
|
178
|
+
)
|
|
179
|
+
print(f"[Turn {turn_idx+1}] Q: {query}")
|
|
180
|
+
print(f"A: {response.choices[0].message.content}\n")
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
> **Note:** Stateful mode works without eviction sync — `LiveContextIndex` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited storage size where the KV cache may evict entries, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
|
|
184
|
+
|
|
185
|
+
## Documentation
|
|
186
|
+
|
|
187
|
+
Check out the ContextPilot [documentation](docs/README.md) for comprehensive guides.
|
|
188
|
+
|
|
189
|
+
## Examples
|
|
190
|
+
|
|
191
|
+
Go hands-on with our [examples](examples/), demonstrating how to address different use cases with ContextPilot.
|
|
192
|
+
|
|
193
|
+
## Contributing
|
|
194
|
+
|
|
195
|
+
We welcome and value all contributions! Please feel free to submit issues and pull requests.
|
|
196
|
+
|
|
197
|
+
## Citation
|
|
198
|
+
We will include the paper citation soon!
|
|
@@ -38,6 +38,8 @@ from .context_ordering import (
|
|
|
38
38
|
InterContextScheduler,
|
|
39
39
|
)
|
|
40
40
|
|
|
41
|
+
from .server.live_index import LiveContextIndex
|
|
42
|
+
|
|
41
43
|
from .retriever import (
|
|
42
44
|
BM25Retriever,
|
|
43
45
|
FAISSRetriever,
|
|
@@ -47,7 +49,7 @@ from .retriever import (
|
|
|
47
49
|
MEM0_AVAILABLE,
|
|
48
50
|
)
|
|
49
51
|
|
|
50
|
-
__version__ = "0.3.
|
|
52
|
+
__version__ = "0.3.2"
|
|
51
53
|
|
|
52
54
|
__all__ = [
|
|
53
55
|
# High-level pipeline API
|
|
@@ -63,6 +65,7 @@ __all__ = [
|
|
|
63
65
|
'build_context_index',
|
|
64
66
|
'IntraContextOrderer',
|
|
65
67
|
'InterContextScheduler',
|
|
68
|
+
'LiveContextIndex',
|
|
66
69
|
|
|
67
70
|
# Retrievers
|
|
68
71
|
'BM25Retriever',
|
|
@@ -96,6 +96,12 @@ class ContextIndex:
|
|
|
96
96
|
self.node_manager = NodeManager()
|
|
97
97
|
self.context_orderer = IntraContextOrderer()
|
|
98
98
|
|
|
99
|
+
# String-to-int mapping (auto-populated when string inputs are given)
|
|
100
|
+
self._str_to_id: dict = {}
|
|
101
|
+
self._id_to_str: dict = {}
|
|
102
|
+
self._next_str_id: int = 0
|
|
103
|
+
self._is_string_input: bool = False
|
|
104
|
+
|
|
99
105
|
if self.use_gpu:
|
|
100
106
|
print("Using GPU for distance computation")
|
|
101
107
|
else:
|
|
@@ -104,16 +110,48 @@ class ContextIndex:
|
|
|
104
110
|
else:
|
|
105
111
|
print("Using CPU for distance computation")
|
|
106
112
|
|
|
107
|
-
def
|
|
113
|
+
def _convert_to_int(self, contexts):
|
|
114
|
+
"""Convert string contexts to integer IDs if needed."""
|
|
115
|
+
if not contexts or not contexts[0]:
|
|
116
|
+
return contexts
|
|
117
|
+
if isinstance(contexts[0][0], str):
|
|
118
|
+
self._is_string_input = True
|
|
119
|
+
converted = []
|
|
120
|
+
for ctx in contexts:
|
|
121
|
+
converted_ctx = []
|
|
122
|
+
for item in ctx:
|
|
123
|
+
sid = self._str_to_id.get(item)
|
|
124
|
+
if sid is None:
|
|
125
|
+
sid = self._next_str_id
|
|
126
|
+
self._str_to_id[item] = sid
|
|
127
|
+
self._id_to_str[sid] = item
|
|
128
|
+
self._next_str_id += 1
|
|
129
|
+
converted_ctx.append(sid)
|
|
130
|
+
converted.append(converted_ctx)
|
|
131
|
+
return converted
|
|
132
|
+
return contexts
|
|
133
|
+
|
|
134
|
+
def _convert_to_str(self, contexts):
|
|
135
|
+
"""Convert integer contexts back to strings if input was strings."""
|
|
136
|
+
if not self._is_string_input or not contexts:
|
|
137
|
+
return contexts
|
|
138
|
+
# Skip if already converted (e.g. from fit_transform output)
|
|
139
|
+
if contexts[0] and isinstance(contexts[0][0], str):
|
|
140
|
+
return contexts
|
|
141
|
+
return [[self._id_to_str[i] for i in ctx] for ctx in contexts]
|
|
142
|
+
|
|
143
|
+
def fit_transform(self, contexts) -> IndexResult:
|
|
108
144
|
"""
|
|
109
145
|
Perform clustering and return results.
|
|
110
146
|
|
|
111
147
|
Args:
|
|
112
|
-
contexts: List of contexts, where each
|
|
148
|
+
contexts: List of contexts, where each context is a list of chunk IDs (int) or strings.
|
|
149
|
+
String inputs are automatically converted to integer IDs.
|
|
113
150
|
|
|
114
151
|
Returns:
|
|
115
152
|
IndexResult object containing clustering results
|
|
116
153
|
"""
|
|
154
|
+
contexts = self._convert_to_int(contexts)
|
|
117
155
|
n = len(contexts)
|
|
118
156
|
|
|
119
157
|
if n < 2:
|
|
@@ -194,14 +232,41 @@ class ContextIndex:
|
|
|
194
232
|
)
|
|
195
233
|
|
|
196
234
|
def _handle_single_prompt(self, contexts: List[List[int]]) -> IndexResult:
|
|
197
|
-
"""Handle case with less than 2 contexts.
|
|
235
|
+
"""Handle case with less than 2 contexts.
|
|
236
|
+
|
|
237
|
+
Always creates an empty root node above the leaf(s) so that
|
|
238
|
+
leaf.is_root is never True. This prevents the root-exclusion
|
|
239
|
+
guard in build_incremental from skipping legitimate matches.
|
|
240
|
+
"""
|
|
198
241
|
for i, prompt in enumerate(contexts):
|
|
199
|
-
self.node_manager.create_leaf_node(i, prompt)
|
|
242
|
+
node = self.node_manager.create_leaf_node(i, prompt)
|
|
243
|
+
# ClusterNode.__init__ sets doc_ids = sorted(content).
|
|
244
|
+
# Override to preserve the original context order so that
|
|
245
|
+
# build_incremental can use it as a correct prefix for Turn 2.
|
|
246
|
+
node.doc_ids = list(prompt)
|
|
247
|
+
|
|
248
|
+
# Wrap leaf node(s) under an empty root so that no leaf is the root.
|
|
249
|
+
# This mirrors the virtual-root logic in update_search_paths for forests,
|
|
250
|
+
# but applies it even for a single leaf.
|
|
251
|
+
leaf_ids = list(self.node_manager.unique_nodes.keys())
|
|
252
|
+
virtual_root_id = max(leaf_ids) + 1 if leaf_ids else 0
|
|
253
|
+
virtual_root = ClusterNode(
|
|
254
|
+
node_id=virtual_root_id,
|
|
255
|
+
content=set(),
|
|
256
|
+
original_indices=set(),
|
|
257
|
+
distance=0.0,
|
|
258
|
+
children=leaf_ids,
|
|
259
|
+
parent=None,
|
|
260
|
+
frequency=sum(self.node_manager.unique_nodes[nid].frequency for nid in leaf_ids)
|
|
261
|
+
)
|
|
262
|
+
self.node_manager.unique_nodes[virtual_root_id] = virtual_root
|
|
263
|
+
for nid in leaf_ids:
|
|
264
|
+
self.node_manager.unique_nodes[nid].parent = virtual_root_id
|
|
200
265
|
|
|
201
|
-
# Update search paths
|
|
266
|
+
# Update search paths (now a proper rooted tree)
|
|
202
267
|
self.node_manager.update_search_paths()
|
|
203
268
|
|
|
204
|
-
# For single context, extract search paths
|
|
269
|
+
# For single context, extract search paths
|
|
205
270
|
search_paths = self.context_orderer.extract_search_paths(
|
|
206
271
|
self.node_manager.unique_nodes, len(contexts)
|
|
207
272
|
)
|
|
@@ -233,7 +298,7 @@ class ContextIndex:
|
|
|
233
298
|
|
|
234
299
|
|
|
235
300
|
# Convenience function for backward compatibility
|
|
236
|
-
def build_context_index(contexts
|
|
301
|
+
def build_context_index(contexts,
|
|
237
302
|
linkage_method: str = "average",
|
|
238
303
|
use_gpu: bool = True,
|
|
239
304
|
alpha: float = 0.005,
|
|
@@ -243,7 +308,7 @@ def build_context_index(contexts: List[List[int]],
|
|
|
243
308
|
Convenience function for building a context index.
|
|
244
309
|
|
|
245
310
|
Args:
|
|
246
|
-
contexts: List of contexts, where each
|
|
311
|
+
contexts: List of contexts, where each context is a list of chunk IDs (int) or strings
|
|
247
312
|
linkage_method: Linkage method for hierarchical clustering
|
|
248
313
|
use_gpu: Whether to use GPU for distance computation
|
|
249
314
|
alpha: Weight for position term in distance calculation
|
|
@@ -260,4 +325,11 @@ def build_context_index(contexts: List[List[int]],
|
|
|
260
325
|
num_workers=num_workers,
|
|
261
326
|
batch_size=batch_size
|
|
262
327
|
)
|
|
263
|
-
|
|
328
|
+
result = indexer.fit_transform(contexts)
|
|
329
|
+
# Convert back to strings at the API boundary if input was strings
|
|
330
|
+
if indexer._is_string_input:
|
|
331
|
+
result.reordered_contexts = indexer._convert_to_str(result.reordered_contexts)
|
|
332
|
+
result.original_contexts = indexer._convert_to_str(result.original_contexts)
|
|
333
|
+
result.reordered_prompts = result.reordered_contexts
|
|
334
|
+
result.original_prompts = result.original_contexts
|
|
335
|
+
return result
|
|
@@ -28,11 +28,11 @@ class ContextPilotIndexClient:
|
|
|
28
28
|
Example usage in SGLang:
|
|
29
29
|
# In scheduler initialization:
|
|
30
30
|
self.contextpilot_client = ContextPilotIndexClient("http://localhost:8765")
|
|
31
|
-
|
|
32
|
-
# In eviction
|
|
33
|
-
def
|
|
34
|
-
|
|
35
|
-
self.contextpilot_client.evict(
|
|
31
|
+
|
|
32
|
+
# In eviction callback:
|
|
33
|
+
def on_cache_evict(self, evicted_request_ids):
|
|
34
|
+
# Sync eviction with ContextPilot index
|
|
35
|
+
self.contextpilot_client.evict(evicted_request_ids)
|
|
36
36
|
"""
|
|
37
37
|
|
|
38
38
|
def __init__(
|
|
@@ -92,19 +92,23 @@ class ContextPilotIndexClient:
|
|
|
92
92
|
logger.warning(f"ContextPilot index request failed: {e}")
|
|
93
93
|
return None
|
|
94
94
|
|
|
95
|
-
def evict(self,
|
|
95
|
+
def evict(self, request_ids: List[str]) -> Optional[Dict[str, Any]]:
|
|
96
96
|
"""
|
|
97
|
-
Evict
|
|
98
|
-
|
|
97
|
+
Evict requests from the index.
|
|
98
|
+
|
|
99
99
|
THIS IS THE MAIN METHOD THAT SGLANG SHOULD CALL FOR EVICTION SYNC.
|
|
100
|
-
|
|
100
|
+
|
|
101
101
|
Args:
|
|
102
|
-
|
|
103
|
-
|
|
102
|
+
request_ids: List of request IDs to evict (from SGLang's cache eviction)
|
|
103
|
+
|
|
104
104
|
Returns:
|
|
105
|
-
Dictionary with eviction results
|
|
105
|
+
Dictionary with eviction results:
|
|
106
|
+
- removed_count: Number of requests successfully removed
|
|
107
|
+
- not_found: List of request IDs that were not in the index
|
|
108
|
+
- conversations_cleared: Number of conversation chains cleared
|
|
109
|
+
Returns None if request failed
|
|
106
110
|
"""
|
|
107
|
-
return self._post("/evict", {"
|
|
111
|
+
return self._post("/evict", {"request_ids": request_ids})
|
|
108
112
|
|
|
109
113
|
def search(
|
|
110
114
|
self,
|
|
@@ -327,16 +331,26 @@ class ContextPilotIndexClient:
|
|
|
327
331
|
|
|
328
332
|
# Convenience functions for simple usage
|
|
329
333
|
|
|
330
|
-
def
|
|
334
|
+
def evict_requests(
|
|
335
|
+
request_ids: List[str],
|
|
336
|
+
server_url: str = "http://localhost:8765"
|
|
337
|
+
) -> Optional[Dict[str, Any]]:
|
|
331
338
|
"""
|
|
332
|
-
Simple function to evict
|
|
333
|
-
|
|
339
|
+
Simple function to evict requests from the index.
|
|
340
|
+
|
|
334
341
|
For one-off calls without maintaining a client instance.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
request_ids: List of request IDs to evict
|
|
345
|
+
server_url: ContextPilot server URL
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
Dictionary with removed_count, not_found, conversations_cleared
|
|
335
349
|
"""
|
|
336
350
|
try:
|
|
337
351
|
response = requests.post(
|
|
338
352
|
f"{server_url}/evict",
|
|
339
|
-
json={"
|
|
353
|
+
json={"request_ids": request_ids},
|
|
340
354
|
timeout=1.0
|
|
341
355
|
)
|
|
342
356
|
response.raise_for_status()
|